@tangle-network/agent-eval 0.65.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/dist/adapters/otel.d.ts +1 -1
  3. package/dist/campaign/index.d.ts +110 -6
  4. package/dist/campaign/index.js +26 -19
  5. package/dist/campaign/index.js.map +1 -1
  6. package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
  7. package/dist/chunk-6XQIEUQ2.js.map +1 -0
  8. package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
  9. package/dist/chunk-DFS3FEXO.js.map +1 -0
  10. package/dist/chunk-MZ2IYGGN.js +592 -0
  11. package/dist/chunk-MZ2IYGGN.js.map +1 -0
  12. package/dist/{chunk-4ODZXQV2.js → chunk-NV2PF37Q.js} +645 -2
  13. package/dist/chunk-NV2PF37Q.js.map +1 -0
  14. package/dist/contract/index.d.ts +11 -9
  15. package/dist/contract/index.js +11 -12
  16. package/dist/contract/index.js.map +1 -1
  17. package/dist/hosted/index.d.ts +1 -1
  18. package/dist/hosted/index.js +1 -1
  19. package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
  20. package/dist/index.d.ts +251 -7
  21. package/dist/index.js +292 -2
  22. package/dist/index.js.map +1 -1
  23. package/dist/openapi.json +1 -1
  24. package/dist/provenance-CChUqexv.d.ts +314 -0
  25. package/dist/{registry-DPly4_hZ.d.ts → registry-BGKyX6bw.d.ts} +2 -2
  26. package/dist/release-report-CN8hJlhk.d.ts +233 -0
  27. package/dist/reporting.d.ts +4 -3
  28. package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
  29. package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
  30. package/dist/statistics-B7yCbi9i.d.ts +253 -0
  31. package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
  32. package/package.json +1 -1
  33. package/dist/chunk-4ODZXQV2.js.map +0 -1
  34. package/dist/chunk-7TPYV2ER.js.map +0 -1
  35. package/dist/chunk-CZRKD2X2.js +0 -1104
  36. package/dist/chunk-CZRKD2X2.js.map +0 -1
  37. package/dist/chunk-E22YUOAL.js +0 -111
  38. package/dist/chunk-E22YUOAL.js.map +0 -1
  39. package/dist/chunk-HKINEDRZ.js.map +0 -1
  40. package/dist/release-report-DGoeObZT.d.ts +0 -484
  41. /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.65.0",
5
+ "version": "0.67.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -0,0 +1,314 @@
1
+ import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, f as CampaignResult, M as MutableSurface, k as GateResult, j as GateDecision } from './types-c2R2kfmv.js';
2
+ import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
3
+ import { R as RunRecord } from './run-record-BgTFzO2r.js';
4
+ import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BKpM5T4t.js';
5
+ import { H as HostedClient, T as TraceSpanEvent } from './index-DSEHMwvS.js';
6
+
7
+ /**
8
+ * @experimental
9
+ *
10
+ * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
11
+ * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
12
+ * the evolutionary strategy: each generation, mutate the current best surface
13
+ * into N candidates, measure, select. No generation memory beyond the current
14
+ * surface; the loop body handles ranking + promotion.
15
+ *
16
+ * The reflective alternative is agent-runtime's `improvementDriver` with a
17
+ * `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +
18
+ * trace findings to propose targeted edits rather than blind mutations. Both
19
+ * conform to `ImprovementDriver`; the improvement loop is identical regardless
20
+ * of which drives it.
21
+ */
22
+
23
+ interface EvolutionaryDriverOptions<TFindings = unknown> {
24
+ mutator: Mutator<TFindings>;
25
+ /** External findings fed to the mutator each generation. Default: []. */
26
+ findings?: TFindings[];
27
+ }
28
+ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
29
+
30
+ /**
31
+ * @experimental
32
+ *
33
+ * Compose multiple `Gate` implementations — every gate must pass for the
34
+ * composite to ship. Closes the alignment reviewer's "default-only
35
+ * heldOutGate + costGate would happily promote a reward-hacked prompt"
36
+ * concern by making safety gates first-class composable defaults.
37
+ */
38
+
39
+ /** Compose gates — all must `ship` for the composite to `ship`. First
40
+ * non-ship verdict short-circuits the composite verdict, but ALL gates run
41
+ * (so the result records every gate's reason — useful for diagnostics). */
42
+ declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
43
+
44
+ /**
45
+ * @experimental
46
+ *
47
+ * `defaultProductionGate` — composes the substrate's existing safety
48
+ * primitives (red-team / reward-hacking / canary / heldout) into a single
49
+ * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
50
+ * primitives are off the critical path" blocker.
51
+ *
52
+ * The composition is opinionated — when consumers wire `runImprovementLoop`,
53
+ * THIS gate is the default. Consumers can still pass a custom gate to
54
+ * override; the recommended pattern is to compose THIS gate with whatever
55
+ * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
56
+ */
57
+
58
+ interface DefaultProductionGateOptions {
59
+ /** Required: scenarios held out from training; substrate compares
60
+ * candidate-on-holdout vs baseline-on-holdout. */
61
+ holdoutScenarios: Scenario[];
62
+ /** Minimum held-out lift the **paired-bootstrap CI lower bound** must clear
63
+ * to ship — NOT a point estimate. Default 0 ⇒ "confidently positive at the
64
+ * confidence level". Interpreted in the judge's native composite scale (set
65
+ * e.g. 2 for a 0-100 rubric to require a ≥2-point significant gain). */
66
+ deltaThreshold?: number;
67
+ /** Confidence level for the held-out + dimension bootstraps. Default 0.95. */
68
+ confidence?: number;
69
+ /** Bootstrap resamples. Default 2000. */
70
+ bootstrapResamples?: number;
71
+ /** Fixed bootstrap seed for a deterministic verdict. Default 1337. */
72
+ bootstrapSeed?: number;
73
+ /** Minimum paired holdout observations (scenarios × reps) before a
74
+ * significance claim is allowed; below it the gate HOLDS with `few_runs`
75
+ * rather than reading a degenerate CI. Default 3. */
76
+ minProductiveRuns?: number;
77
+ /** Critical judge dimensions that must NOT significantly regress even when
78
+ * the net composite rises (anti-Goodhart). The gate HOLDS if any listed
79
+ * dimension's paired-delta CI lower bound < −`regressionTolerance`. E.g.
80
+ * `['hallucination_free']` for a legal agent. */
81
+ criticalDimensions?: string[];
82
+ /** Tolerance for the per-dimension regression guard, in the dimension's
83
+ * native scale. When omitted it auto-scales off observed magnitudes:
84
+ * 0.05 on [0,1], 5 on 0-100. */
85
+ regressionTolerance?: number;
86
+ /** Total $ budget for ALL cells in this campaign — including baseline + candidate.
87
+ * Composite verdict refuses to ship when spend exceeded budget. */
88
+ budgetUsd?: number;
89
+ /** Red-team cases to probe candidate outputs against. When omitted the
90
+ * substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
91
+ * battery for tighter coverage. */
92
+ redTeamBattery?: RedTeamCase[];
93
+ /** Run records (oldest-first) needed for the reward-hacking detector.
94
+ * Substrate populates from prior production-loop generations. */
95
+ recentRuns?: RunRecord[];
96
+ /** When true, the gate refuses to ship if the reward-hacking detector
97
+ * fires at the `gaming` severity. Default true. */
98
+ blockOnRewardHackingGaming?: boolean;
99
+ }
100
+ declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
101
+
102
+ /**
103
+ * @experimental
104
+ *
105
+ * Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
106
+ * `Gate`. Use when you want held-out as one of N composed gates instead of
107
+ * the full `defaultProductionGate` stack.
108
+ */
109
+
110
+ interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
111
+ scenarios: TScenario[];
112
+ deltaThreshold?: number;
113
+ }
114
+ declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
115
+
116
+ /**
117
+ * @experimental
118
+ *
119
+ * `runEval` — the simplest preset over `runCampaign`. No optimizer, no
120
+ * gate, no auto-PR. Just: run scenarios through dispatch, score with
121
+ * judges, return CampaignResult.
122
+ *
123
+ * The 80% case for consumers who want a scorecard, not an improvement loop.
124
+ */
125
+
126
+ interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
127
+ runDir: string;
128
+ }
129
+ declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
130
+
131
+ /**
132
+ * @experimental
133
+ *
134
+ * Loop provenance — the durable, queryable record of WHAT a self-improvement
135
+ * loop did and WHY, plus the OTel spans that let an OTLP collector pivot from
136
+ * an eval-run to the underlying candidate→cell→gate→promote chain.
137
+ *
138
+ * Two artifacts, one source of truth:
139
+ *
140
+ * 1. `LoopProvenanceRecord` — a structured JSON record capturing every
141
+ * candidate (surfaceHash + label + rationale), its measured composite,
142
+ * the gate decision + reasons + delta, the held-out lift, the explicit
143
+ * baseline→candidate diff, and BACKEND PROVENANCE (the
144
+ * `assertRealBackend` verdict + worker call count + model). This is the
145
+ * ingestable audit artifact: the +lift recomputes from it, the "because
146
+ * Z" rationale survives in it, and a stub backend is detectable from it.
147
+ *
148
+ * 2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable
149
+ * `TraceSpanEvent`s, pivoted on the substrate's standard
150
+ * `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /
151
+ * `tangle.generation` attributes (the same pivots `/adapters/otel`
152
+ * reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,
153
+ * not just the `cost.*` spans `runCampaign` already emits per cell.
154
+ *
155
+ * The record is built from the substrate's own loop result + the per-call
156
+ * `RunRecord`s the worker emitted — no new measurement, no recomputation that
157
+ * could drift from what the gate actually saw.
158
+ */
159
+
160
+ /** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash
161
+ * their worktree+base identity since the content lives in git. Distinct from
162
+ * `surfaceHash` (16-char content fingerprint used as a loop identity key);
163
+ * this is the byte-identical-verifiable content hash the provenance record +
164
+ * `RunRecord.promptHash` carry. */
165
+ declare function surfaceContentHash(surface: MutableSurface): string;
166
+ interface LoopProvenanceCandidate {
167
+ /** Generation index this candidate was proposed in. */
168
+ generation: number;
169
+ /** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */
170
+ surfaceHash: string;
171
+ /** Full sha256 content hash — byte-identical-verifiable. */
172
+ contentHash: string;
173
+ /** Driver label, when the driver returned a `ProposedCandidate`. */
174
+ label?: string;
175
+ /** Driver rationale — the "because Z". When the driver returned a bare
176
+ * surface (blind mutator) this is absent. */
177
+ rationale?: string;
178
+ /** Mean composite this candidate scored on the search split. */
179
+ composite: number;
180
+ /** Whether this candidate was promoted out of its generation. */
181
+ promoted: boolean;
182
+ }
183
+ interface LoopProvenanceBackend {
184
+ /** `assertRealBackend`-grade verdict over the worker call records. */
185
+ verdict: 'real' | 'mixed' | 'stub';
186
+ /** Number of worker LLM calls captured (the audit's "worker call count"). */
187
+ workerCallCount: number;
188
+ /** Distinct model ids observed across worker calls. */
189
+ models: string[];
190
+ totalInputTokens: number;
191
+ totalOutputTokens: number;
192
+ totalCostUsd: number;
193
+ }
194
+ /**
195
+ * The durable provenance record. Aligns to the hosted `EvalRunEvent` path but
196
+ * ADDS the rationale + the explicit baseline→candidate diff (both omitted from
197
+ * the bare hosted event) + backend provenance.
198
+ */
199
+ interface LoopProvenanceRecord {
200
+ schema: 'tangle.loop-provenance.v1';
201
+ runId: string;
202
+ runDir: string;
203
+ timestamp: string;
204
+ /** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */
205
+ baselineContentHash: string;
206
+ winnerContentHash: string;
207
+ /** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */
208
+ winnerLabel?: string;
209
+ winnerRationale?: string;
210
+ /** The explicit baseline→winner unified diff the gate decided on. */
211
+ diff: string;
212
+ /** Every candidate across every generation, each carrying its rationale. */
213
+ candidates: LoopProvenanceCandidate[];
214
+ /** The gate verdict — decision + reasons + contributing gates + delta. */
215
+ gate: {
216
+ decision: GateDecision;
217
+ reasons: string[];
218
+ delta?: number;
219
+ contributingGates: Array<{
220
+ name: string;
221
+ passed: boolean;
222
+ }>;
223
+ };
224
+ /** baseline-on-holdout composite mean. */
225
+ baselineHoldoutComposite: number;
226
+ /** winner-on-holdout composite mean. */
227
+ winnerHoldoutComposite: number;
228
+ /** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */
229
+ heldOutLift: number;
230
+ /** Backend provenance: stub-vs-real verdict + worker call count + models. */
231
+ backend: LoopProvenanceBackend;
232
+ totalCostUsd: number;
233
+ totalDurationMs: number;
234
+ }
235
+ interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {
236
+ runId: string;
237
+ runDir: string;
238
+ timestamp: string;
239
+ baselineSurface: MutableSurface;
240
+ winnerSurface: MutableSurface;
241
+ winnerLabel?: string;
242
+ winnerRationale?: string;
243
+ diff: string;
244
+ /** Per-generation candidate records straight off the loop result. */
245
+ generations: Array<{
246
+ generationIndex: number;
247
+ candidates: Array<{
248
+ surfaceHash: string;
249
+ composite: number;
250
+ label?: string;
251
+ rationale?: string;
252
+ }>;
253
+ promoted: string[];
254
+ /** Surfaces measured this generation, keyed positionally to candidates so
255
+ * the content hash can be computed from the real surface text. */
256
+ surfaces: Array<{
257
+ surfaceHash: string;
258
+ surface: MutableSurface;
259
+ }>;
260
+ }>;
261
+ gate: GateResult;
262
+ baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
263
+ winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
264
+ /** Worker call records — the source for backend provenance. */
265
+ workerRecords: ReadonlyArray<RunRecord>;
266
+ totalCostUsd: number;
267
+ totalDurationMs: number;
268
+ }
269
+ /** Build the durable provenance record from a completed loop result. */
270
+ declare function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(args: BuildLoopProvenanceArgs<TArtifact, TScenario>): LoopProvenanceRecord;
271
+ /**
272
+ * Build the loop's OTLP-ingestable spans from a provenance record. One root
273
+ * span per loop (`tangle.runId`), one span per generation, one span per
274
+ * candidate (carrying its surfaceHash + label), and one span for the gate
275
+ * decision (carrying reasons + delta + lift). Candidate + gate spans pivot on
276
+ * the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`
277
+ * reads, so the hosted collector reconstructs the full tree.
278
+ *
279
+ * Times are synthesized monotonically off a single base so the span tree is
280
+ * orderable; the substrate does not retain per-candidate wall-clock starts.
281
+ */
282
+ declare function loopProvenanceSpans(record: LoopProvenanceRecord, opts?: {
283
+ baseTimeMs?: number;
284
+ }): TraceSpanEvent[];
285
+ /** Canonical durable paths under the run dir. */
286
+ declare function provenanceRecordPath(runDir: string): string;
287
+ declare function provenanceSpansPath(runDir: string): string;
288
+ interface EmitLoopProvenanceResult {
289
+ record: LoopProvenanceRecord;
290
+ spans: TraceSpanEvent[];
291
+ /** Absolute paths the record + spans were written to, when storage persists. */
292
+ recordPath: string;
293
+ spansPath: string;
294
+ }
295
+ interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends BuildLoopProvenanceArgs<TArtifact, TScenario> {
296
+ /** Storage the record + spans are written through. */
297
+ storage: CampaignStorage;
298
+ /** When set, the spans are also shipped to the hosted `/v1/ingest/traces`
299
+ * endpoint so the collector receives the full loop, not just `cost.*`. */
300
+ hostedClient?: HostedClient;
301
+ }
302
+ /**
303
+ * Build the provenance record + OTel spans and persist them durably under the
304
+ * run dir (and ship spans to a hosted collector when one is wired). Returns
305
+ * both artifacts so the caller can assert on / re-derive from them.
306
+ *
307
+ * Fail-loud: the durable write throws on storage failure (a swallowed write is
308
+ * exactly the "emitted but lost" failure this closes). The hosted span ship is
309
+ * the one best-effort leg — its failure is logged, not thrown, so an offline
310
+ * collector never fails the loop (the durable artifact is the source of truth).
311
+ */
312
+ declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
313
+
314
+ export { type BuildLoopProvenanceArgs as B, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type HeldOutGateOptions as H, type LoopProvenanceRecord as L, type RunEvalOptions as R, type EmitLoopProvenanceArgs as a, type EmitLoopProvenanceResult as b, composeGate as c, defaultProductionGate as d, evolutionaryDriver as e, type LoopProvenanceBackend as f, type LoopProvenanceCandidate as g, heldOutGate as h, buildLoopProvenanceRecord as i, emitLoopProvenance as j, provenanceSpansPath as k, loopProvenanceSpans as l, provenanceRecordPath as p, runEval as r, surfaceContentHash as s };
@@ -1,7 +1,7 @@
1
1
  import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
2
2
  import { R as RunRecord } from './run-record-BgTFzO2r.js';
3
3
  import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
4
- import { J as JudgeInput } from './types-DhqpAi_z.js';
4
+ import { a as JudgeInput } from './types-Croy5h7V.js';
5
5
 
6
6
  /**
7
7
  * ChatClient — the single LLM abstraction analysts call.
@@ -454,4 +454,4 @@ declare class AnalystRegistry {
454
454
  private routeInput;
455
455
  }
456
456
 
457
- export { AnalystRegistry as A, type BudgetPolicy as B, type ChatCallOpts as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RegistryRunOpts as R, type SandboxSdkTransportOpts as S, type Analyst as a, type AnalystSeverity as b, type AnalystFinding as c, type AnalystCost as d, type AnalystContext as e, type AnalystHooks as f, type AnalystInputKind as g, type AnalystRegistryOptions as h, type AnalystRequirements as i, type AnalystRunEvent as j, type AnalystRunInputs as k, type AnalystRunResult as l, type AnalystRunSummary as m, type ChatClient as n, type ChatRequest as o, type ChatResponse as p, type ChatTransport as q, type CliBridgeTransportOpts as r, type CreateChatClientOpts as s, type RouterTransportOpts as t, computeFindingId as u, createChatClient as v, makeFinding as w };
457
+ export { AnalystRegistry as A, type BudgetPolicy as B, type ChatRequest as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RegistryRunOpts as R, type SandboxSdkTransportOpts as S, type Analyst as a, type AnalystSeverity as b, type AnalystFinding as c, type AnalystCost as d, type AnalystContext as e, type CreateChatClientOpts as f, type AnalystHooks as g, type AnalystInputKind as h, type AnalystRegistryOptions as i, type AnalystRequirements as j, type AnalystRunEvent as k, type AnalystRunInputs as l, type AnalystRunResult as m, type AnalystRunSummary as n, type ChatCallOpts as o, type ChatClient as p, type ChatResponse as q, type ChatTransport as r, type CliBridgeTransportOpts as s, type RouterTransportOpts as t, computeFindingId as u, createChatClient as v, makeFinding as w };
@@ -0,0 +1,233 @@
1
+ import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
2
+ import { m as GateDecision } from './summary-report-ByiOUrHj.js';
3
+ import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
4
+
5
+ /**
6
+ * Release confidence gate.
7
+ *
8
+ * This is the production-facing composition layer over the lower-level
9
+ * primitives:
10
+ * - Dataset manifests prove corpus/version coverage.
11
+ * - RunRecord rows prove reproducible search/holdout outcomes.
12
+ * - Multi-shot trace evidence carries turn counts and ASI diagnostics.
13
+ * - HeldOutGate decisions remain the paired promotion authority.
14
+ *
15
+ * The gate is intentionally pure and conservative. Missing declared evidence
16
+ * fails closed instead of being treated as a neutral zero.
17
+ */
18
+
19
+ /** Severity of an actionable finding attached to a run/trace. */
20
+ type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
21
+ /** Actionable side-info — a diagnosed finding the loop can act on. */
22
+ interface ActionableSideInfo {
23
+ /** Stable expectation/check id when available. */
24
+ expectationId?: string;
25
+ /** Human-readable diagnosis of what happened. */
26
+ message: string;
27
+ severity?: AsiSeverity;
28
+ /** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
29
+ evidence?: string;
30
+ /** Prompt/tool/context surface likely responsible. */
31
+ responsibleSurface?: string;
32
+ /** Suggested fix in natural language. */
33
+ suggestion?: string;
34
+ /** Whether this expectation was satisfied. Defaults to false for ASI rows. */
35
+ matched?: boolean;
36
+ metadata?: Record<string, unknown>;
37
+ }
38
+ type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
39
+ type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
40
+ interface ReleaseTraceEvidence {
41
+ scenarioId: string;
42
+ candidateId?: string;
43
+ split?: RunSplitTag;
44
+ score?: number;
45
+ ok?: boolean;
46
+ turnCount?: number;
47
+ costUsd?: number;
48
+ durationMs?: number;
49
+ failureMode?: string;
50
+ asi?: ActionableSideInfo[];
51
+ metadata?: Record<string, unknown>;
52
+ }
53
+ interface ReleaseConfidenceThresholds {
54
+ /** Require a Dataset manifest or explicit scenarios. Default true. */
55
+ requireCorpus?: boolean;
56
+ minScenarioCount?: number;
57
+ minSearchRuns?: number;
58
+ minHoldoutRuns?: number;
59
+ /** Require at least one holdout scenario/run. Default true. */
60
+ requireHoldout?: boolean;
61
+ minPassRate?: number;
62
+ minMeanScore?: number;
63
+ /** Search mean may exceed holdout mean by at most this much. */
64
+ maxOverfitGap?: number;
65
+ maxMeanCostUsd?: number;
66
+ maxP95WallMs?: number;
67
+ /** Low-score/failed rows must carry ASI. Default true. */
68
+ requireAsiForFailures?: boolean;
69
+ /** Score below this is considered a failure for ASI coverage. Default 0.5. */
70
+ failureScoreThreshold?: number;
71
+ }
72
+ interface ReleaseConfidenceInput {
73
+ target: string;
74
+ candidateId?: string;
75
+ baselineId?: string;
76
+ dataset?: DatasetManifest;
77
+ scenarios?: readonly DatasetScenario[];
78
+ runs?: readonly RunRecord[];
79
+ traces?: readonly ReleaseTraceEvidence[];
80
+ gateDecision?: GateDecision | null;
81
+ thresholds?: ReleaseConfidenceThresholds;
82
+ }
83
+ interface ReleaseConfidenceAxis {
84
+ name: ReleaseConfidenceAxisName;
85
+ status: ReleaseConfidenceStatus;
86
+ score: number;
87
+ detail: string;
88
+ }
89
+ interface ReleaseConfidenceIssue {
90
+ axis: ReleaseConfidenceAxisName;
91
+ severity: 'critical' | 'warning';
92
+ code: string;
93
+ detail: string;
94
+ }
95
+ interface ReleaseConfidenceMetrics {
96
+ scenarioCount: number;
97
+ searchRuns: number;
98
+ holdoutRuns: number;
99
+ passRate: number;
100
+ meanScore: number;
101
+ searchMeanScore: number;
102
+ holdoutMeanScore: number;
103
+ overfitGap: number;
104
+ meanCostUsd: number;
105
+ p95WallMs: number;
106
+ failedRows: number;
107
+ failuresWithAsi: number;
108
+ singleShotTraces: number;
109
+ multiShotTraces: number;
110
+ splitCounts: Record<DatasetSplit, number>;
111
+ domainCounts: Record<string, number>;
112
+ failureModeCounts: Record<string, number>;
113
+ responsibleSurfaceCounts: Record<string, number>;
114
+ }
115
+ interface ReleaseConfidenceScorecard {
116
+ target: string;
117
+ candidateId: string | null;
118
+ baselineId: string | null;
119
+ status: ReleaseConfidenceStatus;
120
+ promote: boolean;
121
+ axes: ReleaseConfidenceAxis[];
122
+ issues: ReleaseConfidenceIssue[];
123
+ metrics: ReleaseConfidenceMetrics;
124
+ dataset: DatasetManifest | null;
125
+ gateDecision: GateDecision | null;
126
+ summary: string;
127
+ }
128
+ declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
129
+ declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
130
+
131
+ /**
132
+ * Bootstrap-CI promotion gate.
133
+ *
134
+ * In any iterative-improvement loop (GEPA, prompt evolution, dataset
135
+ * curation), the question is "did this generation actually improve, or are
136
+ * we celebrating noise?". With small N and noisy outcomes, point-estimate
137
+ * deltas lie. Bootstrap confidence intervals tell the operator whether the
138
+ * delta is real before code or prompts get promoted.
139
+ *
140
+ * This module is pure functions — no I/O, no model calls. Easy to unit-test
141
+ * and to compose into any verdict gate.
142
+ *
143
+ * Default gate:
144
+ * - Bootstrap mean baseline vs candidate (1k resamples).
145
+ * - Compute the delta distribution; pass if the lower CI bound > 0.
146
+ * - Tunable confidence (default 95%) and resample count.
147
+ *
148
+ * Verdict semantics intentionally match the existing `experiments.jsonl`
149
+ * vocabulary:
150
+ * - ADVANCE: candidate's CI lower bound > baseline mean (real win)
151
+ * - KEEP: overlap, but candidate point estimate >= baseline (neutral)
152
+ * - REVERT: candidate's CI upper bound < baseline mean (real regression)
153
+ * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
154
+ */
155
+ type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
156
+ interface BootstrapResult {
157
+ baselineMean: number;
158
+ candidateMean: number;
159
+ /** candidateMean - baselineMean, point estimate. */
160
+ delta: number;
161
+ /** Lower bound of the (1 - alpha) CI on the delta. */
162
+ ciLower: number;
163
+ /** Upper bound of the (1 - alpha) CI on the delta. */
164
+ ciUpper: number;
165
+ /** Number of bootstrap resamples used. */
166
+ iterations: number;
167
+ alpha: number;
168
+ verdict: Verdict;
169
+ }
170
+ interface BootstrapOptions {
171
+ /** Confidence level alpha (default 0.05 → 95% CI). */
172
+ alpha?: number;
173
+ /** Number of resamples (default 1000). */
174
+ iterations?: number;
175
+ /**
176
+ * Minimum total samples (baseline + candidate) below which we always
177
+ * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
178
+ * Default 6 (combined).
179
+ */
180
+ minTotalSamples?: number;
181
+ /** RNG seed for reproducibility. Default: Math.random. */
182
+ seed?: number;
183
+ }
184
+ /**
185
+ * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
186
+ *
187
+ * Uses simple percentile bootstrap on the difference of resampled means.
188
+ * That's the standard non-parametric primitive — no distributional
189
+ * assumptions, robust to skew, easy to reason about.
190
+ */
191
+ declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
192
+ /**
193
+ * Judge-replay promotion gate.
194
+ *
195
+ * The cheap inner-loop judge that drives an evolution run is by definition
196
+ * fast and noisy. When you're about to promote a winning variant to the
197
+ * canonical default, you want a STRONGER judge (a more expensive model, a
198
+ * human grader, a separately-trained reward model) to confirm the win
199
+ * generalises beyond the inner loop.
200
+ *
201
+ * This helper takes raw winner + baseline outputs, scores both through the
202
+ * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
203
+ * judge agrees the winner is real with the configured confidence. Doesn't
204
+ * matter what shape your "output" is — pass a string, an object, anything
205
+ * the judge can read.
206
+ */
207
+ interface JudgeReplayGateArgs<TOutput> {
208
+ baselineOutputs: TOutput[];
209
+ candidateOutputs: TOutput[];
210
+ /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
211
+ judge: (output: TOutput) => Promise<number> | number;
212
+ alpha?: number;
213
+ iterations?: number;
214
+ /** RNG seed for reproducibility. */
215
+ seed?: number;
216
+ /** Maximum concurrent judge calls. Default 4. */
217
+ judgeConcurrency?: number;
218
+ }
219
+ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
220
+ baselineSamples: number;
221
+ candidateSamples: number;
222
+ }>;
223
+
224
+ interface RenderReleaseReportOptions {
225
+ title?: string;
226
+ runs?: readonly RunRecord[];
227
+ comparator?: string;
228
+ traceAnalystFindings?: readonly string[];
229
+ nextActions?: readonly string[];
230
+ }
231
+ declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
232
+
233
+ export { type ActionableSideInfo as A, type BootstrapOptions as B, type JudgeReplayGateArgs as J, type ReleaseConfidenceAxis as R, type Verdict as V, type BootstrapResult as a, type ReleaseConfidenceAxisName as b, type ReleaseConfidenceInput as c, type ReleaseConfidenceIssue as d, type ReleaseConfidenceMetrics as e, type ReleaseConfidenceScorecard as f, type ReleaseConfidenceStatus as g, type ReleaseConfidenceThresholds as h, type ReleaseTraceEvidence as i, type RenderReleaseReportOptions as j, assertReleaseConfidence as k, bootstrapCi as l, evaluateReleaseConfidence as m, judgeReplayGate as n, type AsiSeverity as o, renderReleaseReport as r };
@@ -1,14 +1,15 @@
1
1
  export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-D_4BSXGV.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DGoeObZT.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
+ export { P as PairedBootstrapOptions, a as PairedBootstrapResult, b as benjaminiHochberg, p as pairedBootstrap, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
4
5
  export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
5
6
  import './run-record-BgTFzO2r.js';
6
7
  import './errors-Dwqw-T_m.js';
7
8
  import './schema-m0gsnbt3.js';
8
9
  import './outcome-store-D6KWmYvj.js';
10
+ import './dataset-B2kL-fSM.js';
9
11
  import './judge-calibration-DilmB3Ml.js';
10
- import './types-DhqpAi_z.js';
12
+ import './types-Croy5h7V.js';
11
13
  import '@tangle-network/tcloud';
12
- import './dataset-B2kL-fSM.js';
13
14
  import './failure-cluster-CL7IVgkJ.js';
14
15
  import './store-CKUAgsJz.js';
@@ -1,11 +1,10 @@
1
1
  import {
2
2
  runCampaign
3
- } from "./chunk-7TPYV2ER.js";
4
- import "./chunk-E22YUOAL.js";
3
+ } from "./chunk-6XQIEUQ2.js";
5
4
  import "./chunk-ITBRCT73.js";
6
5
  import "./chunk-3BFEG2F6.js";
7
6
  import "./chunk-PZ5AY32C.js";
8
7
  export {
9
8
  runCampaign
10
9
  };
11
- //# sourceMappingURL=run-campaign-5J3ED2UJ.js.map
10
+ //# sourceMappingURL=run-campaign-BVY3RGAZ.js.map