@tangle-network/agent-eval 0.65.0 → 0.67.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/dist/adapters/otel.d.ts +1 -1
- package/dist/campaign/index.d.ts +110 -6
- package/dist/campaign/index.js +26 -19
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
- package/dist/chunk-6XQIEUQ2.js.map +1 -0
- package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
- package/dist/chunk-DFS3FEXO.js.map +1 -0
- package/dist/chunk-MZ2IYGGN.js +592 -0
- package/dist/chunk-MZ2IYGGN.js.map +1 -0
- package/dist/{chunk-4ODZXQV2.js → chunk-NV2PF37Q.js} +645 -2
- package/dist/chunk-NV2PF37Q.js.map +1 -0
- package/dist/contract/index.d.ts +11 -9
- package/dist/contract/index.js +11 -12
- package/dist/contract/index.js.map +1 -1
- package/dist/hosted/index.d.ts +1 -1
- package/dist/hosted/index.js +1 -1
- package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
- package/dist/index.d.ts +251 -7
- package/dist/index.js +292 -2
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/provenance-CChUqexv.d.ts +314 -0
- package/dist/{registry-DPly4_hZ.d.ts → registry-BGKyX6bw.d.ts} +2 -2
- package/dist/release-report-CN8hJlhk.d.ts +233 -0
- package/dist/reporting.d.ts +4 -3
- package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
- package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
- package/dist/statistics-B7yCbi9i.d.ts +253 -0
- package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
- package/package.json +1 -1
- package/dist/chunk-4ODZXQV2.js.map +0 -1
- package/dist/chunk-7TPYV2ER.js.map +0 -1
- package/dist/chunk-CZRKD2X2.js +0 -1104
- package/dist/chunk-CZRKD2X2.js.map +0 -1
- package/dist/chunk-E22YUOAL.js +0 -111
- package/dist/chunk-E22YUOAL.js.map +0 -1
- package/dist/chunk-HKINEDRZ.js.map +0 -1
- package/dist/release-report-DGoeObZT.d.ts +0 -484
- /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.67.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, f as CampaignResult, M as MutableSurface, k as GateResult, j as GateDecision } from './types-c2R2kfmv.js';
|
|
2
|
+
import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
|
|
3
|
+
import { R as RunRecord } from './run-record-BgTFzO2r.js';
|
|
4
|
+
import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BKpM5T4t.js';
|
|
5
|
+
import { H as HostedClient, T as TraceSpanEvent } from './index-DSEHMwvS.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* @experimental
|
|
9
|
+
*
|
|
10
|
+
* `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
|
|
11
|
+
* GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
|
|
12
|
+
* the evolutionary strategy: each generation, mutate the current best surface
|
|
13
|
+
* into N candidates, measure, select. No generation memory beyond the current
|
|
14
|
+
* surface; the loop body handles ranking + promotion.
|
|
15
|
+
*
|
|
16
|
+
* The reflective alternative is agent-runtime's `improvementDriver` with a
|
|
17
|
+
* `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +
|
|
18
|
+
* trace findings to propose targeted edits rather than blind mutations. Both
|
|
19
|
+
* conform to `ImprovementDriver`; the improvement loop is identical regardless
|
|
20
|
+
* of which drives it.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
interface EvolutionaryDriverOptions<TFindings = unknown> {
|
|
24
|
+
mutator: Mutator<TFindings>;
|
|
25
|
+
/** External findings fed to the mutator each generation. Default: []. */
|
|
26
|
+
findings?: TFindings[];
|
|
27
|
+
}
|
|
28
|
+
declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* @experimental
|
|
32
|
+
*
|
|
33
|
+
* Compose multiple `Gate` implementations — every gate must pass for the
|
|
34
|
+
* composite to ship. Closes the alignment reviewer's "default-only
|
|
35
|
+
* heldOutGate + costGate would happily promote a reward-hacked prompt"
|
|
36
|
+
* concern by making safety gates first-class composable defaults.
|
|
37
|
+
*/
|
|
38
|
+
|
|
39
|
+
/** Compose gates — all must `ship` for the composite to `ship`. First
|
|
40
|
+
* non-ship verdict short-circuits the composite verdict, but ALL gates run
|
|
41
|
+
* (so the result records every gate's reason — useful for diagnostics). */
|
|
42
|
+
declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* @experimental
|
|
46
|
+
*
|
|
47
|
+
* `defaultProductionGate` — composes the substrate's existing safety
|
|
48
|
+
* primitives (red-team / reward-hacking / canary / heldout) into a single
|
|
49
|
+
* Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
|
|
50
|
+
* primitives are off the critical path" blocker.
|
|
51
|
+
*
|
|
52
|
+
* The composition is opinionated — when consumers wire `runImprovementLoop`,
|
|
53
|
+
* THIS gate is the default. Consumers can still pass a custom gate to
|
|
54
|
+
* override; the recommended pattern is to compose THIS gate with whatever
|
|
55
|
+
* extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
|
|
56
|
+
*/
|
|
57
|
+
|
|
58
|
+
interface DefaultProductionGateOptions {
|
|
59
|
+
/** Required: scenarios held out from training; substrate compares
|
|
60
|
+
* candidate-on-holdout vs baseline-on-holdout. */
|
|
61
|
+
holdoutScenarios: Scenario[];
|
|
62
|
+
/** Minimum held-out lift the **paired-bootstrap CI lower bound** must clear
|
|
63
|
+
* to ship — NOT a point estimate. Default 0 ⇒ "confidently positive at the
|
|
64
|
+
* confidence level". Interpreted in the judge's native composite scale (set
|
|
65
|
+
* e.g. 2 for a 0-100 rubric to require a ≥2-point significant gain). */
|
|
66
|
+
deltaThreshold?: number;
|
|
67
|
+
/** Confidence level for the held-out + dimension bootstraps. Default 0.95. */
|
|
68
|
+
confidence?: number;
|
|
69
|
+
/** Bootstrap resamples. Default 2000. */
|
|
70
|
+
bootstrapResamples?: number;
|
|
71
|
+
/** Fixed bootstrap seed for a deterministic verdict. Default 1337. */
|
|
72
|
+
bootstrapSeed?: number;
|
|
73
|
+
/** Minimum paired holdout observations (scenarios × reps) before a
|
|
74
|
+
* significance claim is allowed; below it the gate HOLDS with `few_runs`
|
|
75
|
+
* rather than reading a degenerate CI. Default 3. */
|
|
76
|
+
minProductiveRuns?: number;
|
|
77
|
+
/** Critical judge dimensions that must NOT significantly regress even when
|
|
78
|
+
* the net composite rises (anti-Goodhart). The gate HOLDS if any listed
|
|
79
|
+
* dimension's paired-delta CI lower bound < −`regressionTolerance`. E.g.
|
|
80
|
+
* `['hallucination_free']` for a legal agent. */
|
|
81
|
+
criticalDimensions?: string[];
|
|
82
|
+
/** Tolerance for the per-dimension regression guard, in the dimension's
|
|
83
|
+
* native scale. When omitted it auto-scales off observed magnitudes:
|
|
84
|
+
* 0.05 on [0,1], 5 on 0-100. */
|
|
85
|
+
regressionTolerance?: number;
|
|
86
|
+
/** Total $ budget for ALL cells in this campaign — including baseline + candidate.
|
|
87
|
+
* Composite verdict refuses to ship when spend exceeded budget. */
|
|
88
|
+
budgetUsd?: number;
|
|
89
|
+
/** Red-team cases to probe candidate outputs against. When omitted the
|
|
90
|
+
* substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
|
|
91
|
+
* battery for tighter coverage. */
|
|
92
|
+
redTeamBattery?: RedTeamCase[];
|
|
93
|
+
/** Run records (oldest-first) needed for the reward-hacking detector.
|
|
94
|
+
* Substrate populates from prior production-loop generations. */
|
|
95
|
+
recentRuns?: RunRecord[];
|
|
96
|
+
/** When true, the gate refuses to ship if the reward-hacking detector
|
|
97
|
+
* fires at the `gaming` severity. Default true. */
|
|
98
|
+
blockOnRewardHackingGaming?: boolean;
|
|
99
|
+
}
|
|
100
|
+
declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* @experimental
|
|
104
|
+
*
|
|
105
|
+
* Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
|
|
106
|
+
* `Gate`. Use when you want held-out as one of N composed gates instead of
|
|
107
|
+
* the full `defaultProductionGate` stack.
|
|
108
|
+
*/
|
|
109
|
+
|
|
110
|
+
interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
|
|
111
|
+
scenarios: TScenario[];
|
|
112
|
+
deltaThreshold?: number;
|
|
113
|
+
}
|
|
114
|
+
declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* @experimental
|
|
118
|
+
*
|
|
119
|
+
* `runEval` — the simplest preset over `runCampaign`. No optimizer, no
|
|
120
|
+
* gate, no auto-PR. Just: run scenarios through dispatch, score with
|
|
121
|
+
* judges, return CampaignResult.
|
|
122
|
+
*
|
|
123
|
+
* The 80% case for consumers who want a scorecard, not an improvement loop.
|
|
124
|
+
*/
|
|
125
|
+
|
|
126
|
+
interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
|
|
127
|
+
runDir: string;
|
|
128
|
+
}
|
|
129
|
+
declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* @experimental
|
|
133
|
+
*
|
|
134
|
+
* Loop provenance — the durable, queryable record of WHAT a self-improvement
|
|
135
|
+
* loop did and WHY, plus the OTel spans that let an OTLP collector pivot from
|
|
136
|
+
* an eval-run to the underlying candidate→cell→gate→promote chain.
|
|
137
|
+
*
|
|
138
|
+
* Two artifacts, one source of truth:
|
|
139
|
+
*
|
|
140
|
+
* 1. `LoopProvenanceRecord` — a structured JSON record capturing every
|
|
141
|
+
* candidate (surfaceHash + label + rationale), its measured composite,
|
|
142
|
+
* the gate decision + reasons + delta, the held-out lift, the explicit
|
|
143
|
+
* baseline→candidate diff, and BACKEND PROVENANCE (the
|
|
144
|
+
* `assertRealBackend` verdict + worker call count + model). This is the
|
|
145
|
+
* ingestable audit artifact: the +lift recomputes from it, the "because
|
|
146
|
+
* Z" rationale survives in it, and a stub backend is detectable from it.
|
|
147
|
+
*
|
|
148
|
+
* 2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable
|
|
149
|
+
* `TraceSpanEvent`s, pivoted on the substrate's standard
|
|
150
|
+
* `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /
|
|
151
|
+
* `tangle.generation` attributes (the same pivots `/adapters/otel`
|
|
152
|
+
* reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,
|
|
153
|
+
* not just the `cost.*` spans `runCampaign` already emits per cell.
|
|
154
|
+
*
|
|
155
|
+
* The record is built from the substrate's own loop result + the per-call
|
|
156
|
+
* `RunRecord`s the worker emitted — no new measurement, no recomputation that
|
|
157
|
+
* could drift from what the gate actually saw.
|
|
158
|
+
*/
|
|
159
|
+
|
|
160
|
+
/** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash
|
|
161
|
+
* their worktree+base identity since the content lives in git. Distinct from
|
|
162
|
+
* `surfaceHash` (16-char content fingerprint used as a loop identity key);
|
|
163
|
+
* this is the byte-identical-verifiable content hash the provenance record +
|
|
164
|
+
* `RunRecord.promptHash` carry. */
|
|
165
|
+
declare function surfaceContentHash(surface: MutableSurface): string;
|
|
166
|
+
interface LoopProvenanceCandidate {
|
|
167
|
+
/** Generation index this candidate was proposed in. */
|
|
168
|
+
generation: number;
|
|
169
|
+
/** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */
|
|
170
|
+
surfaceHash: string;
|
|
171
|
+
/** Full sha256 content hash — byte-identical-verifiable. */
|
|
172
|
+
contentHash: string;
|
|
173
|
+
/** Driver label, when the driver returned a `ProposedCandidate`. */
|
|
174
|
+
label?: string;
|
|
175
|
+
/** Driver rationale — the "because Z". When the driver returned a bare
|
|
176
|
+
* surface (blind mutator) this is absent. */
|
|
177
|
+
rationale?: string;
|
|
178
|
+
/** Mean composite this candidate scored on the search split. */
|
|
179
|
+
composite: number;
|
|
180
|
+
/** Whether this candidate was promoted out of its generation. */
|
|
181
|
+
promoted: boolean;
|
|
182
|
+
}
|
|
183
|
+
interface LoopProvenanceBackend {
|
|
184
|
+
/** `assertRealBackend`-grade verdict over the worker call records. */
|
|
185
|
+
verdict: 'real' | 'mixed' | 'stub';
|
|
186
|
+
/** Number of worker LLM calls captured (the audit's "worker call count"). */
|
|
187
|
+
workerCallCount: number;
|
|
188
|
+
/** Distinct model ids observed across worker calls. */
|
|
189
|
+
models: string[];
|
|
190
|
+
totalInputTokens: number;
|
|
191
|
+
totalOutputTokens: number;
|
|
192
|
+
totalCostUsd: number;
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* The durable provenance record. Aligns to the hosted `EvalRunEvent` path but
|
|
196
|
+
* ADDS the rationale + the explicit baseline→candidate diff (both omitted from
|
|
197
|
+
* the bare hosted event) + backend provenance.
|
|
198
|
+
*/
|
|
199
|
+
interface LoopProvenanceRecord {
|
|
200
|
+
schema: 'tangle.loop-provenance.v1';
|
|
201
|
+
runId: string;
|
|
202
|
+
runDir: string;
|
|
203
|
+
timestamp: string;
|
|
204
|
+
/** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */
|
|
205
|
+
baselineContentHash: string;
|
|
206
|
+
winnerContentHash: string;
|
|
207
|
+
/** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */
|
|
208
|
+
winnerLabel?: string;
|
|
209
|
+
winnerRationale?: string;
|
|
210
|
+
/** The explicit baseline→winner unified diff the gate decided on. */
|
|
211
|
+
diff: string;
|
|
212
|
+
/** Every candidate across every generation, each carrying its rationale. */
|
|
213
|
+
candidates: LoopProvenanceCandidate[];
|
|
214
|
+
/** The gate verdict — decision + reasons + contributing gates + delta. */
|
|
215
|
+
gate: {
|
|
216
|
+
decision: GateDecision;
|
|
217
|
+
reasons: string[];
|
|
218
|
+
delta?: number;
|
|
219
|
+
contributingGates: Array<{
|
|
220
|
+
name: string;
|
|
221
|
+
passed: boolean;
|
|
222
|
+
}>;
|
|
223
|
+
};
|
|
224
|
+
/** baseline-on-holdout composite mean. */
|
|
225
|
+
baselineHoldoutComposite: number;
|
|
226
|
+
/** winner-on-holdout composite mean. */
|
|
227
|
+
winnerHoldoutComposite: number;
|
|
228
|
+
/** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */
|
|
229
|
+
heldOutLift: number;
|
|
230
|
+
/** Backend provenance: stub-vs-real verdict + worker call count + models. */
|
|
231
|
+
backend: LoopProvenanceBackend;
|
|
232
|
+
totalCostUsd: number;
|
|
233
|
+
totalDurationMs: number;
|
|
234
|
+
}
|
|
235
|
+
interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {
|
|
236
|
+
runId: string;
|
|
237
|
+
runDir: string;
|
|
238
|
+
timestamp: string;
|
|
239
|
+
baselineSurface: MutableSurface;
|
|
240
|
+
winnerSurface: MutableSurface;
|
|
241
|
+
winnerLabel?: string;
|
|
242
|
+
winnerRationale?: string;
|
|
243
|
+
diff: string;
|
|
244
|
+
/** Per-generation candidate records straight off the loop result. */
|
|
245
|
+
generations: Array<{
|
|
246
|
+
generationIndex: number;
|
|
247
|
+
candidates: Array<{
|
|
248
|
+
surfaceHash: string;
|
|
249
|
+
composite: number;
|
|
250
|
+
label?: string;
|
|
251
|
+
rationale?: string;
|
|
252
|
+
}>;
|
|
253
|
+
promoted: string[];
|
|
254
|
+
/** Surfaces measured this generation, keyed positionally to candidates so
|
|
255
|
+
* the content hash can be computed from the real surface text. */
|
|
256
|
+
surfaces: Array<{
|
|
257
|
+
surfaceHash: string;
|
|
258
|
+
surface: MutableSurface;
|
|
259
|
+
}>;
|
|
260
|
+
}>;
|
|
261
|
+
gate: GateResult;
|
|
262
|
+
baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
|
|
263
|
+
winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
|
|
264
|
+
/** Worker call records — the source for backend provenance. */
|
|
265
|
+
workerRecords: ReadonlyArray<RunRecord>;
|
|
266
|
+
totalCostUsd: number;
|
|
267
|
+
totalDurationMs: number;
|
|
268
|
+
}
|
|
269
|
+
/** Build the durable provenance record from a completed loop result. */
|
|
270
|
+
declare function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(args: BuildLoopProvenanceArgs<TArtifact, TScenario>): LoopProvenanceRecord;
|
|
271
|
+
/**
|
|
272
|
+
* Build the loop's OTLP-ingestable spans from a provenance record. One root
|
|
273
|
+
* span per loop (`tangle.runId`), one span per generation, one span per
|
|
274
|
+
* candidate (carrying its surfaceHash + label), and one span for the gate
|
|
275
|
+
* decision (carrying reasons + delta + lift). Candidate + gate spans pivot on
|
|
276
|
+
* the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`
|
|
277
|
+
* reads, so the hosted collector reconstructs the full tree.
|
|
278
|
+
*
|
|
279
|
+
* Times are synthesized monotonically off a single base so the span tree is
|
|
280
|
+
* orderable; the substrate does not retain per-candidate wall-clock starts.
|
|
281
|
+
*/
|
|
282
|
+
declare function loopProvenanceSpans(record: LoopProvenanceRecord, opts?: {
|
|
283
|
+
baseTimeMs?: number;
|
|
284
|
+
}): TraceSpanEvent[];
|
|
285
|
+
/** Canonical durable paths under the run dir. */
|
|
286
|
+
declare function provenanceRecordPath(runDir: string): string;
|
|
287
|
+
declare function provenanceSpansPath(runDir: string): string;
|
|
288
|
+
interface EmitLoopProvenanceResult {
|
|
289
|
+
record: LoopProvenanceRecord;
|
|
290
|
+
spans: TraceSpanEvent[];
|
|
291
|
+
/** Absolute paths the record + spans were written to, when storage persists. */
|
|
292
|
+
recordPath: string;
|
|
293
|
+
spansPath: string;
|
|
294
|
+
}
|
|
295
|
+
interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends BuildLoopProvenanceArgs<TArtifact, TScenario> {
|
|
296
|
+
/** Storage the record + spans are written through. */
|
|
297
|
+
storage: CampaignStorage;
|
|
298
|
+
/** When set, the spans are also shipped to the hosted `/v1/ingest/traces`
|
|
299
|
+
* endpoint so the collector receives the full loop, not just `cost.*`. */
|
|
300
|
+
hostedClient?: HostedClient;
|
|
301
|
+
}
|
|
302
|
+
/**
|
|
303
|
+
* Build the provenance record + OTel spans and persist them durably under the
|
|
304
|
+
* run dir (and ship spans to a hosted collector when one is wired). Returns
|
|
305
|
+
* both artifacts so the caller can assert on / re-derive from them.
|
|
306
|
+
*
|
|
307
|
+
* Fail-loud: the durable write throws on storage failure (a swallowed write is
|
|
308
|
+
* exactly the "emitted but lost" failure this closes). The hosted span ship is
|
|
309
|
+
* the one best-effort leg — its failure is logged, not thrown, so an offline
|
|
310
|
+
* collector never fails the loop (the durable artifact is the source of truth).
|
|
311
|
+
*/
|
|
312
|
+
declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
|
|
313
|
+
|
|
314
|
+
export { type BuildLoopProvenanceArgs as B, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type HeldOutGateOptions as H, type LoopProvenanceRecord as L, type RunEvalOptions as R, type EmitLoopProvenanceArgs as a, type EmitLoopProvenanceResult as b, composeGate as c, defaultProductionGate as d, evolutionaryDriver as e, type LoopProvenanceBackend as f, type LoopProvenanceCandidate as g, heldOutGate as h, buildLoopProvenanceRecord as i, emitLoopProvenance as j, provenanceSpansPath as k, loopProvenanceSpans as l, provenanceRecordPath as p, runEval as r, surfaceContentHash as s };
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
|
|
2
2
|
import { R as RunRecord } from './run-record-BgTFzO2r.js';
|
|
3
3
|
import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
|
|
4
|
-
import {
|
|
4
|
+
import { a as JudgeInput } from './types-Croy5h7V.js';
|
|
5
5
|
|
|
6
6
|
/**
|
|
7
7
|
* ChatClient — the single LLM abstraction analysts call.
|
|
@@ -454,4 +454,4 @@ declare class AnalystRegistry {
|
|
|
454
454
|
private routeInput;
|
|
455
455
|
}
|
|
456
456
|
|
|
457
|
-
export { AnalystRegistry as A, type BudgetPolicy as B, type
|
|
457
|
+
export { AnalystRegistry as A, type BudgetPolicy as B, type ChatRequest as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RegistryRunOpts as R, type SandboxSdkTransportOpts as S, type Analyst as a, type AnalystSeverity as b, type AnalystFinding as c, type AnalystCost as d, type AnalystContext as e, type CreateChatClientOpts as f, type AnalystHooks as g, type AnalystInputKind as h, type AnalystRegistryOptions as i, type AnalystRequirements as j, type AnalystRunEvent as k, type AnalystRunInputs as l, type AnalystRunResult as m, type AnalystRunSummary as n, type ChatCallOpts as o, type ChatClient as p, type ChatResponse as q, type ChatTransport as r, type CliBridgeTransportOpts as s, type RouterTransportOpts as t, computeFindingId as u, createChatClient as v, makeFinding as w };
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
|
|
2
|
+
import { m as GateDecision } from './summary-report-ByiOUrHj.js';
|
|
3
|
+
import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Release confidence gate.
|
|
7
|
+
*
|
|
8
|
+
* This is the production-facing composition layer over the lower-level
|
|
9
|
+
* primitives:
|
|
10
|
+
* - Dataset manifests prove corpus/version coverage.
|
|
11
|
+
* - RunRecord rows prove reproducible search/holdout outcomes.
|
|
12
|
+
* - Multi-shot trace evidence carries turn counts and ASI diagnostics.
|
|
13
|
+
* - HeldOutGate decisions remain the paired promotion authority.
|
|
14
|
+
*
|
|
15
|
+
* The gate is intentionally pure and conservative. Missing declared evidence
|
|
16
|
+
* fails closed instead of being treated as a neutral zero.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
/** Severity of an actionable finding attached to a run/trace. */
|
|
20
|
+
type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
|
|
21
|
+
/** Actionable side-info — a diagnosed finding the loop can act on. */
|
|
22
|
+
interface ActionableSideInfo {
|
|
23
|
+
/** Stable expectation/check id when available. */
|
|
24
|
+
expectationId?: string;
|
|
25
|
+
/** Human-readable diagnosis of what happened. */
|
|
26
|
+
message: string;
|
|
27
|
+
severity?: AsiSeverity;
|
|
28
|
+
/** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
|
|
29
|
+
evidence?: string;
|
|
30
|
+
/** Prompt/tool/context surface likely responsible. */
|
|
31
|
+
responsibleSurface?: string;
|
|
32
|
+
/** Suggested fix in natural language. */
|
|
33
|
+
suggestion?: string;
|
|
34
|
+
/** Whether this expectation was satisfied. Defaults to false for ASI rows. */
|
|
35
|
+
matched?: boolean;
|
|
36
|
+
metadata?: Record<string, unknown>;
|
|
37
|
+
}
|
|
38
|
+
type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
|
|
39
|
+
type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
|
|
40
|
+
interface ReleaseTraceEvidence {
|
|
41
|
+
scenarioId: string;
|
|
42
|
+
candidateId?: string;
|
|
43
|
+
split?: RunSplitTag;
|
|
44
|
+
score?: number;
|
|
45
|
+
ok?: boolean;
|
|
46
|
+
turnCount?: number;
|
|
47
|
+
costUsd?: number;
|
|
48
|
+
durationMs?: number;
|
|
49
|
+
failureMode?: string;
|
|
50
|
+
asi?: ActionableSideInfo[];
|
|
51
|
+
metadata?: Record<string, unknown>;
|
|
52
|
+
}
|
|
53
|
+
interface ReleaseConfidenceThresholds {
|
|
54
|
+
/** Require a Dataset manifest or explicit scenarios. Default true. */
|
|
55
|
+
requireCorpus?: boolean;
|
|
56
|
+
minScenarioCount?: number;
|
|
57
|
+
minSearchRuns?: number;
|
|
58
|
+
minHoldoutRuns?: number;
|
|
59
|
+
/** Require at least one holdout scenario/run. Default true. */
|
|
60
|
+
requireHoldout?: boolean;
|
|
61
|
+
minPassRate?: number;
|
|
62
|
+
minMeanScore?: number;
|
|
63
|
+
/** Search mean may exceed holdout mean by at most this much. */
|
|
64
|
+
maxOverfitGap?: number;
|
|
65
|
+
maxMeanCostUsd?: number;
|
|
66
|
+
maxP95WallMs?: number;
|
|
67
|
+
/** Low-score/failed rows must carry ASI. Default true. */
|
|
68
|
+
requireAsiForFailures?: boolean;
|
|
69
|
+
/** Score below this is considered a failure for ASI coverage. Default 0.5. */
|
|
70
|
+
failureScoreThreshold?: number;
|
|
71
|
+
}
|
|
72
|
+
interface ReleaseConfidenceInput {
|
|
73
|
+
target: string;
|
|
74
|
+
candidateId?: string;
|
|
75
|
+
baselineId?: string;
|
|
76
|
+
dataset?: DatasetManifest;
|
|
77
|
+
scenarios?: readonly DatasetScenario[];
|
|
78
|
+
runs?: readonly RunRecord[];
|
|
79
|
+
traces?: readonly ReleaseTraceEvidence[];
|
|
80
|
+
gateDecision?: GateDecision | null;
|
|
81
|
+
thresholds?: ReleaseConfidenceThresholds;
|
|
82
|
+
}
|
|
83
|
+
interface ReleaseConfidenceAxis {
|
|
84
|
+
name: ReleaseConfidenceAxisName;
|
|
85
|
+
status: ReleaseConfidenceStatus;
|
|
86
|
+
score: number;
|
|
87
|
+
detail: string;
|
|
88
|
+
}
|
|
89
|
+
interface ReleaseConfidenceIssue {
|
|
90
|
+
axis: ReleaseConfidenceAxisName;
|
|
91
|
+
severity: 'critical' | 'warning';
|
|
92
|
+
code: string;
|
|
93
|
+
detail: string;
|
|
94
|
+
}
|
|
95
|
+
interface ReleaseConfidenceMetrics {
|
|
96
|
+
scenarioCount: number;
|
|
97
|
+
searchRuns: number;
|
|
98
|
+
holdoutRuns: number;
|
|
99
|
+
passRate: number;
|
|
100
|
+
meanScore: number;
|
|
101
|
+
searchMeanScore: number;
|
|
102
|
+
holdoutMeanScore: number;
|
|
103
|
+
overfitGap: number;
|
|
104
|
+
meanCostUsd: number;
|
|
105
|
+
p95WallMs: number;
|
|
106
|
+
failedRows: number;
|
|
107
|
+
failuresWithAsi: number;
|
|
108
|
+
singleShotTraces: number;
|
|
109
|
+
multiShotTraces: number;
|
|
110
|
+
splitCounts: Record<DatasetSplit, number>;
|
|
111
|
+
domainCounts: Record<string, number>;
|
|
112
|
+
failureModeCounts: Record<string, number>;
|
|
113
|
+
responsibleSurfaceCounts: Record<string, number>;
|
|
114
|
+
}
|
|
115
|
+
interface ReleaseConfidenceScorecard {
|
|
116
|
+
target: string;
|
|
117
|
+
candidateId: string | null;
|
|
118
|
+
baselineId: string | null;
|
|
119
|
+
status: ReleaseConfidenceStatus;
|
|
120
|
+
promote: boolean;
|
|
121
|
+
axes: ReleaseConfidenceAxis[];
|
|
122
|
+
issues: ReleaseConfidenceIssue[];
|
|
123
|
+
metrics: ReleaseConfidenceMetrics;
|
|
124
|
+
dataset: DatasetManifest | null;
|
|
125
|
+
gateDecision: GateDecision | null;
|
|
126
|
+
summary: string;
|
|
127
|
+
}
|
|
128
|
+
declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
|
|
129
|
+
declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Bootstrap-CI promotion gate.
|
|
133
|
+
*
|
|
134
|
+
* In any iterative-improvement loop (GEPA, prompt evolution, dataset
|
|
135
|
+
* curation), the question is "did this generation actually improve, or are
|
|
136
|
+
* we celebrating noise?". With small N and noisy outcomes, point-estimate
|
|
137
|
+
* deltas lie. Bootstrap confidence intervals tell the operator whether the
|
|
138
|
+
* delta is real before code or prompts get promoted.
|
|
139
|
+
*
|
|
140
|
+
* This module is pure functions — no I/O, no model calls. Easy to unit-test
|
|
141
|
+
* and to compose into any verdict gate.
|
|
142
|
+
*
|
|
143
|
+
* Default gate:
|
|
144
|
+
* - Bootstrap mean baseline vs candidate (1k resamples).
|
|
145
|
+
* - Compute the delta distribution; pass if the lower CI bound > 0.
|
|
146
|
+
* - Tunable confidence (default 95%) and resample count.
|
|
147
|
+
*
|
|
148
|
+
* Verdict semantics intentionally match the existing `experiments.jsonl`
|
|
149
|
+
* vocabulary:
|
|
150
|
+
* - ADVANCE: candidate's CI lower bound > baseline mean (real win)
|
|
151
|
+
* - KEEP: overlap, but candidate point estimate >= baseline (neutral)
|
|
152
|
+
* - REVERT: candidate's CI upper bound < baseline mean (real regression)
|
|
153
|
+
* - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
|
|
154
|
+
*/
|
|
155
|
+
type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
|
|
156
|
+
interface BootstrapResult {
|
|
157
|
+
baselineMean: number;
|
|
158
|
+
candidateMean: number;
|
|
159
|
+
/** candidateMean - baselineMean, point estimate. */
|
|
160
|
+
delta: number;
|
|
161
|
+
/** Lower bound of the (1 - alpha) CI on the delta. */
|
|
162
|
+
ciLower: number;
|
|
163
|
+
/** Upper bound of the (1 - alpha) CI on the delta. */
|
|
164
|
+
ciUpper: number;
|
|
165
|
+
/** Number of bootstrap resamples used. */
|
|
166
|
+
iterations: number;
|
|
167
|
+
alpha: number;
|
|
168
|
+
verdict: Verdict;
|
|
169
|
+
}
|
|
170
|
+
interface BootstrapOptions {
|
|
171
|
+
/** Confidence level alpha (default 0.05 → 95% CI). */
|
|
172
|
+
alpha?: number;
|
|
173
|
+
/** Number of resamples (default 1000). */
|
|
174
|
+
iterations?: number;
|
|
175
|
+
/**
|
|
176
|
+
* Minimum total samples (baseline + candidate) below which we always
|
|
177
|
+
* return INCONCLUSIVE — bootstrap with too few samples is meaningless.
|
|
178
|
+
* Default 6 (combined).
|
|
179
|
+
*/
|
|
180
|
+
minTotalSamples?: number;
|
|
181
|
+
/** RNG seed for reproducibility. Default: Math.random. */
|
|
182
|
+
seed?: number;
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
|
|
186
|
+
*
|
|
187
|
+
* Uses simple percentile bootstrap on the difference of resampled means.
|
|
188
|
+
* That's the standard non-parametric primitive — no distributional
|
|
189
|
+
* assumptions, robust to skew, easy to reason about.
|
|
190
|
+
*/
|
|
191
|
+
declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
|
|
192
|
+
/**
|
|
193
|
+
* Judge-replay promotion gate.
|
|
194
|
+
*
|
|
195
|
+
* The cheap inner-loop judge that drives an evolution run is by definition
|
|
196
|
+
* fast and noisy. When you're about to promote a winning variant to the
|
|
197
|
+
* canonical default, you want a STRONGER judge (a more expensive model, a
|
|
198
|
+
* human grader, a separately-trained reward model) to confirm the win
|
|
199
|
+
* generalises beyond the inner loop.
|
|
200
|
+
*
|
|
201
|
+
* This helper takes raw winner + baseline outputs, scores both through the
|
|
202
|
+
* stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
|
|
203
|
+
* judge agrees the winner is real with the configured confidence. Doesn't
|
|
204
|
+
* matter what shape your "output" is — pass a string, an object, anything
|
|
205
|
+
* the judge can read.
|
|
206
|
+
*/
|
|
207
|
+
interface JudgeReplayGateArgs<TOutput> {
|
|
208
|
+
baselineOutputs: TOutput[];
|
|
209
|
+
candidateOutputs: TOutput[];
|
|
210
|
+
/** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
|
|
211
|
+
judge: (output: TOutput) => Promise<number> | number;
|
|
212
|
+
alpha?: number;
|
|
213
|
+
iterations?: number;
|
|
214
|
+
/** RNG seed for reproducibility. */
|
|
215
|
+
seed?: number;
|
|
216
|
+
/** Maximum concurrent judge calls. Default 4. */
|
|
217
|
+
judgeConcurrency?: number;
|
|
218
|
+
}
|
|
219
|
+
declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
|
|
220
|
+
baselineSamples: number;
|
|
221
|
+
candidateSamples: number;
|
|
222
|
+
}>;
|
|
223
|
+
|
|
224
|
+
interface RenderReleaseReportOptions {
|
|
225
|
+
title?: string;
|
|
226
|
+
runs?: readonly RunRecord[];
|
|
227
|
+
comparator?: string;
|
|
228
|
+
traceAnalystFindings?: readonly string[];
|
|
229
|
+
nextActions?: readonly string[];
|
|
230
|
+
}
|
|
231
|
+
declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
|
|
232
|
+
|
|
233
|
+
export { type ActionableSideInfo as A, type BootstrapOptions as B, type JudgeReplayGateArgs as J, type ReleaseConfidenceAxis as R, type Verdict as V, type BootstrapResult as a, type ReleaseConfidenceAxisName as b, type ReleaseConfidenceInput as c, type ReleaseConfidenceIssue as d, type ReleaseConfidenceMetrics as e, type ReleaseConfidenceScorecard as f, type ReleaseConfidenceStatus as g, type ReleaseConfidenceThresholds as h, type ReleaseTraceEvidence as i, type RenderReleaseReportOptions as j, assertReleaseConfidence as k, bootstrapCi as l, evaluateReleaseConfidence as m, judgeReplayGate as n, type AsiSeverity as o, renderReleaseReport as r };
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-D_4BSXGV.js';
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs,
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
|
|
3
3
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
|
+
export { P as PairedBootstrapOptions, a as PairedBootstrapResult, b as benjaminiHochberg, p as pairedBootstrap, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
|
|
4
5
|
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
|
|
5
6
|
import './run-record-BgTFzO2r.js';
|
|
6
7
|
import './errors-Dwqw-T_m.js';
|
|
7
8
|
import './schema-m0gsnbt3.js';
|
|
8
9
|
import './outcome-store-D6KWmYvj.js';
|
|
10
|
+
import './dataset-B2kL-fSM.js';
|
|
9
11
|
import './judge-calibration-DilmB3Ml.js';
|
|
10
|
-
import './types-
|
|
12
|
+
import './types-Croy5h7V.js';
|
|
11
13
|
import '@tangle-network/tcloud';
|
|
12
|
-
import './dataset-B2kL-fSM.js';
|
|
13
14
|
import './failure-cluster-CL7IVgkJ.js';
|
|
14
15
|
import './store-CKUAgsJz.js';
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "./chunk-
|
|
4
|
-
import "./chunk-E22YUOAL.js";
|
|
3
|
+
} from "./chunk-6XQIEUQ2.js";
|
|
5
4
|
import "./chunk-ITBRCT73.js";
|
|
6
5
|
import "./chunk-3BFEG2F6.js";
|
|
7
6
|
import "./chunk-PZ5AY32C.js";
|
|
8
7
|
export {
|
|
9
8
|
runCampaign
|
|
10
9
|
};
|
|
11
|
-
//# sourceMappingURL=run-campaign-
|
|
10
|
+
//# sourceMappingURL=run-campaign-BVY3RGAZ.js.map
|