@tangle-network/agent-eval 0.36.0 → 0.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,276 @@
1
+ // src/matrix/aggregation.ts
2
+ function flattenRuns(cells) {
3
+ const rows = [];
4
+ for (const { cell, runs } of cells) {
5
+ for (const result of runs) rows.push({ cell, result });
6
+ }
7
+ return rows;
8
+ }
9
+ function quantile(sorted, q) {
10
+ if (sorted.length === 0) return 0;
11
+ if (sorted.length === 1) return sorted[0];
12
+ const pos = (sorted.length - 1) * q;
13
+ const lo = Math.floor(pos);
14
+ const hi = Math.ceil(pos);
15
+ if (lo === hi) return sorted[lo];
16
+ const frac = pos - lo;
17
+ return sorted[lo] * (1 - frac) + sorted[hi] * frac;
18
+ }
19
+ function summariseRows(rows, axisName, axisValue) {
20
+ if (rows.length === 0) {
21
+ return {
22
+ axisName,
23
+ axisValue,
24
+ cells: 0,
25
+ passRate: 0,
26
+ meanScore: 0,
27
+ p50Score: 0,
28
+ p90Score: 0,
29
+ totalCostUsd: 0,
30
+ meanDurationMs: 0
31
+ };
32
+ }
33
+ let pass = 0;
34
+ let scoreSum = 0;
35
+ let costSum = 0;
36
+ let durSum = 0;
37
+ const scores = [];
38
+ for (const { result } of rows) {
39
+ const errored = result.error !== void 0;
40
+ const score = errored ? 0 : result.verdict.score;
41
+ const valid = !errored && result.verdict.valid;
42
+ if (valid) pass++;
43
+ scoreSum += score;
44
+ scores.push(score);
45
+ costSum += result.costUsd;
46
+ durSum += result.durationMs;
47
+ }
48
+ scores.sort((a, b) => a - b);
49
+ return {
50
+ axisName,
51
+ axisValue,
52
+ cells: rows.length,
53
+ passRate: pass / rows.length,
54
+ meanScore: scoreSum / rows.length,
55
+ p50Score: quantile(scores, 0.5),
56
+ p90Score: quantile(scores, 0.9),
57
+ totalCostUsd: costSum,
58
+ meanDurationMs: durSum / rows.length
59
+ };
60
+ }
61
+ function bucketBy(rows, axisName, labelFor) {
62
+ const buckets = /* @__PURE__ */ new Map();
63
+ for (const row of rows) {
64
+ const slot = row.cell.axes[axisName];
65
+ if (!slot) continue;
66
+ const id = slot.id;
67
+ let arr = buckets.get(id);
68
+ if (!arr) {
69
+ arr = [];
70
+ buckets.set(id, arr);
71
+ }
72
+ arr.push(row);
73
+ }
74
+ const out = {};
75
+ for (const id of [...buckets.keys()].sort()) {
76
+ out[id] = summariseRows(buckets.get(id), axisName, labelFor(id));
77
+ }
78
+ return out;
79
+ }
80
+ function buildByAxis(cells, axes, aggregateBy) {
81
+ const rows = flattenRuns(cells);
82
+ const byName = new Map(axes.map((a) => [a.name, a]));
83
+ const byAxis = {};
84
+ for (const name of aggregateBy) {
85
+ const axis = byName.get(name);
86
+ const labelFor = (id) => {
87
+ if (!axis?.label) return id;
88
+ const found = axis.values.find((v) => v.id === id);
89
+ if (!found) return id;
90
+ return axis.label(found.value, id);
91
+ };
92
+ byAxis[name] = bucketBy(rows, name, labelFor);
93
+ }
94
+ return byAxis;
95
+ }
96
+
97
+ // src/matrix/runner.ts
98
+ function cartesian(axes) {
99
+ if (axes.length === 0) return [{ axes: {} }];
100
+ for (const a of axes) if (a.values.length === 0) return [];
101
+ const out = [];
102
+ const idx = new Array(axes.length).fill(0);
103
+ while (true) {
104
+ const slot = {};
105
+ for (let i2 = 0; i2 < axes.length; i2++) {
106
+ const axis = axes[i2];
107
+ const v = axis.values[idx[i2]];
108
+ slot[axis.name] = { id: v.id, value: v.value };
109
+ }
110
+ out.push({ axes: slot });
111
+ let i = 0;
112
+ while (i < axes.length) {
113
+ const next = idx[i] + 1;
114
+ const axis = axes[i];
115
+ if (next < axis.values.length) {
116
+ idx[i] = next;
117
+ break;
118
+ }
119
+ idx[i] = 0;
120
+ i++;
121
+ }
122
+ if (i === axes.length) break;
123
+ }
124
+ return out;
125
+ }
126
+ function makeMatrixId() {
127
+ const t = Date.now().toString(36);
128
+ let r = "";
129
+ for (let i = 0; i < 8; i++) r += Math.floor(Math.random() * 16).toString(16);
130
+ return `mtx_${t}_${r}`;
131
+ }
132
+ function makeErrorResult(err) {
133
+ const e = err;
134
+ return {
135
+ output: void 0,
136
+ verdict: { valid: false, score: 0 },
137
+ costUsd: 0,
138
+ durationMs: 0,
139
+ error: {
140
+ message: typeof e?.message === "string" ? e.message : String(err),
141
+ kind: typeof e?.name === "string" ? e.name : "Error"
142
+ }
143
+ };
144
+ }
145
+ async function runAgentMatrix(opts) {
146
+ const startedAt = Date.now();
147
+ const reps = Math.max(1, opts.reps ?? 1);
148
+ const maxConcurrency = Math.max(1, opts.maxConcurrency ?? 4);
149
+ const costCeiling = opts.costCeiling ?? Number.POSITIVE_INFINITY;
150
+ const aggregateBy = opts.aggregateBy ?? opts.axes.map((a) => a.name);
151
+ const base = cartesian(opts.axes);
152
+ const filtered = opts.filter ? base.filter((c) => opts.filter(c)) : base;
153
+ const filteredOut = base.length - filtered.length;
154
+ const planned = [];
155
+ for (let i = 0; i < filtered.length; i++) {
156
+ for (let r = 0; r < reps; r++) {
157
+ planned.push({
158
+ axes: filtered[i].axes,
159
+ rep: r,
160
+ ordinal: i * reps + r
161
+ });
162
+ }
163
+ }
164
+ const cellRecords = [];
165
+ let cumulativeCost = 0;
166
+ let costCeilingReached = false;
167
+ let runsExecuted = 0;
168
+ let cellsUnscheduled = 0;
169
+ const aborted = () => opts.signal?.aborted === true;
170
+ let inFlight = 0;
171
+ let cursor = 0;
172
+ let resolveAll;
173
+ const done = new Promise((res) => {
174
+ resolveAll = res;
175
+ });
176
+ const pump = () => {
177
+ while (inFlight < maxConcurrency && cursor < planned.length) {
178
+ if (aborted() || costCeilingReached) {
179
+ const left = planned.length - cursor;
180
+ cellsUnscheduled += left;
181
+ cursor = planned.length;
182
+ break;
183
+ }
184
+ const cell = planned[cursor++];
185
+ inFlight++;
186
+ const record = { cell, runs: [] };
187
+ cellRecords.push(record);
188
+ const promise = (async () => {
189
+ try {
190
+ return await opts.runCell(cell);
191
+ } catch (err) {
192
+ return makeErrorResult(err);
193
+ }
194
+ })();
195
+ promise.then((result) => {
196
+ record.runs.push(result);
197
+ runsExecuted++;
198
+ cumulativeCost += result.costUsd;
199
+ if (cumulativeCost >= costCeiling && !costCeilingReached) {
200
+ costCeilingReached = true;
201
+ console.warn("[matrix] cost ceiling reached");
202
+ }
203
+ try {
204
+ opts.onCellComplete?.(cell, result);
205
+ } catch {
206
+ }
207
+ inFlight--;
208
+ if (cursor < planned.length) {
209
+ pump();
210
+ } else if (inFlight === 0) {
211
+ resolveAll?.();
212
+ }
213
+ });
214
+ }
215
+ if (cursor >= planned.length && inFlight === 0) resolveAll?.();
216
+ };
217
+ const onAbort = () => {
218
+ if (cursor < planned.length) {
219
+ cellsUnscheduled += planned.length - cursor;
220
+ cursor = planned.length;
221
+ }
222
+ if (inFlight === 0) resolveAll?.();
223
+ };
224
+ if (opts.signal) {
225
+ if (opts.signal.aborted) {
226
+ cellsUnscheduled = planned.length;
227
+ cursor = planned.length;
228
+ resolveAll?.();
229
+ } else {
230
+ opts.signal.addEventListener("abort", onAbort, { once: true });
231
+ }
232
+ }
233
+ if (planned.length === 0) {
234
+ resolveAll?.();
235
+ } else {
236
+ pump();
237
+ }
238
+ await done;
239
+ if (opts.signal) opts.signal.removeEventListener("abort", onAbort);
240
+ cellRecords.sort((a, b) => a.cell.ordinal - b.cell.ordinal);
241
+ let pass = 0;
242
+ let scoreSum = 0;
243
+ let totalCost = 0;
244
+ let runCount = 0;
245
+ for (const { runs } of cellRecords) {
246
+ for (const r of runs) {
247
+ runCount++;
248
+ const errored = r.error !== void 0;
249
+ if (!errored && r.verdict.valid) pass++;
250
+ scoreSum += errored ? 0 : r.verdict.score;
251
+ totalCost += r.costUsd;
252
+ }
253
+ }
254
+ const byAxis = buildByAxis(cellRecords, opts.axes, aggregateBy);
255
+ return {
256
+ cells: cellRecords,
257
+ byAxis,
258
+ summary: {
259
+ totalCells: planned.length,
260
+ runsExecuted,
261
+ cellsSkipped: cellsUnscheduled + filteredOut * reps,
262
+ overallPassRate: runCount === 0 ? 0 : pass / runCount,
263
+ overallMeanScore: runCount === 0 ? 0 : scoreSum / runCount,
264
+ totalCostUsd: totalCost,
265
+ durationMs: Date.now() - startedAt
266
+ },
267
+ matrixId: makeMatrixId()
268
+ };
269
+ }
270
+
271
+ export {
272
+ summariseRows,
273
+ buildByAxis,
274
+ runAgentMatrix
275
+ };
276
+ //# sourceMappingURL=chunk-QWV226SL.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/matrix/aggregation.ts","../src/matrix/runner.ts"],"sourcesContent":["/**\n * Per-axis aggregation of cell runs into `AxisSummary` rows.\n *\n * Pure: consumes the final `cells: [{cell, runs}]` array and returns the\n * `byAxis` table. Error runs contribute 0 to passRate and meanScore. Cost\n * and duration always count — the budget was spent regardless.\n */\n\nimport type { AxisSummary, CellResult, MatrixAxis, MatrixCell, MatrixResult } from './types'\n\ninterface Row<Output> {\n cell: MatrixCell\n result: CellResult<Output>\n}\n\nfunction flattenRuns<Output>(cells: MatrixResult<Output>['cells']): Row<Output>[] {\n const rows: Row<Output>[] = []\n for (const { cell, runs } of cells) {\n for (const result of runs) rows.push({ cell, result })\n }\n return rows\n}\n\nfunction quantile(sorted: number[], q: number): number {\n if (sorted.length === 0) return 0\n if (sorted.length === 1) return sorted[0] as number\n const pos = (sorted.length - 1) * q\n const lo = Math.floor(pos)\n const hi = Math.ceil(pos)\n if (lo === hi) return sorted[lo] as number\n const frac = pos - lo\n return (sorted[lo] as number) * (1 - frac) + (sorted[hi] as number) * frac\n}\n\nexport function summariseRows<Output>(\n rows: Row<Output>[],\n axisName: string,\n axisValue: string,\n): AxisSummary {\n if (rows.length === 0) {\n return {\n axisName,\n axisValue,\n cells: 0,\n passRate: 0,\n meanScore: 0,\n p50Score: 0,\n p90Score: 0,\n totalCostUsd: 0,\n meanDurationMs: 0,\n }\n }\n let pass = 0\n let scoreSum = 0\n let costSum = 0\n let durSum = 0\n const scores: number[] = []\n for (const { result } of rows) {\n const errored = result.error !== undefined\n const score = errored ? 0 : result.verdict.score\n const valid = !errored && result.verdict.valid\n if (valid) pass++\n scoreSum += score\n scores.push(score)\n costSum += result.costUsd\n durSum += result.durationMs\n }\n scores.sort((a, b) => a - b)\n return {\n axisName,\n axisValue,\n cells: rows.length,\n passRate: pass / rows.length,\n meanScore: scoreSum / rows.length,\n p50Score: quantile(scores, 0.5),\n p90Score: quantile(scores, 0.9),\n totalCostUsd: costSum,\n meanDurationMs: durSum / rows.length,\n }\n}\n\nfunction bucketBy<Output>(\n rows: Row<Output>[],\n axisName: string,\n labelFor: (id: string) => string,\n): Record<string, AxisSummary> {\n const buckets = new Map<string, Row<Output>[]>()\n for (const row of rows) {\n const slot = row.cell.axes[axisName]\n if (!slot) continue\n const id = slot.id\n let arr = buckets.get(id)\n if (!arr) {\n arr = []\n buckets.set(id, arr)\n }\n arr.push(row)\n }\n const out: Record<string, AxisSummary> = {}\n // Sorted keys for deterministic JSON serialisation.\n for (const id of [...buckets.keys()].sort()) {\n out[id] = summariseRows(buckets.get(id) as Row<Output>[], axisName, labelFor(id))\n }\n return out\n}\n\nexport function buildByAxis<Output>(\n cells: MatrixResult<Output>['cells'],\n axes: MatrixAxis<unknown>[],\n aggregateBy: string[],\n): Record<string, Record<string, AxisSummary>> {\n const rows = flattenRuns(cells)\n const byName = new Map(axes.map((a) => [a.name, a]))\n const byAxis: Record<string, Record<string, AxisSummary>> = {}\n for (const name of aggregateBy) {\n const axis = byName.get(name)\n const labelFor = (id: string): string => {\n if (!axis?.label) return id\n const found = axis.values.find((v) => v.id === id)\n if (!found) return id\n return axis.label(found.value, id)\n }\n byAxis[name] = bucketBy(rows, name, labelFor)\n }\n return byAxis\n}\n","/**\n * N-axis cartesian runner.\n *\n * Expansion order: cartesian over `axes` in declared order, then `reps` as the\n * inner-most dim → `ordinal = (cartIdx * reps) + rep`. The returned\n * `cells[]` is sorted by `ordinal` so concurrent execution does not reorder\n * the output.\n *\n * Scheduling is a sliding window of in-flight promises capped at\n * `maxConcurrency`. The window stops admitting new cells when the cost\n * ceiling trips or the abort signal fires; in-flight cells finish.\n */\n\nimport { buildByAxis } from './aggregation'\nimport type {\n CellResult,\n MatrixAxis,\n MatrixCell,\n MatrixResult,\n RunAgentMatrixOptions,\n} from './types'\n\ninterface BaseCell {\n axes: Record<string, { id: string; value: unknown }>\n}\n\nfunction cartesian(axes: MatrixAxis<unknown>[]): BaseCell[] {\n // Empty axes (`values=[]`) collapse the whole product to zero cells. An\n // empty `axes` array yields a single empty-axes cell — degenerate but\n // valid (caller is iterating only reps).\n if (axes.length === 0) return [{ axes: {} }]\n for (const a of axes) if (a.values.length === 0) return []\n const out: BaseCell[] = []\n const idx = new Array(axes.length).fill(0)\n while (true) {\n const slot: Record<string, { id: string; value: unknown }> = {}\n for (let i = 0; i < axes.length; i++) {\n const axis = axes[i] as MatrixAxis<unknown>\n const v = axis.values[idx[i] as number] as { id: string; value: unknown }\n slot[axis.name] = { id: v.id, value: v.value }\n }\n out.push({ axes: slot })\n // Increment like an odometer, left-most axis is fastest.\n let i = 0\n while (i < axes.length) {\n const next = (idx[i] as number) + 1\n const axis = axes[i] as MatrixAxis<unknown>\n if (next < axis.values.length) {\n idx[i] = next\n break\n }\n idx[i] = 0\n i++\n }\n if (i === axes.length) break\n }\n return out\n}\n\nfunction makeMatrixId(): string {\n // Stable id-like string: time + 8 random hex chars. Avoids node:crypto\n // import to keep the matrix dep-free.\n const t = Date.now().toString(36)\n let r = ''\n for (let i = 0; i < 8; i++) r += Math.floor(Math.random() * 16).toString(16)\n return `mtx_${t}_${r}`\n}\n\nfunction makeErrorResult<Output>(err: unknown): CellResult<Output> {\n const e = err as { message?: string; name?: string }\n return {\n output: undefined as unknown as Output,\n verdict: { valid: false, score: 0 },\n costUsd: 0,\n durationMs: 0,\n error: {\n message: typeof e?.message === 'string' ? e.message : String(err),\n kind: typeof e?.name === 'string' ? e.name : 'Error',\n },\n }\n}\n\nexport async function runAgentMatrix<Output>(\n opts: RunAgentMatrixOptions<Output>,\n): Promise<MatrixResult<Output>> {\n const startedAt = Date.now()\n const reps = Math.max(1, opts.reps ?? 1)\n const maxConcurrency = Math.max(1, opts.maxConcurrency ?? 4)\n const costCeiling = opts.costCeiling ?? Number.POSITIVE_INFINITY\n const aggregateBy = opts.aggregateBy ?? opts.axes.map((a) => a.name)\n\n const base = cartesian(opts.axes)\n const filtered = opts.filter\n ? base.filter((c) => (opts.filter as (b: BaseCell) => boolean)(c))\n : base\n const filteredOut = base.length - filtered.length\n\n const planned: MatrixCell[] = []\n for (let i = 0; i < filtered.length; i++) {\n for (let r = 0; r < reps; r++) {\n planned.push({\n axes: (filtered[i] as BaseCell).axes,\n rep: r,\n ordinal: i * reps + r,\n })\n }\n }\n\n const cellRecords: Array<{ cell: MatrixCell; runs: CellResult<Output>[] }> = []\n let cumulativeCost = 0\n let costCeilingReached = false\n let runsExecuted = 0\n let cellsUnscheduled = 0\n\n const aborted = (): boolean => opts.signal?.aborted === true\n\n // Per-run abort controller forwards the external signal so cell executors\n // see cancellation. We don't expose it on `MatrixCell` — the signature on\n // `runCell` per the public API is `(cell) => Promise<...>`. Executors that\n // need cancellation use the external signal directly via closure.\n\n let inFlight = 0\n let cursor = 0\n let resolveAll: (() => void) | undefined\n const done = new Promise<void>((res) => {\n resolveAll = res\n })\n\n const pump = (): void => {\n while (inFlight < maxConcurrency && cursor < planned.length) {\n if (aborted() || costCeilingReached) {\n // Drain remaining as unscheduled.\n const left = planned.length - cursor\n cellsUnscheduled += left\n cursor = planned.length\n break\n }\n const cell = planned[cursor++] as MatrixCell\n inFlight++\n // Lazily allocate the record so cells appear in `cells[]` in any\n // arrival order; we sort by ordinal at the end.\n const record = { cell, runs: [] as CellResult<Output>[] }\n cellRecords.push(record)\n const promise: Promise<CellResult<Output>> = (async () => {\n try {\n return await opts.runCell(cell)\n } catch (err) {\n return makeErrorResult<Output>(err)\n }\n })()\n promise.then((result) => {\n record.runs.push(result)\n runsExecuted++\n cumulativeCost += result.costUsd\n if (cumulativeCost >= costCeiling && !costCeilingReached) {\n costCeilingReached = true\n // eslint-disable-next-line no-console\n console.warn('[matrix] cost ceiling reached')\n }\n try {\n opts.onCellComplete?.(cell, result)\n } catch {\n // onCellComplete is observational — swallow throws so a noisy\n // callback can't tank the run.\n }\n inFlight--\n if (cursor < planned.length) {\n pump()\n } else if (inFlight === 0) {\n resolveAll?.()\n }\n })\n }\n if (cursor >= planned.length && inFlight === 0) resolveAll?.()\n }\n\n const onAbort = (): void => {\n // External abort: stop scheduling. In-flight cells finish; their\n // executors observe `opts.signal.aborted` directly via closure.\n if (cursor < planned.length) {\n cellsUnscheduled += planned.length - cursor\n cursor = planned.length\n }\n if (inFlight === 0) resolveAll?.()\n }\n if (opts.signal) {\n if (opts.signal.aborted) {\n cellsUnscheduled = planned.length\n cursor = planned.length\n resolveAll?.()\n } else {\n opts.signal.addEventListener('abort', onAbort, { once: true })\n }\n }\n\n if (planned.length === 0) {\n resolveAll?.()\n } else {\n pump()\n }\n\n await done\n if (opts.signal) opts.signal.removeEventListener('abort', onAbort)\n\n cellRecords.sort((a, b) => a.cell.ordinal - b.cell.ordinal)\n\n let pass = 0\n let scoreSum = 0\n let totalCost = 0\n let runCount = 0\n for (const { runs } of cellRecords) {\n for (const r of runs) {\n runCount++\n const errored = r.error !== undefined\n if (!errored && r.verdict.valid) pass++\n scoreSum += errored ? 0 : r.verdict.score\n totalCost += r.costUsd\n }\n }\n\n const byAxis = buildByAxis(cellRecords, opts.axes, aggregateBy)\n\n return {\n cells: cellRecords,\n byAxis,\n summary: {\n totalCells: planned.length,\n runsExecuted,\n cellsSkipped: cellsUnscheduled + filteredOut * reps,\n overallPassRate: runCount === 0 ? 0 : pass / runCount,\n overallMeanScore: runCount === 0 ? 0 : scoreSum / runCount,\n totalCostUsd: totalCost,\n durationMs: Date.now() - startedAt,\n },\n matrixId: makeMatrixId(),\n }\n}\n"],"mappings":";AAeA,SAAS,YAAoB,OAAqD;AAChF,QAAM,OAAsB,CAAC;AAC7B,aAAW,EAAE,MAAM,KAAK,KAAK,OAAO;AAClC,eAAW,UAAU,KAAM,MAAK,KAAK,EAAE,MAAM,OAAO,CAAC;AAAA,EACvD;AACA,SAAO;AACT;AAEA,SAAS,SAAS,QAAkB,GAAmB;AACrD,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,OAAO,WAAW,EAAG,QAAO,OAAO,CAAC;AACxC,QAAM,OAAO,OAAO,SAAS,KAAK;AAClC,QAAM,KAAK,KAAK,MAAM,GAAG;AACzB,QAAM,KAAK,KAAK,KAAK,GAAG;AACxB,MAAI,OAAO,GAAI,QAAO,OAAO,EAAE;AAC/B,QAAM,OAAO,MAAM;AACnB,SAAQ,OAAO,EAAE,KAAgB,IAAI,QAAS,OAAO,EAAE,IAAe;AACxE;AAEO,SAAS,cACd,MACA,UACA,WACa;AACb,MAAI,KAAK,WAAW,GAAG;AACrB,WAAO;AAAA,MACL;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV,WAAW;AAAA,MACX,UAAU;AAAA,MACV,UAAU;AAAA,MACV,cAAc;AAAA,MACd,gBAAgB;AAAA,IAClB;AAAA,EACF;AACA,MAAI,OAAO;AACX,MAAI,WAAW;AACf,MAAI,UAAU;AACd,MAAI,SAAS;AACb,QAAM,SAAmB,CAAC;AAC1B,aAAW,EAAE,OAAO,KAAK,MAAM;AAC7B,UAAM,UAAU,OAAO,UAAU;AACjC,UAAM,QAAQ,UAAU,IAAI,OAAO,QAAQ;AAC3C,UAAM,QAAQ,CAAC,WAAW,OAAO,QAAQ;AACzC,QAAI,MAAO;AACX,gBAAY;AACZ,WAAO,KAAK,KAAK;AACjB,eAAW,OAAO;AAClB,cAAU,OAAO;AAAA,EACnB;AACA,SAAO,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3B,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,OAAO,KAAK;AAAA,IACZ,UAAU,OAAO,KAAK;AAAA,IACtB,WAAW,WAAW,KAAK;AAAA,IAC3B,UAAU,SAAS,QAAQ,GAAG;AAAA,IAC9B,UAAU,SAAS,QAAQ,GAAG;AAAA,IAC9B,cAAc;AAAA,IACd,gBAAgB,SAAS,KAAK;AAAA,EAChC;AACF;AAEA,SAAS,SACP,MACA,UACA,UAC6B;AAC7B,QAAM,UAAU,oBAAI,IAA2B;AAC/C,aAAW,OAAO,MAAM;AACtB,UAAM,OAAO,IAAI,KAAK,KAAK,QAAQ;AACnC,QAAI,CAAC,KAAM;AACX,UAAM,KAAK,KAAK;AAChB,QAAI,MAAM,QAAQ,IAAI,EAAE;AACxB,QAAI,CAAC,KAAK;AACR,YAAM,CAAC;AACP,cAAQ,IAAI,IAAI,GAAG;AAAA,IACrB;AACA,QAAI,KAAK,GAAG;AAAA,EACd;AACA,QAAM,MAAmC,CAAC;AAE1C,aAAW,MAAM,CAAC,GAAG,QAAQ,KAAK,CAAC,EAAE,KAAK,GAAG;AAC3C,QAAI,EAAE,IAAI,cAAc,QAAQ,IAAI,EAAE,GAAoB,UAAU,SAAS,EAAE,CAAC;AAAA,EAClF;AACA,SAAO;AACT;AAEO,SAAS,YACd,OACA,MACA,aAC6C;AAC7C,QAAM,OAAO,YAAY,KAAK;AAC9B,QAAM,SAAS,IAAI,IAAI,KAAK,IAAI,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC;AACnD,QAAM,SAAsD,CAAC;AAC7D,aAAW,QAAQ,aAAa;AAC9B,UAAM,OAAO,OAAO,IAAI,IAAI;AAC5B,UAAM,WAAW,CAAC,OAAuB;AACvC,UAAI,CAAC,MAAM,MAAO,QAAO;AACzB,YAAM,QAAQ,KAAK,OAAO,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE;AACjD,UAAI,CAAC,MAAO,QAAO;AACnB,aAAO,KAAK,MAAM,MAAM,OAAO,EAAE;AAAA,IACnC;AACA,WAAO,IAAI,IAAI,SAAS,MAAM,MAAM,QAAQ;AAAA,EAC9C;AACA,SAAO;AACT;;;ACnGA,SAAS,UAAU,MAAyC;AAI1D,MAAI,KAAK,WAAW,EAAG,QAAO,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC;AAC3C,aAAW,KAAK,KAAM,KAAI,EAAE,OAAO,WAAW,EAAG,QAAO,CAAC;AACzD,QAAM,MAAkB,CAAC;AACzB,QAAM,MAAM,IAAI,MAAM,KAAK,MAAM,EAAE,KAAK,CAAC;AACzC,SAAO,MAAM;AACX,UAAM,OAAuD,CAAC;AAC9D,aAASA,KAAI,GAAGA,KAAI,KAAK,QAAQA,MAAK;AACpC,YAAM,OAAO,KAAKA,EAAC;AACnB,YAAM,IAAI,KAAK,OAAO,IAAIA,EAAC,CAAW;AACtC,WAAK,KAAK,IAAI,IAAI,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,MAAM;AAAA,IAC/C;AACA,QAAI,KAAK,EAAE,MAAM,KAAK,CAAC;AAEvB,QAAI,IAAI;AACR,WAAO,IAAI,KAAK,QAAQ;AACtB,YAAM,OAAQ,IAAI,CAAC,IAAe;AAClC,YAAM,OAAO,KAAK,CAAC;AACnB,UAAI,OAAO,KAAK,OAAO,QAAQ;AAC7B,YAAI,CAAC,IAAI;AACT;AAAA,MACF;AACA,UAAI,CAAC,IAAI;AACT;AAAA,IACF;AACA,QAAI,MAAM,KAAK,OAAQ;AAAA,EACzB;AACA,SAAO;AACT;AAEA,SAAS,eAAuB;AAG9B,QAAM,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE;AAChC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,GAAG,IAAK,MAAK,KAAK,MAAM,KAAK,OAAO,IAAI,EAAE,EAAE,SAAS,EAAE;AAC3E,SAAO,OAAO,CAAC,IAAI,CAAC;AACtB;AAEA,SAAS,gBAAwB,KAAkC;AACjE,QAAM,IAAI;AACV,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,SAAS,EAAE,OAAO,OAAO,OAAO,EAAE;AAAA,IAClC,SAAS;AAAA,IACT,YAAY;AAAA,IACZ,OAAO;AAAA,MACL,SAAS,OAAO,GAAG,YAAY,WAAW,EAAE,UAAU,OAAO,GAAG;AAAA,MAChE,MAAM,OAAO,GAAG,SAAS,WAAW,EAAE,OAAO;AAAA,IAC/C;AAAA,EACF;AACF;AAEA,eAAsB,eACpB,MAC+B;AAC/B,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,OAAO,KAAK,IAAI,GAAG,KAAK,QAAQ,CAAC;AACvC,QAAM,iBAAiB,KAAK,IAAI,GAAG,KAAK,kBAAkB,CAAC;AAC3D,QAAM,cAAc,KAAK,eAAe,OAAO;AAC/C,QAAM,cAAc,KAAK,eAAe,KAAK,KAAK,IAAI,CAAC,MAAM,EAAE,IAAI;AAEnE,QAAM,OAAO,UAAU,KAAK,IAAI;AAChC,QAAM,WAAW,KAAK,SAClB,KAAK,OAAO,CAAC,MAAO,KAAK,OAAoC,CAAC,CAAC,IAC/D;AACJ,QAAM,cAAc,KAAK,SAAS,SAAS;AAE3C,QAAM,UAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,aAAS,IAAI,GAAG,IAAI,MAAM,KAAK;AAC7B,cAAQ,KAAK;AAAA,QACX,MAAO,SAAS,CAAC,EAAe;AAAA,QAChC,KAAK;AAAA,QACL,SAAS,IAAI,OAAO;AAAA,MACtB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,cAAuE,CAAC;AAC9E,MAAI,iBAAiB;AACrB,MAAI,qBAAqB;AACzB,MAAI,eAAe;AACnB,MAAI,mBAAmB;AAEvB,QAAM,UAAU,MAAe,KAAK,QAAQ,YAAY;AAOxD,MAAI,WAAW;AACf,MAAI,SAAS;AACb,MAAI;AACJ,QAAM,OAAO,IAAI,QAAc,CAAC,QAAQ;AACtC,iBAAa;AAAA,EACf,CAAC;AAED,QAAM,OAAO,MAAY;AACvB,WAAO,WAAW,kBAAkB,SAAS,QAAQ,QAAQ;AAC3D,UAAI,QAAQ,KAAK,oBAAoB;AAEnC,cAAM,OAAO,QAAQ,SAAS;AAC9B,4BAAoB;AACpB,iBAAS,QAAQ;AACjB;AAAA,MACF;AACA,YAAM,OAAO,QAAQ,QAAQ;AAC7B;AAGA,YAAM,SAAS,EAAE,MAAM,MAAM,CAAC,EAA0B;AACxD,kBAAY,KAAK,MAAM;AACvB,YAAM,WAAwC,YAAY;AACxD,YAAI;AACF,iBAAO,MAAM,KAAK,QAAQ,IAAI;AAAA,QAChC,SAAS,KAAK;AACZ,iBAAO,gBAAwB,GAAG;AAAA,QACpC;AAAA,MACF,GAAG;AACH,cAAQ,KAAK,CAAC,WAAW;AACvB,eAAO,KAAK,KAAK,MAAM;AACvB;AACA,0BAAkB,OAAO;AACzB,YAAI,kBAAkB,eAAe,CAAC,oBAAoB;AACxD,+BAAqB;AAErB,kBAAQ,KAAK,+BAA+B;AAAA,QAC9C;AACA,YAAI;AACF,eAAK,iBAAiB,MAAM,MAAM;AAAA,QACpC,QAAQ;AAAA,QAGR;AACA;AACA,YAAI,SAAS,QAAQ,QAAQ;AAC3B,eAAK;AAAA,QACP,WAAW,aAAa,GAAG;AACzB,uBAAa;AAAA,QACf;AAAA,MACF,CAAC;AAAA,IACH;AACA,QAAI,UAAU,QAAQ,UAAU,aAAa,EAAG,cAAa;AAAA,EAC/D;AAEA,QAAM,UAAU,MAAY;AAG1B,QAAI,SAAS,QAAQ,QAAQ;AAC3B,0BAAoB,QAAQ,SAAS;AACrC,eAAS,QAAQ;AAAA,IACnB;AACA,QAAI,aAAa,EAAG,cAAa;AAAA,EACnC;AACA,MAAI,KAAK,QAAQ;AACf,QAAI,KAAK,OAAO,SAAS;AACvB,yBAAmB,QAAQ;AAC3B,eAAS,QAAQ;AACjB,mBAAa;AAAA,IACf,OAAO;AACL,WAAK,OAAO,iBAAiB,SAAS,SAAS,EAAE,MAAM,KAAK,CAAC;AAAA,IAC/D;AAAA,EACF;AAEA,MAAI,QAAQ,WAAW,GAAG;AACxB,iBAAa;AAAA,EACf,OAAO;AACL,SAAK;AAAA,EACP;AAEA,QAAM;AACN,MAAI,KAAK,OAAQ,MAAK,OAAO,oBAAoB,SAAS,OAAO;AAEjE,cAAY,KAAK,CAAC,GAAG,MAAM,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AAE1D,MAAI,OAAO;AACX,MAAI,WAAW;AACf,MAAI,YAAY;AAChB,MAAI,WAAW;AACf,aAAW,EAAE,KAAK,KAAK,aAAa;AAClC,eAAW,KAAK,MAAM;AACpB;AACA,YAAM,UAAU,EAAE,UAAU;AAC5B,UAAI,CAAC,WAAW,EAAE,QAAQ,MAAO;AACjC,kBAAY,UAAU,IAAI,EAAE,QAAQ;AACpC,mBAAa,EAAE;AAAA,IACjB;AAAA,EACF;AAEA,QAAM,SAAS,YAAY,aAAa,KAAK,MAAM,WAAW;AAE9D,SAAO;AAAA,IACL,OAAO;AAAA,IACP;AAAA,IACA,SAAS;AAAA,MACP,YAAY,QAAQ;AAAA,MACpB;AAAA,MACA,cAAc,mBAAmB,cAAc;AAAA,MAC/C,iBAAiB,aAAa,IAAI,IAAI,OAAO;AAAA,MAC7C,kBAAkB,aAAa,IAAI,IAAI,WAAW;AAAA,MAClD,cAAc;AAAA,MACd,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B;AAAA,IACA,UAAU,aAAa;AAAA,EACzB;AACF;","names":["i"]}
package/dist/index.d.ts CHANGED
@@ -8,8 +8,8 @@ import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLed
8
8
  export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
9
9
  import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from './researcher-DeZ_EArp.js';
10
10
  export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as backoffMs, x as callLlm, y as callLlmJson, z as isTransientLlmError, A as probeLlm, r as runEvalCampaign, B as stripFencedJson } from './researcher-DeZ_EArp.js';
11
- import { TraceAnalysisStore, AnalyzeTracesOptions } from './traces.js';
12
- export { AnalyzeTracesInput, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
11
+ import { TraceAnalysisStore, AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
12
+ export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, ExportableSpan, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
13
13
  import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-D2ykiLSe.js';
14
14
  export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-D2ykiLSe.js';
15
15
  import { TCloud } from '@tangle-network/tcloud';
@@ -6278,4 +6278,112 @@ declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
6278
6278
  mode: AggregatorMode;
6279
6279
  }): TrialAggregate;
6280
6280
 
6281
- export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesOptions, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorrectnessChecker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouterTransportOpts, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, containsAll, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, wranglerDeployRunner };
6281
+ /**
6282
+ * Pipeline-level OTEL integration — auto-attaches an OTEL exporter when
6283
+ * OTEL_EXPORTER_OTLP_ENDPOINT is set. Pipelines call `withOtelPipeline()`
6284
+ * to get a configured exporter + shutdown handle without manual wiring.
6285
+ *
6286
+ * Used by: runEvalCampaign, runProductionLoop, runAgentMatrix.
6287
+ */
6288
+
6289
+ interface OtelPipelineHandle {
6290
+ /** The active exporter, or undefined if no endpoint is configured. */
6291
+ exporter: OtelExporter | undefined;
6292
+ /** Call at pipeline end to flush + shutdown. Safe to call even if exporter is undefined. */
6293
+ shutdown(): Promise<void>;
6294
+ }
6295
+ interface OtelPipelineOptions {
6296
+ /** Override OTEL config. */
6297
+ otelConfig?: OtelExportConfig;
6298
+ /** Pipeline-specific resource attributes. */
6299
+ pipelineKind?: string;
6300
+ pipelineId?: string;
6301
+ }
6302
+ /**
6303
+ * Create an OTEL exporter scoped to a pipeline run. Auto-reads
6304
+ * OTEL_EXPORTER_OTLP_ENDPOINT from env when no explicit config is passed.
6305
+ *
6306
+ * Returns a handle with `exporter` (possibly undefined) and `shutdown()`.
6307
+ */
6308
+ declare function withOtelPipeline(opts?: OtelPipelineOptions): OtelPipelineHandle;
6309
+ /**
6310
+ * Check if OTEL export is configured (endpoint is set).
6311
+ */
6312
+ declare function isOtelConfigured(): boolean;
6313
+
6314
+ /**
6315
+ * Traced judge wrappers — instruments every LLM call inside the judge
6316
+ * ensemble with child spans so OTEL sinks see per-judge latency, model,
6317
+ * token counts, and score dimensions.
6318
+ *
6319
+ * The ensemble parent span groups all individual judge spans; each judge
6320
+ * gets its own child span with model + score as attributes.
6321
+ */
6322
+
6323
+ interface TracedJudgeOptions {
6324
+ /** TraceEmitter to emit spans into. */
6325
+ emitter: TraceEmitter;
6326
+ /** Parent span id for the ensemble. If omitted, uses the emitter stack. */
6327
+ parentSpanId?: string;
6328
+ }
6329
+ /**
6330
+ * Wrap a single JudgeFn so its LLM call emits a traced span.
6331
+ */
6332
+ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudgeOptions): JudgeFn;
6333
+ /**
6334
+ * Wrap an array of JudgeFns with tracing, running them inside an ensemble
6335
+ * parent span. Returns a single function that calls all judges and merges
6336
+ * their scores.
6337
+ */
6338
+ declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
6339
+
6340
+ /**
6341
+ * Traced analyst wrapper — instruments `analyzeTraces` with spans so the
6342
+ * analyst's internal LLM calls (actor + responder turns) appear in the
6343
+ * trace tree. Also wraps each actor turn callback with a span.
6344
+ *
6345
+ * Since the analyst uses @ax-llm/ax internally (an agent framework with
6346
+ * its own turn loop), we cannot wrap individual `tc.chat()` calls without
6347
+ * forking ax. Instead, we wrap at the boundary:
6348
+ * 1. A parent span for the entire analyst run.
6349
+ * 2. Per-turn child spans from the `onTurn` callback (captures code,
6350
+ * output size, error status).
6351
+ * 3. Summary attributes on the parent (total turns, usage, findings).
6352
+ */
6353
+
6354
+ interface TracedAnalystOptions {
6355
+ /** TraceEmitter for span emission. */
6356
+ emitter: TraceEmitter;
6357
+ /** Parent span id. If omitted, uses emitter stack. */
6358
+ parentSpanId?: string;
6359
+ }
6360
+ /**
6361
+ * Run `analyzeTraces` wrapped in a parent span with per-turn child spans.
6362
+ */
6363
+ declare function tracedAnalyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions, traceOpts: TracedAnalystOptions): Promise<AnalyzeTracesResult>;
6364
+
6365
+ /**
6366
+ * Traced mutator wrapper — instruments reflective-mutation LLM calls.
6367
+ *
6368
+ * The reflective mutator (used by production-loop + multi-shot-optimization)
6369
+ * builds a prompt via `buildReflectionPrompt` and calls an LLM to produce
6370
+ * candidate mutations. This wrapper emits a span around each mutation call
6371
+ * so OTEL sinks observe:
6372
+ * - Model used for mutation
6373
+ * - Input context (target, trial count, child count)
6374
+ * - Output (proposal count, labels)
6375
+ * - Duration + cost if available
6376
+ */
6377
+
6378
+ interface TracedMutatorOptions {
6379
+ /** TraceEmitter for span emission. */
6380
+ emitter: TraceEmitter;
6381
+ /** Parent span id. If omitted, uses emitter stack. */
6382
+ parentSpanId?: string;
6383
+ }
6384
+ /**
6385
+ * Wrap a MutateAdapter so every mutate() call emits a span.
6386
+ */
6387
+ declare function traceMutator<P>(adapter: MutateAdapter<P>, opts: TracedMutatorOptions): MutateAdapter<P>;
6388
+
6389
+ export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorrectnessChecker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouterTransportOpts, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, type TracedMutatorOptions, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, containsAll, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, traceMutator, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };