@tangle-network/agent-eval 0.37.0 → 0.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +695 -0
- package/dist/campaign/index.js +741 -0
- package/dist/campaign/index.js.map +1 -0
- package/dist/chunk-5U2DOJU4.js +565 -0
- package/dist/chunk-5U2DOJU4.js.map +1 -0
- package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
- package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
- package/dist/chunk-BWZEGTES.js.map +1 -0
- package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
- package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
- package/dist/chunk-GGE4NNQT.js +65 -0
- package/dist/chunk-GGE4NNQT.js.map +1 -0
- package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
- package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
- package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
- package/dist/chunk-MAOZCN36.js.map +1 -0
- package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
- package/dist/chunk-QWV226SL.js +276 -0
- package/dist/chunk-QWV226SL.js.map +1 -0
- package/dist/chunk-TMXPFWC7.js +305 -0
- package/dist/chunk-TMXPFWC7.js.map +1 -0
- package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
- package/dist/chunk-WP7SY7AI.js.map +1 -0
- package/dist/chunk-YV7J7X5N.js +313 -0
- package/dist/chunk-YV7J7X5N.js.map +1 -0
- package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
- package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
- package/dist/governance/index.d.ts +133 -5
- package/dist/index.d.ts +35 -34
- package/dist/index.js +97 -630
- package/dist/index.js.map +1 -1
- package/dist/matrix/index.d.ts +2 -109
- package/dist/matrix/index.js +5 -270
- package/dist/matrix/index.js.map +1 -1
- package/dist/multishot/index.d.ts +276 -0
- package/dist/multishot/index.js +516 -0
- package/dist/multishot/index.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.js +2 -2
- package/dist/red-team-30II1T4o.d.ts +63 -0
- package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +3 -3
- package/dist/rl.js +15 -315
- package/dist/rl.js.map +1 -1
- package/dist/run-campaign-JYJXYHHL.js +10 -0
- package/dist/run-campaign-JYJXYHHL.js.map +1 -0
- package/dist/traces.js +7 -5
- package/dist/types-DHqkLwEU.d.ts +110 -0
- package/dist/wire/index.d.ts +2 -2
- package/docs/design/loop-taxonomy.md +233 -0
- package/package.json +38 -24
- package/dist/chunk-KHZRNY3F.js.map +0 -1
- package/dist/chunk-L5UNCDAJ.js.map +0 -1
- package/dist/chunk-TSPOEDM3.js.map +0 -1
- package/dist/index-CN2agEaO.d.ts +0 -191
- /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
- /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
- /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
- /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
- /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
- /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
// src/matrix/aggregation.ts
|
|
2
|
+
function flattenRuns(cells) {
|
|
3
|
+
const rows = [];
|
|
4
|
+
for (const { cell, runs } of cells) {
|
|
5
|
+
for (const result of runs) rows.push({ cell, result });
|
|
6
|
+
}
|
|
7
|
+
return rows;
|
|
8
|
+
}
|
|
9
|
+
function quantile(sorted, q) {
|
|
10
|
+
if (sorted.length === 0) return 0;
|
|
11
|
+
if (sorted.length === 1) return sorted[0];
|
|
12
|
+
const pos = (sorted.length - 1) * q;
|
|
13
|
+
const lo = Math.floor(pos);
|
|
14
|
+
const hi = Math.ceil(pos);
|
|
15
|
+
if (lo === hi) return sorted[lo];
|
|
16
|
+
const frac = pos - lo;
|
|
17
|
+
return sorted[lo] * (1 - frac) + sorted[hi] * frac;
|
|
18
|
+
}
|
|
19
|
+
function summariseRows(rows, axisName, axisValue) {
|
|
20
|
+
if (rows.length === 0) {
|
|
21
|
+
return {
|
|
22
|
+
axisName,
|
|
23
|
+
axisValue,
|
|
24
|
+
cells: 0,
|
|
25
|
+
passRate: 0,
|
|
26
|
+
meanScore: 0,
|
|
27
|
+
p50Score: 0,
|
|
28
|
+
p90Score: 0,
|
|
29
|
+
totalCostUsd: 0,
|
|
30
|
+
meanDurationMs: 0
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
let pass = 0;
|
|
34
|
+
let scoreSum = 0;
|
|
35
|
+
let costSum = 0;
|
|
36
|
+
let durSum = 0;
|
|
37
|
+
const scores = [];
|
|
38
|
+
for (const { result } of rows) {
|
|
39
|
+
const errored = result.error !== void 0;
|
|
40
|
+
const score = errored ? 0 : result.verdict.score;
|
|
41
|
+
const valid = !errored && result.verdict.valid;
|
|
42
|
+
if (valid) pass++;
|
|
43
|
+
scoreSum += score;
|
|
44
|
+
scores.push(score);
|
|
45
|
+
costSum += result.costUsd;
|
|
46
|
+
durSum += result.durationMs;
|
|
47
|
+
}
|
|
48
|
+
scores.sort((a, b) => a - b);
|
|
49
|
+
return {
|
|
50
|
+
axisName,
|
|
51
|
+
axisValue,
|
|
52
|
+
cells: rows.length,
|
|
53
|
+
passRate: pass / rows.length,
|
|
54
|
+
meanScore: scoreSum / rows.length,
|
|
55
|
+
p50Score: quantile(scores, 0.5),
|
|
56
|
+
p90Score: quantile(scores, 0.9),
|
|
57
|
+
totalCostUsd: costSum,
|
|
58
|
+
meanDurationMs: durSum / rows.length
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
function bucketBy(rows, axisName, labelFor) {
|
|
62
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
63
|
+
for (const row of rows) {
|
|
64
|
+
const slot = row.cell.axes[axisName];
|
|
65
|
+
if (!slot) continue;
|
|
66
|
+
const id = slot.id;
|
|
67
|
+
let arr = buckets.get(id);
|
|
68
|
+
if (!arr) {
|
|
69
|
+
arr = [];
|
|
70
|
+
buckets.set(id, arr);
|
|
71
|
+
}
|
|
72
|
+
arr.push(row);
|
|
73
|
+
}
|
|
74
|
+
const out = {};
|
|
75
|
+
for (const id of [...buckets.keys()].sort()) {
|
|
76
|
+
out[id] = summariseRows(buckets.get(id), axisName, labelFor(id));
|
|
77
|
+
}
|
|
78
|
+
return out;
|
|
79
|
+
}
|
|
80
|
+
function buildByAxis(cells, axes, aggregateBy) {
|
|
81
|
+
const rows = flattenRuns(cells);
|
|
82
|
+
const byName = new Map(axes.map((a) => [a.name, a]));
|
|
83
|
+
const byAxis = {};
|
|
84
|
+
for (const name of aggregateBy) {
|
|
85
|
+
const axis = byName.get(name);
|
|
86
|
+
const labelFor = (id) => {
|
|
87
|
+
if (!axis?.label) return id;
|
|
88
|
+
const found = axis.values.find((v) => v.id === id);
|
|
89
|
+
if (!found) return id;
|
|
90
|
+
return axis.label(found.value, id);
|
|
91
|
+
};
|
|
92
|
+
byAxis[name] = bucketBy(rows, name, labelFor);
|
|
93
|
+
}
|
|
94
|
+
return byAxis;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// src/matrix/runner.ts
|
|
98
|
+
function cartesian(axes) {
|
|
99
|
+
if (axes.length === 0) return [{ axes: {} }];
|
|
100
|
+
for (const a of axes) if (a.values.length === 0) return [];
|
|
101
|
+
const out = [];
|
|
102
|
+
const idx = new Array(axes.length).fill(0);
|
|
103
|
+
while (true) {
|
|
104
|
+
const slot = {};
|
|
105
|
+
for (let i2 = 0; i2 < axes.length; i2++) {
|
|
106
|
+
const axis = axes[i2];
|
|
107
|
+
const v = axis.values[idx[i2]];
|
|
108
|
+
slot[axis.name] = { id: v.id, value: v.value };
|
|
109
|
+
}
|
|
110
|
+
out.push({ axes: slot });
|
|
111
|
+
let i = 0;
|
|
112
|
+
while (i < axes.length) {
|
|
113
|
+
const next = idx[i] + 1;
|
|
114
|
+
const axis = axes[i];
|
|
115
|
+
if (next < axis.values.length) {
|
|
116
|
+
idx[i] = next;
|
|
117
|
+
break;
|
|
118
|
+
}
|
|
119
|
+
idx[i] = 0;
|
|
120
|
+
i++;
|
|
121
|
+
}
|
|
122
|
+
if (i === axes.length) break;
|
|
123
|
+
}
|
|
124
|
+
return out;
|
|
125
|
+
}
|
|
126
|
+
function makeMatrixId() {
|
|
127
|
+
const t = Date.now().toString(36);
|
|
128
|
+
let r = "";
|
|
129
|
+
for (let i = 0; i < 8; i++) r += Math.floor(Math.random() * 16).toString(16);
|
|
130
|
+
return `mtx_${t}_${r}`;
|
|
131
|
+
}
|
|
132
|
+
function makeErrorResult(err) {
|
|
133
|
+
const e = err;
|
|
134
|
+
return {
|
|
135
|
+
output: void 0,
|
|
136
|
+
verdict: { valid: false, score: 0 },
|
|
137
|
+
costUsd: 0,
|
|
138
|
+
durationMs: 0,
|
|
139
|
+
error: {
|
|
140
|
+
message: typeof e?.message === "string" ? e.message : String(err),
|
|
141
|
+
kind: typeof e?.name === "string" ? e.name : "Error"
|
|
142
|
+
}
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
async function runAgentMatrix(opts) {
|
|
146
|
+
const startedAt = Date.now();
|
|
147
|
+
const reps = Math.max(1, opts.reps ?? 1);
|
|
148
|
+
const maxConcurrency = Math.max(1, opts.maxConcurrency ?? 4);
|
|
149
|
+
const costCeiling = opts.costCeiling ?? Number.POSITIVE_INFINITY;
|
|
150
|
+
const aggregateBy = opts.aggregateBy ?? opts.axes.map((a) => a.name);
|
|
151
|
+
const base = cartesian(opts.axes);
|
|
152
|
+
const filtered = opts.filter ? base.filter((c) => opts.filter(c)) : base;
|
|
153
|
+
const filteredOut = base.length - filtered.length;
|
|
154
|
+
const planned = [];
|
|
155
|
+
for (let i = 0; i < filtered.length; i++) {
|
|
156
|
+
for (let r = 0; r < reps; r++) {
|
|
157
|
+
planned.push({
|
|
158
|
+
axes: filtered[i].axes,
|
|
159
|
+
rep: r,
|
|
160
|
+
ordinal: i * reps + r
|
|
161
|
+
});
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
const cellRecords = [];
|
|
165
|
+
let cumulativeCost = 0;
|
|
166
|
+
let costCeilingReached = false;
|
|
167
|
+
let runsExecuted = 0;
|
|
168
|
+
let cellsUnscheduled = 0;
|
|
169
|
+
const aborted = () => opts.signal?.aborted === true;
|
|
170
|
+
let inFlight = 0;
|
|
171
|
+
let cursor = 0;
|
|
172
|
+
let resolveAll;
|
|
173
|
+
const done = new Promise((res) => {
|
|
174
|
+
resolveAll = res;
|
|
175
|
+
});
|
|
176
|
+
const pump = () => {
|
|
177
|
+
while (inFlight < maxConcurrency && cursor < planned.length) {
|
|
178
|
+
if (aborted() || costCeilingReached) {
|
|
179
|
+
const left = planned.length - cursor;
|
|
180
|
+
cellsUnscheduled += left;
|
|
181
|
+
cursor = planned.length;
|
|
182
|
+
break;
|
|
183
|
+
}
|
|
184
|
+
const cell = planned[cursor++];
|
|
185
|
+
inFlight++;
|
|
186
|
+
const record = { cell, runs: [] };
|
|
187
|
+
cellRecords.push(record);
|
|
188
|
+
const promise = (async () => {
|
|
189
|
+
try {
|
|
190
|
+
return await opts.runCell(cell);
|
|
191
|
+
} catch (err) {
|
|
192
|
+
return makeErrorResult(err);
|
|
193
|
+
}
|
|
194
|
+
})();
|
|
195
|
+
promise.then((result) => {
|
|
196
|
+
record.runs.push(result);
|
|
197
|
+
runsExecuted++;
|
|
198
|
+
cumulativeCost += result.costUsd;
|
|
199
|
+
if (cumulativeCost >= costCeiling && !costCeilingReached) {
|
|
200
|
+
costCeilingReached = true;
|
|
201
|
+
console.warn("[matrix] cost ceiling reached");
|
|
202
|
+
}
|
|
203
|
+
try {
|
|
204
|
+
opts.onCellComplete?.(cell, result);
|
|
205
|
+
} catch {
|
|
206
|
+
}
|
|
207
|
+
inFlight--;
|
|
208
|
+
if (cursor < planned.length) {
|
|
209
|
+
pump();
|
|
210
|
+
} else if (inFlight === 0) {
|
|
211
|
+
resolveAll?.();
|
|
212
|
+
}
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
if (cursor >= planned.length && inFlight === 0) resolveAll?.();
|
|
216
|
+
};
|
|
217
|
+
const onAbort = () => {
|
|
218
|
+
if (cursor < planned.length) {
|
|
219
|
+
cellsUnscheduled += planned.length - cursor;
|
|
220
|
+
cursor = planned.length;
|
|
221
|
+
}
|
|
222
|
+
if (inFlight === 0) resolveAll?.();
|
|
223
|
+
};
|
|
224
|
+
if (opts.signal) {
|
|
225
|
+
if (opts.signal.aborted) {
|
|
226
|
+
cellsUnscheduled = planned.length;
|
|
227
|
+
cursor = planned.length;
|
|
228
|
+
resolveAll?.();
|
|
229
|
+
} else {
|
|
230
|
+
opts.signal.addEventListener("abort", onAbort, { once: true });
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
if (planned.length === 0) {
|
|
234
|
+
resolveAll?.();
|
|
235
|
+
} else {
|
|
236
|
+
pump();
|
|
237
|
+
}
|
|
238
|
+
await done;
|
|
239
|
+
if (opts.signal) opts.signal.removeEventListener("abort", onAbort);
|
|
240
|
+
cellRecords.sort((a, b) => a.cell.ordinal - b.cell.ordinal);
|
|
241
|
+
let pass = 0;
|
|
242
|
+
let scoreSum = 0;
|
|
243
|
+
let totalCost = 0;
|
|
244
|
+
let runCount = 0;
|
|
245
|
+
for (const { runs } of cellRecords) {
|
|
246
|
+
for (const r of runs) {
|
|
247
|
+
runCount++;
|
|
248
|
+
const errored = r.error !== void 0;
|
|
249
|
+
if (!errored && r.verdict.valid) pass++;
|
|
250
|
+
scoreSum += errored ? 0 : r.verdict.score;
|
|
251
|
+
totalCost += r.costUsd;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
const byAxis = buildByAxis(cellRecords, opts.axes, aggregateBy);
|
|
255
|
+
return {
|
|
256
|
+
cells: cellRecords,
|
|
257
|
+
byAxis,
|
|
258
|
+
summary: {
|
|
259
|
+
totalCells: planned.length,
|
|
260
|
+
runsExecuted,
|
|
261
|
+
cellsSkipped: cellsUnscheduled + filteredOut * reps,
|
|
262
|
+
overallPassRate: runCount === 0 ? 0 : pass / runCount,
|
|
263
|
+
overallMeanScore: runCount === 0 ? 0 : scoreSum / runCount,
|
|
264
|
+
totalCostUsd: totalCost,
|
|
265
|
+
durationMs: Date.now() - startedAt
|
|
266
|
+
},
|
|
267
|
+
matrixId: makeMatrixId()
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
export {
|
|
272
|
+
summariseRows,
|
|
273
|
+
buildByAxis,
|
|
274
|
+
runAgentMatrix
|
|
275
|
+
};
|
|
276
|
+
//# sourceMappingURL=chunk-QWV226SL.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/matrix/aggregation.ts","../src/matrix/runner.ts"],"sourcesContent":["/**\n * Per-axis aggregation of cell runs into `AxisSummary` rows.\n *\n * Pure: consumes the final `cells: [{cell, runs}]` array and returns the\n * `byAxis` table. Error runs contribute 0 to passRate and meanScore. Cost\n * and duration always count — the budget was spent regardless.\n */\n\nimport type { AxisSummary, CellResult, MatrixAxis, MatrixCell, MatrixResult } from './types'\n\ninterface Row<Output> {\n cell: MatrixCell\n result: CellResult<Output>\n}\n\nfunction flattenRuns<Output>(cells: MatrixResult<Output>['cells']): Row<Output>[] {\n const rows: Row<Output>[] = []\n for (const { cell, runs } of cells) {\n for (const result of runs) rows.push({ cell, result })\n }\n return rows\n}\n\nfunction quantile(sorted: number[], q: number): number {\n if (sorted.length === 0) return 0\n if (sorted.length === 1) return sorted[0] as number\n const pos = (sorted.length - 1) * q\n const lo = Math.floor(pos)\n const hi = Math.ceil(pos)\n if (lo === hi) return sorted[lo] as number\n const frac = pos - lo\n return (sorted[lo] as number) * (1 - frac) + (sorted[hi] as number) * frac\n}\n\nexport function summariseRows<Output>(\n rows: Row<Output>[],\n axisName: string,\n axisValue: string,\n): AxisSummary {\n if (rows.length === 0) {\n return {\n axisName,\n axisValue,\n cells: 0,\n passRate: 0,\n meanScore: 0,\n p50Score: 0,\n p90Score: 0,\n totalCostUsd: 0,\n meanDurationMs: 0,\n }\n }\n let pass = 0\n let scoreSum = 0\n let costSum = 0\n let durSum = 0\n const scores: number[] = []\n for (const { result } of rows) {\n const errored = result.error !== undefined\n const score = errored ? 0 : result.verdict.score\n const valid = !errored && result.verdict.valid\n if (valid) pass++\n scoreSum += score\n scores.push(score)\n costSum += result.costUsd\n durSum += result.durationMs\n }\n scores.sort((a, b) => a - b)\n return {\n axisName,\n axisValue,\n cells: rows.length,\n passRate: pass / rows.length,\n meanScore: scoreSum / rows.length,\n p50Score: quantile(scores, 0.5),\n p90Score: quantile(scores, 0.9),\n totalCostUsd: costSum,\n meanDurationMs: durSum / rows.length,\n }\n}\n\nfunction bucketBy<Output>(\n rows: Row<Output>[],\n axisName: string,\n labelFor: (id: string) => string,\n): Record<string, AxisSummary> {\n const buckets = new Map<string, Row<Output>[]>()\n for (const row of rows) {\n const slot = row.cell.axes[axisName]\n if (!slot) continue\n const id = slot.id\n let arr = buckets.get(id)\n if (!arr) {\n arr = []\n buckets.set(id, arr)\n }\n arr.push(row)\n }\n const out: Record<string, AxisSummary> = {}\n // Sorted keys for deterministic JSON serialisation.\n for (const id of [...buckets.keys()].sort()) {\n out[id] = summariseRows(buckets.get(id) as Row<Output>[], axisName, labelFor(id))\n }\n return out\n}\n\nexport function buildByAxis<Output>(\n cells: MatrixResult<Output>['cells'],\n axes: MatrixAxis<unknown>[],\n aggregateBy: string[],\n): Record<string, Record<string, AxisSummary>> {\n const rows = flattenRuns(cells)\n const byName = new Map(axes.map((a) => [a.name, a]))\n const byAxis: Record<string, Record<string, AxisSummary>> = {}\n for (const name of aggregateBy) {\n const axis = byName.get(name)\n const labelFor = (id: string): string => {\n if (!axis?.label) return id\n const found = axis.values.find((v) => v.id === id)\n if (!found) return id\n return axis.label(found.value, id)\n }\n byAxis[name] = bucketBy(rows, name, labelFor)\n }\n return byAxis\n}\n","/**\n * N-axis cartesian runner.\n *\n * Expansion order: cartesian over `axes` in declared order, then `reps` as the\n * inner-most dim → `ordinal = (cartIdx * reps) + rep`. The returned\n * `cells[]` is sorted by `ordinal` so concurrent execution does not reorder\n * the output.\n *\n * Scheduling is a sliding window of in-flight promises capped at\n * `maxConcurrency`. The window stops admitting new cells when the cost\n * ceiling trips or the abort signal fires; in-flight cells finish.\n */\n\nimport { buildByAxis } from './aggregation'\nimport type {\n CellResult,\n MatrixAxis,\n MatrixCell,\n MatrixResult,\n RunAgentMatrixOptions,\n} from './types'\n\ninterface BaseCell {\n axes: Record<string, { id: string; value: unknown }>\n}\n\nfunction cartesian(axes: MatrixAxis<unknown>[]): BaseCell[] {\n // Empty axes (`values=[]`) collapse the whole product to zero cells. An\n // empty `axes` array yields a single empty-axes cell — degenerate but\n // valid (caller is iterating only reps).\n if (axes.length === 0) return [{ axes: {} }]\n for (const a of axes) if (a.values.length === 0) return []\n const out: BaseCell[] = []\n const idx = new Array(axes.length).fill(0)\n while (true) {\n const slot: Record<string, { id: string; value: unknown }> = {}\n for (let i = 0; i < axes.length; i++) {\n const axis = axes[i] as MatrixAxis<unknown>\n const v = axis.values[idx[i] as number] as { id: string; value: unknown }\n slot[axis.name] = { id: v.id, value: v.value }\n }\n out.push({ axes: slot })\n // Increment like an odometer, left-most axis is fastest.\n let i = 0\n while (i < axes.length) {\n const next = (idx[i] as number) + 1\n const axis = axes[i] as MatrixAxis<unknown>\n if (next < axis.values.length) {\n idx[i] = next\n break\n }\n idx[i] = 0\n i++\n }\n if (i === axes.length) break\n }\n return out\n}\n\nfunction makeMatrixId(): string {\n // Stable id-like string: time + 8 random hex chars. Avoids node:crypto\n // import to keep the matrix dep-free.\n const t = Date.now().toString(36)\n let r = ''\n for (let i = 0; i < 8; i++) r += Math.floor(Math.random() * 16).toString(16)\n return `mtx_${t}_${r}`\n}\n\nfunction makeErrorResult<Output>(err: unknown): CellResult<Output> {\n const e = err as { message?: string; name?: string }\n return {\n output: undefined as unknown as Output,\n verdict: { valid: false, score: 0 },\n costUsd: 0,\n durationMs: 0,\n error: {\n message: typeof e?.message === 'string' ? e.message : String(err),\n kind: typeof e?.name === 'string' ? e.name : 'Error',\n },\n }\n}\n\nexport async function runAgentMatrix<Output>(\n opts: RunAgentMatrixOptions<Output>,\n): Promise<MatrixResult<Output>> {\n const startedAt = Date.now()\n const reps = Math.max(1, opts.reps ?? 1)\n const maxConcurrency = Math.max(1, opts.maxConcurrency ?? 4)\n const costCeiling = opts.costCeiling ?? Number.POSITIVE_INFINITY\n const aggregateBy = opts.aggregateBy ?? opts.axes.map((a) => a.name)\n\n const base = cartesian(opts.axes)\n const filtered = opts.filter\n ? base.filter((c) => (opts.filter as (b: BaseCell) => boolean)(c))\n : base\n const filteredOut = base.length - filtered.length\n\n const planned: MatrixCell[] = []\n for (let i = 0; i < filtered.length; i++) {\n for (let r = 0; r < reps; r++) {\n planned.push({\n axes: (filtered[i] as BaseCell).axes,\n rep: r,\n ordinal: i * reps + r,\n })\n }\n }\n\n const cellRecords: Array<{ cell: MatrixCell; runs: CellResult<Output>[] }> = []\n let cumulativeCost = 0\n let costCeilingReached = false\n let runsExecuted = 0\n let cellsUnscheduled = 0\n\n const aborted = (): boolean => opts.signal?.aborted === true\n\n // Per-run abort controller forwards the external signal so cell executors\n // see cancellation. We don't expose it on `MatrixCell` — the signature on\n // `runCell` per the public API is `(cell) => Promise<...>`. Executors that\n // need cancellation use the external signal directly via closure.\n\n let inFlight = 0\n let cursor = 0\n let resolveAll: (() => void) | undefined\n const done = new Promise<void>((res) => {\n resolveAll = res\n })\n\n const pump = (): void => {\n while (inFlight < maxConcurrency && cursor < planned.length) {\n if (aborted() || costCeilingReached) {\n // Drain remaining as unscheduled.\n const left = planned.length - cursor\n cellsUnscheduled += left\n cursor = planned.length\n break\n }\n const cell = planned[cursor++] as MatrixCell\n inFlight++\n // Lazily allocate the record so cells appear in `cells[]` in any\n // arrival order; we sort by ordinal at the end.\n const record = { cell, runs: [] as CellResult<Output>[] }\n cellRecords.push(record)\n const promise: Promise<CellResult<Output>> = (async () => {\n try {\n return await opts.runCell(cell)\n } catch (err) {\n return makeErrorResult<Output>(err)\n }\n })()\n promise.then((result) => {\n record.runs.push(result)\n runsExecuted++\n cumulativeCost += result.costUsd\n if (cumulativeCost >= costCeiling && !costCeilingReached) {\n costCeilingReached = true\n // eslint-disable-next-line no-console\n console.warn('[matrix] cost ceiling reached')\n }\n try {\n opts.onCellComplete?.(cell, result)\n } catch {\n // onCellComplete is observational — swallow throws so a noisy\n // callback can't tank the run.\n }\n inFlight--\n if (cursor < planned.length) {\n pump()\n } else if (inFlight === 0) {\n resolveAll?.()\n }\n })\n }\n if (cursor >= planned.length && inFlight === 0) resolveAll?.()\n }\n\n const onAbort = (): void => {\n // External abort: stop scheduling. In-flight cells finish; their\n // executors observe `opts.signal.aborted` directly via closure.\n if (cursor < planned.length) {\n cellsUnscheduled += planned.length - cursor\n cursor = planned.length\n }\n if (inFlight === 0) resolveAll?.()\n }\n if (opts.signal) {\n if (opts.signal.aborted) {\n cellsUnscheduled = planned.length\n cursor = planned.length\n resolveAll?.()\n } else {\n opts.signal.addEventListener('abort', onAbort, { once: true })\n }\n }\n\n if (planned.length === 0) {\n resolveAll?.()\n } else {\n pump()\n }\n\n await done\n if (opts.signal) opts.signal.removeEventListener('abort', onAbort)\n\n cellRecords.sort((a, b) => a.cell.ordinal - b.cell.ordinal)\n\n let pass = 0\n let scoreSum = 0\n let totalCost = 0\n let runCount = 0\n for (const { runs } of cellRecords) {\n for (const r of runs) {\n runCount++\n const errored = r.error !== undefined\n if (!errored && r.verdict.valid) pass++\n scoreSum += errored ? 0 : r.verdict.score\n totalCost += r.costUsd\n }\n }\n\n const byAxis = buildByAxis(cellRecords, opts.axes, aggregateBy)\n\n return {\n cells: cellRecords,\n byAxis,\n summary: {\n totalCells: planned.length,\n runsExecuted,\n cellsSkipped: cellsUnscheduled + filteredOut * reps,\n overallPassRate: runCount === 0 ? 0 : pass / runCount,\n overallMeanScore: runCount === 0 ? 0 : scoreSum / runCount,\n totalCostUsd: totalCost,\n durationMs: Date.now() - startedAt,\n },\n matrixId: makeMatrixId(),\n }\n}\n"],"mappings":";AAeA,SAAS,YAAoB,OAAqD;AAChF,QAAM,OAAsB,CAAC;AAC7B,aAAW,EAAE,MAAM,KAAK,KAAK,OAAO;AAClC,eAAW,UAAU,KAAM,MAAK,KAAK,EAAE,MAAM,OAAO,CAAC;AAAA,EACvD;AACA,SAAO;AACT;AAEA,SAAS,SAAS,QAAkB,GAAmB;AACrD,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,OAAO,WAAW,EAAG,QAAO,OAAO,CAAC;AACxC,QAAM,OAAO,OAAO,SAAS,KAAK;AAClC,QAAM,KAAK,KAAK,MAAM,GAAG;AACzB,QAAM,KAAK,KAAK,KAAK,GAAG;AACxB,MAAI,OAAO,GAAI,QAAO,OAAO,EAAE;AAC/B,QAAM,OAAO,MAAM;AACnB,SAAQ,OAAO,EAAE,KAAgB,IAAI,QAAS,OAAO,EAAE,IAAe;AACxE;AAEO,SAAS,cACd,MACA,UACA,WACa;AACb,MAAI,KAAK,WAAW,GAAG;AACrB,WAAO;AAAA,MACL;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV,WAAW;AAAA,MACX,UAAU;AAAA,MACV,UAAU;AAAA,MACV,cAAc;AAAA,MACd,gBAAgB;AAAA,IAClB;AAAA,EACF;AACA,MAAI,OAAO;AACX,MAAI,WAAW;AACf,MAAI,UAAU;AACd,MAAI,SAAS;AACb,QAAM,SAAmB,CAAC;AAC1B,aAAW,EAAE,OAAO,KAAK,MAAM;AAC7B,UAAM,UAAU,OAAO,UAAU;AACjC,UAAM,QAAQ,UAAU,IAAI,OAAO,QAAQ;AAC3C,UAAM,QAAQ,CAAC,WAAW,OAAO,QAAQ;AACzC,QAAI,MAAO;AACX,gBAAY;AACZ,WAAO,KAAK,KAAK;AACjB,eAAW,OAAO;AAClB,cAAU,OAAO;AAAA,EACnB;AACA,SAAO,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3B,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,OAAO,KAAK;AAAA,IACZ,UAAU,OAAO,KAAK;AAAA,IACtB,WAAW,WAAW,KAAK;AAAA,IAC3B,UAAU,SAAS,QAAQ,GAAG;AAAA,IAC9B,UAAU,SAAS,QAAQ,GAAG;AAAA,IAC9B,cAAc;AAAA,IACd,gBAAgB,SAAS,KAAK;AAAA,EAChC;AACF;AAEA,SAAS,SACP,MACA,UACA,UAC6B;AAC7B,QAAM,UAAU,oBAAI,IAA2B;AAC/C,aAAW,OAAO,MAAM;AACtB,UAAM,OAAO,IAAI,KAAK,KAAK,QAAQ;AACnC,QAAI,CAAC,KAAM;AACX,UAAM,KAAK,KAAK;AAChB,QAAI,MAAM,QAAQ,IAAI,EAAE;AACxB,QAAI,CAAC,KAAK;AACR,YAAM,CAAC;AACP,cAAQ,IAAI,IAAI,GAAG;AAAA,IACrB;AACA,QAAI,KAAK,GAAG;AAAA,EACd;AACA,QAAM,MAAmC,CAAC;AAE1C,aAAW,MAAM,CAAC,GAAG,QAAQ,KAAK,CAAC,EAAE,KAAK,GAAG;AAC3C,QAAI,EAAE,IAAI,cAAc,QAAQ,IAAI,EAAE,GAAoB,UAAU,SAAS,EAAE,CAAC;AAAA,EAClF;AACA,SAAO;AACT;AAEO,SAAS,YACd,OACA,MACA,aAC6C;AAC7C,QAAM,OAAO,YAAY,KAAK;AAC9B,QAAM,SAAS,IAAI,IAAI,KAAK,IAAI,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC;AACnD,QAAM,SAAsD,CAAC;AAC7D,aAAW,QAAQ,aAAa;AAC9B,UAAM,OAAO,OAAO,IAAI,IAAI;AAC5B,UAAM,WAAW,CAAC,OAAuB;AACvC,UAAI,CAAC,MAAM,MAAO,QAAO;AACzB,YAAM,QAAQ,KAAK,OAAO,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE;AACjD,UAAI,CAAC,MAAO,QAAO;AACnB,aAAO,KAAK,MAAM,MAAM,OAAO,EAAE;AAAA,IACnC;AACA,WAAO,IAAI,IAAI,SAAS,MAAM,MAAM,QAAQ;AAAA,EAC9C;AACA,SAAO;AACT;;;ACnGA,SAAS,UAAU,MAAyC;AAI1D,MAAI,KAAK,WAAW,EAAG,QAAO,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC;AAC3C,aAAW,KAAK,KAAM,KAAI,EAAE,OAAO,WAAW,EAAG,QAAO,CAAC;AACzD,QAAM,MAAkB,CAAC;AACzB,QAAM,MAAM,IAAI,MAAM,KAAK,MAAM,EAAE,KAAK,CAAC;AACzC,SAAO,MAAM;AACX,UAAM,OAAuD,CAAC;AAC9D,aAASA,KAAI,GAAGA,KAAI,KAAK,QAAQA,MAAK;AACpC,YAAM,OAAO,KAAKA,EAAC;AACnB,YAAM,IAAI,KAAK,OAAO,IAAIA,EAAC,CAAW;AACtC,WAAK,KAAK,IAAI,IAAI,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,MAAM;AAAA,IAC/C;AACA,QAAI,KAAK,EAAE,MAAM,KAAK,CAAC;AAEvB,QAAI,IAAI;AACR,WAAO,IAAI,KAAK,QAAQ;AACtB,YAAM,OAAQ,IAAI,CAAC,IAAe;AAClC,YAAM,OAAO,KAAK,CAAC;AACnB,UAAI,OAAO,KAAK,OAAO,QAAQ;AAC7B,YAAI,CAAC,IAAI;AACT;AAAA,MACF;AACA,UAAI,CAAC,IAAI;AACT;AAAA,IACF;AACA,QAAI,MAAM,KAAK,OAAQ;AAAA,EACzB;AACA,SAAO;AACT;AAEA,SAAS,eAAuB;AAG9B,QAAM,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE;AAChC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,GAAG,IAAK,MAAK,KAAK,MAAM,KAAK,OAAO,IAAI,EAAE,EAAE,SAAS,EAAE;AAC3E,SAAO,OAAO,CAAC,IAAI,CAAC;AACtB;AAEA,SAAS,gBAAwB,KAAkC;AACjE,QAAM,IAAI;AACV,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,SAAS,EAAE,OAAO,OAAO,OAAO,EAAE;AAAA,IAClC,SAAS;AAAA,IACT,YAAY;AAAA,IACZ,OAAO;AAAA,MACL,SAAS,OAAO,GAAG,YAAY,WAAW,EAAE,UAAU,OAAO,GAAG;AAAA,MAChE,MAAM,OAAO,GAAG,SAAS,WAAW,EAAE,OAAO;AAAA,IAC/C;AAAA,EACF;AACF;AAEA,eAAsB,eACpB,MAC+B;AAC/B,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,OAAO,KAAK,IAAI,GAAG,KAAK,QAAQ,CAAC;AACvC,QAAM,iBAAiB,KAAK,IAAI,GAAG,KAAK,kBAAkB,CAAC;AAC3D,QAAM,cAAc,KAAK,eAAe,OAAO;AAC/C,QAAM,cAAc,KAAK,eAAe,KAAK,KAAK,IAAI,CAAC,MAAM,EAAE,IAAI;AAEnE,QAAM,OAAO,UAAU,KAAK,IAAI;AAChC,QAAM,WAAW,KAAK,SAClB,KAAK,OAAO,CAAC,MAAO,KAAK,OAAoC,CAAC,CAAC,IAC/D;AACJ,QAAM,cAAc,KAAK,SAAS,SAAS;AAE3C,QAAM,UAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,aAAS,IAAI,GAAG,IAAI,MAAM,KAAK;AAC7B,cAAQ,KAAK;AAAA,QACX,MAAO,SAAS,CAAC,EAAe;AAAA,QAChC,KAAK;AAAA,QACL,SAAS,IAAI,OAAO;AAAA,MACtB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,cAAuE,CAAC;AAC9E,MAAI,iBAAiB;AACrB,MAAI,qBAAqB;AACzB,MAAI,eAAe;AACnB,MAAI,mBAAmB;AAEvB,QAAM,UAAU,MAAe,KAAK,QAAQ,YAAY;AAOxD,MAAI,WAAW;AACf,MAAI,SAAS;AACb,MAAI;AACJ,QAAM,OAAO,IAAI,QAAc,CAAC,QAAQ;AACtC,iBAAa;AAAA,EACf,CAAC;AAED,QAAM,OAAO,MAAY;AACvB,WAAO,WAAW,kBAAkB,SAAS,QAAQ,QAAQ;AAC3D,UAAI,QAAQ,KAAK,oBAAoB;AAEnC,cAAM,OAAO,QAAQ,SAAS;AAC9B,4BAAoB;AACpB,iBAAS,QAAQ;AACjB;AAAA,MACF;AACA,YAAM,OAAO,QAAQ,QAAQ;AAC7B;AAGA,YAAM,SAAS,EAAE,MAAM,MAAM,CAAC,EAA0B;AACxD,kBAAY,KAAK,MAAM;AACvB,YAAM,WAAwC,YAAY;AACxD,YAAI;AACF,iBAAO,MAAM,KAAK,QAAQ,IAAI;AAAA,QAChC,SAAS,KAAK;AACZ,iBAAO,gBAAwB,GAAG;AAAA,QACpC;AAAA,MACF,GAAG;AACH,cAAQ,KAAK,CAAC,WAAW;AACvB,eAAO,KAAK,KAAK,MAAM;AACvB;AACA,0BAAkB,OAAO;AACzB,YAAI,kBAAkB,eAAe,CAAC,oBAAoB;AACxD,+BAAqB;AAErB,kBAAQ,KAAK,+BAA+B;AAAA,QAC9C;AACA,YAAI;AACF,eAAK,iBAAiB,MAAM,MAAM;AAAA,QACpC,QAAQ;AAAA,QAGR;AACA;AACA,YAAI,SAAS,QAAQ,QAAQ;AAC3B,eAAK;AAAA,QACP,WAAW,aAAa,GAAG;AACzB,uBAAa;AAAA,QACf;AAAA,MACF,CAAC;AAAA,IACH;AACA,QAAI,UAAU,QAAQ,UAAU,aAAa,EAAG,cAAa;AAAA,EAC/D;AAEA,QAAM,UAAU,MAAY;AAG1B,QAAI,SAAS,QAAQ,QAAQ;AAC3B,0BAAoB,QAAQ,SAAS;AACrC,eAAS,QAAQ;AAAA,IACnB;AACA,QAAI,aAAa,EAAG,cAAa;AAAA,EACnC;AACA,MAAI,KAAK,QAAQ;AACf,QAAI,KAAK,OAAO,SAAS;AACvB,yBAAmB,QAAQ;AAC3B,eAAS,QAAQ;AACjB,mBAAa;AAAA,IACf,OAAO;AACL,WAAK,OAAO,iBAAiB,SAAS,SAAS,EAAE,MAAM,KAAK,CAAC;AAAA,IAC/D;AAAA,EACF;AAEA,MAAI,QAAQ,WAAW,GAAG;AACxB,iBAAa;AAAA,EACf,OAAO;AACL,SAAK;AAAA,EACP;AAEA,QAAM;AACN,MAAI,KAAK,OAAQ,MAAK,OAAO,oBAAoB,SAAS,OAAO;AAEjE,cAAY,KAAK,CAAC,GAAG,MAAM,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AAE1D,MAAI,OAAO;AACX,MAAI,WAAW;AACf,MAAI,YAAY;AAChB,MAAI,WAAW;AACf,aAAW,EAAE,KAAK,KAAK,aAAa;AAClC,eAAW,KAAK,MAAM;AACpB;AACA,YAAM,UAAU,EAAE,UAAU;AAC5B,UAAI,CAAC,WAAW,EAAE,QAAQ,MAAO;AACjC,kBAAY,UAAU,IAAI,EAAE,QAAQ;AACpC,mBAAa,EAAE;AAAA,IACjB;AAAA,EACF;AAEA,QAAM,SAAS,YAAY,aAAa,KAAK,MAAM,WAAW;AAE9D,SAAO;AAAA,IACL,OAAO;AAAA,IACP;AAAA,IACA,SAAS;AAAA,MACP,YAAY,QAAQ;AAAA,MACpB;AAAA,MACA,cAAc,mBAAmB,cAAc;AAAA,MAC/C,iBAAiB,aAAa,IAAI,IAAI,OAAO;AAAA,MAC7C,kBAAkB,aAAa,IAAI,IAAI,WAAW;AAAA,MAClD,cAAc;AAAA,MACd,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B;AAAA,IACA,UAAU,aAAa;AAAA,EACzB;AACF;","names":["i"]}
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
import {
|
|
2
|
+
confidenceInterval
|
|
3
|
+
} from "./chunk-WP7SY7AI.js";
|
|
4
|
+
|
|
5
|
+
// src/campaign/run-campaign.ts
|
|
6
|
+
import { createHash } from "crypto";
|
|
7
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
8
|
+
import { join } from "path";
|
|
9
|
+
async function runCampaign(opts) {
|
|
10
|
+
const seed = opts.seed ?? 42;
|
|
11
|
+
const reps = opts.reps ?? 1;
|
|
12
|
+
const resumable = opts.resumable ?? true;
|
|
13
|
+
const maxConcurrency = opts.maxConcurrency ?? 2;
|
|
14
|
+
const now = opts.now ?? (() => /* @__PURE__ */ new Date());
|
|
15
|
+
const judges = opts.judges ?? [];
|
|
16
|
+
if (!existsSync(opts.runDir)) mkdirSync(opts.runDir, { recursive: true });
|
|
17
|
+
const manifestHash = computeManifestHash({
|
|
18
|
+
scenarios: opts.scenarios,
|
|
19
|
+
judges,
|
|
20
|
+
dispatchRef: opts.dispatch.name || "anonymous",
|
|
21
|
+
seed,
|
|
22
|
+
reps
|
|
23
|
+
});
|
|
24
|
+
const startedAt = now();
|
|
25
|
+
const cells = [];
|
|
26
|
+
const artifactsByPath = {};
|
|
27
|
+
const schedule = [];
|
|
28
|
+
let cellIndex = 0;
|
|
29
|
+
for (const scenario of opts.scenarios) {
|
|
30
|
+
for (let rep = 0; rep < reps; rep++) {
|
|
31
|
+
const cellId = `${scenario.id}:${rep}`;
|
|
32
|
+
const cellSeed = seed + cellIndex;
|
|
33
|
+
schedule.push({ scenario, rep, cellId, cellSeed });
|
|
34
|
+
cellIndex += 1;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
let totalCostUsd = 0;
|
|
38
|
+
let costCeilingReached = false;
|
|
39
|
+
const abortController = new AbortController();
|
|
40
|
+
const lanes = [];
|
|
41
|
+
let nextIdx = 0;
|
|
42
|
+
const cellsRef = cells;
|
|
43
|
+
for (let i = 0; i < maxConcurrency; i++) {
|
|
44
|
+
lanes.push(
|
|
45
|
+
(async () => {
|
|
46
|
+
while (true) {
|
|
47
|
+
const myIdx = nextIdx++;
|
|
48
|
+
if (myIdx >= schedule.length) return;
|
|
49
|
+
const slot = schedule[myIdx];
|
|
50
|
+
if (costCeilingReached) {
|
|
51
|
+
cellsRef.push(skippedCell(slot, "cost_ceiling_reached"));
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
const result = await executeCell({
|
|
55
|
+
slot,
|
|
56
|
+
opts,
|
|
57
|
+
manifestHash,
|
|
58
|
+
resumable,
|
|
59
|
+
now,
|
|
60
|
+
buildTraceWriter: opts.buildTraceWriter ?? defaultBuildTraceWriter,
|
|
61
|
+
signal: abortController.signal
|
|
62
|
+
});
|
|
63
|
+
cellsRef.push(result.cell);
|
|
64
|
+
totalCostUsd += result.cell.costUsd;
|
|
65
|
+
Object.assign(artifactsByPath, result.artifactsByPath);
|
|
66
|
+
if (opts.costCeiling !== void 0 && totalCostUsd >= opts.costCeiling) {
|
|
67
|
+
costCeilingReached = true;
|
|
68
|
+
}
|
|
69
|
+
if (opts.labeledStore && opts.labeledStore !== "off" && !result.cell.error) {
|
|
70
|
+
await captureToStore({
|
|
71
|
+
store: opts.labeledStore,
|
|
72
|
+
cell: result.cell,
|
|
73
|
+
scenario: slot.scenario,
|
|
74
|
+
opts,
|
|
75
|
+
now
|
|
76
|
+
}).catch((err) => {
|
|
77
|
+
console.warn(
|
|
78
|
+
`[runCampaign] capture failed for ${result.cell.cellId}: ${err instanceof Error ? err.message : String(err)}`
|
|
79
|
+
);
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
})()
|
|
84
|
+
);
|
|
85
|
+
}
|
|
86
|
+
await Promise.all(lanes);
|
|
87
|
+
const endedAt = now();
|
|
88
|
+
cellsRef.sort((a, b) => a.cellId.localeCompare(b.cellId));
|
|
89
|
+
const aggregates = computeAggregates(
|
|
90
|
+
cellsRef,
|
|
91
|
+
judges,
|
|
92
|
+
seed
|
|
93
|
+
);
|
|
94
|
+
return {
|
|
95
|
+
manifestHash,
|
|
96
|
+
seed,
|
|
97
|
+
startedAt: startedAt.toISOString(),
|
|
98
|
+
endedAt: endedAt.toISOString(),
|
|
99
|
+
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
100
|
+
cells: cellsRef,
|
|
101
|
+
aggregates,
|
|
102
|
+
runDir: opts.runDir,
|
|
103
|
+
artifactsByPath,
|
|
104
|
+
scenarios: opts.scenarios.map((s) => ({ id: s.id, kind: s.kind }))
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
async function executeCell(args) {
|
|
108
|
+
const cellDir = join(args.opts.runDir, args.slot.cellId.replace(/[^a-zA-Z0-9_-]/g, "_"));
|
|
109
|
+
if (!existsSync(cellDir)) mkdirSync(cellDir, { recursive: true });
|
|
110
|
+
const cachePath = join(cellDir, "cached-result.json");
|
|
111
|
+
if (args.resumable && existsSync(cachePath)) {
|
|
112
|
+
try {
|
|
113
|
+
const cached = JSON.parse(readFileSync(cachePath, "utf8"));
|
|
114
|
+
if (cached.cellId === args.slot.cellId) {
|
|
115
|
+
return { cell: { ...cached, cached: true }, artifactsByPath: {} };
|
|
116
|
+
}
|
|
117
|
+
} catch {
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
const startMs = Date.now();
|
|
121
|
+
const trace = args.buildTraceWriter(args.slot.cellId, cellDir);
|
|
122
|
+
const artifactsByPath = {};
|
|
123
|
+
const artifacts = {
|
|
124
|
+
async write(path, content) {
|
|
125
|
+
const fullPath = join(cellDir, path);
|
|
126
|
+
const dir = join(fullPath, "..");
|
|
127
|
+
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
128
|
+
writeFileSync(fullPath, content);
|
|
129
|
+
artifactsByPath[`${args.slot.cellId}/${path}`] = fullPath;
|
|
130
|
+
return fullPath;
|
|
131
|
+
},
|
|
132
|
+
async writeJson(path, value) {
|
|
133
|
+
return artifacts.write(path, JSON.stringify(value, null, 2));
|
|
134
|
+
}
|
|
135
|
+
};
|
|
136
|
+
let costSoFar = 0;
|
|
137
|
+
const cost = {
|
|
138
|
+
observe(amount, source) {
|
|
139
|
+
costSoFar += amount;
|
|
140
|
+
trace.span(`cost.${source}`, { amountUsd: amount }).end();
|
|
141
|
+
},
|
|
142
|
+
current() {
|
|
143
|
+
return costSoFar;
|
|
144
|
+
}
|
|
145
|
+
};
|
|
146
|
+
const ctx = {
|
|
147
|
+
cellId: args.slot.cellId,
|
|
148
|
+
rep: args.slot.rep,
|
|
149
|
+
seed: args.slot.cellSeed,
|
|
150
|
+
signal: args.signal,
|
|
151
|
+
trace,
|
|
152
|
+
artifacts,
|
|
153
|
+
cost
|
|
154
|
+
};
|
|
155
|
+
let artifact;
|
|
156
|
+
let errorMessage;
|
|
157
|
+
try {
|
|
158
|
+
artifact = await args.opts.dispatch(args.slot.scenario, ctx);
|
|
159
|
+
} catch (err) {
|
|
160
|
+
errorMessage = err instanceof Error ? err.message : String(err);
|
|
161
|
+
}
|
|
162
|
+
const judgeScores = {};
|
|
163
|
+
if (artifact !== void 0) {
|
|
164
|
+
for (const judge of args.opts.judges ?? []) {
|
|
165
|
+
if (judge.appliesTo && !judge.appliesTo(args.slot.scenario)) continue;
|
|
166
|
+
try {
|
|
167
|
+
const score = await runJudgeCell(judge, { artifact, scenario: args.slot.scenario });
|
|
168
|
+
judgeScores[judge.name] = score;
|
|
169
|
+
} catch (err) {
|
|
170
|
+
judgeScores[judge.name] = {
|
|
171
|
+
dimensions: {},
|
|
172
|
+
composite: 0,
|
|
173
|
+
notes: `judge failed: ${err instanceof Error ? err.message : String(err)}`
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
await trace.flush();
|
|
179
|
+
const cell = {
|
|
180
|
+
cellId: args.slot.cellId,
|
|
181
|
+
scenarioId: args.slot.scenario.id,
|
|
182
|
+
rep: args.slot.rep,
|
|
183
|
+
artifact: artifact ?? null,
|
|
184
|
+
judgeScores,
|
|
185
|
+
costUsd: costSoFar,
|
|
186
|
+
durationMs: Date.now() - startMs,
|
|
187
|
+
seed: args.slot.cellSeed,
|
|
188
|
+
cached: false,
|
|
189
|
+
error: errorMessage
|
|
190
|
+
};
|
|
191
|
+
if (!errorMessage && args.resumable) {
|
|
192
|
+
writeFileSync(cachePath, JSON.stringify(cell));
|
|
193
|
+
}
|
|
194
|
+
return { cell, artifactsByPath };
|
|
195
|
+
}
|
|
196
|
+
async function runJudgeCell(_judge, _input) {
|
|
197
|
+
return { dimensions: {}, composite: 0, notes: "phase-1-stub" };
|
|
198
|
+
}
|
|
199
|
+
function defaultBuildTraceWriter(cellId, dir) {
|
|
200
|
+
const spans = [];
|
|
201
|
+
return {
|
|
202
|
+
span(name, attributes) {
|
|
203
|
+
const startMs = Date.now();
|
|
204
|
+
const record = { name, cellId, startMs, ...attributes ?? {} };
|
|
205
|
+
const finish = {
|
|
206
|
+
end(endAttrs) {
|
|
207
|
+
record.durationMs = Date.now() - startMs;
|
|
208
|
+
if (endAttrs) Object.assign(record, endAttrs);
|
|
209
|
+
spans.push(record);
|
|
210
|
+
},
|
|
211
|
+
setAttribute(key, value) {
|
|
212
|
+
record[key] = value;
|
|
213
|
+
}
|
|
214
|
+
};
|
|
215
|
+
return finish;
|
|
216
|
+
},
|
|
217
|
+
async flush() {
|
|
218
|
+
const path = join(dir, "spans.jsonl");
|
|
219
|
+
writeFileSync(path, spans.map((s) => JSON.stringify(s)).join("\n"));
|
|
220
|
+
}
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
function skippedCell(slot, reason) {
|
|
224
|
+
return {
|
|
225
|
+
cellId: slot.cellId,
|
|
226
|
+
scenarioId: slot.scenario.id,
|
|
227
|
+
rep: slot.rep,
|
|
228
|
+
artifact: null,
|
|
229
|
+
judgeScores: {},
|
|
230
|
+
costUsd: 0,
|
|
231
|
+
durationMs: 0,
|
|
232
|
+
seed: slot.cellSeed,
|
|
233
|
+
cached: false,
|
|
234
|
+
error: `skipped: ${reason}`
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
async function captureToStore(args) {
|
|
238
|
+
await args.store.observe({
|
|
239
|
+
scenario: args.scenario,
|
|
240
|
+
artifact: args.cell.artifact,
|
|
241
|
+
judgeScores: args.cell.judgeScores,
|
|
242
|
+
source: args.opts.captureSource ?? "eval-run",
|
|
243
|
+
sourceVersionHash: args.opts.captureSourceVersionHash ?? "unknown",
|
|
244
|
+
capturedAt: args.now().toISOString(),
|
|
245
|
+
redactionStatus: "raw"
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
function computeManifestHash(input) {
|
|
249
|
+
const canonical = {
|
|
250
|
+
scenarios: input.scenarios.map((s) => ({ id: s.id, kind: s.kind })),
|
|
251
|
+
judges: input.judges.map((j) => ({ name: j.name, dims: j.dimensions.map((d) => d.key) })),
|
|
252
|
+
dispatch: input.dispatchRef,
|
|
253
|
+
seed: input.seed,
|
|
254
|
+
reps: input.reps
|
|
255
|
+
};
|
|
256
|
+
return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
|
|
257
|
+
}
|
|
258
|
+
function computeAggregates(cells, judges, seed) {
|
|
259
|
+
const byJudge = {};
|
|
260
|
+
for (const judge of judges) {
|
|
261
|
+
const scores = [];
|
|
262
|
+
for (const cell of cells) {
|
|
263
|
+
const s = cell.judgeScores[judge.name];
|
|
264
|
+
if (s !== void 0) scores.push(s.composite);
|
|
265
|
+
}
|
|
266
|
+
byJudge[judge.name] = aggregate(scores, seed);
|
|
267
|
+
}
|
|
268
|
+
const byScenario = {};
|
|
269
|
+
const scenarioGroups = /* @__PURE__ */ new Map();
|
|
270
|
+
for (const cell of cells) {
|
|
271
|
+
const composites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
272
|
+
if (composites.length === 0) continue;
|
|
273
|
+
const mean = composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
274
|
+
const arr = scenarioGroups.get(cell.scenarioId) ?? [];
|
|
275
|
+
arr.push(mean);
|
|
276
|
+
scenarioGroups.set(cell.scenarioId, arr);
|
|
277
|
+
}
|
|
278
|
+
for (const [scenarioId, samples] of scenarioGroups) {
|
|
279
|
+
const ag = aggregate(samples, seed);
|
|
280
|
+
byScenario[scenarioId] = { meanComposite: ag.mean, ci95: ag.ci95, n: ag.n };
|
|
281
|
+
}
|
|
282
|
+
return {
|
|
283
|
+
byJudge,
|
|
284
|
+
byScenario,
|
|
285
|
+
totalCostUsd: cells.reduce((a, c) => a + c.costUsd, 0),
|
|
286
|
+
cellsExecuted: cells.filter((c) => !c.error).length,
|
|
287
|
+
cellsSkipped: cells.filter((c) => c.error?.startsWith("skipped:")).length,
|
|
288
|
+
cellsCached: cells.filter((c) => c.cached).length,
|
|
289
|
+
cellsFailed: cells.filter((c) => c.error && !c.error.startsWith("skipped:")).length
|
|
290
|
+
};
|
|
291
|
+
}
|
|
292
|
+
function aggregate(samples, seed) {
|
|
293
|
+
const n = samples.length;
|
|
294
|
+
if (n === 0) return { mean: 0, stdev: 0, ci95: [0, 0], n: 0 };
|
|
295
|
+
const mean = samples.reduce((a, b) => a + b, 0) / n;
|
|
296
|
+
const variance = samples.reduce((a, b) => a + (b - mean) ** 2, 0) / Math.max(1, n - 1);
|
|
297
|
+
const stdev = Math.sqrt(variance);
|
|
298
|
+
const ci = confidenceInterval(samples, 0.95, { seed, resamples: 1e3 });
|
|
299
|
+
return { mean, stdev, ci95: [ci.lower, ci.upper], n };
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
export {
|
|
303
|
+
runCampaign
|
|
304
|
+
};
|
|
305
|
+
//# sourceMappingURL=chunk-TMXPFWC7.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/campaign/run-campaign.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates\n * scenarios → dispatch → artifacts → judges → aggregates, with full\n * reproducibility (seed + manifest hash), cell-level resumability, bootstrap\n * CIs, and the `LabeledScenarioStore` capture flywheel.\n *\n * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this\n * primitive but live in `presets/run-improvement-loop.ts`. This file keeps\n * the core orchestrator minimal — Phase 1 of the Pass A track.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport { confidenceInterval } from '../statistics'\nimport type {\n CampaignAggregates,\n CampaignArtifactWriter,\n CampaignCellResult,\n CampaignCostMeter,\n CampaignResult,\n CampaignTraceWriter,\n DispatchContext,\n DispatchFn,\n JudgeAggregate,\n JudgeConfig,\n JudgeScore,\n LabeledScenarioStore,\n Scenario,\n ScenarioAggregate,\n TraceSpan,\n} from './types'\n\nexport interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {\n scenarios: TScenario[]\n dispatch: DispatchFn<TScenario, TArtifact>\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Required for reproducibility. Default 42. */\n seed?: number\n /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for\n * bootstrap-tight intervals on critical eval. */\n reps?: number\n /** When true (default), completed cells are cached by\n * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */\n resumable?: boolean\n /** Optional store — when present, every artifact + judge score is captured\n * with the configured `captureSource`. Capture is default ON; pass `'off'`\n * to disable. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic'\n captureSourceVersionHash?: string\n /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */\n costCeiling?: number\n /** Max concurrent cells. Default 2. */\n maxConcurrency?: number\n /** Required: where artifacts + traces land. */\n runDir: string\n /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted\n * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate\n * refuses this when the caller wires `autoOnPromote !== 'none'`. */\n tracing?: 'on' | 'off'\n /** Test seam — override the wall clock for deterministic tests. */\n now?: () => Date\n /** Test seam — override per-cell trace writer factory. */\n buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter\n}\n\nexport async function runCampaign<TScenario extends Scenario, TArtifact>(\n opts: RunCampaignOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n const seed = opts.seed ?? 42\n const reps = opts.reps ?? 1\n const resumable = opts.resumable ?? true\n const maxConcurrency = opts.maxConcurrency ?? 2\n const now = opts.now ?? (() => new Date())\n const judges = opts.judges ?? []\n\n if (!existsSync(opts.runDir)) mkdirSync(opts.runDir, { recursive: true })\n\n const manifestHash = computeManifestHash({\n scenarios: opts.scenarios,\n judges: judges as unknown as JudgeConfig<unknown>[],\n dispatchRef: opts.dispatch.name || 'anonymous',\n seed,\n reps,\n })\n\n const startedAt = now()\n const cells: CampaignCellResult<TArtifact>[] = []\n const artifactsByPath: Record<string, string> = {}\n\n // Build the cell schedule (scenario × rep).\n const schedule: Array<{ scenario: TScenario; rep: number; cellId: string; cellSeed: number }> = []\n let cellIndex = 0\n for (const scenario of opts.scenarios) {\n for (let rep = 0; rep < reps; rep++) {\n const cellId = `${scenario.id}:${rep}`\n const cellSeed = seed + cellIndex\n schedule.push({ scenario, rep, cellId, cellSeed })\n cellIndex += 1\n }\n }\n\n // Concurrency-limited execution.\n let totalCostUsd = 0\n let costCeilingReached = false\n const abortController = new AbortController()\n // Concurrency lanes that drain the cell schedule. Named \"lanes\" — not\n // \"workers\" — to avoid clashing with the taxonomy's worker (= the agent\n // harness in a sandbox, invoked behind `dispatch`). See loop-taxonomy.md.\n const lanes: Promise<void>[] = []\n let nextIdx = 0\n const cellsRef = cells\n\n for (let i = 0; i < maxConcurrency; i++) {\n lanes.push(\n (async () => {\n while (true) {\n const myIdx = nextIdx++\n if (myIdx >= schedule.length) return\n const slot = schedule[myIdx]!\n if (costCeilingReached) {\n cellsRef.push(skippedCell(slot, 'cost_ceiling_reached'))\n continue\n }\n const result = await executeCell({\n slot,\n opts,\n manifestHash,\n resumable,\n now,\n buildTraceWriter: opts.buildTraceWriter ?? defaultBuildTraceWriter,\n signal: abortController.signal,\n })\n cellsRef.push(result.cell)\n totalCostUsd += result.cell.costUsd\n Object.assign(artifactsByPath, result.artifactsByPath)\n if (opts.costCeiling !== undefined && totalCostUsd >= opts.costCeiling) {\n costCeilingReached = true\n }\n // Capture into LabeledScenarioStore unless explicitly disabled.\n if (opts.labeledStore && opts.labeledStore !== 'off' && !result.cell.error) {\n await captureToStore({\n store: opts.labeledStore,\n cell: result.cell,\n scenario: slot.scenario,\n opts,\n now,\n }).catch((err) => {\n // Capture failures are non-fatal — log but don't crash the campaign.\n // (Trace would normally land here.)\n console.warn(\n `[runCampaign] capture failed for ${result.cell.cellId}: ${err instanceof Error ? err.message : String(err)}`,\n )\n })\n }\n }\n })(),\n )\n }\n await Promise.all(lanes)\n\n const endedAt = now()\n cellsRef.sort((a, b) => a.cellId.localeCompare(b.cellId))\n\n const aggregates = computeAggregates(\n cellsRef,\n judges as unknown as JudgeConfig<TArtifact>[],\n seed,\n )\n\n return {\n manifestHash,\n seed,\n startedAt: startedAt.toISOString(),\n endedAt: endedAt.toISOString(),\n durationMs: endedAt.getTime() - startedAt.getTime(),\n cells: cellsRef,\n aggregates,\n runDir: opts.runDir,\n artifactsByPath,\n scenarios: opts.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n }\n}\n\n// ── Internals ─────────────────────────────────────────────────────────\n\ninterface ExecuteCellArgs<TScenario extends Scenario, TArtifact> {\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number }\n opts: RunCampaignOptions<TScenario, TArtifact>\n manifestHash: string\n resumable: boolean\n now: () => Date\n buildTraceWriter: (cellId: string, dir: string) => CampaignTraceWriter\n signal: AbortSignal\n}\n\nasync function executeCell<TScenario extends Scenario, TArtifact>(\n args: ExecuteCellArgs<TScenario, TArtifact>,\n): Promise<{ cell: CampaignCellResult<TArtifact>; artifactsByPath: Record<string, string> }> {\n const cellDir = join(args.opts.runDir, args.slot.cellId.replace(/[^a-zA-Z0-9_-]/g, '_'))\n if (!existsSync(cellDir)) mkdirSync(cellDir, { recursive: true })\n\n // Resumability: cache key = (manifestHash, scenarioId, rep)\n const cachePath = join(cellDir, 'cached-result.json')\n if (args.resumable && existsSync(cachePath)) {\n try {\n const cached = JSON.parse(readFileSync(cachePath, 'utf8')) as CampaignCellResult<TArtifact>\n if (cached.cellId === args.slot.cellId) {\n return { cell: { ...cached, cached: true }, artifactsByPath: {} }\n }\n } catch {\n // Corrupt cache — fall through to re-run.\n }\n }\n\n const startMs = Date.now()\n const trace = args.buildTraceWriter(args.slot.cellId, cellDir)\n const artifactsByPath: Record<string, string> = {}\n const artifacts: CampaignArtifactWriter = {\n async write(path, content) {\n const fullPath = join(cellDir, path)\n const dir = join(fullPath, '..')\n if (!existsSync(dir)) mkdirSync(dir, { recursive: true })\n writeFileSync(fullPath, content as Uint8Array)\n artifactsByPath[`${args.slot.cellId}/${path}`] = fullPath\n return fullPath\n },\n async writeJson(path, value) {\n return artifacts.write(path, JSON.stringify(value, null, 2))\n },\n }\n let costSoFar = 0\n const cost: CampaignCostMeter = {\n observe(amount, source) {\n costSoFar += amount\n trace.span(`cost.${source}`, { amountUsd: amount }).end()\n },\n current() {\n return costSoFar\n },\n }\n\n const ctx: DispatchContext = {\n cellId: args.slot.cellId,\n rep: args.slot.rep,\n seed: args.slot.cellSeed,\n signal: args.signal,\n trace,\n artifacts,\n cost,\n }\n\n let artifact: TArtifact | undefined\n let errorMessage: string | undefined\n try {\n artifact = await args.opts.dispatch(args.slot.scenario, ctx)\n } catch (err) {\n errorMessage = err instanceof Error ? err.message : String(err)\n }\n\n // Run judges (only if we have an artifact).\n const judgeScores: Record<string, JudgeScore> = {}\n if (artifact !== undefined) {\n for (const judge of args.opts.judges ?? []) {\n if (judge.appliesTo && !judge.appliesTo(args.slot.scenario)) continue\n try {\n const score = await runJudgeCell(judge, { artifact, scenario: args.slot.scenario })\n judgeScores[judge.name] = score\n } catch (err) {\n judgeScores[judge.name] = {\n dimensions: {},\n composite: 0,\n notes: `judge failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n }\n }\n\n await trace.flush()\n\n const cell: CampaignCellResult<TArtifact> = {\n cellId: args.slot.cellId,\n scenarioId: args.slot.scenario.id,\n rep: args.slot.rep,\n artifact: (artifact ?? null) as TArtifact,\n judgeScores,\n costUsd: costSoFar,\n durationMs: Date.now() - startMs,\n seed: args.slot.cellSeed,\n cached: false,\n error: errorMessage,\n }\n\n if (!errorMessage && args.resumable) {\n writeFileSync(cachePath, JSON.stringify(cell))\n }\n\n return { cell, artifactsByPath }\n}\n\nasync function runJudgeCell<TArtifact, TScenario extends Scenario>(\n _judge: JudgeConfig<TArtifact, TScenario>,\n _input: { artifact: TArtifact; scenario: TScenario },\n): Promise<JudgeScore> {\n // Phase 1 stub — wires to the existing 0.38 runJudge in Phase 2.\n // Returns a zero-score for now; consumer wiring + preset uses this.\n return { dimensions: {}, composite: 0, notes: 'phase-1-stub' }\n}\n\nfunction defaultBuildTraceWriter(cellId: string, dir: string): CampaignTraceWriter {\n const spans: Array<Record<string, unknown>> = []\n return {\n span(name, attributes) {\n const startMs = Date.now()\n const record: Record<string, unknown> = { name, cellId, startMs, ...(attributes ?? {}) }\n const finish: TraceSpan = {\n end(endAttrs) {\n record.durationMs = Date.now() - startMs\n if (endAttrs) Object.assign(record, endAttrs)\n spans.push(record)\n },\n setAttribute(key, value) {\n record[key] = value\n },\n }\n return finish\n },\n async flush() {\n const path = join(dir, 'spans.jsonl')\n writeFileSync(path, spans.map((s) => JSON.stringify(s)).join('\\n'))\n },\n }\n}\n\nfunction skippedCell<TScenario extends Scenario, TArtifact>(\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number },\n reason: string,\n): CampaignCellResult<TArtifact> {\n return {\n cellId: slot.cellId,\n scenarioId: slot.scenario.id,\n rep: slot.rep,\n artifact: null as unknown as TArtifact,\n judgeScores: {},\n costUsd: 0,\n durationMs: 0,\n seed: slot.cellSeed,\n cached: false,\n error: `skipped: ${reason}`,\n }\n}\n\ninterface CaptureArgs<TScenario extends Scenario, TArtifact> {\n store: LabeledScenarioStore\n cell: CampaignCellResult<TArtifact>\n scenario: TScenario\n opts: RunCampaignOptions<TScenario, TArtifact>\n now: () => Date\n}\n\nasync function captureToStore<TScenario extends Scenario, TArtifact>(\n args: CaptureArgs<TScenario, TArtifact>,\n): Promise<void> {\n await args.store.observe({\n scenario: args.scenario,\n artifact: args.cell.artifact,\n judgeScores: args.cell.judgeScores,\n source: args.opts.captureSource ?? 'eval-run',\n sourceVersionHash: args.opts.captureSourceVersionHash ?? 'unknown',\n capturedAt: args.now().toISOString(),\n redactionStatus: 'raw',\n })\n}\n\n// ── Aggregates + manifest hash ────────────────────────────────────────\n\nfunction computeManifestHash(input: {\n scenarios: Scenario[]\n judges: JudgeConfig<unknown>[]\n dispatchRef: string\n seed: number\n reps: number\n}): string {\n const canonical = {\n scenarios: input.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n judges: input.judges.map((j) => ({ name: j.name, dims: j.dimensions.map((d) => d.key) })),\n dispatch: input.dispatchRef,\n seed: input.seed,\n reps: input.reps,\n }\n return createHash('sha256').update(JSON.stringify(canonical)).digest('hex')\n}\n\nfunction computeAggregates<TArtifact>(\n cells: CampaignCellResult<TArtifact>[],\n judges: JudgeConfig<TArtifact>[],\n seed: number,\n): CampaignAggregates {\n const byJudge: Record<string, JudgeAggregate> = {}\n for (const judge of judges) {\n const scores: number[] = []\n for (const cell of cells) {\n const s = cell.judgeScores[judge.name]\n if (s !== undefined) scores.push(s.composite)\n }\n byJudge[judge.name] = aggregate(scores, seed)\n }\n const byScenario: Record<string, ScenarioAggregate> = {}\n const scenarioGroups = new Map<string, number[]>()\n for (const cell of cells) {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n if (composites.length === 0) continue\n const mean = composites.reduce((a, b) => a + b, 0) / composites.length\n const arr = scenarioGroups.get(cell.scenarioId) ?? []\n arr.push(mean)\n scenarioGroups.set(cell.scenarioId, arr)\n }\n for (const [scenarioId, samples] of scenarioGroups) {\n const ag = aggregate(samples, seed)\n byScenario[scenarioId] = { meanComposite: ag.mean, ci95: ag.ci95, n: ag.n }\n }\n return {\n byJudge,\n byScenario,\n totalCostUsd: cells.reduce((a, c) => a + c.costUsd, 0),\n cellsExecuted: cells.filter((c) => !c.error).length,\n cellsSkipped: cells.filter((c) => c.error?.startsWith('skipped:')).length,\n cellsCached: cells.filter((c) => c.cached).length,\n cellsFailed: cells.filter((c) => c.error && !c.error.startsWith('skipped:')).length,\n }\n}\n\n// Percentile bootstrap CI95 via seeded resampling. Deterministic for a given\n// seed — same campaign re-run produces identical CI bands. Falls back to\n// degenerate intervals at n<=1 (the bootstrap is undefined there).\nfunction aggregate(samples: number[], seed: number): JudgeAggregate {\n const n = samples.length\n if (n === 0) return { mean: 0, stdev: 0, ci95: [0, 0], n: 0 }\n const mean = samples.reduce((a, b) => a + b, 0) / n\n const variance = samples.reduce((a, b) => a + (b - mean) ** 2, 0) / Math.max(1, n - 1)\n const stdev = Math.sqrt(variance)\n const ci = confidenceInterval(samples, 0.95, { seed, resamples: 1000 })\n return { mean, stdev, ci95: [ci.lower, ci.upper], n }\n}\n"],"mappings":";;;;;AAaA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAsDrB,eAAsB,YACpB,MAC+C;AAC/C,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,QAAM,MAAM,KAAK,QAAQ,MAAM,oBAAI,KAAK;AACxC,QAAM,SAAS,KAAK,UAAU,CAAC;AAE/B,MAAI,CAAC,WAAW,KAAK,MAAM,EAAG,WAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAExE,QAAM,eAAe,oBAAoB;AAAA,IACvC,WAAW,KAAK;AAAA,IAChB;AAAA,IACA,aAAa,KAAK,SAAS,QAAQ;AAAA,IACnC;AAAA,IACA;AAAA,EACF,CAAC;AAED,QAAM,YAAY,IAAI;AACtB,QAAM,QAAyC,CAAC;AAChD,QAAM,kBAA0C,CAAC;AAGjD,QAAM,WAA0F,CAAC;AACjG,MAAI,YAAY;AAChB,aAAW,YAAY,KAAK,WAAW;AACrC,aAAS,MAAM,GAAG,MAAM,MAAM,OAAO;AACnC,YAAM,SAAS,GAAG,SAAS,EAAE,IAAI,GAAG;AACpC,YAAM,WAAW,OAAO;AACxB,eAAS,KAAK,EAAE,UAAU,KAAK,QAAQ,SAAS,CAAC;AACjD,mBAAa;AAAA,IACf;AAAA,EACF;AAGA,MAAI,eAAe;AACnB,MAAI,qBAAqB;AACzB,QAAM,kBAAkB,IAAI,gBAAgB;AAI5C,QAAM,QAAyB,CAAC;AAChC,MAAI,UAAU;AACd,QAAM,WAAW;AAEjB,WAAS,IAAI,GAAG,IAAI,gBAAgB,KAAK;AACvC,UAAM;AAAA,OACH,YAAY;AACX,eAAO,MAAM;AACX,gBAAM,QAAQ;AACd,cAAI,SAAS,SAAS,OAAQ;AAC9B,gBAAM,OAAO,SAAS,KAAK;AAC3B,cAAI,oBAAoB;AACtB,qBAAS,KAAK,YAAY,MAAM,sBAAsB,CAAC;AACvD;AAAA,UACF;AACA,gBAAM,SAAS,MAAM,YAAY;AAAA,YAC/B;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA,kBAAkB,KAAK,oBAAoB;AAAA,YAC3C,QAAQ,gBAAgB;AAAA,UAC1B,CAAC;AACD,mBAAS,KAAK,OAAO,IAAI;AACzB,0BAAgB,OAAO,KAAK;AAC5B,iBAAO,OAAO,iBAAiB,OAAO,eAAe;AACrD,cAAI,KAAK,gBAAgB,UAAa,gBAAgB,KAAK,aAAa;AACtE,iCAAqB;AAAA,UACvB;AAEA,cAAI,KAAK,gBAAgB,KAAK,iBAAiB,SAAS,CAAC,OAAO,KAAK,OAAO;AAC1E,kBAAM,eAAe;AAAA,cACnB,OAAO,KAAK;AAAA,cACZ,MAAM,OAAO;AAAA,cACb,UAAU,KAAK;AAAA,cACf;AAAA,cACA;AAAA,YACF,CAAC,EAAE,MAAM,CAAC,QAAQ;AAGhB,sBAAQ;AAAA,gBACN,oCAAoC,OAAO,KAAK,MAAM,KAAK,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,cAC7G;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,GAAG;AAAA,IACL;AAAA,EACF;AACA,QAAM,QAAQ,IAAI,KAAK;AAEvB,QAAM,UAAU,IAAI;AACpB,WAAS,KAAK,CAAC,GAAG,MAAM,EAAE,OAAO,cAAc,EAAE,MAAM,CAAC;AAExD,QAAM,aAAa;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,WAAW,UAAU,YAAY;AAAA,IACjC,SAAS,QAAQ,YAAY;AAAA,IAC7B,YAAY,QAAQ,QAAQ,IAAI,UAAU,QAAQ;AAAA,IAClD,OAAO;AAAA,IACP;AAAA,IACA,QAAQ,KAAK;AAAA,IACb;AAAA,IACA,WAAW,KAAK,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,EACnE;AACF;AAcA,eAAe,YACb,MAC2F;AAC3F,QAAM,UAAU,KAAK,KAAK,KAAK,QAAQ,KAAK,KAAK,OAAO,QAAQ,mBAAmB,GAAG,CAAC;AACvF,MAAI,CAAC,WAAW,OAAO,EAAG,WAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AAGhE,QAAM,YAAY,KAAK,SAAS,oBAAoB;AACpD,MAAI,KAAK,aAAa,WAAW,SAAS,GAAG;AAC3C,QAAI;AACF,YAAM,SAAS,KAAK,MAAM,aAAa,WAAW,MAAM,CAAC;AACzD,UAAI,OAAO,WAAW,KAAK,KAAK,QAAQ;AACtC,eAAO,EAAE,MAAM,EAAE,GAAG,QAAQ,QAAQ,KAAK,GAAG,iBAAiB,CAAC,EAAE;AAAA,MAClE;AAAA,IACF,QAAQ;AAAA,IAER;AAAA,EACF;AAEA,QAAM,UAAU,KAAK,IAAI;AACzB,QAAM,QAAQ,KAAK,iBAAiB,KAAK,KAAK,QAAQ,OAAO;AAC7D,QAAM,kBAA0C,CAAC;AACjD,QAAM,YAAoC;AAAA,IACxC,MAAM,MAAM,MAAM,SAAS;AACzB,YAAM,WAAW,KAAK,SAAS,IAAI;AACnC,YAAM,MAAM,KAAK,UAAU,IAAI;AAC/B,UAAI,CAAC,WAAW,GAAG,EAAG,WAAU,KAAK,EAAE,WAAW,KAAK,CAAC;AACxD,oBAAc,UAAU,OAAqB;AAC7C,sBAAgB,GAAG,KAAK,KAAK,MAAM,IAAI,IAAI,EAAE,IAAI;AACjD,aAAO;AAAA,IACT;AAAA,IACA,MAAM,UAAU,MAAM,OAAO;AAC3B,aAAO,UAAU,MAAM,MAAM,KAAK,UAAU,OAAO,MAAM,CAAC,CAAC;AAAA,IAC7D;AAAA,EACF;AACA,MAAI,YAAY;AAChB,QAAM,OAA0B;AAAA,IAC9B,QAAQ,QAAQ,QAAQ;AACtB,mBAAa;AACb,YAAM,KAAK,QAAQ,MAAM,IAAI,EAAE,WAAW,OAAO,CAAC,EAAE,IAAI;AAAA,IAC1D;AAAA,IACA,UAAU;AACR,aAAO;AAAA,IACT;AAAA,EACF;AAEA,QAAM,MAAuB;AAAA,IAC3B,QAAQ,KAAK,KAAK;AAAA,IAClB,KAAK,KAAK,KAAK;AAAA,IACf,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,MAAI;AACJ,MAAI;AACJ,MAAI;AACF,eAAW,MAAM,KAAK,KAAK,SAAS,KAAK,KAAK,UAAU,GAAG;AAAA,EAC7D,SAAS,KAAK;AACZ,mBAAe,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,EAChE;AAGA,QAAM,cAA0C,CAAC;AACjD,MAAI,aAAa,QAAW;AAC1B,eAAW,SAAS,KAAK,KAAK,UAAU,CAAC,GAAG;AAC1C,UAAI,MAAM,aAAa,CAAC,MAAM,UAAU,KAAK,KAAK,QAAQ,EAAG;AAC7D,UAAI;AACF,cAAM,QAAQ,MAAM,aAAa,OAAO,EAAE,UAAU,UAAU,KAAK,KAAK,SAAS,CAAC;AAClF,oBAAY,MAAM,IAAI,IAAI;AAAA,MAC5B,SAAS,KAAK;AACZ,oBAAY,MAAM,IAAI,IAAI;AAAA,UACxB,YAAY,CAAC;AAAA,UACb,WAAW;AAAA,UACX,OAAO,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,QAC1E;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,MAAM,MAAM;AAElB,QAAM,OAAsC;AAAA,IAC1C,QAAQ,KAAK,KAAK;AAAA,IAClB,YAAY,KAAK,KAAK,SAAS;AAAA,IAC/B,KAAK,KAAK,KAAK;AAAA,IACf,UAAW,YAAY;AAAA,IACvB;AAAA,IACA,SAAS;AAAA,IACT,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ;AAAA,IACR,OAAO;AAAA,EACT;AAEA,MAAI,CAAC,gBAAgB,KAAK,WAAW;AACnC,kBAAc,WAAW,KAAK,UAAU,IAAI,CAAC;AAAA,EAC/C;AAEA,SAAO,EAAE,MAAM,gBAAgB;AACjC;AAEA,eAAe,aACb,QACA,QACqB;AAGrB,SAAO,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAC/D;AAEA,SAAS,wBAAwB,QAAgB,KAAkC;AACjF,QAAM,QAAwC,CAAC;AAC/C,SAAO;AAAA,IACL,KAAK,MAAM,YAAY;AACrB,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAkC,EAAE,MAAM,QAAQ,SAAS,GAAI,cAAc,CAAC,EAAG;AACvF,YAAM,SAAoB;AAAA,QACxB,IAAI,UAAU;AACZ,iBAAO,aAAa,KAAK,IAAI,IAAI;AACjC,cAAI,SAAU,QAAO,OAAO,QAAQ,QAAQ;AAC5C,gBAAM,KAAK,MAAM;AAAA,QACnB;AAAA,QACA,aAAa,KAAK,OAAO;AACvB,iBAAO,GAAG,IAAI;AAAA,QAChB;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,IACA,MAAM,QAAQ;AACZ,YAAM,OAAO,KAAK,KAAK,aAAa;AACpC,oBAAc,MAAM,MAAM,IAAI,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,IACpE;AAAA,EACF;AACF;AAEA,SAAS,YACP,MACA,QAC+B;AAC/B,SAAO;AAAA,IACL,QAAQ,KAAK;AAAA,IACb,YAAY,KAAK,SAAS;AAAA,IAC1B,KAAK,KAAK;AAAA,IACV,UAAU;AAAA,IACV,aAAa,CAAC;AAAA,IACd,SAAS;AAAA,IACT,YAAY;AAAA,IACZ,MAAM,KAAK;AAAA,IACX,QAAQ;AAAA,IACR,OAAO,YAAY,MAAM;AAAA,EAC3B;AACF;AAUA,eAAe,eACb,MACe;AACf,QAAM,KAAK,MAAM,QAAQ;AAAA,IACvB,UAAU,KAAK;AAAA,IACf,UAAU,KAAK,KAAK;AAAA,IACpB,aAAa,KAAK,KAAK;AAAA,IACvB,QAAQ,KAAK,KAAK,iBAAiB;AAAA,IACnC,mBAAmB,KAAK,KAAK,4BAA4B;AAAA,IACzD,YAAY,KAAK,IAAI,EAAE,YAAY;AAAA,IACnC,iBAAiB;AAAA,EACnB,CAAC;AACH;AAIA,SAAS,oBAAoB,OAMlB;AACT,QAAM,YAAY;AAAA,IAChB,WAAW,MAAM,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,IAClE,QAAQ,MAAM,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,MAAM,EAAE,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;AAAA,IACxF,UAAU,MAAM;AAAA,IAChB,MAAM,MAAM;AAAA,IACZ,MAAM,MAAM;AAAA,EACd;AACA,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,SAAS,CAAC,EAAE,OAAO,KAAK;AAC5E;AAEA,SAAS,kBACP,OACA,QACA,MACoB;AACpB,QAAM,UAA0C,CAAC;AACjD,aAAW,SAAS,QAAQ;AAC1B,UAAM,SAAmB,CAAC;AAC1B,eAAW,QAAQ,OAAO;AACxB,YAAM,IAAI,KAAK,YAAY,MAAM,IAAI;AACrC,UAAI,MAAM,OAAW,QAAO,KAAK,EAAE,SAAS;AAAA,IAC9C;AACA,YAAQ,MAAM,IAAI,IAAI,UAAU,QAAQ,IAAI;AAAA,EAC9C;AACA,QAAM,aAAgD,CAAC;AACvD,QAAM,iBAAiB,oBAAI,IAAsB;AACjD,aAAW,QAAQ,OAAO;AACxB,UAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,QAAI,WAAW,WAAW,EAAG;AAC7B,UAAM,OAAO,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAChE,UAAM,MAAM,eAAe,IAAI,KAAK,UAAU,KAAK,CAAC;AACpD,QAAI,KAAK,IAAI;AACb,mBAAe,IAAI,KAAK,YAAY,GAAG;AAAA,EACzC;AACA,aAAW,CAAC,YAAY,OAAO,KAAK,gBAAgB;AAClD,UAAM,KAAK,UAAU,SAAS,IAAI;AAClC,eAAW,UAAU,IAAI,EAAE,eAAe,GAAG,MAAM,MAAM,GAAG,MAAM,GAAG,GAAG,EAAE;AAAA,EAC5E;AACA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,cAAc,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,IACrD,eAAe,MAAM,OAAO,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE;AAAA,IAC7C,cAAc,MAAM,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,UAAU,CAAC,EAAE;AAAA,IACnE,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AAAA,IAC3C,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,WAAW,UAAU,CAAC,EAAE;AAAA,EAC/E;AACF;AAKA,SAAS,UAAU,SAAmB,MAA8B;AAClE,QAAM,IAAI,QAAQ;AAClB,MAAI,MAAM,EAAG,QAAO,EAAE,MAAM,GAAG,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE;AAC5D,QAAM,OAAO,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAClD,QAAM,WAAW,QAAQ,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,SAAS,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,IAAI,CAAC;AACrF,QAAM,QAAQ,KAAK,KAAK,QAAQ;AAChC,QAAM,KAAK,mBAAmB,SAAS,MAAM,EAAE,MAAM,WAAW,IAAK,CAAC;AACtE,SAAO,EAAE,MAAM,OAAO,MAAM,CAAC,GAAG,OAAO,GAAG,KAAK,GAAG,EAAE;AACtD;","names":[]}
|