@tangle-network/agent-eval 0.37.0 → 0.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,274 +1,9 @@
1
+ import {
2
+ buildByAxis,
3
+ runAgentMatrix,
4
+ summariseRows
5
+ } from "../chunk-QWV226SL.js";
1
6
  import "../chunk-PZ5AY32C.js";
2
-
3
- // src/matrix/aggregation.ts
4
- function flattenRuns(cells) {
5
- const rows = [];
6
- for (const { cell, runs } of cells) {
7
- for (const result of runs) rows.push({ cell, result });
8
- }
9
- return rows;
10
- }
11
- function quantile(sorted, q) {
12
- if (sorted.length === 0) return 0;
13
- if (sorted.length === 1) return sorted[0];
14
- const pos = (sorted.length - 1) * q;
15
- const lo = Math.floor(pos);
16
- const hi = Math.ceil(pos);
17
- if (lo === hi) return sorted[lo];
18
- const frac = pos - lo;
19
- return sorted[lo] * (1 - frac) + sorted[hi] * frac;
20
- }
21
- function summariseRows(rows, axisName, axisValue) {
22
- if (rows.length === 0) {
23
- return {
24
- axisName,
25
- axisValue,
26
- cells: 0,
27
- passRate: 0,
28
- meanScore: 0,
29
- p50Score: 0,
30
- p90Score: 0,
31
- totalCostUsd: 0,
32
- meanDurationMs: 0
33
- };
34
- }
35
- let pass = 0;
36
- let scoreSum = 0;
37
- let costSum = 0;
38
- let durSum = 0;
39
- const scores = [];
40
- for (const { result } of rows) {
41
- const errored = result.error !== void 0;
42
- const score = errored ? 0 : result.verdict.score;
43
- const valid = !errored && result.verdict.valid;
44
- if (valid) pass++;
45
- scoreSum += score;
46
- scores.push(score);
47
- costSum += result.costUsd;
48
- durSum += result.durationMs;
49
- }
50
- scores.sort((a, b) => a - b);
51
- return {
52
- axisName,
53
- axisValue,
54
- cells: rows.length,
55
- passRate: pass / rows.length,
56
- meanScore: scoreSum / rows.length,
57
- p50Score: quantile(scores, 0.5),
58
- p90Score: quantile(scores, 0.9),
59
- totalCostUsd: costSum,
60
- meanDurationMs: durSum / rows.length
61
- };
62
- }
63
- function bucketBy(rows, axisName, labelFor) {
64
- const buckets = /* @__PURE__ */ new Map();
65
- for (const row of rows) {
66
- const slot = row.cell.axes[axisName];
67
- if (!slot) continue;
68
- const id = slot.id;
69
- let arr = buckets.get(id);
70
- if (!arr) {
71
- arr = [];
72
- buckets.set(id, arr);
73
- }
74
- arr.push(row);
75
- }
76
- const out = {};
77
- for (const id of [...buckets.keys()].sort()) {
78
- out[id] = summariseRows(buckets.get(id), axisName, labelFor(id));
79
- }
80
- return out;
81
- }
82
- function buildByAxis(cells, axes, aggregateBy) {
83
- const rows = flattenRuns(cells);
84
- const byName = new Map(axes.map((a) => [a.name, a]));
85
- const byAxis = {};
86
- for (const name of aggregateBy) {
87
- const axis = byName.get(name);
88
- const labelFor = (id) => {
89
- if (!axis?.label) return id;
90
- const found = axis.values.find((v) => v.id === id);
91
- if (!found) return id;
92
- return axis.label(found.value, id);
93
- };
94
- byAxis[name] = bucketBy(rows, name, labelFor);
95
- }
96
- return byAxis;
97
- }
98
-
99
- // src/matrix/runner.ts
100
- function cartesian(axes) {
101
- if (axes.length === 0) return [{ axes: {} }];
102
- for (const a of axes) if (a.values.length === 0) return [];
103
- const out = [];
104
- const idx = new Array(axes.length).fill(0);
105
- while (true) {
106
- const slot = {};
107
- for (let i2 = 0; i2 < axes.length; i2++) {
108
- const axis = axes[i2];
109
- const v = axis.values[idx[i2]];
110
- slot[axis.name] = { id: v.id, value: v.value };
111
- }
112
- out.push({ axes: slot });
113
- let i = 0;
114
- while (i < axes.length) {
115
- const next = idx[i] + 1;
116
- const axis = axes[i];
117
- if (next < axis.values.length) {
118
- idx[i] = next;
119
- break;
120
- }
121
- idx[i] = 0;
122
- i++;
123
- }
124
- if (i === axes.length) break;
125
- }
126
- return out;
127
- }
128
- function makeMatrixId() {
129
- const t = Date.now().toString(36);
130
- let r = "";
131
- for (let i = 0; i < 8; i++) r += Math.floor(Math.random() * 16).toString(16);
132
- return `mtx_${t}_${r}`;
133
- }
134
- function makeErrorResult(err) {
135
- const e = err;
136
- return {
137
- output: void 0,
138
- verdict: { valid: false, score: 0 },
139
- costUsd: 0,
140
- durationMs: 0,
141
- error: {
142
- message: typeof e?.message === "string" ? e.message : String(err),
143
- kind: typeof e?.name === "string" ? e.name : "Error"
144
- }
145
- };
146
- }
147
- async function runAgentMatrix(opts) {
148
- const startedAt = Date.now();
149
- const reps = Math.max(1, opts.reps ?? 1);
150
- const maxConcurrency = Math.max(1, opts.maxConcurrency ?? 4);
151
- const costCeiling = opts.costCeiling ?? Number.POSITIVE_INFINITY;
152
- const aggregateBy = opts.aggregateBy ?? opts.axes.map((a) => a.name);
153
- const base = cartesian(opts.axes);
154
- const filtered = opts.filter ? base.filter((c) => opts.filter(c)) : base;
155
- const filteredOut = base.length - filtered.length;
156
- const planned = [];
157
- for (let i = 0; i < filtered.length; i++) {
158
- for (let r = 0; r < reps; r++) {
159
- planned.push({
160
- axes: filtered[i].axes,
161
- rep: r,
162
- ordinal: i * reps + r
163
- });
164
- }
165
- }
166
- const cellRecords = [];
167
- let cumulativeCost = 0;
168
- let costCeilingReached = false;
169
- let runsExecuted = 0;
170
- let cellsUnscheduled = 0;
171
- const aborted = () => opts.signal?.aborted === true;
172
- let inFlight = 0;
173
- let cursor = 0;
174
- let resolveAll;
175
- const done = new Promise((res) => {
176
- resolveAll = res;
177
- });
178
- const pump = () => {
179
- while (inFlight < maxConcurrency && cursor < planned.length) {
180
- if (aborted() || costCeilingReached) {
181
- const left = planned.length - cursor;
182
- cellsUnscheduled += left;
183
- cursor = planned.length;
184
- break;
185
- }
186
- const cell = planned[cursor++];
187
- inFlight++;
188
- const record = { cell, runs: [] };
189
- cellRecords.push(record);
190
- const promise = (async () => {
191
- try {
192
- return await opts.runCell(cell);
193
- } catch (err) {
194
- return makeErrorResult(err);
195
- }
196
- })();
197
- promise.then((result) => {
198
- record.runs.push(result);
199
- runsExecuted++;
200
- cumulativeCost += result.costUsd;
201
- if (cumulativeCost >= costCeiling && !costCeilingReached) {
202
- costCeilingReached = true;
203
- console.warn("[matrix] cost ceiling reached");
204
- }
205
- try {
206
- opts.onCellComplete?.(cell, result);
207
- } catch {
208
- }
209
- inFlight--;
210
- if (cursor < planned.length) {
211
- pump();
212
- } else if (inFlight === 0) {
213
- resolveAll?.();
214
- }
215
- });
216
- }
217
- if (cursor >= planned.length && inFlight === 0) resolveAll?.();
218
- };
219
- const onAbort = () => {
220
- if (cursor < planned.length) {
221
- cellsUnscheduled += planned.length - cursor;
222
- cursor = planned.length;
223
- }
224
- if (inFlight === 0) resolveAll?.();
225
- };
226
- if (opts.signal) {
227
- if (opts.signal.aborted) {
228
- cellsUnscheduled = planned.length;
229
- cursor = planned.length;
230
- resolveAll?.();
231
- } else {
232
- opts.signal.addEventListener("abort", onAbort, { once: true });
233
- }
234
- }
235
- if (planned.length === 0) {
236
- resolveAll?.();
237
- } else {
238
- pump();
239
- }
240
- await done;
241
- if (opts.signal) opts.signal.removeEventListener("abort", onAbort);
242
- cellRecords.sort((a, b) => a.cell.ordinal - b.cell.ordinal);
243
- let pass = 0;
244
- let scoreSum = 0;
245
- let totalCost = 0;
246
- let runCount = 0;
247
- for (const { runs } of cellRecords) {
248
- for (const r of runs) {
249
- runCount++;
250
- const errored = r.error !== void 0;
251
- if (!errored && r.verdict.valid) pass++;
252
- scoreSum += errored ? 0 : r.verdict.score;
253
- totalCost += r.costUsd;
254
- }
255
- }
256
- const byAxis = buildByAxis(cellRecords, opts.axes, aggregateBy);
257
- return {
258
- cells: cellRecords,
259
- byAxis,
260
- summary: {
261
- totalCells: planned.length,
262
- runsExecuted,
263
- cellsSkipped: cellsUnscheduled + filteredOut * reps,
264
- overallPassRate: runCount === 0 ? 0 : pass / runCount,
265
- overallMeanScore: runCount === 0 ? 0 : scoreSum / runCount,
266
- totalCostUsd: totalCost,
267
- durationMs: Date.now() - startedAt
268
- },
269
- matrixId: makeMatrixId()
270
- };
271
- }
272
7
  export {
273
8
  buildByAxis,
274
9
  runAgentMatrix,
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/matrix/aggregation.ts","../../src/matrix/runner.ts"],"sourcesContent":["/**\n * Per-axis aggregation of cell runs into `AxisSummary` rows.\n *\n * Pure: consumes the final `cells: [{cell, runs}]` array and returns the\n * `byAxis` table. Error runs contribute 0 to passRate and meanScore. Cost\n * and duration always count — the budget was spent regardless.\n */\n\nimport type { AxisSummary, CellResult, MatrixAxis, MatrixCell, MatrixResult } from './types'\n\ninterface Row<Output> {\n cell: MatrixCell\n result: CellResult<Output>\n}\n\nfunction flattenRuns<Output>(cells: MatrixResult<Output>['cells']): Row<Output>[] {\n const rows: Row<Output>[] = []\n for (const { cell, runs } of cells) {\n for (const result of runs) rows.push({ cell, result })\n }\n return rows\n}\n\nfunction quantile(sorted: number[], q: number): number {\n if (sorted.length === 0) return 0\n if (sorted.length === 1) return sorted[0] as number\n const pos = (sorted.length - 1) * q\n const lo = Math.floor(pos)\n const hi = Math.ceil(pos)\n if (lo === hi) return sorted[lo] as number\n const frac = pos - lo\n return (sorted[lo] as number) * (1 - frac) + (sorted[hi] as number) * frac\n}\n\nexport function summariseRows<Output>(\n rows: Row<Output>[],\n axisName: string,\n axisValue: string,\n): AxisSummary {\n if (rows.length === 0) {\n return {\n axisName,\n axisValue,\n cells: 0,\n passRate: 0,\n meanScore: 0,\n p50Score: 0,\n p90Score: 0,\n totalCostUsd: 0,\n meanDurationMs: 0,\n }\n }\n let pass = 0\n let scoreSum = 0\n let costSum = 0\n let durSum = 0\n const scores: number[] = []\n for (const { result } of rows) {\n const errored = result.error !== undefined\n const score = errored ? 0 : result.verdict.score\n const valid = !errored && result.verdict.valid\n if (valid) pass++\n scoreSum += score\n scores.push(score)\n costSum += result.costUsd\n durSum += result.durationMs\n }\n scores.sort((a, b) => a - b)\n return {\n axisName,\n axisValue,\n cells: rows.length,\n passRate: pass / rows.length,\n meanScore: scoreSum / rows.length,\n p50Score: quantile(scores, 0.5),\n p90Score: quantile(scores, 0.9),\n totalCostUsd: costSum,\n meanDurationMs: durSum / rows.length,\n }\n}\n\nfunction bucketBy<Output>(\n rows: Row<Output>[],\n axisName: string,\n labelFor: (id: string) => string,\n): Record<string, AxisSummary> {\n const buckets = new Map<string, Row<Output>[]>()\n for (const row of rows) {\n const slot = row.cell.axes[axisName]\n if (!slot) continue\n const id = slot.id\n let arr = buckets.get(id)\n if (!arr) {\n arr = []\n buckets.set(id, arr)\n }\n arr.push(row)\n }\n const out: Record<string, AxisSummary> = {}\n // Sorted keys for deterministic JSON serialisation.\n for (const id of [...buckets.keys()].sort()) {\n out[id] = summariseRows(buckets.get(id) as Row<Output>[], axisName, labelFor(id))\n }\n return out\n}\n\nexport function buildByAxis<Output>(\n cells: MatrixResult<Output>['cells'],\n axes: MatrixAxis<unknown>[],\n aggregateBy: string[],\n): Record<string, Record<string, AxisSummary>> {\n const rows = flattenRuns(cells)\n const byName = new Map(axes.map((a) => [a.name, a]))\n const byAxis: Record<string, Record<string, AxisSummary>> = {}\n for (const name of aggregateBy) {\n const axis = byName.get(name)\n const labelFor = (id: string): string => {\n if (!axis?.label) return id\n const found = axis.values.find((v) => v.id === id)\n if (!found) return id\n return axis.label(found.value, id)\n }\n byAxis[name] = bucketBy(rows, name, labelFor)\n }\n return byAxis\n}\n","/**\n * N-axis cartesian runner.\n *\n * Expansion order: cartesian over `axes` in declared order, then `reps` as the\n * inner-most dim → `ordinal = (cartIdx * reps) + rep`. The returned\n * `cells[]` is sorted by `ordinal` so concurrent execution does not reorder\n * the output.\n *\n * Scheduling is a sliding window of in-flight promises capped at\n * `maxConcurrency`. The window stops admitting new cells when the cost\n * ceiling trips or the abort signal fires; in-flight cells finish.\n */\n\nimport { buildByAxis } from './aggregation'\nimport type {\n CellResult,\n MatrixAxis,\n MatrixCell,\n MatrixResult,\n RunAgentMatrixOptions,\n} from './types'\n\ninterface BaseCell {\n axes: Record<string, { id: string; value: unknown }>\n}\n\nfunction cartesian(axes: MatrixAxis<unknown>[]): BaseCell[] {\n // Empty axes (`values=[]`) collapse the whole product to zero cells. An\n // empty `axes` array yields a single empty-axes cell — degenerate but\n // valid (caller is iterating only reps).\n if (axes.length === 0) return [{ axes: {} }]\n for (const a of axes) if (a.values.length === 0) return []\n const out: BaseCell[] = []\n const idx = new Array(axes.length).fill(0)\n while (true) {\n const slot: Record<string, { id: string; value: unknown }> = {}\n for (let i = 0; i < axes.length; i++) {\n const axis = axes[i] as MatrixAxis<unknown>\n const v = axis.values[idx[i] as number] as { id: string; value: unknown }\n slot[axis.name] = { id: v.id, value: v.value }\n }\n out.push({ axes: slot })\n // Increment like an odometer, left-most axis is fastest.\n let i = 0\n while (i < axes.length) {\n const next = (idx[i] as number) + 1\n const axis = axes[i] as MatrixAxis<unknown>\n if (next < axis.values.length) {\n idx[i] = next\n break\n }\n idx[i] = 0\n i++\n }\n if (i === axes.length) break\n }\n return out\n}\n\nfunction makeMatrixId(): string {\n // Stable id-like string: time + 8 random hex chars. Avoids node:crypto\n // import to keep the matrix dep-free.\n const t = Date.now().toString(36)\n let r = ''\n for (let i = 0; i < 8; i++) r += Math.floor(Math.random() * 16).toString(16)\n return `mtx_${t}_${r}`\n}\n\nfunction makeErrorResult<Output>(err: unknown): CellResult<Output> {\n const e = err as { message?: string; name?: string }\n return {\n output: undefined as unknown as Output,\n verdict: { valid: false, score: 0 },\n costUsd: 0,\n durationMs: 0,\n error: {\n message: typeof e?.message === 'string' ? e.message : String(err),\n kind: typeof e?.name === 'string' ? e.name : 'Error',\n },\n }\n}\n\nexport async function runAgentMatrix<Output>(\n opts: RunAgentMatrixOptions<Output>,\n): Promise<MatrixResult<Output>> {\n const startedAt = Date.now()\n const reps = Math.max(1, opts.reps ?? 1)\n const maxConcurrency = Math.max(1, opts.maxConcurrency ?? 4)\n const costCeiling = opts.costCeiling ?? Number.POSITIVE_INFINITY\n const aggregateBy = opts.aggregateBy ?? opts.axes.map((a) => a.name)\n\n const base = cartesian(opts.axes)\n const filtered = opts.filter\n ? base.filter((c) => (opts.filter as (b: BaseCell) => boolean)(c))\n : base\n const filteredOut = base.length - filtered.length\n\n const planned: MatrixCell[] = []\n for (let i = 0; i < filtered.length; i++) {\n for (let r = 0; r < reps; r++) {\n planned.push({\n axes: (filtered[i] as BaseCell).axes,\n rep: r,\n ordinal: i * reps + r,\n })\n }\n }\n\n const cellRecords: Array<{ cell: MatrixCell; runs: CellResult<Output>[] }> = []\n let cumulativeCost = 0\n let costCeilingReached = false\n let runsExecuted = 0\n let cellsUnscheduled = 0\n\n const aborted = (): boolean => opts.signal?.aborted === true\n\n // Per-run abort controller forwards the external signal so cell executors\n // see cancellation. We don't expose it on `MatrixCell` — the signature on\n // `runCell` per the public API is `(cell) => Promise<...>`. Executors that\n // need cancellation use the external signal directly via closure.\n\n let inFlight = 0\n let cursor = 0\n let resolveAll: (() => void) | undefined\n const done = new Promise<void>((res) => {\n resolveAll = res\n })\n\n const pump = (): void => {\n while (inFlight < maxConcurrency && cursor < planned.length) {\n if (aborted() || costCeilingReached) {\n // Drain remaining as unscheduled.\n const left = planned.length - cursor\n cellsUnscheduled += left\n cursor = planned.length\n break\n }\n const cell = planned[cursor++] as MatrixCell\n inFlight++\n // Lazily allocate the record so cells appear in `cells[]` in any\n // arrival order; we sort by ordinal at the end.\n const record = { cell, runs: [] as CellResult<Output>[] }\n cellRecords.push(record)\n const promise: Promise<CellResult<Output>> = (async () => {\n try {\n return await opts.runCell(cell)\n } catch (err) {\n return makeErrorResult<Output>(err)\n }\n })()\n promise.then((result) => {\n record.runs.push(result)\n runsExecuted++\n cumulativeCost += result.costUsd\n if (cumulativeCost >= costCeiling && !costCeilingReached) {\n costCeilingReached = true\n // eslint-disable-next-line no-console\n console.warn('[matrix] cost ceiling reached')\n }\n try {\n opts.onCellComplete?.(cell, result)\n } catch {\n // onCellComplete is observational — swallow throws so a noisy\n // callback can't tank the run.\n }\n inFlight--\n if (cursor < planned.length) {\n pump()\n } else if (inFlight === 0) {\n resolveAll?.()\n }\n })\n }\n if (cursor >= planned.length && inFlight === 0) resolveAll?.()\n }\n\n const onAbort = (): void => {\n // External abort: stop scheduling. In-flight cells finish; their\n // executors observe `opts.signal.aborted` directly via closure.\n if (cursor < planned.length) {\n cellsUnscheduled += planned.length - cursor\n cursor = planned.length\n }\n if (inFlight === 0) resolveAll?.()\n }\n if (opts.signal) {\n if (opts.signal.aborted) {\n cellsUnscheduled = planned.length\n cursor = planned.length\n resolveAll?.()\n } else {\n opts.signal.addEventListener('abort', onAbort, { once: true })\n }\n }\n\n if (planned.length === 0) {\n resolveAll?.()\n } else {\n pump()\n }\n\n await done\n if (opts.signal) opts.signal.removeEventListener('abort', onAbort)\n\n cellRecords.sort((a, b) => a.cell.ordinal - b.cell.ordinal)\n\n let pass = 0\n let scoreSum = 0\n let totalCost = 0\n let runCount = 0\n for (const { runs } of cellRecords) {\n for (const r of runs) {\n runCount++\n const errored = r.error !== undefined\n if (!errored && r.verdict.valid) pass++\n scoreSum += errored ? 0 : r.verdict.score\n totalCost += r.costUsd\n }\n }\n\n const byAxis = buildByAxis(cellRecords, opts.axes, aggregateBy)\n\n return {\n cells: cellRecords,\n byAxis,\n summary: {\n totalCells: planned.length,\n runsExecuted,\n cellsSkipped: cellsUnscheduled + filteredOut * reps,\n overallPassRate: runCount === 0 ? 0 : pass / runCount,\n overallMeanScore: runCount === 0 ? 0 : scoreSum / runCount,\n totalCostUsd: totalCost,\n durationMs: Date.now() - startedAt,\n },\n matrixId: makeMatrixId(),\n }\n}\n"],"mappings":";;;AAeA,SAAS,YAAoB,OAAqD;AAChF,QAAM,OAAsB,CAAC;AAC7B,aAAW,EAAE,MAAM,KAAK,KAAK,OAAO;AAClC,eAAW,UAAU,KAAM,MAAK,KAAK,EAAE,MAAM,OAAO,CAAC;AAAA,EACvD;AACA,SAAO;AACT;AAEA,SAAS,SAAS,QAAkB,GAAmB;AACrD,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,OAAO,WAAW,EAAG,QAAO,OAAO,CAAC;AACxC,QAAM,OAAO,OAAO,SAAS,KAAK;AAClC,QAAM,KAAK,KAAK,MAAM,GAAG;AACzB,QAAM,KAAK,KAAK,KAAK,GAAG;AACxB,MAAI,OAAO,GAAI,QAAO,OAAO,EAAE;AAC/B,QAAM,OAAO,MAAM;AACnB,SAAQ,OAAO,EAAE,KAAgB,IAAI,QAAS,OAAO,EAAE,IAAe;AACxE;AAEO,SAAS,cACd,MACA,UACA,WACa;AACb,MAAI,KAAK,WAAW,GAAG;AACrB,WAAO;AAAA,MACL;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV,WAAW;AAAA,MACX,UAAU;AAAA,MACV,UAAU;AAAA,MACV,cAAc;AAAA,MACd,gBAAgB;AAAA,IAClB;AAAA,EACF;AACA,MAAI,OAAO;AACX,MAAI,WAAW;AACf,MAAI,UAAU;AACd,MAAI,SAAS;AACb,QAAM,SAAmB,CAAC;AAC1B,aAAW,EAAE,OAAO,KAAK,MAAM;AAC7B,UAAM,UAAU,OAAO,UAAU;AACjC,UAAM,QAAQ,UAAU,IAAI,OAAO,QAAQ;AAC3C,UAAM,QAAQ,CAAC,WAAW,OAAO,QAAQ;AACzC,QAAI,MAAO;AACX,gBAAY;AACZ,WAAO,KAAK,KAAK;AACjB,eAAW,OAAO;AAClB,cAAU,OAAO;AAAA,EACnB;AACA,SAAO,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3B,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,OAAO,KAAK;AAAA,IACZ,UAAU,OAAO,KAAK;AAAA,IACtB,WAAW,WAAW,KAAK;AAAA,IAC3B,UAAU,SAAS,QAAQ,GAAG;AAAA,IAC9B,UAAU,SAAS,QAAQ,GAAG;AAAA,IAC9B,cAAc;AAAA,IACd,gBAAgB,SAAS,KAAK;AAAA,EAChC;AACF;AAEA,SAAS,SACP,MACA,UACA,UAC6B;AAC7B,QAAM,UAAU,oBAAI,IAA2B;AAC/C,aAAW,OAAO,MAAM;AACtB,UAAM,OAAO,IAAI,KAAK,KAAK,QAAQ;AACnC,QAAI,CAAC,KAAM;AACX,UAAM,KAAK,KAAK;AAChB,QAAI,MAAM,QAAQ,IAAI,EAAE;AACxB,QAAI,CAAC,KAAK;AACR,YAAM,CAAC;AACP,cAAQ,IAAI,IAAI,GAAG;AAAA,IACrB;AACA,QAAI,KAAK,GAAG;AAAA,EACd;AACA,QAAM,MAAmC,CAAC;AAE1C,aAAW,MAAM,CAAC,GAAG,QAAQ,KAAK,CAAC,EAAE,KAAK,GAAG;AAC3C,QAAI,EAAE,IAAI,cAAc,QAAQ,IAAI,EAAE,GAAoB,UAAU,SAAS,EAAE,CAAC;AAAA,EAClF;AACA,SAAO;AACT;AAEO,SAAS,YACd,OACA,MACA,aAC6C;AAC7C,QAAM,OAAO,YAAY,KAAK;AAC9B,QAAM,SAAS,IAAI,IAAI,KAAK,IAAI,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC;AACnD,QAAM,SAAsD,CAAC;AAC7D,aAAW,QAAQ,aAAa;AAC9B,UAAM,OAAO,OAAO,IAAI,IAAI;AAC5B,UAAM,WAAW,CAAC,OAAuB;AACvC,UAAI,CAAC,MAAM,MAAO,QAAO;AACzB,YAAM,QAAQ,KAAK,OAAO,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE;AACjD,UAAI,CAAC,MAAO,QAAO;AACnB,aAAO,KAAK,MAAM,MAAM,OAAO,EAAE;AAAA,IACnC;AACA,WAAO,IAAI,IAAI,SAAS,MAAM,MAAM,QAAQ;AAAA,EAC9C;AACA,SAAO;AACT;;;ACnGA,SAAS,UAAU,MAAyC;AAI1D,MAAI,KAAK,WAAW,EAAG,QAAO,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC;AAC3C,aAAW,KAAK,KAAM,KAAI,EAAE,OAAO,WAAW,EAAG,QAAO,CAAC;AACzD,QAAM,MAAkB,CAAC;AACzB,QAAM,MAAM,IAAI,MAAM,KAAK,MAAM,EAAE,KAAK,CAAC;AACzC,SAAO,MAAM;AACX,UAAM,OAAuD,CAAC;AAC9D,aAASA,KAAI,GAAGA,KAAI,KAAK,QAAQA,MAAK;AACpC,YAAM,OAAO,KAAKA,EAAC;AACnB,YAAM,IAAI,KAAK,OAAO,IAAIA,EAAC,CAAW;AACtC,WAAK,KAAK,IAAI,IAAI,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,MAAM;AAAA,IAC/C;AACA,QAAI,KAAK,EAAE,MAAM,KAAK,CAAC;AAEvB,QAAI,IAAI;AACR,WAAO,IAAI,KAAK,QAAQ;AACtB,YAAM,OAAQ,IAAI,CAAC,IAAe;AAClC,YAAM,OAAO,KAAK,CAAC;AACnB,UAAI,OAAO,KAAK,OAAO,QAAQ;AAC7B,YAAI,CAAC,IAAI;AACT;AAAA,MACF;AACA,UAAI,CAAC,IAAI;AACT;AAAA,IACF;AACA,QAAI,MAAM,KAAK,OAAQ;AAAA,EACzB;AACA,SAAO;AACT;AAEA,SAAS,eAAuB;AAG9B,QAAM,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE;AAChC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,GAAG,IAAK,MAAK,KAAK,MAAM,KAAK,OAAO,IAAI,EAAE,EAAE,SAAS,EAAE;AAC3E,SAAO,OAAO,CAAC,IAAI,CAAC;AACtB;AAEA,SAAS,gBAAwB,KAAkC;AACjE,QAAM,IAAI;AACV,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,SAAS,EAAE,OAAO,OAAO,OAAO,EAAE;AAAA,IAClC,SAAS;AAAA,IACT,YAAY;AAAA,IACZ,OAAO;AAAA,MACL,SAAS,OAAO,GAAG,YAAY,WAAW,EAAE,UAAU,OAAO,GAAG;AAAA,MAChE,MAAM,OAAO,GAAG,SAAS,WAAW,EAAE,OAAO;AAAA,IAC/C;AAAA,EACF;AACF;AAEA,eAAsB,eACpB,MAC+B;AAC/B,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,OAAO,KAAK,IAAI,GAAG,KAAK,QAAQ,CAAC;AACvC,QAAM,iBAAiB,KAAK,IAAI,GAAG,KAAK,kBAAkB,CAAC;AAC3D,QAAM,cAAc,KAAK,eAAe,OAAO;AAC/C,QAAM,cAAc,KAAK,eAAe,KAAK,KAAK,IAAI,CAAC,MAAM,EAAE,IAAI;AAEnE,QAAM,OAAO,UAAU,KAAK,IAAI;AAChC,QAAM,WAAW,KAAK,SAClB,KAAK,OAAO,CAAC,MAAO,KAAK,OAAoC,CAAC,CAAC,IAC/D;AACJ,QAAM,cAAc,KAAK,SAAS,SAAS;AAE3C,QAAM,UAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,aAAS,IAAI,GAAG,IAAI,MAAM,KAAK;AAC7B,cAAQ,KAAK;AAAA,QACX,MAAO,SAAS,CAAC,EAAe;AAAA,QAChC,KAAK;AAAA,QACL,SAAS,IAAI,OAAO;AAAA,MACtB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,cAAuE,CAAC;AAC9E,MAAI,iBAAiB;AACrB,MAAI,qBAAqB;AACzB,MAAI,eAAe;AACnB,MAAI,mBAAmB;AAEvB,QAAM,UAAU,MAAe,KAAK,QAAQ,YAAY;AAOxD,MAAI,WAAW;AACf,MAAI,SAAS;AACb,MAAI;AACJ,QAAM,OAAO,IAAI,QAAc,CAAC,QAAQ;AACtC,iBAAa;AAAA,EACf,CAAC;AAED,QAAM,OAAO,MAAY;AACvB,WAAO,WAAW,kBAAkB,SAAS,QAAQ,QAAQ;AAC3D,UAAI,QAAQ,KAAK,oBAAoB;AAEnC,cAAM,OAAO,QAAQ,SAAS;AAC9B,4BAAoB;AACpB,iBAAS,QAAQ;AACjB;AAAA,MACF;AACA,YAAM,OAAO,QAAQ,QAAQ;AAC7B;AAGA,YAAM,SAAS,EAAE,MAAM,MAAM,CAAC,EAA0B;AACxD,kBAAY,KAAK,MAAM;AACvB,YAAM,WAAwC,YAAY;AACxD,YAAI;AACF,iBAAO,MAAM,KAAK,QAAQ,IAAI;AAAA,QAChC,SAAS,KAAK;AACZ,iBAAO,gBAAwB,GAAG;AAAA,QACpC;AAAA,MACF,GAAG;AACH,cAAQ,KAAK,CAAC,WAAW;AACvB,eAAO,KAAK,KAAK,MAAM;AACvB;AACA,0BAAkB,OAAO;AACzB,YAAI,kBAAkB,eAAe,CAAC,oBAAoB;AACxD,+BAAqB;AAErB,kBAAQ,KAAK,+BAA+B;AAAA,QAC9C;AACA,YAAI;AACF,eAAK,iBAAiB,MAAM,MAAM;AAAA,QACpC,QAAQ;AAAA,QAGR;AACA;AACA,YAAI,SAAS,QAAQ,QAAQ;AAC3B,eAAK;AAAA,QACP,WAAW,aAAa,GAAG;AACzB,uBAAa;AAAA,QACf;AAAA,MACF,CAAC;AAAA,IACH;AACA,QAAI,UAAU,QAAQ,UAAU,aAAa,EAAG,cAAa;AAAA,EAC/D;AAEA,QAAM,UAAU,MAAY;AAG1B,QAAI,SAAS,QAAQ,QAAQ;AAC3B,0BAAoB,QAAQ,SAAS;AACrC,eAAS,QAAQ;AAAA,IACnB;AACA,QAAI,aAAa,EAAG,cAAa;AAAA,EACnC;AACA,MAAI,KAAK,QAAQ;AACf,QAAI,KAAK,OAAO,SAAS;AACvB,yBAAmB,QAAQ;AAC3B,eAAS,QAAQ;AACjB,mBAAa;AAAA,IACf,OAAO;AACL,WAAK,OAAO,iBAAiB,SAAS,SAAS,EAAE,MAAM,KAAK,CAAC;AAAA,IAC/D;AAAA,EACF;AAEA,MAAI,QAAQ,WAAW,GAAG;AACxB,iBAAa;AAAA,EACf,OAAO;AACL,SAAK;AAAA,EACP;AAEA,QAAM;AACN,MAAI,KAAK,OAAQ,MAAK,OAAO,oBAAoB,SAAS,OAAO;AAEjE,cAAY,KAAK,CAAC,GAAG,MAAM,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AAE1D,MAAI,OAAO;AACX,MAAI,WAAW;AACf,MAAI,YAAY;AAChB,MAAI,WAAW;AACf,aAAW,EAAE,KAAK,KAAK,aAAa;AAClC,eAAW,KAAK,MAAM;AACpB;AACA,YAAM,UAAU,EAAE,UAAU;AAC5B,UAAI,CAAC,WAAW,EAAE,QAAQ,MAAO;AACjC,kBAAY,UAAU,IAAI,EAAE,QAAQ;AACpC,mBAAa,EAAE;AAAA,IACjB;AAAA,EACF;AAEA,QAAM,SAAS,YAAY,aAAa,KAAK,MAAM,WAAW;AAE9D,SAAO;AAAA,IACL,OAAO;AAAA,IACP;AAAA,IACA,SAAS;AAAA,MACP,YAAY,QAAQ;AAAA,MACpB;AAAA,MACA,cAAc,mBAAmB,cAAc;AAAA,MAC/C,iBAAiB,aAAa,IAAI,IAAI,OAAO;AAAA,MAC7C,kBAAkB,aAAa,IAAI,IAAI,WAAW;AAAA,MAClD,cAAc;AAAA,MACd,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B;AAAA,IACA,UAAU,aAAa;AAAA,EACzB;AACF;","names":["i"]}
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -0,0 +1,276 @@
1
+ import { AgentProfile } from '@tangle-network/sandbox';
2
+ import { M as MatrixResult } from '../types-DHqkLwEU.js';
3
+ import '@tangle-network/agent-runtime/loops';
4
+
5
+ interface MultishotMessage {
6
+ role: 'user' | 'assistant' | 'tool';
7
+ content: string;
8
+ toolCallId?: string;
9
+ toolCalls?: Array<{
10
+ id: string;
11
+ name: string;
12
+ args: Record<string, unknown>;
13
+ }>;
14
+ }
15
+ interface MultishotArtifact {
16
+ type: string;
17
+ turn: number;
18
+ invocation: {
19
+ name: string;
20
+ args: Record<string, unknown>;
21
+ };
22
+ content: string;
23
+ }
24
+ interface MultishotResult {
25
+ transcript: MultishotMessage[];
26
+ artifacts: MultishotArtifact[];
27
+ toolCalls: number;
28
+ durationMs: number;
29
+ costUsd: number;
30
+ }
31
+ interface MultishotToolDefinition {
32
+ type: 'function';
33
+ function: {
34
+ name: string;
35
+ description: string;
36
+ parameters: Record<string, unknown>;
37
+ };
38
+ }
39
+ type MultishotToolExecutor = (args: Record<string, unknown>, ctx: {
40
+ apiKey: string;
41
+ baseUrl: string;
42
+ signal?: AbortSignal;
43
+ }) => Promise<{
44
+ content: string;
45
+ costUsd: number;
46
+ }>;
47
+ interface MultishotPersona {
48
+ /** Stable identifier — used for per-cell artifact paths + matrix axis keys. */
49
+ id: string;
50
+ /** Per-domain payload (income/profile/voice/etc.) shaped by the consumer. */
51
+ [k: string]: unknown;
52
+ }
53
+ interface MultishotShape<TPersona extends MultishotPersona> {
54
+ /** Opening user message (turn 0) — the persona's first ask. */
55
+ buildOpener: (persona: TPersona) => string;
56
+ /** System prompt the driver LLM uses to roleplay the persona. Should set
57
+ * voice, goals, constraints, time-pressure, and the "never go silent" rule. */
58
+ buildDriverSystemPrompt: (persona: TPersona) => string;
59
+ }
60
+ declare class MultishotDriverEmptyError extends Error {
61
+ readonly turn: number;
62
+ constructor(turn: number);
63
+ }
64
+
65
+ interface RouterCompletionRequest {
66
+ apiKey: string;
67
+ baseUrl: string;
68
+ model: string;
69
+ messages: Array<Record<string, unknown>>;
70
+ tools?: MultishotToolDefinition[];
71
+ temperature?: number;
72
+ maxTokens?: number;
73
+ signal?: AbortSignal;
74
+ }
75
+ interface RouterToolCall {
76
+ id: string;
77
+ type: 'function';
78
+ function: {
79
+ name: string;
80
+ arguments: string;
81
+ };
82
+ }
83
+ interface RouterCompletionResponse {
84
+ message: {
85
+ content?: string | null;
86
+ tool_calls?: RouterToolCall[];
87
+ };
88
+ usage?: {
89
+ prompt_tokens?: number;
90
+ completion_tokens?: number;
91
+ };
92
+ }
93
+ declare function routerCompletion(req: RouterCompletionRequest): Promise<RouterCompletionResponse>;
94
+ declare function estimateRouterCost(model: string, usage?: {
95
+ prompt_tokens?: number;
96
+ completion_tokens?: number;
97
+ }): number;
98
+ declare function defaultRouterBaseUrl(): string;
99
+ declare function requireRouterApiKey(): string;
100
+
101
+ declare const DEFAULT_RESEARCHER_MODEL = "openai/gpt-4o-mini";
102
+ declare const DEFAULT_CODER_MODEL = "openai/gpt-4o-mini";
103
+ interface DefaultResearcherConfig {
104
+ /** Replace the system prompt to bias the researcher toward a domain's
105
+ * citation style. Defaults to a generic "cite sources by name" prompt. */
106
+ systemPrompt?: string;
107
+ model?: string;
108
+ }
109
+ interface DefaultCoderConfig {
110
+ /** Replace the system prompt to bias the coder toward a language /
111
+ * framework / artifact style. */
112
+ systemPrompt?: string;
113
+ model?: string;
114
+ }
115
+ declare const DEFAULT_DELEGATE_RESEARCH_TOOL: MultishotToolDefinition;
116
+ declare const DEFAULT_DELEGATE_CODE_TOOL: MultishotToolDefinition;
117
+ declare function createResearchExecutor(config?: DefaultResearcherConfig): MultishotToolExecutor;
118
+ declare function createCodeExecutor(config?: DefaultCoderConfig): MultishotToolExecutor;
119
+ interface DefaultToolsConfig {
120
+ research?: DefaultResearcherConfig;
121
+ code?: DefaultCoderConfig;
122
+ /** When true (default), each tool result is recorded as a typed artifact:
123
+ * research → type='research', code → type='code'. */
124
+ recordArtifacts?: boolean;
125
+ }
126
+ interface DefaultToolsBundle {
127
+ tools: MultishotToolDefinition[];
128
+ executors: Record<string, MultishotToolExecutor>;
129
+ artifactTypeFor: (toolName: string) => string | undefined;
130
+ }
131
+ declare function defaultDelegationTools(config?: DefaultToolsConfig): DefaultToolsBundle;
132
+
133
+ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
134
+ interface JudgeDimension {
135
+ /** JSON field name + score key. */
136
+ key: string;
137
+ /** Description shown in the judge's user prompt. */
138
+ description: string;
139
+ }
140
+ interface JudgeConfig<TInput> {
141
+ /** Display name (for trace + log). */
142
+ name: string;
143
+ /** Model used for this judge. */
144
+ model?: string;
145
+ /** 0-10 scored dimensions. */
146
+ dimensions: JudgeDimension[];
147
+ /** Judge system prompt — sets persona + JSON-only constraint. */
148
+ systemPrompt: string;
149
+ /** Build the user prompt from the typed input. Must include "Respond with
150
+ * ONLY this JSON: { ... }" listing each dimension key. */
151
+ buildPrompt: (input: TInput) => string;
152
+ /** Optional model + api overrides. */
153
+ apiKey?: string;
154
+ baseUrl?: string;
155
+ }
156
+ interface JudgeScore {
157
+ /** Per-dimension 0-10 score. Missing dims default to 0. */
158
+ dimensions: Record<string, number>;
159
+ /** Mean across dimensions. */
160
+ composite: number;
161
+ /** Free-form 1-2 sentence critique from the judge (when provided). */
162
+ notes: string;
163
+ }
164
+ declare function runJudge<TInput>(judge: JudgeConfig<TInput>, input: TInput): Promise<JudgeScore>;
165
+ /** Convenience: stringified dimension list for inclusion in a judge prompt.
166
+ * Returns lines like `- audience_fit: Does this match what the audience cares about? (0-10)`. */
167
+ declare function renderDimensions(dims: readonly JudgeDimension[]): string;
168
+ /** Convenience: build the "Respond with ONLY this JSON" footer for a judge prompt. */
169
+ declare function renderJsonFooter(dims: readonly JudgeDimension[]): string;
170
+
171
+ interface RunMultishotOptions<TPersona extends MultishotPersona> {
172
+ profile: AgentProfile;
173
+ persona: TPersona;
174
+ shape: MultishotShape<TPersona>;
175
+ /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */
176
+ tools?: MultishotToolDefinition[];
177
+ /** Map from tool name → executor invoked inline when the agent emits a tool_call. */
178
+ toolExecutors?: Record<string, MultishotToolExecutor>;
179
+ /** Map from tool name → artifact type label written into MultishotArtifact.type.
180
+ * Tools without a mapping still execute, but their results aren't surfaced as
181
+ * typed artifacts (only as tool messages in the transcript). */
182
+ artifactTypeFor?: (toolName: string) => string | undefined;
183
+ maxTurns?: number;
184
+ agentModel?: string;
185
+ driverModel?: string;
186
+ apiKey?: string;
187
+ baseUrl?: string;
188
+ signal?: AbortSignal;
189
+ }
190
+ declare function runMultishot<TPersona extends MultishotPersona>(opts: RunMultishotOptions<TPersona>): Promise<MultishotResult>;
191
+
192
+ interface ConversationJudgeInput<TPersona extends MultishotPersona> {
193
+ transcript: MultishotMessage[];
194
+ persona: TPersona;
195
+ }
196
+ interface ArtifactJudgeInput<TPersona extends MultishotPersona> {
197
+ artifact: MultishotArtifact;
198
+ persona: TPersona;
199
+ }
200
+ interface MultishotJudges<TPersona extends MultishotPersona> {
201
+ /** Scores the full transcript end-to-end (always runs). */
202
+ conversation: JudgeConfig<ConversationJudgeInput<TPersona>>;
203
+ /** Scores each code-type artifact. Optional — omit when domain has no code artifacts. */
204
+ codeReview?: JudgeConfig<ArtifactJudgeInput<TPersona>>;
205
+ /** Scores each non-code (research/content/template) artifact. Optional. */
206
+ contentQuality?: JudgeConfig<ArtifactJudgeInput<TPersona>>;
207
+ /** Which artifact types route to codeReview. Defaults to ['code']. */
208
+ codeArtifactTypes?: string[];
209
+ /** Which artifact types route to contentQuality. Defaults to ['research']. */
210
+ contentArtifactTypes?: string[];
211
+ }
212
+ interface CellCompositeScore {
213
+ composite: number;
214
+ conversation: JudgeScore;
215
+ codeReview?: {
216
+ perArtifact: Array<JudgeScore & {
217
+ turn: number;
218
+ type: string;
219
+ }>;
220
+ composite: number;
221
+ };
222
+ contentQuality?: {
223
+ perArtifact: Array<JudgeScore & {
224
+ turn: number;
225
+ type: string;
226
+ }>;
227
+ composite: number;
228
+ };
229
+ }
230
+ interface RunMultishotMatrixOptions<TPersona extends MultishotPersona> {
231
+ /** AgentProfile axis (matrix primary). */
232
+ profiles: Array<{
233
+ id: string;
234
+ value: AgentProfile;
235
+ }>;
236
+ /** Persona axis. */
237
+ personas: TPersona[];
238
+ /** Persona-shaping callbacks. */
239
+ shape: MultishotShape<TPersona>;
240
+ /** Judge configurations. */
241
+ judges: MultishotJudges<TPersona>;
242
+ /** Tool definitions advertised to the agent. Defaults to delegate_research + delegate_code. */
243
+ tools?: MultishotToolDefinition[];
244
+ /** Map from tool name → inline executor. Must align with `tools`. */
245
+ toolExecutors?: Record<string, MultishotToolExecutor>;
246
+ /** Tool name → artifact type label. Defaults to research/code mapping. */
247
+ artifactTypeFor?: (toolName: string) => string | undefined;
248
+ /** Where per-cell artifacts land. Cells write to `<runDir>/<profileId>/<personaId>/rep-N/`. */
249
+ runDir: string;
250
+ /** Replicates per (profile, persona) cell. */
251
+ reps?: number;
252
+ /** Max conversation turns per cell. */
253
+ maxTurns?: number;
254
+ /** Max concurrent cells. */
255
+ maxConcurrency?: number;
256
+ /** Total $ ceiling across the matrix; cells aborted past this. */
257
+ costCeiling?: number;
258
+ /** Agent model. */
259
+ agentModel?: string;
260
+ /** Driver model. */
261
+ driverModel?: string;
262
+ /** Pass-thru fields. */
263
+ apiKey?: string;
264
+ baseUrl?: string;
265
+ }
266
+ interface CellOutput {
267
+ turns: number;
268
+ toolCalls: number;
269
+ artifactCount: number;
270
+ }
271
+ interface RunMultishotMatrixResult {
272
+ matrix: MatrixResult<CellOutput>;
273
+ }
274
+ declare function runMultishotMatrix<TPersona extends MultishotPersona>(opts: RunMultishotMatrixOptions<TPersona>): Promise<RunMultishotMatrixResult>;
275
+
276
+ export { type ArtifactJudgeInput, type CellCompositeScore, type ConversationJudgeInput, DEFAULT_CODER_MODEL, DEFAULT_DELEGATE_CODE_TOOL, DEFAULT_DELEGATE_RESEARCH_TOOL, DEFAULT_JUDGE_MODEL, DEFAULT_RESEARCHER_MODEL, type DefaultCoderConfig, type DefaultResearcherConfig, type DefaultToolsBundle, type DefaultToolsConfig, type JudgeConfig, type JudgeDimension, type JudgeScore, type MultishotArtifact, MultishotDriverEmptyError, type MultishotJudges, type MultishotMessage, type MultishotPersona, type MultishotResult, type MultishotShape, type MultishotToolDefinition, type MultishotToolExecutor, type RouterCompletionRequest, type RouterCompletionResponse, type RouterToolCall, type RunMultishotMatrixOptions, type RunMultishotMatrixResult, type RunMultishotOptions, createCodeExecutor, createResearchExecutor, defaultDelegationTools, defaultRouterBaseUrl, estimateRouterCost, renderDimensions, renderJsonFooter, requireRouterApiKey, routerCompletion, runJudge, runMultishot, runMultishotMatrix };