@tangle-network/agent-eval 0.35.0 → 0.37.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/dist/{chunk-HIO4UIS5.js → chunk-L5UNCDAJ.js} +207 -1
- package/dist/chunk-L5UNCDAJ.js.map +1 -0
- package/dist/index.d.ts +111 -3
- package/dist/index.js +204 -1
- package/dist/index.js.map +1 -1
- package/dist/matrix/index.d.ts +141 -0
- package/dist/matrix/index.js +277 -0
- package/dist/matrix/index.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/traces.d.ts +81 -2
- package/dist/traces.js +7 -1
- package/package.json +17 -2
- package/dist/chunk-HIO4UIS5.js.map +0 -1
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import { DefaultVerdict } from '@tangle-network/agent-runtime/loops';
|
|
2
|
+
export { DefaultVerdict } from '@tangle-network/agent-runtime/loops';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @experimental
|
|
6
|
+
*
|
|
7
|
+
* N-axis cartesian matrix over substrate types — types module.
|
|
8
|
+
*
|
|
9
|
+
* The matrix is a runner + aggregator. It iterates the cartesian product of
|
|
10
|
+
* caller-provided axes (any value type — `AgentProfile` from sandbox, `Driver`
|
|
11
|
+
* / `Validator` from agent-runtime, rubric records, thinking levels, anything)
|
|
12
|
+
* and aggregates per-axis pass/score/cost summaries. Substrate types are
|
|
13
|
+
* imported at the boundary by JSDoc only; the matrix never wraps them.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
/** One axis = one dimension to iterate. `V` is the value type — pass any
|
|
17
|
+
* substrate type (AgentProfile, Driver, Validator, rubric record). */
|
|
18
|
+
interface MatrixAxis<V> {
|
|
19
|
+
/** Axis name. Becomes the key in `MatrixResult.byAxis`. */
|
|
20
|
+
name: string;
|
|
21
|
+
/** Stable id per value. Used as the bucket key in aggregation. */
|
|
22
|
+
values: Array<{
|
|
23
|
+
id: string;
|
|
24
|
+
value: V;
|
|
25
|
+
}>;
|
|
26
|
+
/** Optional bucket label override. Receives the same `(value, id)` the
|
|
27
|
+
* runner stored on the cell; default label is `id`. */
|
|
28
|
+
label?: (value: V, id: string) => string;
|
|
29
|
+
}
|
|
30
|
+
/** A cell carries one picked value from each axis, keyed by axis name. */
|
|
31
|
+
interface MatrixCell {
|
|
32
|
+
axes: Record<string, {
|
|
33
|
+
id: string;
|
|
34
|
+
value: unknown;
|
|
35
|
+
}>;
|
|
36
|
+
/** 0-based replicate index within the same axis combination. */
|
|
37
|
+
rep: number;
|
|
38
|
+
/** Stable sort key — preserves cartesian order across concurrent execution. */
|
|
39
|
+
ordinal: number;
|
|
40
|
+
}
|
|
41
|
+
interface CellResult<Output> {
|
|
42
|
+
output: Output;
|
|
43
|
+
verdict: DefaultVerdict;
|
|
44
|
+
costUsd: number;
|
|
45
|
+
durationMs: number;
|
|
46
|
+
runId?: string;
|
|
47
|
+
/** Populated when `runCell` threw. The cell contributes 0 to passRate AND
|
|
48
|
+
* meanScore regardless of `verdict`. */
|
|
49
|
+
error?: {
|
|
50
|
+
message: string;
|
|
51
|
+
kind: string;
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
interface AxisSummary {
|
|
55
|
+
axisName: string;
|
|
56
|
+
axisValue: string;
|
|
57
|
+
cells: number;
|
|
58
|
+
passRate: number;
|
|
59
|
+
meanScore: number;
|
|
60
|
+
p50Score: number;
|
|
61
|
+
p90Score: number;
|
|
62
|
+
totalCostUsd: number;
|
|
63
|
+
meanDurationMs: number;
|
|
64
|
+
}
|
|
65
|
+
interface MatrixResult<Output> {
|
|
66
|
+
cells: Array<{
|
|
67
|
+
cell: MatrixCell;
|
|
68
|
+
runs: CellResult<Output>[];
|
|
69
|
+
}>;
|
|
70
|
+
/** `byAxis[axisName][axisValueId] = summary`. Populated only for axes
|
|
71
|
+
* named in `aggregateBy` (default = every axis in `axes`). */
|
|
72
|
+
byAxis: Record<string, Record<string, AxisSummary>>;
|
|
73
|
+
summary: {
|
|
74
|
+
totalCells: number;
|
|
75
|
+
runsExecuted: number;
|
|
76
|
+
/** Cells removed by `filter` plus cells unscheduled after the cost
|
|
77
|
+
* ceiling or abort signal tripped. */
|
|
78
|
+
cellsSkipped: number;
|
|
79
|
+
overallPassRate: number;
|
|
80
|
+
overallMeanScore: number;
|
|
81
|
+
totalCostUsd: number;
|
|
82
|
+
durationMs: number;
|
|
83
|
+
};
|
|
84
|
+
/** Stable id-like string generated at the end of the run. */
|
|
85
|
+
matrixId: string;
|
|
86
|
+
}
|
|
87
|
+
interface RunAgentMatrixOptions<Output> {
|
|
88
|
+
axes: MatrixAxis<unknown>[];
|
|
89
|
+
/** User-supplied cell executor. May throw; the matrix captures throws as
|
|
90
|
+
* `CellResult.error` and continues. */
|
|
91
|
+
runCell: (cell: MatrixCell) => Promise<CellResult<Output>>;
|
|
92
|
+
/** Replicates per cell. Default 1. */
|
|
93
|
+
reps?: number;
|
|
94
|
+
/** Prune cells from the cartesian BEFORE rep expansion. */
|
|
95
|
+
filter?: (cell: Omit<MatrixCell, 'rep' | 'ordinal'>) => boolean;
|
|
96
|
+
/** Axes to aggregate into `byAxis`. Default: every axis in `axes`. */
|
|
97
|
+
aggregateBy?: string[];
|
|
98
|
+
/** Max concurrent in-flight `runCell` invocations. Default 4. */
|
|
99
|
+
maxConcurrency?: number;
|
|
100
|
+
/** Cumulative-cost abort threshold (USD). When the running sum of
|
|
101
|
+
* `result.costUsd` crosses this value, no new cells are scheduled.
|
|
102
|
+
* In-flight cells finish. Default `Infinity`. */
|
|
103
|
+
costCeiling?: number;
|
|
104
|
+
/** Fires once per executed cell, after its promise settles. */
|
|
105
|
+
onCellComplete?: (cell: MatrixCell, result: CellResult<Output>) => void;
|
|
106
|
+
/** External cancellation. Aborts in-flight cells via a forwarded signal
|
|
107
|
+
* and suppresses scheduling of new ones. */
|
|
108
|
+
signal?: AbortSignal;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Per-axis aggregation of cell runs into `AxisSummary` rows.
|
|
113
|
+
*
|
|
114
|
+
* Pure: consumes the final `cells: [{cell, runs}]` array and returns the
|
|
115
|
+
* `byAxis` table. Error runs contribute 0 to passRate and meanScore. Cost
|
|
116
|
+
* and duration always count — the budget was spent regardless.
|
|
117
|
+
*/
|
|
118
|
+
|
|
119
|
+
interface Row<Output> {
|
|
120
|
+
cell: MatrixCell;
|
|
121
|
+
result: CellResult<Output>;
|
|
122
|
+
}
|
|
123
|
+
declare function summariseRows<Output>(rows: Row<Output>[], axisName: string, axisValue: string): AxisSummary;
|
|
124
|
+
declare function buildByAxis<Output>(cells: MatrixResult<Output>['cells'], axes: MatrixAxis<unknown>[], aggregateBy: string[]): Record<string, Record<string, AxisSummary>>;
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* N-axis cartesian runner.
|
|
128
|
+
*
|
|
129
|
+
* Expansion order: cartesian over `axes` in declared order, then `reps` as the
|
|
130
|
+
* inner-most dim → `ordinal = (cartIdx * reps) + rep`. The returned
|
|
131
|
+
* `cells[]` is sorted by `ordinal` so concurrent execution does not reorder
|
|
132
|
+
* the output.
|
|
133
|
+
*
|
|
134
|
+
* Scheduling is a sliding window of in-flight promises capped at
|
|
135
|
+
* `maxConcurrency`. The window stops admitting new cells when the cost
|
|
136
|
+
* ceiling trips or the abort signal fires; in-flight cells finish.
|
|
137
|
+
*/
|
|
138
|
+
|
|
139
|
+
declare function runAgentMatrix<Output>(opts: RunAgentMatrixOptions<Output>): Promise<MatrixResult<Output>>;
|
|
140
|
+
|
|
141
|
+
export { type AxisSummary, type CellResult, type MatrixAxis, type MatrixCell, type MatrixResult, type RunAgentMatrixOptions, buildByAxis, runAgentMatrix, summariseRows };
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import "../chunk-PZ5AY32C.js";
|
|
2
|
+
|
|
3
|
+
// src/matrix/aggregation.ts
|
|
4
|
+
function flattenRuns(cells) {
|
|
5
|
+
const rows = [];
|
|
6
|
+
for (const { cell, runs } of cells) {
|
|
7
|
+
for (const result of runs) rows.push({ cell, result });
|
|
8
|
+
}
|
|
9
|
+
return rows;
|
|
10
|
+
}
|
|
11
|
+
function quantile(sorted, q) {
|
|
12
|
+
if (sorted.length === 0) return 0;
|
|
13
|
+
if (sorted.length === 1) return sorted[0];
|
|
14
|
+
const pos = (sorted.length - 1) * q;
|
|
15
|
+
const lo = Math.floor(pos);
|
|
16
|
+
const hi = Math.ceil(pos);
|
|
17
|
+
if (lo === hi) return sorted[lo];
|
|
18
|
+
const frac = pos - lo;
|
|
19
|
+
return sorted[lo] * (1 - frac) + sorted[hi] * frac;
|
|
20
|
+
}
|
|
21
|
+
function summariseRows(rows, axisName, axisValue) {
|
|
22
|
+
if (rows.length === 0) {
|
|
23
|
+
return {
|
|
24
|
+
axisName,
|
|
25
|
+
axisValue,
|
|
26
|
+
cells: 0,
|
|
27
|
+
passRate: 0,
|
|
28
|
+
meanScore: 0,
|
|
29
|
+
p50Score: 0,
|
|
30
|
+
p90Score: 0,
|
|
31
|
+
totalCostUsd: 0,
|
|
32
|
+
meanDurationMs: 0
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
let pass = 0;
|
|
36
|
+
let scoreSum = 0;
|
|
37
|
+
let costSum = 0;
|
|
38
|
+
let durSum = 0;
|
|
39
|
+
const scores = [];
|
|
40
|
+
for (const { result } of rows) {
|
|
41
|
+
const errored = result.error !== void 0;
|
|
42
|
+
const score = errored ? 0 : result.verdict.score;
|
|
43
|
+
const valid = !errored && result.verdict.valid;
|
|
44
|
+
if (valid) pass++;
|
|
45
|
+
scoreSum += score;
|
|
46
|
+
scores.push(score);
|
|
47
|
+
costSum += result.costUsd;
|
|
48
|
+
durSum += result.durationMs;
|
|
49
|
+
}
|
|
50
|
+
scores.sort((a, b) => a - b);
|
|
51
|
+
return {
|
|
52
|
+
axisName,
|
|
53
|
+
axisValue,
|
|
54
|
+
cells: rows.length,
|
|
55
|
+
passRate: pass / rows.length,
|
|
56
|
+
meanScore: scoreSum / rows.length,
|
|
57
|
+
p50Score: quantile(scores, 0.5),
|
|
58
|
+
p90Score: quantile(scores, 0.9),
|
|
59
|
+
totalCostUsd: costSum,
|
|
60
|
+
meanDurationMs: durSum / rows.length
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
function bucketBy(rows, axisName, labelFor) {
|
|
64
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
65
|
+
for (const row of rows) {
|
|
66
|
+
const slot = row.cell.axes[axisName];
|
|
67
|
+
if (!slot) continue;
|
|
68
|
+
const id = slot.id;
|
|
69
|
+
let arr = buckets.get(id);
|
|
70
|
+
if (!arr) {
|
|
71
|
+
arr = [];
|
|
72
|
+
buckets.set(id, arr);
|
|
73
|
+
}
|
|
74
|
+
arr.push(row);
|
|
75
|
+
}
|
|
76
|
+
const out = {};
|
|
77
|
+
for (const id of [...buckets.keys()].sort()) {
|
|
78
|
+
out[id] = summariseRows(buckets.get(id), axisName, labelFor(id));
|
|
79
|
+
}
|
|
80
|
+
return out;
|
|
81
|
+
}
|
|
82
|
+
function buildByAxis(cells, axes, aggregateBy) {
|
|
83
|
+
const rows = flattenRuns(cells);
|
|
84
|
+
const byName = new Map(axes.map((a) => [a.name, a]));
|
|
85
|
+
const byAxis = {};
|
|
86
|
+
for (const name of aggregateBy) {
|
|
87
|
+
const axis = byName.get(name);
|
|
88
|
+
const labelFor = (id) => {
|
|
89
|
+
if (!axis?.label) return id;
|
|
90
|
+
const found = axis.values.find((v) => v.id === id);
|
|
91
|
+
if (!found) return id;
|
|
92
|
+
return axis.label(found.value, id);
|
|
93
|
+
};
|
|
94
|
+
byAxis[name] = bucketBy(rows, name, labelFor);
|
|
95
|
+
}
|
|
96
|
+
return byAxis;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// src/matrix/runner.ts
|
|
100
|
+
function cartesian(axes) {
|
|
101
|
+
if (axes.length === 0) return [{ axes: {} }];
|
|
102
|
+
for (const a of axes) if (a.values.length === 0) return [];
|
|
103
|
+
const out = [];
|
|
104
|
+
const idx = new Array(axes.length).fill(0);
|
|
105
|
+
while (true) {
|
|
106
|
+
const slot = {};
|
|
107
|
+
for (let i2 = 0; i2 < axes.length; i2++) {
|
|
108
|
+
const axis = axes[i2];
|
|
109
|
+
const v = axis.values[idx[i2]];
|
|
110
|
+
slot[axis.name] = { id: v.id, value: v.value };
|
|
111
|
+
}
|
|
112
|
+
out.push({ axes: slot });
|
|
113
|
+
let i = 0;
|
|
114
|
+
while (i < axes.length) {
|
|
115
|
+
const next = idx[i] + 1;
|
|
116
|
+
const axis = axes[i];
|
|
117
|
+
if (next < axis.values.length) {
|
|
118
|
+
idx[i] = next;
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
idx[i] = 0;
|
|
122
|
+
i++;
|
|
123
|
+
}
|
|
124
|
+
if (i === axes.length) break;
|
|
125
|
+
}
|
|
126
|
+
return out;
|
|
127
|
+
}
|
|
128
|
+
function makeMatrixId() {
|
|
129
|
+
const t = Date.now().toString(36);
|
|
130
|
+
let r = "";
|
|
131
|
+
for (let i = 0; i < 8; i++) r += Math.floor(Math.random() * 16).toString(16);
|
|
132
|
+
return `mtx_${t}_${r}`;
|
|
133
|
+
}
|
|
134
|
+
function makeErrorResult(err) {
|
|
135
|
+
const e = err;
|
|
136
|
+
return {
|
|
137
|
+
output: void 0,
|
|
138
|
+
verdict: { valid: false, score: 0 },
|
|
139
|
+
costUsd: 0,
|
|
140
|
+
durationMs: 0,
|
|
141
|
+
error: {
|
|
142
|
+
message: typeof e?.message === "string" ? e.message : String(err),
|
|
143
|
+
kind: typeof e?.name === "string" ? e.name : "Error"
|
|
144
|
+
}
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
async function runAgentMatrix(opts) {
|
|
148
|
+
const startedAt = Date.now();
|
|
149
|
+
const reps = Math.max(1, opts.reps ?? 1);
|
|
150
|
+
const maxConcurrency = Math.max(1, opts.maxConcurrency ?? 4);
|
|
151
|
+
const costCeiling = opts.costCeiling ?? Number.POSITIVE_INFINITY;
|
|
152
|
+
const aggregateBy = opts.aggregateBy ?? opts.axes.map((a) => a.name);
|
|
153
|
+
const base = cartesian(opts.axes);
|
|
154
|
+
const filtered = opts.filter ? base.filter((c) => opts.filter(c)) : base;
|
|
155
|
+
const filteredOut = base.length - filtered.length;
|
|
156
|
+
const planned = [];
|
|
157
|
+
for (let i = 0; i < filtered.length; i++) {
|
|
158
|
+
for (let r = 0; r < reps; r++) {
|
|
159
|
+
planned.push({
|
|
160
|
+
axes: filtered[i].axes,
|
|
161
|
+
rep: r,
|
|
162
|
+
ordinal: i * reps + r
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
const cellRecords = [];
|
|
167
|
+
let cumulativeCost = 0;
|
|
168
|
+
let costCeilingReached = false;
|
|
169
|
+
let runsExecuted = 0;
|
|
170
|
+
let cellsUnscheduled = 0;
|
|
171
|
+
const aborted = () => opts.signal?.aborted === true;
|
|
172
|
+
let inFlight = 0;
|
|
173
|
+
let cursor = 0;
|
|
174
|
+
let resolveAll;
|
|
175
|
+
const done = new Promise((res) => {
|
|
176
|
+
resolveAll = res;
|
|
177
|
+
});
|
|
178
|
+
const pump = () => {
|
|
179
|
+
while (inFlight < maxConcurrency && cursor < planned.length) {
|
|
180
|
+
if (aborted() || costCeilingReached) {
|
|
181
|
+
const left = planned.length - cursor;
|
|
182
|
+
cellsUnscheduled += left;
|
|
183
|
+
cursor = planned.length;
|
|
184
|
+
break;
|
|
185
|
+
}
|
|
186
|
+
const cell = planned[cursor++];
|
|
187
|
+
inFlight++;
|
|
188
|
+
const record = { cell, runs: [] };
|
|
189
|
+
cellRecords.push(record);
|
|
190
|
+
const promise = (async () => {
|
|
191
|
+
try {
|
|
192
|
+
return await opts.runCell(cell);
|
|
193
|
+
} catch (err) {
|
|
194
|
+
return makeErrorResult(err);
|
|
195
|
+
}
|
|
196
|
+
})();
|
|
197
|
+
promise.then((result) => {
|
|
198
|
+
record.runs.push(result);
|
|
199
|
+
runsExecuted++;
|
|
200
|
+
cumulativeCost += result.costUsd;
|
|
201
|
+
if (cumulativeCost >= costCeiling && !costCeilingReached) {
|
|
202
|
+
costCeilingReached = true;
|
|
203
|
+
console.warn("[matrix] cost ceiling reached");
|
|
204
|
+
}
|
|
205
|
+
try {
|
|
206
|
+
opts.onCellComplete?.(cell, result);
|
|
207
|
+
} catch {
|
|
208
|
+
}
|
|
209
|
+
inFlight--;
|
|
210
|
+
if (cursor < planned.length) {
|
|
211
|
+
pump();
|
|
212
|
+
} else if (inFlight === 0) {
|
|
213
|
+
resolveAll?.();
|
|
214
|
+
}
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
if (cursor >= planned.length && inFlight === 0) resolveAll?.();
|
|
218
|
+
};
|
|
219
|
+
const onAbort = () => {
|
|
220
|
+
if (cursor < planned.length) {
|
|
221
|
+
cellsUnscheduled += planned.length - cursor;
|
|
222
|
+
cursor = planned.length;
|
|
223
|
+
}
|
|
224
|
+
if (inFlight === 0) resolveAll?.();
|
|
225
|
+
};
|
|
226
|
+
if (opts.signal) {
|
|
227
|
+
if (opts.signal.aborted) {
|
|
228
|
+
cellsUnscheduled = planned.length;
|
|
229
|
+
cursor = planned.length;
|
|
230
|
+
resolveAll?.();
|
|
231
|
+
} else {
|
|
232
|
+
opts.signal.addEventListener("abort", onAbort, { once: true });
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
if (planned.length === 0) {
|
|
236
|
+
resolveAll?.();
|
|
237
|
+
} else {
|
|
238
|
+
pump();
|
|
239
|
+
}
|
|
240
|
+
await done;
|
|
241
|
+
if (opts.signal) opts.signal.removeEventListener("abort", onAbort);
|
|
242
|
+
cellRecords.sort((a, b) => a.cell.ordinal - b.cell.ordinal);
|
|
243
|
+
let pass = 0;
|
|
244
|
+
let scoreSum = 0;
|
|
245
|
+
let totalCost = 0;
|
|
246
|
+
let runCount = 0;
|
|
247
|
+
for (const { runs } of cellRecords) {
|
|
248
|
+
for (const r of runs) {
|
|
249
|
+
runCount++;
|
|
250
|
+
const errored = r.error !== void 0;
|
|
251
|
+
if (!errored && r.verdict.valid) pass++;
|
|
252
|
+
scoreSum += errored ? 0 : r.verdict.score;
|
|
253
|
+
totalCost += r.costUsd;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
const byAxis = buildByAxis(cellRecords, opts.axes, aggregateBy);
|
|
257
|
+
return {
|
|
258
|
+
cells: cellRecords,
|
|
259
|
+
byAxis,
|
|
260
|
+
summary: {
|
|
261
|
+
totalCells: planned.length,
|
|
262
|
+
runsExecuted,
|
|
263
|
+
cellsSkipped: cellsUnscheduled + filteredOut * reps,
|
|
264
|
+
overallPassRate: runCount === 0 ? 0 : pass / runCount,
|
|
265
|
+
overallMeanScore: runCount === 0 ? 0 : scoreSum / runCount,
|
|
266
|
+
totalCostUsd: totalCost,
|
|
267
|
+
durationMs: Date.now() - startedAt
|
|
268
|
+
},
|
|
269
|
+
matrixId: makeMatrixId()
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
export {
|
|
273
|
+
buildByAxis,
|
|
274
|
+
runAgentMatrix,
|
|
275
|
+
summariseRows
|
|
276
|
+
};
|
|
277
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/matrix/aggregation.ts","../../src/matrix/runner.ts"],"sourcesContent":["/**\n * Per-axis aggregation of cell runs into `AxisSummary` rows.\n *\n * Pure: consumes the final `cells: [{cell, runs}]` array and returns the\n * `byAxis` table. Error runs contribute 0 to passRate and meanScore. Cost\n * and duration always count — the budget was spent regardless.\n */\n\nimport type { AxisSummary, CellResult, MatrixAxis, MatrixCell, MatrixResult } from './types'\n\ninterface Row<Output> {\n cell: MatrixCell\n result: CellResult<Output>\n}\n\nfunction flattenRuns<Output>(cells: MatrixResult<Output>['cells']): Row<Output>[] {\n const rows: Row<Output>[] = []\n for (const { cell, runs } of cells) {\n for (const result of runs) rows.push({ cell, result })\n }\n return rows\n}\n\nfunction quantile(sorted: number[], q: number): number {\n if (sorted.length === 0) return 0\n if (sorted.length === 1) return sorted[0] as number\n const pos = (sorted.length - 1) * q\n const lo = Math.floor(pos)\n const hi = Math.ceil(pos)\n if (lo === hi) return sorted[lo] as number\n const frac = pos - lo\n return (sorted[lo] as number) * (1 - frac) + (sorted[hi] as number) * frac\n}\n\nexport function summariseRows<Output>(\n rows: Row<Output>[],\n axisName: string,\n axisValue: string,\n): AxisSummary {\n if (rows.length === 0) {\n return {\n axisName,\n axisValue,\n cells: 0,\n passRate: 0,\n meanScore: 0,\n p50Score: 0,\n p90Score: 0,\n totalCostUsd: 0,\n meanDurationMs: 0,\n }\n }\n let pass = 0\n let scoreSum = 0\n let costSum = 0\n let durSum = 0\n const scores: number[] = []\n for (const { result } of rows) {\n const errored = result.error !== undefined\n const score = errored ? 0 : result.verdict.score\n const valid = !errored && result.verdict.valid\n if (valid) pass++\n scoreSum += score\n scores.push(score)\n costSum += result.costUsd\n durSum += result.durationMs\n }\n scores.sort((a, b) => a - b)\n return {\n axisName,\n axisValue,\n cells: rows.length,\n passRate: pass / rows.length,\n meanScore: scoreSum / rows.length,\n p50Score: quantile(scores, 0.5),\n p90Score: quantile(scores, 0.9),\n totalCostUsd: costSum,\n meanDurationMs: durSum / rows.length,\n }\n}\n\nfunction bucketBy<Output>(\n rows: Row<Output>[],\n axisName: string,\n labelFor: (id: string) => string,\n): Record<string, AxisSummary> {\n const buckets = new Map<string, Row<Output>[]>()\n for (const row of rows) {\n const slot = row.cell.axes[axisName]\n if (!slot) continue\n const id = slot.id\n let arr = buckets.get(id)\n if (!arr) {\n arr = []\n buckets.set(id, arr)\n }\n arr.push(row)\n }\n const out: Record<string, AxisSummary> = {}\n // Sorted keys for deterministic JSON serialisation.\n for (const id of [...buckets.keys()].sort()) {\n out[id] = summariseRows(buckets.get(id) as Row<Output>[], axisName, labelFor(id))\n }\n return out\n}\n\nexport function buildByAxis<Output>(\n cells: MatrixResult<Output>['cells'],\n axes: MatrixAxis<unknown>[],\n aggregateBy: string[],\n): Record<string, Record<string, AxisSummary>> {\n const rows = flattenRuns(cells)\n const byName = new Map(axes.map((a) => [a.name, a]))\n const byAxis: Record<string, Record<string, AxisSummary>> = {}\n for (const name of aggregateBy) {\n const axis = byName.get(name)\n const labelFor = (id: string): string => {\n if (!axis?.label) return id\n const found = axis.values.find((v) => v.id === id)\n if (!found) return id\n return axis.label(found.value, id)\n }\n byAxis[name] = bucketBy(rows, name, labelFor)\n }\n return byAxis\n}\n","/**\n * N-axis cartesian runner.\n *\n * Expansion order: cartesian over `axes` in declared order, then `reps` as the\n * inner-most dim → `ordinal = (cartIdx * reps) + rep`. The returned\n * `cells[]` is sorted by `ordinal` so concurrent execution does not reorder\n * the output.\n *\n * Scheduling is a sliding window of in-flight promises capped at\n * `maxConcurrency`. The window stops admitting new cells when the cost\n * ceiling trips or the abort signal fires; in-flight cells finish.\n */\n\nimport { buildByAxis } from './aggregation'\nimport type {\n CellResult,\n MatrixAxis,\n MatrixCell,\n MatrixResult,\n RunAgentMatrixOptions,\n} from './types'\n\ninterface BaseCell {\n axes: Record<string, { id: string; value: unknown }>\n}\n\nfunction cartesian(axes: MatrixAxis<unknown>[]): BaseCell[] {\n // Empty axes (`values=[]`) collapse the whole product to zero cells. An\n // empty `axes` array yields a single empty-axes cell — degenerate but\n // valid (caller is iterating only reps).\n if (axes.length === 0) return [{ axes: {} }]\n for (const a of axes) if (a.values.length === 0) return []\n const out: BaseCell[] = []\n const idx = new Array(axes.length).fill(0)\n while (true) {\n const slot: Record<string, { id: string; value: unknown }> = {}\n for (let i = 0; i < axes.length; i++) {\n const axis = axes[i] as MatrixAxis<unknown>\n const v = axis.values[idx[i] as number] as { id: string; value: unknown }\n slot[axis.name] = { id: v.id, value: v.value }\n }\n out.push({ axes: slot })\n // Increment like an odometer, left-most axis is fastest.\n let i = 0\n while (i < axes.length) {\n const next = (idx[i] as number) + 1\n const axis = axes[i] as MatrixAxis<unknown>\n if (next < axis.values.length) {\n idx[i] = next\n break\n }\n idx[i] = 0\n i++\n }\n if (i === axes.length) break\n }\n return out\n}\n\nfunction makeMatrixId(): string {\n // Stable id-like string: time + 8 random hex chars. Avoids node:crypto\n // import to keep the matrix dep-free.\n const t = Date.now().toString(36)\n let r = ''\n for (let i = 0; i < 8; i++) r += Math.floor(Math.random() * 16).toString(16)\n return `mtx_${t}_${r}`\n}\n\nfunction makeErrorResult<Output>(err: unknown): CellResult<Output> {\n const e = err as { message?: string; name?: string }\n return {\n output: undefined as unknown as Output,\n verdict: { valid: false, score: 0 },\n costUsd: 0,\n durationMs: 0,\n error: {\n message: typeof e?.message === 'string' ? e.message : String(err),\n kind: typeof e?.name === 'string' ? e.name : 'Error',\n },\n }\n}\n\nexport async function runAgentMatrix<Output>(\n opts: RunAgentMatrixOptions<Output>,\n): Promise<MatrixResult<Output>> {\n const startedAt = Date.now()\n const reps = Math.max(1, opts.reps ?? 1)\n const maxConcurrency = Math.max(1, opts.maxConcurrency ?? 4)\n const costCeiling = opts.costCeiling ?? Number.POSITIVE_INFINITY\n const aggregateBy = opts.aggregateBy ?? opts.axes.map((a) => a.name)\n\n const base = cartesian(opts.axes)\n const filtered = opts.filter\n ? base.filter((c) => (opts.filter as (b: BaseCell) => boolean)(c))\n : base\n const filteredOut = base.length - filtered.length\n\n const planned: MatrixCell[] = []\n for (let i = 0; i < filtered.length; i++) {\n for (let r = 0; r < reps; r++) {\n planned.push({\n axes: (filtered[i] as BaseCell).axes,\n rep: r,\n ordinal: i * reps + r,\n })\n }\n }\n\n const cellRecords: Array<{ cell: MatrixCell; runs: CellResult<Output>[] }> = []\n let cumulativeCost = 0\n let costCeilingReached = false\n let runsExecuted = 0\n let cellsUnscheduled = 0\n\n const aborted = (): boolean => opts.signal?.aborted === true\n\n // Per-run abort controller forwards the external signal so cell executors\n // see cancellation. We don't expose it on `MatrixCell` — the signature on\n // `runCell` per the public API is `(cell) => Promise<...>`. Executors that\n // need cancellation use the external signal directly via closure.\n\n let inFlight = 0\n let cursor = 0\n let resolveAll: (() => void) | undefined\n const done = new Promise<void>((res) => {\n resolveAll = res\n })\n\n const pump = (): void => {\n while (inFlight < maxConcurrency && cursor < planned.length) {\n if (aborted() || costCeilingReached) {\n // Drain remaining as unscheduled.\n const left = planned.length - cursor\n cellsUnscheduled += left\n cursor = planned.length\n break\n }\n const cell = planned[cursor++] as MatrixCell\n inFlight++\n // Lazily allocate the record so cells appear in `cells[]` in any\n // arrival order; we sort by ordinal at the end.\n const record = { cell, runs: [] as CellResult<Output>[] }\n cellRecords.push(record)\n const promise: Promise<CellResult<Output>> = (async () => {\n try {\n return await opts.runCell(cell)\n } catch (err) {\n return makeErrorResult<Output>(err)\n }\n })()\n promise.then((result) => {\n record.runs.push(result)\n runsExecuted++\n cumulativeCost += result.costUsd\n if (cumulativeCost >= costCeiling && !costCeilingReached) {\n costCeilingReached = true\n // eslint-disable-next-line no-console\n console.warn('[matrix] cost ceiling reached')\n }\n try {\n opts.onCellComplete?.(cell, result)\n } catch {\n // onCellComplete is observational — swallow throws so a noisy\n // callback can't tank the run.\n }\n inFlight--\n if (cursor < planned.length) {\n pump()\n } else if (inFlight === 0) {\n resolveAll?.()\n }\n })\n }\n if (cursor >= planned.length && inFlight === 0) resolveAll?.()\n }\n\n const onAbort = (): void => {\n // External abort: stop scheduling. In-flight cells finish; their\n // executors observe `opts.signal.aborted` directly via closure.\n if (cursor < planned.length) {\n cellsUnscheduled += planned.length - cursor\n cursor = planned.length\n }\n if (inFlight === 0) resolveAll?.()\n }\n if (opts.signal) {\n if (opts.signal.aborted) {\n cellsUnscheduled = planned.length\n cursor = planned.length\n resolveAll?.()\n } else {\n opts.signal.addEventListener('abort', onAbort, { once: true })\n }\n }\n\n if (planned.length === 0) {\n resolveAll?.()\n } else {\n pump()\n }\n\n await done\n if (opts.signal) opts.signal.removeEventListener('abort', onAbort)\n\n cellRecords.sort((a, b) => a.cell.ordinal - b.cell.ordinal)\n\n let pass = 0\n let scoreSum = 0\n let totalCost = 0\n let runCount = 0\n for (const { runs } of cellRecords) {\n for (const r of runs) {\n runCount++\n const errored = r.error !== undefined\n if (!errored && r.verdict.valid) pass++\n scoreSum += errored ? 0 : r.verdict.score\n totalCost += r.costUsd\n }\n }\n\n const byAxis = buildByAxis(cellRecords, opts.axes, aggregateBy)\n\n return {\n cells: cellRecords,\n byAxis,\n summary: {\n totalCells: planned.length,\n runsExecuted,\n cellsSkipped: cellsUnscheduled + filteredOut * reps,\n overallPassRate: runCount === 0 ? 0 : pass / runCount,\n overallMeanScore: runCount === 0 ? 0 : scoreSum / runCount,\n totalCostUsd: totalCost,\n durationMs: Date.now() - startedAt,\n },\n matrixId: makeMatrixId(),\n }\n}\n"],"mappings":";;;AAeA,SAAS,YAAoB,OAAqD;AAChF,QAAM,OAAsB,CAAC;AAC7B,aAAW,EAAE,MAAM,KAAK,KAAK,OAAO;AAClC,eAAW,UAAU,KAAM,MAAK,KAAK,EAAE,MAAM,OAAO,CAAC;AAAA,EACvD;AACA,SAAO;AACT;AAEA,SAAS,SAAS,QAAkB,GAAmB;AACrD,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,OAAO,WAAW,EAAG,QAAO,OAAO,CAAC;AACxC,QAAM,OAAO,OAAO,SAAS,KAAK;AAClC,QAAM,KAAK,KAAK,MAAM,GAAG;AACzB,QAAM,KAAK,KAAK,KAAK,GAAG;AACxB,MAAI,OAAO,GAAI,QAAO,OAAO,EAAE;AAC/B,QAAM,OAAO,MAAM;AACnB,SAAQ,OAAO,EAAE,KAAgB,IAAI,QAAS,OAAO,EAAE,IAAe;AACxE;AAEO,SAAS,cACd,MACA,UACA,WACa;AACb,MAAI,KAAK,WAAW,GAAG;AACrB,WAAO;AAAA,MACL;AAAA,MACA;AAAA,MACA,OAAO;AAAA,MACP,UAAU;AAAA,MACV,WAAW;AAAA,MACX,UAAU;AAAA,MACV,UAAU;AAAA,MACV,cAAc;AAAA,MACd,gBAAgB;AAAA,IAClB;AAAA,EACF;AACA,MAAI,OAAO;AACX,MAAI,WAAW;AACf,MAAI,UAAU;AACd,MAAI,SAAS;AACb,QAAM,SAAmB,CAAC;AAC1B,aAAW,EAAE,OAAO,KAAK,MAAM;AAC7B,UAAM,UAAU,OAAO,UAAU;AACjC,UAAM,QAAQ,UAAU,IAAI,OAAO,QAAQ;AAC3C,UAAM,QAAQ,CAAC,WAAW,OAAO,QAAQ;AACzC,QAAI,MAAO;AACX,gBAAY;AACZ,WAAO,KAAK,KAAK;AACjB,eAAW,OAAO;AAClB,cAAU,OAAO;AAAA,EACnB;AACA,SAAO,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3B,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,OAAO,KAAK;AAAA,IACZ,UAAU,OAAO,KAAK;AAAA,IACtB,WAAW,WAAW,KAAK;AAAA,IAC3B,UAAU,SAAS,QAAQ,GAAG;AAAA,IAC9B,UAAU,SAAS,QAAQ,GAAG;AAAA,IAC9B,cAAc;AAAA,IACd,gBAAgB,SAAS,KAAK;AAAA,EAChC;AACF;AAEA,SAAS,SACP,MACA,UACA,UAC6B;AAC7B,QAAM,UAAU,oBAAI,IAA2B;AAC/C,aAAW,OAAO,MAAM;AACtB,UAAM,OAAO,IAAI,KAAK,KAAK,QAAQ;AACnC,QAAI,CAAC,KAAM;AACX,UAAM,KAAK,KAAK;AAChB,QAAI,MAAM,QAAQ,IAAI,EAAE;AACxB,QAAI,CAAC,KAAK;AACR,YAAM,CAAC;AACP,cAAQ,IAAI,IAAI,GAAG;AAAA,IACrB;AACA,QAAI,KAAK,GAAG;AAAA,EACd;AACA,QAAM,MAAmC,CAAC;AAE1C,aAAW,MAAM,CAAC,GAAG,QAAQ,KAAK,CAAC,EAAE,KAAK,GAAG;AAC3C,QAAI,EAAE,IAAI,cAAc,QAAQ,IAAI,EAAE,GAAoB,UAAU,SAAS,EAAE,CAAC;AAAA,EAClF;AACA,SAAO;AACT;AAEO,SAAS,YACd,OACA,MACA,aAC6C;AAC7C,QAAM,OAAO,YAAY,KAAK;AAC9B,QAAM,SAAS,IAAI,IAAI,KAAK,IAAI,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC;AACnD,QAAM,SAAsD,CAAC;AAC7D,aAAW,QAAQ,aAAa;AAC9B,UAAM,OAAO,OAAO,IAAI,IAAI;AAC5B,UAAM,WAAW,CAAC,OAAuB;AACvC,UAAI,CAAC,MAAM,MAAO,QAAO;AACzB,YAAM,QAAQ,KAAK,OAAO,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE;AACjD,UAAI,CAAC,MAAO,QAAO;AACnB,aAAO,KAAK,MAAM,MAAM,OAAO,EAAE;AAAA,IACnC;AACA,WAAO,IAAI,IAAI,SAAS,MAAM,MAAM,QAAQ;AAAA,EAC9C;AACA,SAAO;AACT;;;ACnGA,SAAS,UAAU,MAAyC;AAI1D,MAAI,KAAK,WAAW,EAAG,QAAO,CAAC,EAAE,MAAM,CAAC,EAAE,CAAC;AAC3C,aAAW,KAAK,KAAM,KAAI,EAAE,OAAO,WAAW,EAAG,QAAO,CAAC;AACzD,QAAM,MAAkB,CAAC;AACzB,QAAM,MAAM,IAAI,MAAM,KAAK,MAAM,EAAE,KAAK,CAAC;AACzC,SAAO,MAAM;AACX,UAAM,OAAuD,CAAC;AAC9D,aAASA,KAAI,GAAGA,KAAI,KAAK,QAAQA,MAAK;AACpC,YAAM,OAAO,KAAKA,EAAC;AACnB,YAAM,IAAI,KAAK,OAAO,IAAIA,EAAC,CAAW;AACtC,WAAK,KAAK,IAAI,IAAI,EAAE,IAAI,EAAE,IAAI,OAAO,EAAE,MAAM;AAAA,IAC/C;AACA,QAAI,KAAK,EAAE,MAAM,KAAK,CAAC;AAEvB,QAAI,IAAI;AACR,WAAO,IAAI,KAAK,QAAQ;AACtB,YAAM,OAAQ,IAAI,CAAC,IAAe;AAClC,YAAM,OAAO,KAAK,CAAC;AACnB,UAAI,OAAO,KAAK,OAAO,QAAQ;AAC7B,YAAI,CAAC,IAAI;AACT;AAAA,MACF;AACA,UAAI,CAAC,IAAI;AACT;AAAA,IACF;AACA,QAAI,MAAM,KAAK,OAAQ;AAAA,EACzB;AACA,SAAO;AACT;AAEA,SAAS,eAAuB;AAG9B,QAAM,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE;AAChC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,GAAG,IAAK,MAAK,KAAK,MAAM,KAAK,OAAO,IAAI,EAAE,EAAE,SAAS,EAAE;AAC3E,SAAO,OAAO,CAAC,IAAI,CAAC;AACtB;AAEA,SAAS,gBAAwB,KAAkC;AACjE,QAAM,IAAI;AACV,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,SAAS,EAAE,OAAO,OAAO,OAAO,EAAE;AAAA,IAClC,SAAS;AAAA,IACT,YAAY;AAAA,IACZ,OAAO;AAAA,MACL,SAAS,OAAO,GAAG,YAAY,WAAW,EAAE,UAAU,OAAO,GAAG;AAAA,MAChE,MAAM,OAAO,GAAG,SAAS,WAAW,EAAE,OAAO;AAAA,IAC/C;AAAA,EACF;AACF;AAEA,eAAsB,eACpB,MAC+B;AAC/B,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,OAAO,KAAK,IAAI,GAAG,KAAK,QAAQ,CAAC;AACvC,QAAM,iBAAiB,KAAK,IAAI,GAAG,KAAK,kBAAkB,CAAC;AAC3D,QAAM,cAAc,KAAK,eAAe,OAAO;AAC/C,QAAM,cAAc,KAAK,eAAe,KAAK,KAAK,IAAI,CAAC,MAAM,EAAE,IAAI;AAEnE,QAAM,OAAO,UAAU,KAAK,IAAI;AAChC,QAAM,WAAW,KAAK,SAClB,KAAK,OAAO,CAAC,MAAO,KAAK,OAAoC,CAAC,CAAC,IAC/D;AACJ,QAAM,cAAc,KAAK,SAAS,SAAS;AAE3C,QAAM,UAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,aAAS,IAAI,GAAG,IAAI,MAAM,KAAK;AAC7B,cAAQ,KAAK;AAAA,QACX,MAAO,SAAS,CAAC,EAAe;AAAA,QAChC,KAAK;AAAA,QACL,SAAS,IAAI,OAAO;AAAA,MACtB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,cAAuE,CAAC;AAC9E,MAAI,iBAAiB;AACrB,MAAI,qBAAqB;AACzB,MAAI,eAAe;AACnB,MAAI,mBAAmB;AAEvB,QAAM,UAAU,MAAe,KAAK,QAAQ,YAAY;AAOxD,MAAI,WAAW;AACf,MAAI,SAAS;AACb,MAAI;AACJ,QAAM,OAAO,IAAI,QAAc,CAAC,QAAQ;AACtC,iBAAa;AAAA,EACf,CAAC;AAED,QAAM,OAAO,MAAY;AACvB,WAAO,WAAW,kBAAkB,SAAS,QAAQ,QAAQ;AAC3D,UAAI,QAAQ,KAAK,oBAAoB;AAEnC,cAAM,OAAO,QAAQ,SAAS;AAC9B,4BAAoB;AACpB,iBAAS,QAAQ;AACjB;AAAA,MACF;AACA,YAAM,OAAO,QAAQ,QAAQ;AAC7B;AAGA,YAAM,SAAS,EAAE,MAAM,MAAM,CAAC,EAA0B;AACxD,kBAAY,KAAK,MAAM;AACvB,YAAM,WAAwC,YAAY;AACxD,YAAI;AACF,iBAAO,MAAM,KAAK,QAAQ,IAAI;AAAA,QAChC,SAAS,KAAK;AACZ,iBAAO,gBAAwB,GAAG;AAAA,QACpC;AAAA,MACF,GAAG;AACH,cAAQ,KAAK,CAAC,WAAW;AACvB,eAAO,KAAK,KAAK,MAAM;AACvB;AACA,0BAAkB,OAAO;AACzB,YAAI,kBAAkB,eAAe,CAAC,oBAAoB;AACxD,+BAAqB;AAErB,kBAAQ,KAAK,+BAA+B;AAAA,QAC9C;AACA,YAAI;AACF,eAAK,iBAAiB,MAAM,MAAM;AAAA,QACpC,QAAQ;AAAA,QAGR;AACA;AACA,YAAI,SAAS,QAAQ,QAAQ;AAC3B,eAAK;AAAA,QACP,WAAW,aAAa,GAAG;AACzB,uBAAa;AAAA,QACf;AAAA,MACF,CAAC;AAAA,IACH;AACA,QAAI,UAAU,QAAQ,UAAU,aAAa,EAAG,cAAa;AAAA,EAC/D;AAEA,QAAM,UAAU,MAAY;AAG1B,QAAI,SAAS,QAAQ,QAAQ;AAC3B,0BAAoB,QAAQ,SAAS;AACrC,eAAS,QAAQ;AAAA,IACnB;AACA,QAAI,aAAa,EAAG,cAAa;AAAA,EACnC;AACA,MAAI,KAAK,QAAQ;AACf,QAAI,KAAK,OAAO,SAAS;AACvB,yBAAmB,QAAQ;AAC3B,eAAS,QAAQ;AACjB,mBAAa;AAAA,IACf,OAAO;AACL,WAAK,OAAO,iBAAiB,SAAS,SAAS,EAAE,MAAM,KAAK,CAAC;AAAA,IAC/D;AAAA,EACF;AAEA,MAAI,QAAQ,WAAW,GAAG;AACxB,iBAAa;AAAA,EACf,OAAO;AACL,SAAK;AAAA,EACP;AAEA,QAAM;AACN,MAAI,KAAK,OAAQ,MAAK,OAAO,oBAAoB,SAAS,OAAO;AAEjE,cAAY,KAAK,CAAC,GAAG,MAAM,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AAE1D,MAAI,OAAO;AACX,MAAI,WAAW;AACf,MAAI,YAAY;AAChB,MAAI,WAAW;AACf,aAAW,EAAE,KAAK,KAAK,aAAa;AAClC,eAAW,KAAK,MAAM;AACpB;AACA,YAAM,UAAU,EAAE,UAAU;AAC5B,UAAI,CAAC,WAAW,EAAE,QAAQ,MAAO;AACjC,kBAAY,UAAU,IAAI,EAAE,QAAQ;AACpC,mBAAa,EAAE;AAAA,IACjB;AAAA,EACF;AAEA,QAAM,SAAS,YAAY,aAAa,KAAK,MAAM,WAAW;AAE9D,SAAO;AAAA,IACL,OAAO;AAAA,IACP;AAAA,IACA,SAAS;AAAA,MACP,YAAY,QAAQ;AAAA,MACpB;AAAA,MACA,cAAc,mBAAmB,cAAc;AAAA,MAC/C,iBAAiB,aAAa,IAAI,IAAI,OAAO;AAAA,MAC7C,kBAAkB,aAAa,IAAI,IAAI,WAAW;AAAA,MAClD,cAAc;AAAA,MACd,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B;AAAA,IACA,UAAU,aAAa;AAAA,EACzB;AACF;","names":["i"]}
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.37.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/traces.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
|
|
2
2
|
import { R as RawProviderSink, f as RawProviderEvent } from './integrity-DYR5gWlb.js';
|
|
3
3
|
export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DYR5gWlb.js';
|
|
4
|
-
import {
|
|
4
|
+
import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DP_cSSiw.js';
|
|
5
5
|
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
|
|
6
6
|
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
7
7
|
export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
|
|
@@ -65,6 +65,85 @@ interface OtlpExport {
|
|
|
65
65
|
/** Export a single run's spans + events in OTLP/JSON. */
|
|
66
66
|
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
67
67
|
|
|
68
|
+
/**
|
|
69
|
+
* OTEL span exporter — streams spans to an OTLP/HTTP collector.
|
|
70
|
+
*
|
|
71
|
+
* Reads OTEL_EXPORTER_OTLP_ENDPOINT + OTEL_EXPORTER_OTLP_HEADERS from env
|
|
72
|
+
* when no explicit config is given. Batches spans and flushes periodically
|
|
73
|
+
* or when the batch fills. No @opentelemetry SDK dependency — minimal
|
|
74
|
+
* OTLP/JSON serializer (~120 LOC) using the existing otel.ts helpers.
|
|
75
|
+
*/
|
|
76
|
+
interface OtelExportConfig {
|
|
77
|
+
/** OTLP endpoint. Reads OTEL_EXPORTER_OTLP_ENDPOINT env by default. */
|
|
78
|
+
endpoint?: string;
|
|
79
|
+
/** OTLP headers. Reads OTEL_EXPORTER_OTLP_HEADERS env by default. */
|
|
80
|
+
headers?: Record<string, string>;
|
|
81
|
+
/** Batch size before flush. Default 64. */
|
|
82
|
+
batchSize?: number;
|
|
83
|
+
/** Flush interval ms. Default 5000. */
|
|
84
|
+
flushIntervalMs?: number;
|
|
85
|
+
/** Resource attributes stamped on every export. */
|
|
86
|
+
resourceAttributes?: Record<string, string | number | boolean>;
|
|
87
|
+
/** Service name. Default 'agent-eval'. */
|
|
88
|
+
serviceName?: string;
|
|
89
|
+
}
|
|
90
|
+
interface OtelExporter {
|
|
91
|
+
/** Called by the TraceEmitter on every span close. */
|
|
92
|
+
exportSpan(span: ExportableSpan): void;
|
|
93
|
+
/** Force flush pending spans. */
|
|
94
|
+
flush(): Promise<void>;
|
|
95
|
+
/** Shutdown cleanly — flushes remaining spans and stops the timer. */
|
|
96
|
+
shutdown(): Promise<void>;
|
|
97
|
+
}
|
|
98
|
+
interface ExportableSpan {
|
|
99
|
+
traceId: string;
|
|
100
|
+
spanId: string;
|
|
101
|
+
parentSpanId?: string;
|
|
102
|
+
name: string;
|
|
103
|
+
kind: string;
|
|
104
|
+
startedAt: number;
|
|
105
|
+
endedAt?: number;
|
|
106
|
+
status?: string;
|
|
107
|
+
error?: string;
|
|
108
|
+
model?: string;
|
|
109
|
+
inputTokens?: number;
|
|
110
|
+
outputTokens?: number;
|
|
111
|
+
costUsd?: number;
|
|
112
|
+
attributes?: Record<string, unknown>;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Create an OTEL exporter. Returns undefined when no endpoint is configured
|
|
116
|
+
* (neither via config nor env) — callers should check before attaching.
|
|
117
|
+
*/
|
|
118
|
+
declare function createOtelExporter(config?: OtelExportConfig): OtelExporter | undefined;
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* OTEL bridge — connects TraceEmitter span lifecycle to the OtelExporter.
|
|
122
|
+
*
|
|
123
|
+
* When an OtelExporter is active, every span that closes through the
|
|
124
|
+
* TraceEmitter is also pushed to the exporter for real-time streaming to
|
|
125
|
+
* the user's OTEL collector.
|
|
126
|
+
*
|
|
127
|
+
* The bridge is opt-in: attach via `otelRunCompleteHook(exporter)` as a
|
|
128
|
+
* RunCompleteHook, or wrap the store with `createOtelTracingStore` for
|
|
129
|
+
* real-time per-span export.
|
|
130
|
+
*/
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Create a RunCompleteHook that exports all spans from the completed run
|
|
134
|
+
* to the OTEL exporter, then flushes.
|
|
135
|
+
*/
|
|
136
|
+
declare function otelRunCompleteHook(exporter: OtelExporter): RunCompleteHook;
|
|
137
|
+
/**
|
|
138
|
+
* Create an auto-exporting TraceStore wrapper that intercepts updateSpan
|
|
139
|
+
* calls. When a span gets an endedAt, it's exported immediately. This
|
|
140
|
+
* gives real-time streaming instead of batch-at-end.
|
|
141
|
+
*
|
|
142
|
+
* This is the preferred integration path: wrap the store before
|
|
143
|
+
* constructing the TraceEmitter.
|
|
144
|
+
*/
|
|
145
|
+
declare function createOtelTracingStore(inner: TraceStore, exporter: OtelExporter, traceId: string): TraceStore;
|
|
146
|
+
|
|
68
147
|
/**
|
|
69
148
|
* Redaction — remove PII / secrets from trace payloads before persist.
|
|
70
149
|
*
|
|
@@ -807,4 +886,4 @@ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
|
|
|
807
886
|
spanId?: string;
|
|
808
887
|
}): AsyncGenerator<ReplayCacheEntry>;
|
|
809
888
|
|
|
810
|
-
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
|
|
889
|
+
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
|
package/dist/traces.js
CHANGED
|
@@ -19,6 +19,8 @@ import {
|
|
|
19
19
|
buildTraceAnalystTools,
|
|
20
20
|
buildTraceInsightContext,
|
|
21
21
|
buildTraceInsightPrompt,
|
|
22
|
+
createOtelExporter,
|
|
23
|
+
createOtelTracingStore,
|
|
22
24
|
createReplayFetch,
|
|
23
25
|
defaultTraceInsightPanel,
|
|
24
26
|
describeTraceInsightScope,
|
|
@@ -26,6 +28,7 @@ import {
|
|
|
26
28
|
exportRunAsOtlp,
|
|
27
29
|
inferDomainKeywords,
|
|
28
30
|
iterateRawCalls,
|
|
31
|
+
otelRunCompleteHook,
|
|
29
32
|
planTraceInsightQuestions,
|
|
30
33
|
redactString,
|
|
31
34
|
redactValue,
|
|
@@ -33,7 +36,7 @@ import {
|
|
|
33
36
|
tokenizeDomainWords,
|
|
34
37
|
traceAnalystFunctionGroup,
|
|
35
38
|
traceAnalystOnRunComplete
|
|
36
|
-
} from "./chunk-
|
|
39
|
+
} from "./chunk-L5UNCDAJ.js";
|
|
37
40
|
import {
|
|
38
41
|
aggregateLlm,
|
|
39
42
|
argHash,
|
|
@@ -103,6 +106,8 @@ export {
|
|
|
103
106
|
buildTraceAnalystTools,
|
|
104
107
|
buildTraceInsightContext,
|
|
105
108
|
buildTraceInsightPrompt,
|
|
109
|
+
createOtelExporter,
|
|
110
|
+
createOtelTracingStore,
|
|
106
111
|
createReplayFetch,
|
|
107
112
|
defaultProviderRedactor,
|
|
108
113
|
defaultTraceInsightPanel,
|
|
@@ -120,6 +125,7 @@ export {
|
|
|
120
125
|
judgeSpans,
|
|
121
126
|
llmSpanFromProvider,
|
|
122
127
|
llmSpans,
|
|
128
|
+
otelRunCompleteHook,
|
|
123
129
|
planTraceInsightQuestions,
|
|
124
130
|
providerFromBaseUrl,
|
|
125
131
|
redactString,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.37.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -94,6 +94,11 @@
|
|
|
94
94
|
"import": "./dist/knowledge/index.js",
|
|
95
95
|
"default": "./dist/knowledge/index.js"
|
|
96
96
|
},
|
|
97
|
+
"./matrix": {
|
|
98
|
+
"types": "./dist/matrix/index.d.ts",
|
|
99
|
+
"import": "./dist/matrix/index.js",
|
|
100
|
+
"default": "./dist/matrix/index.js"
|
|
101
|
+
},
|
|
97
102
|
"./openapi.json": {
|
|
98
103
|
"default": "./dist/openapi.json"
|
|
99
104
|
}
|
|
@@ -128,8 +133,18 @@
|
|
|
128
133
|
"hono": "^4.12.16",
|
|
129
134
|
"zod": "^4.3.6"
|
|
130
135
|
},
|
|
136
|
+
"peerDependencies": {
|
|
137
|
+
"@tangle-network/agent-runtime": "^0.21.0",
|
|
138
|
+
"@tangle-network/sandbox": "^0.2.1"
|
|
139
|
+
},
|
|
140
|
+
"peerDependenciesMeta": {
|
|
141
|
+
"@tangle-network/agent-runtime": { "optional": true },
|
|
142
|
+
"@tangle-network/sandbox": { "optional": true }
|
|
143
|
+
},
|
|
131
144
|
"devDependencies": {
|
|
132
145
|
"@biomejs/biome": "^2.4.15",
|
|
146
|
+
"@tangle-network/agent-runtime": "^0.21.0",
|
|
147
|
+
"@tangle-network/sandbox": "^0.2.1",
|
|
133
148
|
"@types/node": "^25.6.0",
|
|
134
149
|
"openapi3-ts": "^4.5.0",
|
|
135
150
|
"tsup": "^8.0.0",
|
|
@@ -138,7 +153,7 @@
|
|
|
138
153
|
},
|
|
139
154
|
"pnpm": {
|
|
140
155
|
"minimumReleaseAge": 4320,
|
|
141
|
-
"minimumReleaseAgeExclude": [],
|
|
156
|
+
"minimumReleaseAgeExclude": ["@tangle-network/sandbox", "@tangle-network/agent-runtime"],
|
|
142
157
|
"overrides": {
|
|
143
158
|
"postcss@<8.5.10": "^8.5.10",
|
|
144
159
|
"ws@>=8.0.0 <8.20.1": "^8.20.1"
|