@tangle-network/agent-eval 0.37.0 → 0.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-QWV226SL.js +276 -0
- package/dist/chunk-QWV226SL.js.map +1 -0
- package/dist/matrix/index.d.ts +2 -109
- package/dist/matrix/index.js +5 -270
- package/dist/matrix/index.js.map +1 -1
- package/dist/multishot/index.d.ts +276 -0
- package/dist/multishot/index.js +467 -0
- package/dist/multishot/index.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/types-DHqkLwEU.d.ts +110 -0
- package/package.json +6 -1
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import { DefaultVerdict } from '@tangle-network/agent-runtime/loops';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @experimental
|
|
5
|
+
*
|
|
6
|
+
* N-axis cartesian matrix over substrate types — types module.
|
|
7
|
+
*
|
|
8
|
+
* The matrix is a runner + aggregator. It iterates the cartesian product of
|
|
9
|
+
* caller-provided axes (any value type — `AgentProfile` from sandbox, `Driver`
|
|
10
|
+
* / `Validator` from agent-runtime, rubric records, thinking levels, anything)
|
|
11
|
+
* and aggregates per-axis pass/score/cost summaries. Substrate types are
|
|
12
|
+
* imported at the boundary by JSDoc only; the matrix never wraps them.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/** One axis = one dimension to iterate. `V` is the value type — pass any
|
|
16
|
+
* substrate type (AgentProfile, Driver, Validator, rubric record). */
|
|
17
|
+
interface MatrixAxis<V> {
|
|
18
|
+
/** Axis name. Becomes the key in `MatrixResult.byAxis`. */
|
|
19
|
+
name: string;
|
|
20
|
+
/** Stable id per value. Used as the bucket key in aggregation. */
|
|
21
|
+
values: Array<{
|
|
22
|
+
id: string;
|
|
23
|
+
value: V;
|
|
24
|
+
}>;
|
|
25
|
+
/** Optional bucket label override. Receives the same `(value, id)` the
|
|
26
|
+
* runner stored on the cell; default label is `id`. */
|
|
27
|
+
label?: (value: V, id: string) => string;
|
|
28
|
+
}
|
|
29
|
+
/** A cell carries one picked value from each axis, keyed by axis name. */
|
|
30
|
+
interface MatrixCell {
|
|
31
|
+
axes: Record<string, {
|
|
32
|
+
id: string;
|
|
33
|
+
value: unknown;
|
|
34
|
+
}>;
|
|
35
|
+
/** 0-based replicate index within the same axis combination. */
|
|
36
|
+
rep: number;
|
|
37
|
+
/** Stable sort key — preserves cartesian order across concurrent execution. */
|
|
38
|
+
ordinal: number;
|
|
39
|
+
}
|
|
40
|
+
interface CellResult<Output> {
|
|
41
|
+
output: Output;
|
|
42
|
+
verdict: DefaultVerdict;
|
|
43
|
+
costUsd: number;
|
|
44
|
+
durationMs: number;
|
|
45
|
+
runId?: string;
|
|
46
|
+
/** Populated when `runCell` threw. The cell contributes 0 to passRate AND
|
|
47
|
+
* meanScore regardless of `verdict`. */
|
|
48
|
+
error?: {
|
|
49
|
+
message: string;
|
|
50
|
+
kind: string;
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
interface AxisSummary {
|
|
54
|
+
axisName: string;
|
|
55
|
+
axisValue: string;
|
|
56
|
+
cells: number;
|
|
57
|
+
passRate: number;
|
|
58
|
+
meanScore: number;
|
|
59
|
+
p50Score: number;
|
|
60
|
+
p90Score: number;
|
|
61
|
+
totalCostUsd: number;
|
|
62
|
+
meanDurationMs: number;
|
|
63
|
+
}
|
|
64
|
+
interface MatrixResult<Output> {
|
|
65
|
+
cells: Array<{
|
|
66
|
+
cell: MatrixCell;
|
|
67
|
+
runs: CellResult<Output>[];
|
|
68
|
+
}>;
|
|
69
|
+
/** `byAxis[axisName][axisValueId] = summary`. Populated only for axes
|
|
70
|
+
* named in `aggregateBy` (default = every axis in `axes`). */
|
|
71
|
+
byAxis: Record<string, Record<string, AxisSummary>>;
|
|
72
|
+
summary: {
|
|
73
|
+
totalCells: number;
|
|
74
|
+
runsExecuted: number;
|
|
75
|
+
/** Cells removed by `filter` plus cells unscheduled after the cost
|
|
76
|
+
* ceiling or abort signal tripped. */
|
|
77
|
+
cellsSkipped: number;
|
|
78
|
+
overallPassRate: number;
|
|
79
|
+
overallMeanScore: number;
|
|
80
|
+
totalCostUsd: number;
|
|
81
|
+
durationMs: number;
|
|
82
|
+
};
|
|
83
|
+
/** Stable id-like string generated at the end of the run. */
|
|
84
|
+
matrixId: string;
|
|
85
|
+
}
|
|
86
|
+
interface RunAgentMatrixOptions<Output> {
|
|
87
|
+
axes: MatrixAxis<unknown>[];
|
|
88
|
+
/** User-supplied cell executor. May throw; the matrix captures throws as
|
|
89
|
+
* `CellResult.error` and continues. */
|
|
90
|
+
runCell: (cell: MatrixCell) => Promise<CellResult<Output>>;
|
|
91
|
+
/** Replicates per cell. Default 1. */
|
|
92
|
+
reps?: number;
|
|
93
|
+
/** Prune cells from the cartesian BEFORE rep expansion. */
|
|
94
|
+
filter?: (cell: Omit<MatrixCell, 'rep' | 'ordinal'>) => boolean;
|
|
95
|
+
/** Axes to aggregate into `byAxis`. Default: every axis in `axes`. */
|
|
96
|
+
aggregateBy?: string[];
|
|
97
|
+
/** Max concurrent in-flight `runCell` invocations. Default 4. */
|
|
98
|
+
maxConcurrency?: number;
|
|
99
|
+
/** Cumulative-cost abort threshold (USD). When the running sum of
|
|
100
|
+
* `result.costUsd` crosses this value, no new cells are scheduled.
|
|
101
|
+
* In-flight cells finish. Default `Infinity`. */
|
|
102
|
+
costCeiling?: number;
|
|
103
|
+
/** Fires once per executed cell, after its promise settles. */
|
|
104
|
+
onCellComplete?: (cell: MatrixCell, result: CellResult<Output>) => void;
|
|
105
|
+
/** External cancellation. Aborts in-flight cells via a forwarded signal
|
|
106
|
+
* and suppresses scheduling of new ones. */
|
|
107
|
+
signal?: AbortSignal;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export type { AxisSummary as A, CellResult as C, MatrixResult as M, RunAgentMatrixOptions as R, MatrixAxis as a, MatrixCell as b };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.38.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -99,6 +99,11 @@
|
|
|
99
99
|
"import": "./dist/matrix/index.js",
|
|
100
100
|
"default": "./dist/matrix/index.js"
|
|
101
101
|
},
|
|
102
|
+
"./multishot": {
|
|
103
|
+
"types": "./dist/multishot/index.d.ts",
|
|
104
|
+
"import": "./dist/multishot/index.js",
|
|
105
|
+
"default": "./dist/multishot/index.js"
|
|
106
|
+
},
|
|
102
107
|
"./openapi.json": {
|
|
103
108
|
"default": "./dist/openapi.json"
|
|
104
109
|
}
|