@tangle-network/agent-eval 0.37.0 → 0.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +695 -0
- package/dist/campaign/index.js +741 -0
- package/dist/campaign/index.js.map +1 -0
- package/dist/chunk-5U2DOJU4.js +565 -0
- package/dist/chunk-5U2DOJU4.js.map +1 -0
- package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
- package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
- package/dist/chunk-BWZEGTES.js.map +1 -0
- package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
- package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
- package/dist/chunk-GGE4NNQT.js +65 -0
- package/dist/chunk-GGE4NNQT.js.map +1 -0
- package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
- package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
- package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
- package/dist/chunk-MAOZCN36.js.map +1 -0
- package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
- package/dist/chunk-QWV226SL.js +276 -0
- package/dist/chunk-QWV226SL.js.map +1 -0
- package/dist/chunk-TMXPFWC7.js +305 -0
- package/dist/chunk-TMXPFWC7.js.map +1 -0
- package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
- package/dist/chunk-WP7SY7AI.js.map +1 -0
- package/dist/chunk-YV7J7X5N.js +313 -0
- package/dist/chunk-YV7J7X5N.js.map +1 -0
- package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
- package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
- package/dist/governance/index.d.ts +133 -5
- package/dist/index.d.ts +35 -34
- package/dist/index.js +97 -630
- package/dist/index.js.map +1 -1
- package/dist/matrix/index.d.ts +2 -109
- package/dist/matrix/index.js +5 -270
- package/dist/matrix/index.js.map +1 -1
- package/dist/multishot/index.d.ts +276 -0
- package/dist/multishot/index.js +516 -0
- package/dist/multishot/index.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.js +2 -2
- package/dist/red-team-30II1T4o.d.ts +63 -0
- package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +3 -3
- package/dist/rl.js +15 -315
- package/dist/rl.js.map +1 -1
- package/dist/run-campaign-JYJXYHHL.js +10 -0
- package/dist/run-campaign-JYJXYHHL.js.map +1 -0
- package/dist/traces.js +7 -5
- package/dist/types-DHqkLwEU.d.ts +110 -0
- package/dist/wire/index.d.ts +2 -2
- package/docs/design/loop-taxonomy.md +233 -0
- package/package.json +38 -24
- package/dist/chunk-KHZRNY3F.js.map +0 -1
- package/dist/chunk-L5UNCDAJ.js.map +0 -1
- package/dist/chunk-TSPOEDM3.js.map +0 -1
- package/dist/index-CN2agEaO.d.ts +0 -191
- /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
- /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
- /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
- /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
- /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
- /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
package/dist/traces.js
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import {
|
|
2
|
-
DEFAULT_REDACTION_RULES,
|
|
3
2
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
4
3
|
FileSystemTraceStore,
|
|
5
4
|
InMemoryTraceStore,
|
|
6
5
|
OTEL_AGENT_EVAL_SCOPE,
|
|
7
6
|
OtlpFileTraceStore,
|
|
8
|
-
REDACTION_VERSION,
|
|
9
7
|
ReplayCache,
|
|
10
8
|
ReplayCacheMissError,
|
|
11
9
|
SpanNotFoundError,
|
|
@@ -30,13 +28,17 @@ import {
|
|
|
30
28
|
iterateRawCalls,
|
|
31
29
|
otelRunCompleteHook,
|
|
32
30
|
planTraceInsightQuestions,
|
|
33
|
-
redactString,
|
|
34
|
-
redactValue,
|
|
35
31
|
scoreTraceInsightReadiness,
|
|
36
32
|
tokenizeDomainWords,
|
|
37
33
|
traceAnalystFunctionGroup,
|
|
38
34
|
traceAnalystOnRunComplete
|
|
39
|
-
} from "./chunk-
|
|
35
|
+
} from "./chunk-MAOZCN36.js";
|
|
36
|
+
import {
|
|
37
|
+
DEFAULT_REDACTION_RULES,
|
|
38
|
+
REDACTION_VERSION,
|
|
39
|
+
redactString,
|
|
40
|
+
redactValue
|
|
41
|
+
} from "./chunk-GGE4NNQT.js";
|
|
40
42
|
import {
|
|
41
43
|
aggregateLlm,
|
|
42
44
|
argHash,
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import { DefaultVerdict } from '@tangle-network/agent-runtime/loops';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @experimental
|
|
5
|
+
*
|
|
6
|
+
* N-axis cartesian matrix over substrate types — types module.
|
|
7
|
+
*
|
|
8
|
+
* The matrix is a runner + aggregator. It iterates the cartesian product of
|
|
9
|
+
* caller-provided axes (any value type — `AgentProfile` from sandbox, `Driver`
|
|
10
|
+
* / `Validator` from agent-runtime, rubric records, thinking levels, anything)
|
|
11
|
+
* and aggregates per-axis pass/score/cost summaries. Substrate types are
|
|
12
|
+
* imported at the boundary by JSDoc only; the matrix never wraps them.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/** One axis = one dimension to iterate. `V` is the value type — pass any
|
|
16
|
+
* substrate type (AgentProfile, Driver, Validator, rubric record). */
|
|
17
|
+
interface MatrixAxis<V> {
|
|
18
|
+
/** Axis name. Becomes the key in `MatrixResult.byAxis`. */
|
|
19
|
+
name: string;
|
|
20
|
+
/** Stable id per value. Used as the bucket key in aggregation. */
|
|
21
|
+
values: Array<{
|
|
22
|
+
id: string;
|
|
23
|
+
value: V;
|
|
24
|
+
}>;
|
|
25
|
+
/** Optional bucket label override. Receives the same `(value, id)` the
|
|
26
|
+
* runner stored on the cell; default label is `id`. */
|
|
27
|
+
label?: (value: V, id: string) => string;
|
|
28
|
+
}
|
|
29
|
+
/** A cell carries one picked value from each axis, keyed by axis name. */
|
|
30
|
+
interface MatrixCell {
|
|
31
|
+
axes: Record<string, {
|
|
32
|
+
id: string;
|
|
33
|
+
value: unknown;
|
|
34
|
+
}>;
|
|
35
|
+
/** 0-based replicate index within the same axis combination. */
|
|
36
|
+
rep: number;
|
|
37
|
+
/** Stable sort key — preserves cartesian order across concurrent execution. */
|
|
38
|
+
ordinal: number;
|
|
39
|
+
}
|
|
40
|
+
interface CellResult<Output> {
|
|
41
|
+
output: Output;
|
|
42
|
+
verdict: DefaultVerdict;
|
|
43
|
+
costUsd: number;
|
|
44
|
+
durationMs: number;
|
|
45
|
+
runId?: string;
|
|
46
|
+
/** Populated when `runCell` threw. The cell contributes 0 to passRate AND
|
|
47
|
+
* meanScore regardless of `verdict`. */
|
|
48
|
+
error?: {
|
|
49
|
+
message: string;
|
|
50
|
+
kind: string;
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
interface AxisSummary {
|
|
54
|
+
axisName: string;
|
|
55
|
+
axisValue: string;
|
|
56
|
+
cells: number;
|
|
57
|
+
passRate: number;
|
|
58
|
+
meanScore: number;
|
|
59
|
+
p50Score: number;
|
|
60
|
+
p90Score: number;
|
|
61
|
+
totalCostUsd: number;
|
|
62
|
+
meanDurationMs: number;
|
|
63
|
+
}
|
|
64
|
+
interface MatrixResult<Output> {
|
|
65
|
+
cells: Array<{
|
|
66
|
+
cell: MatrixCell;
|
|
67
|
+
runs: CellResult<Output>[];
|
|
68
|
+
}>;
|
|
69
|
+
/** `byAxis[axisName][axisValueId] = summary`. Populated only for axes
|
|
70
|
+
* named in `aggregateBy` (default = every axis in `axes`). */
|
|
71
|
+
byAxis: Record<string, Record<string, AxisSummary>>;
|
|
72
|
+
summary: {
|
|
73
|
+
totalCells: number;
|
|
74
|
+
runsExecuted: number;
|
|
75
|
+
/** Cells removed by `filter` plus cells unscheduled after the cost
|
|
76
|
+
* ceiling or abort signal tripped. */
|
|
77
|
+
cellsSkipped: number;
|
|
78
|
+
overallPassRate: number;
|
|
79
|
+
overallMeanScore: number;
|
|
80
|
+
totalCostUsd: number;
|
|
81
|
+
durationMs: number;
|
|
82
|
+
};
|
|
83
|
+
/** Stable id-like string generated at the end of the run. */
|
|
84
|
+
matrixId: string;
|
|
85
|
+
}
|
|
86
|
+
interface RunAgentMatrixOptions<Output> {
|
|
87
|
+
axes: MatrixAxis<unknown>[];
|
|
88
|
+
/** User-supplied cell executor. May throw; the matrix captures throws as
|
|
89
|
+
* `CellResult.error` and continues. */
|
|
90
|
+
runCell: (cell: MatrixCell) => Promise<CellResult<Output>>;
|
|
91
|
+
/** Replicates per cell. Default 1. */
|
|
92
|
+
reps?: number;
|
|
93
|
+
/** Prune cells from the cartesian BEFORE rep expansion. */
|
|
94
|
+
filter?: (cell: Omit<MatrixCell, 'rep' | 'ordinal'>) => boolean;
|
|
95
|
+
/** Axes to aggregate into `byAxis`. Default: every axis in `axes`. */
|
|
96
|
+
aggregateBy?: string[];
|
|
97
|
+
/** Max concurrent in-flight `runCell` invocations. Default 4. */
|
|
98
|
+
maxConcurrency?: number;
|
|
99
|
+
/** Cumulative-cost abort threshold (USD). When the running sum of
|
|
100
|
+
* `result.costUsd` crosses this value, no new cells are scheduled.
|
|
101
|
+
* In-flight cells finish. Default `Infinity`. */
|
|
102
|
+
costCeiling?: number;
|
|
103
|
+
/** Fires once per executed cell, after its promise settles. */
|
|
104
|
+
onCellComplete?: (cell: MatrixCell, result: CellResult<Output>) => void;
|
|
105
|
+
/** External cancellation. Aborts in-flight cells via a forwarded signal
|
|
106
|
+
* and suppresses scheduling of new ones. */
|
|
107
|
+
signal?: AbortSignal;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export type { AxisSummary as A, CellResult as C, MatrixResult as M, RunAgentMatrixOptions as R, MatrixAxis as a, MatrixCell as b };
|
package/dist/wire/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-
|
|
1
|
+
import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-Dvy-bt7x.js';
|
|
2
2
|
import { T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { OpenAPIObject } from 'openapi3-ts/oas31';
|
|
@@ -7,7 +7,7 @@ import { ServerType } from '@hono/node-server';
|
|
|
7
7
|
import { Hono } from 'hono';
|
|
8
8
|
import '../control-runtime-BZ_lVLYW.js';
|
|
9
9
|
import '../emitter-DP_cSSiw.js';
|
|
10
|
-
import '../dataset-
|
|
10
|
+
import '../dataset-BlwAtYYf.js';
|
|
11
11
|
import '../errors-mje_cKOs.js';
|
|
12
12
|
|
|
13
13
|
declare const RubricDimensionSchema: z.ZodObject<{
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# Loop taxonomy: driver, worker, measurement, and the improvement loop
|
|
2
|
+
|
|
3
|
+
This is the canonical vocabulary for the Tangle agent stack. It exists because
|
|
4
|
+
the same word ("loop", "shot", "worker") was being used at three different
|
|
5
|
+
layers, and the layers were getting conflated. Every role below has exactly
|
|
6
|
+
one meaning. Use these words and nothing else.
|
|
7
|
+
|
|
8
|
+
Cross-links: [`three-package-architecture.md`](../three-package-architecture.md)
|
|
9
|
+
(who owns what), [`concepts.md`](../concepts.md) (eval mental model),
|
|
10
|
+
[`multi-shot-optimization.md`](../multi-shot-optimization.md) (GEPA),
|
|
11
|
+
[`auto-research-loop-end-to-end.md`](../auto-research-loop-end-to-end.md)
|
|
12
|
+
(analyst / autoresearch).
|
|
13
|
+
|
|
14
|
+
## The three roles
|
|
15
|
+
|
|
16
|
+
| Role | Definition | Lives at |
|
|
17
|
+
|---|---|---|
|
|
18
|
+
| **Driver** | The thing that *decides what happens next*. Plans, then decides whether to continue. | Both layers (see below) |
|
|
19
|
+
| **Worker** | An agent harness instance (Claude Code, Codex, OpenCode, …) running inside a sandbox. Does the actual work; responds in chat. | Inner layer only |
|
|
20
|
+
| **Sandbox** | A multi-harness VM. Hosts **1..N workers**, which can share a workspace. Not an agent — the substrate an agent runs in. | Inner layer only |
|
|
21
|
+
| **Measurement** | Runs the worker over a set of scenarios and judges the outputs into a scorecard with confidence intervals. This is `runCampaign`. | Outer layer |
|
|
22
|
+
|
|
23
|
+
Two facts that trip people up:
|
|
24
|
+
|
|
25
|
+
1. **A sandbox is not a worker.** One sandbox can hold ten workers — a driver
|
|
26
|
+
can coordinate CC + Codex + OpenCode siblings sharing one workspace, or a
|
|
27
|
+
fleet spread across machines. `runLoop`'s placement encodes exactly this:
|
|
28
|
+
`{ sibling, sandboxId }` = co-located workers; `{ fleet, fleetId,
|
|
29
|
+
machineId, sandboxId }` = workers across machines.
|
|
30
|
+
|
|
31
|
+
2. **"Driver" exists at two layers and means the same *kind* of thing
|
|
32
|
+
(a decider) at each, but the things it decides differ:**
|
|
33
|
+
- **Conversation driver** (inner): decides the next *turn* — a persona/user
|
|
34
|
+
simulating chat, or a planner fanning work to workers.
|
|
35
|
+
- **Improvement driver** (outer): decides the next *surface* — what system
|
|
36
|
+
prompt / tool config / code the workers should run.
|
|
37
|
+
|
|
38
|
+
## The nesting
|
|
39
|
+
|
|
40
|
+
There are two loops. The outer one improves the thing the inner one runs.
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
runImprovementLoop OUTER loop — improve the agent over time
|
|
44
|
+
│
|
|
45
|
+
├─ DRIVER = ImprovementDriver proposes a candidate SURFACE
|
|
46
|
+
│ (evolutionary mutator | (the worker's system prompt / tools / config)
|
|
47
|
+
│ reflective analyst) — NOT a conversation turn
|
|
48
|
+
│
|
|
49
|
+
└─ for each candidate surface:
|
|
50
|
+
│
|
|
51
|
+
runCampaign a MEASUREMENT — scores ONE surface
|
|
52
|
+
│
|
|
53
|
+
└─ for each scenario × rep:
|
|
54
|
+
│
|
|
55
|
+
dispatch(scenario) THE SEAM — topology-opaque, returns an artifact
|
|
56
|
+
│
|
|
57
|
+
└─ runLoop / runMultishot INNER loop — one conversation
|
|
58
|
+
├─ DRIVER = persona / user / planner chats with ↓
|
|
59
|
+
└─ WORKERS = 1..N agent harnesses in 1..M sandboxes
|
|
60
|
+
│
|
|
61
|
+
→ transcript / artifact
|
|
62
|
+
judge(artifact) → score
|
|
63
|
+
→ scorecard + CIs
|
|
64
|
+
gate(winner vs baseline) → PR
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### `dispatch` is the topology-opaque seam
|
|
68
|
+
|
|
69
|
+
`dispatch(scenario) → artifact` is the boundary between the measurement layer
|
|
70
|
+
and the execution layer. The measurement does **not** know or care how the
|
|
71
|
+
artifact was produced. Behind the seam can be:
|
|
72
|
+
|
|
73
|
+
- one LLM call,
|
|
74
|
+
- one worker (CC) in one sandbox,
|
|
75
|
+
- a conversation driver coordinating 10 workers (CC + Codex + OpenCode)
|
|
76
|
+
sharing a workspace in one sandbox,
|
|
77
|
+
- a fleet across machines.
|
|
78
|
+
|
|
79
|
+
All of it is invisible to `runCampaign`. This is why the substrate has no
|
|
80
|
+
opinion about execution topology: the topology lives inside `dispatch`.
|
|
81
|
+
|
|
82
|
+
### Corrected statements (things that were said backwards)
|
|
83
|
+
|
|
84
|
+
- The worker is the agent in the sandbox. The driver talks to it. ✓
|
|
85
|
+
- `runCampaign` is a **measurement**, not a worker. It *runs the worker* (via
|
|
86
|
+
`dispatch`); the worker does not "run the eval".
|
|
87
|
+
- The outer improvement loop has **no single worker** — its driver proposes a
|
|
88
|
+
*surface*, and each surface is scored by a *measurement* that drives the
|
|
89
|
+
inner workers.
|
|
90
|
+
|
|
91
|
+
## The dataset flywheel — why every loop run matters
|
|
92
|
+
|
|
93
|
+
**Every loop run, regardless of why it ran, feeds the same dataset.** This is
|
|
94
|
+
the through-line that ties measurement and improvement together.
|
|
95
|
+
|
|
96
|
+
When `runCampaign` runs with a `labeledStore`, each cell captures
|
|
97
|
+
`(scenario, artifact, judgeScore, source)` into the `LabeledScenarioStore`.
|
|
98
|
+
The `source` discriminates *why* the run happened — but the captured tuple is
|
|
99
|
+
identical in shape:
|
|
100
|
+
|
|
101
|
+
| `captureSource` | The run that produced it |
|
|
102
|
+
|---|---|
|
|
103
|
+
| `'eval-run'` | a plain evaluation campaign |
|
|
104
|
+
| `'production-trace'` | a real user conversation in production |
|
|
105
|
+
| `'red-team'` | an adversarial probe |
|
|
106
|
+
| `'synthetic'` | a generated scenario |
|
|
107
|
+
| `'manual'` | a human-curated example |
|
|
108
|
+
|
|
109
|
+
That captured corpus **is the GEPA training set.** A basic eval run, a
|
|
110
|
+
production conversation, and an autoresearch loop all deposit the same
|
|
111
|
+
`(input, output, reward)` tuples. The optimization driver later samples from
|
|
112
|
+
that corpus to evolve the surface. So:
|
|
113
|
+
|
|
114
|
+
> Running *any* loop — even one whose purpose is not optimization — builds the
|
|
115
|
+
> dataset that optimization needs. The flywheel turns whether or not you are
|
|
116
|
+
> currently optimizing.
|
|
117
|
+
|
|
118
|
+
This is enforced, not aspirational: `runImprovementLoop` **refuses**
|
|
119
|
+
`tracing: 'off'` whenever a driver is wired, precisely because a loop that
|
|
120
|
+
doesn't feed the dataset is a loop that breaks the flywheel.
|
|
121
|
+
|
|
122
|
+
Temporal-split discipline (train vs holdout, `capturedBefore`) and
|
|
123
|
+
default-off-for-training of `production-trace` are enforced at the
|
|
124
|
+
`LabeledScenarioStore.sample()` boundary so the flywheel cannot contaminate
|
|
125
|
+
the holdout it is judged against. See `src/campaign/labeled-store/`.
|
|
126
|
+
|
|
127
|
+
## One improvement loop, pluggable drivers
|
|
128
|
+
|
|
129
|
+
The improvement loop is **driver-agnostic**. `runOptimization` (the loop body)
|
|
130
|
+
and `runImprovementLoop` (the gated-promotion shell) call
|
|
131
|
+
`driver.propose(...)` → measure → `driver.decide(...)`. They do not know which
|
|
132
|
+
strategy is driving. Two strategies conform to the same `ImprovementDriver`
|
|
133
|
+
interface:
|
|
134
|
+
|
|
135
|
+
```ts
|
|
136
|
+
interface ImprovementDriver<TFindings = unknown> {
|
|
137
|
+
kind: string
|
|
138
|
+
propose(args: {
|
|
139
|
+
currentSurface: MutableSurface
|
|
140
|
+
history: GenerationRecord[] // what's been tried + scored
|
|
141
|
+
findings: TFindings[] // external signal (e.g. analyst output)
|
|
142
|
+
populationSize: number
|
|
143
|
+
generation: number
|
|
144
|
+
signal: AbortSignal
|
|
145
|
+
}): Promise<MutableSurface[]>
|
|
146
|
+
decide?(args: { history: GenerationRecord[] }): { stop: boolean; reason?: string }
|
|
147
|
+
}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
| Driver | Strategy | How it proposes | Where it lives |
|
|
151
|
+
|---|---|---|---|
|
|
152
|
+
| `evolutionaryDriver` | Evolutionary (GEPA / AxGEPA) | Mutates the current best surface into N candidates, blind to history beyond the current best. Optimizes against the dataset's rewards. | **agent-eval** (pure: dataset → surface, no sandbox) |
|
|
153
|
+
| `analystDriver` *(planned)* | Reflective | Reads trace findings + generation history, reasons about *why* candidates failed, proposes targeted edits. | **agent-runtime** (runs sandboxes to do research) — implements agent-eval's `ImprovementDriver` |
|
|
154
|
+
|
|
155
|
+
This resolves the prior duplication where `runImprovementLoop` (evolutionary,
|
|
156
|
+
agent-eval) and `runAnalystLoop` (reflective, agent-runtime) were two parallel
|
|
157
|
+
loops doing "propose change → measure → gate → PR". There is **one loop**;
|
|
158
|
+
the analyst becomes a driver of it. The dependency direction permits this
|
|
159
|
+
cleanly: agent-eval is the leaf and owns the `ImprovementDriver` contract;
|
|
160
|
+
agent-runtime imports agent-eval and implements the contract.
|
|
161
|
+
|
|
162
|
+
## What "the surface" is — improvement tiers
|
|
163
|
+
|
|
164
|
+
`MutableSurface` is the thing a driver changes. It has tiers, least → most
|
|
165
|
+
invasive. Today `MutableSurface = string` models tiers 1–2; tiers 3–4 are the
|
|
166
|
+
open design question below.
|
|
167
|
+
|
|
168
|
+
| Tier | Surface | Driver that changes it | Blast radius |
|
|
169
|
+
|---|---|---|---|
|
|
170
|
+
| 1 | System prompt / prompt-signature addendum | `evolutionaryDriver` (GEPA), `analystDriver` | prompt only |
|
|
171
|
+
| 2 | Tool config / tool signatures | `analystDriver` | which tools, their schemas |
|
|
172
|
+
| 3 | Knowledge (wiki / knowledge graph) | agent-knowledge's knowledge adapter | what the agent *knows* |
|
|
173
|
+
| 4 | Code / scaffolding | autoresearch (reads codebase + traces) → worktree / PR | the implementation itself |
|
|
174
|
+
|
|
175
|
+
The key distinction Drew drew:
|
|
176
|
+
|
|
177
|
+
- **Analyst** updates the *signatures* — the prompt and tool surface (tiers
|
|
178
|
+
1–2). Cheap, reversible, measured directly against the dataset.
|
|
179
|
+
- **Autoresearch** updates the *code* (tier 4). It reads the repository plus
|
|
180
|
+
the trace findings, opens a worktree, and proposes implementation changes —
|
|
181
|
+
measured by re-running the inner loop against the changed code.
|
|
182
|
+
|
|
183
|
+
Both are `ImprovementDriver`s in the abstract (propose a change → measure →
|
|
184
|
+
gate → PR). They differ only in *what* they edit and *how invasive* it is. And
|
|
185
|
+
both consume the **same dataset** the flywheel builds.
|
|
186
|
+
|
|
187
|
+
## Resolved design decisions
|
|
188
|
+
|
|
189
|
+
1. **`MutableSurface` widens to span all tiers.** `MutableSurface = string |
|
|
190
|
+
CodeSurface`. The `string` form is tiers 1–2 (prompt / serialized tool
|
|
191
|
+
config); `CodeSurface = { kind: 'code'; worktreeRef; baseRef?; summary? }`
|
|
192
|
+
is tier 4 (an implementation change behind a worktree ref). One loop spans
|
|
193
|
+
prompt *and* code improvement. `surfaceHash` hashes a string by content and
|
|
194
|
+
a code surface by its `(worktreeRef, baseRef)` identity (the content lives
|
|
195
|
+
in git). **Shipped in agent-eval 0.40.1.** The consumer's
|
|
196
|
+
`dispatchWithSurface` is responsible for checking out a code surface's
|
|
197
|
+
worktree before running the worker.
|
|
198
|
+
|
|
199
|
+
2. **`runAnalystLoop` (agent-runtime): analyst becomes a driver; knowledge
|
|
200
|
+
stays separate.** Extract an `analystDriver` (implements agent-eval's
|
|
201
|
+
`ImprovementDriver`) for the surface-proposal part, and feed it into
|
|
202
|
+
`runImprovementLoop`'s gate + PR machinery. `runAnalystLoop`'s other
|
|
203
|
+
responsibilities — the findings ledger and knowledge-graph updates, which
|
|
204
|
+
are *not* surface optimization — stay where they are. **Phase 3
|
|
205
|
+
(agent-runtime); the `ImprovementDriver` contract it implements is already
|
|
206
|
+
shipped in agent-eval 0.40.1.**
|
|
207
|
+
|
|
208
|
+
3. **`runLoop` + `runMultishot` converge into one parameterized
|
|
209
|
+
`runConversationLoop`** with a pluggable backend (`sandbox | router`). The
|
|
210
|
+
two are the same shape (driver ↔ workers, iterate) differing only in
|
|
211
|
+
backend and intent; unify them. **Phase 3+ (cross-repo); needs its own
|
|
212
|
+
design pass — introduces a backend abstraction and couples the two repos'
|
|
213
|
+
inner loops, so it lands after the `ImprovementDriver` model is proven in
|
|
214
|
+
product use.**
|
|
215
|
+
|
|
216
|
+
## Vocabulary quick reference
|
|
217
|
+
|
|
218
|
+
- **shot** — one conversational turn (driver says X, worker responds Y). Used
|
|
219
|
+
in `runMultishot`. Never used to mean a whole eval run.
|
|
220
|
+
- **runMultishot** — many shots in one conversation; persona-driver ↔ one
|
|
221
|
+
router-agent. agent-eval.
|
|
222
|
+
- **runLoop** — driver ↔ workers in sandboxes; topology-agnostic execution.
|
|
223
|
+
agent-runtime.
|
|
224
|
+
- **runCampaign** — a measurement: a surface scored over N scenarios × M reps.
|
|
225
|
+
agent-eval. (A "campaign" = a coordinated batch of measurements.)
|
|
226
|
+
- **runOptimization** — the improvement loop body: driver proposes surfaces,
|
|
227
|
+
each measured by a campaign, top-K promoted per generation. agent-eval.
|
|
228
|
+
- **runImprovementLoop** — `runOptimization` + holdout re-score + release gate
|
|
229
|
+
+ optional PR. agent-eval.
|
|
230
|
+
- **runAnalystLoop** — reflective autoresearch: findings + knowledge updates +
|
|
231
|
+
improvement proposals. agent-runtime.
|
|
232
|
+
- **ImprovementDriver** — the pluggable strategy that proposes surfaces;
|
|
233
|
+
`evolutionaryDriver` and (planned) `analystDriver` conform.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.40.1",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -99,6 +99,16 @@
|
|
|
99
99
|
"import": "./dist/matrix/index.js",
|
|
100
100
|
"default": "./dist/matrix/index.js"
|
|
101
101
|
},
|
|
102
|
+
"./multishot": {
|
|
103
|
+
"types": "./dist/multishot/index.d.ts",
|
|
104
|
+
"import": "./dist/multishot/index.js",
|
|
105
|
+
"default": "./dist/multishot/index.js"
|
|
106
|
+
},
|
|
107
|
+
"./campaign": {
|
|
108
|
+
"types": "./dist/campaign/index.d.ts",
|
|
109
|
+
"import": "./dist/campaign/index.js",
|
|
110
|
+
"default": "./dist/campaign/index.js"
|
|
111
|
+
},
|
|
102
112
|
"./openapi.json": {
|
|
103
113
|
"default": "./dist/openapi.json"
|
|
104
114
|
}
|
|
@@ -114,17 +124,6 @@
|
|
|
114
124
|
"publishConfig": {
|
|
115
125
|
"access": "public"
|
|
116
126
|
},
|
|
117
|
-
"scripts": {
|
|
118
|
-
"build": "tsup && pnpm openapi",
|
|
119
|
-
"dev": "tsup --watch",
|
|
120
|
-
"prepare": "pnpm build",
|
|
121
|
-
"test": "vitest run",
|
|
122
|
-
"test:watch": "vitest",
|
|
123
|
-
"typecheck": "tsc --noEmit",
|
|
124
|
-
"lint": "biome check src",
|
|
125
|
-
"format": "biome format --write src",
|
|
126
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
127
|
-
},
|
|
128
127
|
"dependencies": {
|
|
129
128
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
130
129
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -138,30 +137,45 @@
|
|
|
138
137
|
"@tangle-network/sandbox": "^0.2.1"
|
|
139
138
|
},
|
|
140
139
|
"peerDependenciesMeta": {
|
|
141
|
-
"@tangle-network/agent-runtime": {
|
|
142
|
-
|
|
140
|
+
"@tangle-network/agent-runtime": {
|
|
141
|
+
"optional": true
|
|
142
|
+
},
|
|
143
|
+
"@tangle-network/sandbox": {
|
|
144
|
+
"optional": true
|
|
145
|
+
}
|
|
143
146
|
},
|
|
144
147
|
"devDependencies": {
|
|
145
148
|
"@biomejs/biome": "^2.4.15",
|
|
146
149
|
"@tangle-network/agent-runtime": "^0.21.0",
|
|
147
150
|
"@tangle-network/sandbox": "^0.2.1",
|
|
148
151
|
"@types/node": "^25.6.0",
|
|
152
|
+
"husky": "^9.1.7",
|
|
153
|
+
"lint-staged": "^17.0.5",
|
|
149
154
|
"openapi3-ts": "^4.5.0",
|
|
150
155
|
"tsup": "^8.0.0",
|
|
151
156
|
"typescript": "^5.7.0",
|
|
152
157
|
"vitest": "^3.0.0"
|
|
153
158
|
},
|
|
154
|
-
"pnpm": {
|
|
155
|
-
"minimumReleaseAge": 4320,
|
|
156
|
-
"minimumReleaseAgeExclude": ["@tangle-network/sandbox", "@tangle-network/agent-runtime"],
|
|
157
|
-
"overrides": {
|
|
158
|
-
"postcss@<8.5.10": "^8.5.10",
|
|
159
|
-
"ws@>=8.0.0 <8.20.1": "^8.20.1"
|
|
160
|
-
}
|
|
161
|
-
},
|
|
162
159
|
"engines": {
|
|
163
160
|
"node": ">=20"
|
|
164
161
|
},
|
|
162
|
+
"lint-staged": {
|
|
163
|
+
"src/**/*.{ts,tsx}": [
|
|
164
|
+
"biome check --write --no-errors-on-unmatched"
|
|
165
|
+
],
|
|
166
|
+
"tests/**/*.{ts,tsx}": [
|
|
167
|
+
"biome check --write --no-errors-on-unmatched"
|
|
168
|
+
]
|
|
169
|
+
},
|
|
165
170
|
"license": "MIT",
|
|
166
|
-
"
|
|
167
|
-
|
|
171
|
+
"scripts": {
|
|
172
|
+
"build": "tsup && pnpm openapi",
|
|
173
|
+
"dev": "tsup --watch",
|
|
174
|
+
"test": "vitest run",
|
|
175
|
+
"test:watch": "vitest",
|
|
176
|
+
"typecheck": "tsc --noEmit",
|
|
177
|
+
"lint": "biome check src",
|
|
178
|
+
"format": "biome format --write src",
|
|
179
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
180
|
+
}
|
|
181
|
+
}
|