@tangle-network/agent-eval 0.38.0 → 0.40.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +775 -0
- package/dist/campaign/index.js +807 -0
- package/dist/campaign/index.js.map +1 -0
- package/dist/chunk-5U2DOJU4.js +565 -0
- package/dist/chunk-5U2DOJU4.js.map +1 -0
- package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
- package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
- package/dist/chunk-BWZEGTES.js.map +1 -0
- package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
- package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
- package/dist/chunk-GGE4NNQT.js +65 -0
- package/dist/chunk-GGE4NNQT.js.map +1 -0
- package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
- package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
- package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
- package/dist/chunk-MAOZCN36.js.map +1 -0
- package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
- package/dist/chunk-TMXPFWC7.js +305 -0
- package/dist/chunk-TMXPFWC7.js.map +1 -0
- package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
- package/dist/chunk-WP7SY7AI.js.map +1 -0
- package/dist/chunk-YV7J7X5N.js +313 -0
- package/dist/chunk-YV7J7X5N.js.map +1 -0
- package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
- package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
- package/dist/governance/index.d.ts +133 -5
- package/dist/index.d.ts +35 -34
- package/dist/index.js +97 -630
- package/dist/index.js.map +1 -1
- package/dist/multishot/index.d.ts +21 -21
- package/dist/multishot/index.js +64 -15
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.js +2 -2
- package/dist/red-team-30II1T4o.d.ts +63 -0
- package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +3 -3
- package/dist/rl.js +15 -315
- package/dist/rl.js.map +1 -1
- package/dist/run-campaign-JYJXYHHL.js +10 -0
- package/dist/run-campaign-JYJXYHHL.js.map +1 -0
- package/dist/traces.js +7 -5
- package/dist/wire/index.d.ts +2 -2
- package/docs/design/loop-taxonomy.md +233 -0
- package/docs/design/self-improvement-engine.md +130 -0
- package/package.json +33 -24
- package/dist/chunk-KHZRNY3F.js.map +0 -1
- package/dist/chunk-L5UNCDAJ.js.map +0 -1
- package/dist/chunk-TSPOEDM3.js.map +0 -1
- package/dist/index-CN2agEaO.d.ts +0 -191
- /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
- /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
- /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
- /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
- /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
- /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
package/dist/traces.js
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import {
|
|
2
|
-
DEFAULT_REDACTION_RULES,
|
|
3
2
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
4
3
|
FileSystemTraceStore,
|
|
5
4
|
InMemoryTraceStore,
|
|
6
5
|
OTEL_AGENT_EVAL_SCOPE,
|
|
7
6
|
OtlpFileTraceStore,
|
|
8
|
-
REDACTION_VERSION,
|
|
9
7
|
ReplayCache,
|
|
10
8
|
ReplayCacheMissError,
|
|
11
9
|
SpanNotFoundError,
|
|
@@ -30,13 +28,17 @@ import {
|
|
|
30
28
|
iterateRawCalls,
|
|
31
29
|
otelRunCompleteHook,
|
|
32
30
|
planTraceInsightQuestions,
|
|
33
|
-
redactString,
|
|
34
|
-
redactValue,
|
|
35
31
|
scoreTraceInsightReadiness,
|
|
36
32
|
tokenizeDomainWords,
|
|
37
33
|
traceAnalystFunctionGroup,
|
|
38
34
|
traceAnalystOnRunComplete
|
|
39
|
-
} from "./chunk-
|
|
35
|
+
} from "./chunk-MAOZCN36.js";
|
|
36
|
+
import {
|
|
37
|
+
DEFAULT_REDACTION_RULES,
|
|
38
|
+
REDACTION_VERSION,
|
|
39
|
+
redactString,
|
|
40
|
+
redactValue
|
|
41
|
+
} from "./chunk-GGE4NNQT.js";
|
|
40
42
|
import {
|
|
41
43
|
aggregateLlm,
|
|
42
44
|
argHash,
|
package/dist/wire/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-
|
|
1
|
+
import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-Dvy-bt7x.js';
|
|
2
2
|
import { T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { OpenAPIObject } from 'openapi3-ts/oas31';
|
|
@@ -7,7 +7,7 @@ import { ServerType } from '@hono/node-server';
|
|
|
7
7
|
import { Hono } from 'hono';
|
|
8
8
|
import '../control-runtime-BZ_lVLYW.js';
|
|
9
9
|
import '../emitter-DP_cSSiw.js';
|
|
10
|
-
import '../dataset-
|
|
10
|
+
import '../dataset-BlwAtYYf.js';
|
|
11
11
|
import '../errors-mje_cKOs.js';
|
|
12
12
|
|
|
13
13
|
declare const RubricDimensionSchema: z.ZodObject<{
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# Loop taxonomy: driver, worker, measurement, and the improvement loop
|
|
2
|
+
|
|
3
|
+
This is the canonical vocabulary for the Tangle agent stack. It exists because
|
|
4
|
+
the same word ("loop", "shot", "worker") was being used at three different
|
|
5
|
+
layers, and the layers were getting conflated. Every role below has exactly
|
|
6
|
+
one meaning. Use these words and nothing else.
|
|
7
|
+
|
|
8
|
+
Cross-links: [`three-package-architecture.md`](../three-package-architecture.md)
|
|
9
|
+
(who owns what), [`concepts.md`](../concepts.md) (eval mental model),
|
|
10
|
+
[`multi-shot-optimization.md`](../multi-shot-optimization.md) (GEPA),
|
|
11
|
+
[`auto-research-loop-end-to-end.md`](../auto-research-loop-end-to-end.md)
|
|
12
|
+
(analyst / autoresearch).
|
|
13
|
+
|
|
14
|
+
## The three roles
|
|
15
|
+
|
|
16
|
+
| Role | Definition | Lives at |
|
|
17
|
+
|---|---|---|
|
|
18
|
+
| **Driver** | The thing that *decides what happens next*. Plans, then decides whether to continue. | Both layers (see below) |
|
|
19
|
+
| **Worker** | An agent harness instance (Claude Code, Codex, OpenCode, …) running inside a sandbox. Does the actual work; responds in chat. | Inner layer only |
|
|
20
|
+
| **Sandbox** | A multi-harness VM. Hosts **1..N workers**, which can share a workspace. Not an agent — the substrate an agent runs in. | Inner layer only |
|
|
21
|
+
| **Measurement** | Runs the worker over a set of scenarios and judges the outputs into a scorecard with confidence intervals. This is `runCampaign`. | Outer layer |
|
|
22
|
+
|
|
23
|
+
Two facts that trip people up:
|
|
24
|
+
|
|
25
|
+
1. **A sandbox is not a worker.** One sandbox can hold ten workers — a driver
|
|
26
|
+
can coordinate CC + Codex + OpenCode siblings sharing one workspace, or a
|
|
27
|
+
fleet spread across machines. `runLoop`'s placement encodes exactly this:
|
|
28
|
+
`{ sibling, sandboxId }` = co-located workers; `{ fleet, fleetId,
|
|
29
|
+
machineId, sandboxId }` = workers across machines.
|
|
30
|
+
|
|
31
|
+
2. **"Driver" exists at two layers and means the same *kind* of thing
|
|
32
|
+
(a decider) at each, but the things it decides differ:**
|
|
33
|
+
- **Conversation driver** (inner): decides the next *turn* — a persona/user
|
|
34
|
+
simulating chat, or a planner fanning work to workers.
|
|
35
|
+
- **Improvement driver** (outer): decides the next *surface* — what system
|
|
36
|
+
prompt / tool config / code the workers should run.
|
|
37
|
+
|
|
38
|
+
## The nesting
|
|
39
|
+
|
|
40
|
+
There are two loops. The outer one improves the thing the inner one runs.
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
runImprovementLoop OUTER loop — improve the agent over time
|
|
44
|
+
│
|
|
45
|
+
├─ DRIVER = ImprovementDriver proposes a candidate SURFACE
|
|
46
|
+
│ (evolutionary mutator | (the worker's system prompt / tools / config)
|
|
47
|
+
│ reflective analyst) — NOT a conversation turn
|
|
48
|
+
│
|
|
49
|
+
└─ for each candidate surface:
|
|
50
|
+
│
|
|
51
|
+
runCampaign a MEASUREMENT — scores ONE surface
|
|
52
|
+
│
|
|
53
|
+
└─ for each scenario × rep:
|
|
54
|
+
│
|
|
55
|
+
dispatch(scenario) THE SEAM — topology-opaque, returns an artifact
|
|
56
|
+
│
|
|
57
|
+
└─ runLoop / runMultishot INNER loop — one conversation
|
|
58
|
+
├─ DRIVER = persona / user / planner chats with ↓
|
|
59
|
+
└─ WORKERS = 1..N agent harnesses in 1..M sandboxes
|
|
60
|
+
│
|
|
61
|
+
→ transcript / artifact
|
|
62
|
+
judge(artifact) → score
|
|
63
|
+
→ scorecard + CIs
|
|
64
|
+
gate(winner vs baseline) → PR
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### `dispatch` is the topology-opaque seam
|
|
68
|
+
|
|
69
|
+
`dispatch(scenario) → artifact` is the boundary between the measurement layer
|
|
70
|
+
and the execution layer. The measurement does **not** know or care how the
|
|
71
|
+
artifact was produced. Behind the seam can be:
|
|
72
|
+
|
|
73
|
+
- one LLM call,
|
|
74
|
+
- one worker (CC) in one sandbox,
|
|
75
|
+
- a conversation driver coordinating 10 workers (CC + Codex + OpenCode)
|
|
76
|
+
sharing a workspace in one sandbox,
|
|
77
|
+
- a fleet across machines.
|
|
78
|
+
|
|
79
|
+
All of it is invisible to `runCampaign`. This is why the substrate has no
|
|
80
|
+
opinion about execution topology: the topology lives inside `dispatch`.
|
|
81
|
+
|
|
82
|
+
### Corrected statements (things that were said backwards)
|
|
83
|
+
|
|
84
|
+
- The worker is the agent in the sandbox. The driver talks to it. ✓
|
|
85
|
+
- `runCampaign` is a **measurement**, not a worker. It *runs the worker* (via
|
|
86
|
+
`dispatch`); the worker does not "run the eval".
|
|
87
|
+
- The outer improvement loop has **no single worker** — its driver proposes a
|
|
88
|
+
*surface*, and each surface is scored by a *measurement* that drives the
|
|
89
|
+
inner workers.
|
|
90
|
+
|
|
91
|
+
## The dataset flywheel — why every loop run matters
|
|
92
|
+
|
|
93
|
+
**Every loop run, regardless of why it ran, feeds the same dataset.** This is
|
|
94
|
+
the through-line that ties measurement and improvement together.
|
|
95
|
+
|
|
96
|
+
When `runCampaign` runs with a `labeledStore`, each cell captures
|
|
97
|
+
`(scenario, artifact, judgeScore, source)` into the `LabeledScenarioStore`.
|
|
98
|
+
The `source` discriminates *why* the run happened — but the captured tuple is
|
|
99
|
+
identical in shape:
|
|
100
|
+
|
|
101
|
+
| `captureSource` | The run that produced it |
|
|
102
|
+
|---|---|
|
|
103
|
+
| `'eval-run'` | a plain evaluation campaign |
|
|
104
|
+
| `'production-trace'` | a real user conversation in production |
|
|
105
|
+
| `'red-team'` | an adversarial probe |
|
|
106
|
+
| `'synthetic'` | a generated scenario |
|
|
107
|
+
| `'manual'` | a human-curated example |
|
|
108
|
+
|
|
109
|
+
That captured corpus **is the GEPA training set.** A basic eval run, a
|
|
110
|
+
production conversation, and an autoresearch loop all deposit the same
|
|
111
|
+
`(input, output, reward)` tuples. The optimization driver later samples from
|
|
112
|
+
that corpus to evolve the surface. So:
|
|
113
|
+
|
|
114
|
+
> Running *any* loop — even one whose purpose is not optimization — builds the
|
|
115
|
+
> dataset that optimization needs. The flywheel turns whether or not you are
|
|
116
|
+
> currently optimizing.
|
|
117
|
+
|
|
118
|
+
This is enforced, not aspirational: `runImprovementLoop` **refuses**
|
|
119
|
+
`tracing: 'off'` whenever a driver is wired, precisely because a loop that
|
|
120
|
+
doesn't feed the dataset is a loop that breaks the flywheel.
|
|
121
|
+
|
|
122
|
+
Temporal-split discipline (train vs holdout, `capturedBefore`) and
|
|
123
|
+
default-off-for-training of `production-trace` are enforced at the
|
|
124
|
+
`LabeledScenarioStore.sample()` boundary so the flywheel cannot contaminate
|
|
125
|
+
the holdout it is judged against. See `src/campaign/labeled-store/`.
|
|
126
|
+
|
|
127
|
+
## One improvement loop, pluggable drivers
|
|
128
|
+
|
|
129
|
+
The improvement loop is **driver-agnostic**. `runOptimization` (the loop body)
|
|
130
|
+
and `runImprovementLoop` (the gated-promotion shell) call
|
|
131
|
+
`driver.propose(...)` → measure → `driver.decide(...)`. They do not know which
|
|
132
|
+
strategy is driving. Two strategies conform to the same `ImprovementDriver`
|
|
133
|
+
interface:
|
|
134
|
+
|
|
135
|
+
```ts
|
|
136
|
+
interface ImprovementDriver<TFindings = unknown> {
|
|
137
|
+
kind: string
|
|
138
|
+
propose(args: {
|
|
139
|
+
currentSurface: MutableSurface
|
|
140
|
+
history: GenerationRecord[] // what's been tried + scored
|
|
141
|
+
findings: TFindings[] // external signal (e.g. analyst output)
|
|
142
|
+
populationSize: number
|
|
143
|
+
generation: number
|
|
144
|
+
signal: AbortSignal
|
|
145
|
+
}): Promise<MutableSurface[]>
|
|
146
|
+
decide?(args: { history: GenerationRecord[] }): { stop: boolean; reason?: string }
|
|
147
|
+
}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
| Driver | Strategy | How it proposes | Where it lives |
|
|
151
|
+
|---|---|---|---|
|
|
152
|
+
| `evolutionaryDriver` | Evolutionary (GEPA / AxGEPA) | Mutates the current best surface into N candidates, blind to history beyond the current best. Optimizes against the dataset's rewards. | **agent-eval** (pure: dataset → surface, no sandbox) |
|
|
153
|
+
| `analystDriver` *(planned)* | Reflective | Reads trace findings + generation history, reasons about *why* candidates failed, proposes targeted edits. | **agent-runtime** (runs sandboxes to do research) — implements agent-eval's `ImprovementDriver` |
|
|
154
|
+
|
|
155
|
+
This resolves the prior duplication where `runImprovementLoop` (evolutionary,
|
|
156
|
+
agent-eval) and `runAnalystLoop` (reflective, agent-runtime) were two parallel
|
|
157
|
+
loops doing "propose change → measure → gate → PR". There is **one loop**;
|
|
158
|
+
the analyst becomes a driver of it. The dependency direction permits this
|
|
159
|
+
cleanly: agent-eval is the leaf and owns the `ImprovementDriver` contract;
|
|
160
|
+
agent-runtime imports agent-eval and implements the contract.
|
|
161
|
+
|
|
162
|
+
## What "the surface" is — improvement tiers
|
|
163
|
+
|
|
164
|
+
`MutableSurface` is the thing a driver changes. It has tiers, least → most
|
|
165
|
+
invasive. Today `MutableSurface = string` models tiers 1–2; tiers 3–4 are the
|
|
166
|
+
open design question below.
|
|
167
|
+
|
|
168
|
+
| Tier | Surface | Driver that changes it | Blast radius |
|
|
169
|
+
|---|---|---|---|
|
|
170
|
+
| 1 | System prompt / prompt-signature addendum | `evolutionaryDriver` (GEPA), `analystDriver` | prompt only |
|
|
171
|
+
| 2 | Tool config / tool signatures | `analystDriver` | which tools, their schemas |
|
|
172
|
+
| 3 | Knowledge (wiki / knowledge graph) | agent-knowledge's knowledge adapter | what the agent *knows* |
|
|
173
|
+
| 4 | Code / scaffolding | autoresearch (reads codebase + traces) → worktree / PR | the implementation itself |
|
|
174
|
+
|
|
175
|
+
The key distinction Drew drew:
|
|
176
|
+
|
|
177
|
+
- **Analyst** updates the *signatures* — the prompt and tool surface (tiers
|
|
178
|
+
1–2). Cheap, reversible, measured directly against the dataset.
|
|
179
|
+
- **Autoresearch** updates the *code* (tier 4). It reads the repository plus
|
|
180
|
+
the trace findings, opens a worktree, and proposes implementation changes —
|
|
181
|
+
measured by re-running the inner loop against the changed code.
|
|
182
|
+
|
|
183
|
+
Both are `ImprovementDriver`s in the abstract (propose a change → measure →
|
|
184
|
+
gate → PR). They differ only in *what* they edit and *how invasive* it is. And
|
|
185
|
+
both consume the **same dataset** the flywheel builds.
|
|
186
|
+
|
|
187
|
+
## Resolved design decisions
|
|
188
|
+
|
|
189
|
+
1. **`MutableSurface` widens to span all tiers.** `MutableSurface = string |
|
|
190
|
+
CodeSurface`. The `string` form is tiers 1–2 (prompt / serialized tool
|
|
191
|
+
config); `CodeSurface = { kind: 'code'; worktreeRef; baseRef?; summary? }`
|
|
192
|
+
is tier 4 (an implementation change behind a worktree ref). One loop spans
|
|
193
|
+
prompt *and* code improvement. `surfaceHash` hashes a string by content and
|
|
194
|
+
a code surface by its `(worktreeRef, baseRef)` identity (the content lives
|
|
195
|
+
in git). **Shipped in agent-eval 0.40.1.** The consumer's
|
|
196
|
+
`dispatchWithSurface` is responsible for checking out a code surface's
|
|
197
|
+
worktree before running the worker.
|
|
198
|
+
|
|
199
|
+
2. **`runAnalystLoop` (agent-runtime): analyst becomes a driver; knowledge
|
|
200
|
+
stays separate.** Extract an `analystDriver` (implements agent-eval's
|
|
201
|
+
`ImprovementDriver`) for the surface-proposal part, and feed it into
|
|
202
|
+
`runImprovementLoop`'s gate + PR machinery. `runAnalystLoop`'s other
|
|
203
|
+
responsibilities — the findings ledger and knowledge-graph updates, which
|
|
204
|
+
are *not* surface optimization — stay where they are. **Phase 3
|
|
205
|
+
(agent-runtime); the `ImprovementDriver` contract it implements is already
|
|
206
|
+
shipped in agent-eval 0.40.1.**
|
|
207
|
+
|
|
208
|
+
3. **`runLoop` + `runMultishot` converge into one parameterized
|
|
209
|
+
`runConversationLoop`** with a pluggable backend (`sandbox | router`). The
|
|
210
|
+
two are the same shape (driver ↔ workers, iterate) differing only in
|
|
211
|
+
backend and intent; unify them. **Phase 3+ (cross-repo); needs its own
|
|
212
|
+
design pass — introduces a backend abstraction and couples the two repos'
|
|
213
|
+
inner loops, so it lands after the `ImprovementDriver` model is proven in
|
|
214
|
+
product use.**
|
|
215
|
+
|
|
216
|
+
## Vocabulary quick reference
|
|
217
|
+
|
|
218
|
+
- **shot** — one conversational turn (driver says X, worker responds Y). Used
|
|
219
|
+
in `runMultishot`. Never used to mean a whole eval run.
|
|
220
|
+
- **runMultishot** — many shots in one conversation; persona-driver ↔ one
|
|
221
|
+
router-agent. agent-eval.
|
|
222
|
+
- **runLoop** — driver ↔ workers in sandboxes; topology-agnostic execution.
|
|
223
|
+
agent-runtime.
|
|
224
|
+
- **runCampaign** — a measurement: a surface scored over N scenarios × M reps.
|
|
225
|
+
agent-eval. (A "campaign" = a coordinated batch of measurements.)
|
|
226
|
+
- **runOptimization** — the improvement loop body: driver proposes surfaces,
|
|
227
|
+
each measured by a campaign, top-K promoted per generation. agent-eval.
|
|
228
|
+
- **runImprovementLoop** — `runOptimization` + holdout re-score + release gate
|
|
229
|
+
+ optional PR. agent-eval.
|
|
230
|
+
- **runAnalystLoop** — reflective autoresearch: findings + knowledge updates +
|
|
231
|
+
improvement proposals. agent-runtime.
|
|
232
|
+
- **ImprovementDriver** — the pluggable strategy that proposes surfaces;
|
|
233
|
+
`evolutionaryDriver` and (planned) `analystDriver` conform.
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# The self-improvement engine
|
|
2
|
+
|
|
3
|
+
How the pieces compose into a closed loop that improves an agent over time.
|
|
4
|
+
This builds on [`loop-taxonomy.md`](./loop-taxonomy.md) (the role vocabulary)
|
|
5
|
+
— read that first. Here we describe the *engine*: the phases, the data flow,
|
|
6
|
+
and where each existing primitive plugs in.
|
|
7
|
+
|
|
8
|
+
## The closed loop, by phase
|
|
9
|
+
|
|
10
|
+
```
|
|
11
|
+
PHASE 1 — RUN
|
|
12
|
+
driver ↔ workers (sandbox) over scenarios
|
|
13
|
+
→ traces emitted → TraceStore + LabeledScenarioStore (the dataset)
|
|
14
|
+
Every run feeds the dataset regardless of why it ran (see the flywheel
|
|
15
|
+
section in loop-taxonomy.md). This is the only source of improvement signal.
|
|
16
|
+
|
|
17
|
+
PHASE 2 — ANALYZE ← the research report is born here
|
|
18
|
+
trace analysts run over the accumulated traces
|
|
19
|
+
(today: runAnalystLoop steps 2–4 in agent-runtime)
|
|
20
|
+
- run the analyst registry over traces → findings
|
|
21
|
+
- persist findings to the ledger
|
|
22
|
+
- diff the new findings vs the baseline → research report
|
|
23
|
+
Output: a research report = { findings, diff } grounded in real traces.
|
|
24
|
+
|
|
25
|
+
PHASE 3 — PROPOSE
|
|
26
|
+
ImprovementDriver.propose(input) → MutableSurface[]
|
|
27
|
+
input carries:
|
|
28
|
+
- currentSurface the current best surface (prompt string or CodeSurface)
|
|
29
|
+
- history prior generations + their scores
|
|
30
|
+
- report the Phase-2 research report (findings + diff)
|
|
31
|
+
- traces all traces (read access) — "all the data"
|
|
32
|
+
- dataset the LabeledScenarioStore handle
|
|
33
|
+
- populationSize BREADTH: how many candidate surfaces to return
|
|
34
|
+
- maxImprovementShots DEPTH: how many runLoop iterations each candidate
|
|
35
|
+
generation may take (1..MAX_IMPROVEMENT_SHOTS)
|
|
36
|
+
For the code-tier (autoresearch) driver, propose() runs a FULL sandbox
|
|
37
|
+
runLoop: a driver↔worker(s) loop that reads report+traces+codebase and
|
|
38
|
+
produces the improvement as commits in ONE worktree per candidate.
|
|
39
|
+
Output: CodeSurface{ worktreeRef }[] (or string[] for prompt-tier).
|
|
40
|
+
|
|
41
|
+
PHASE 4 — MEASURE
|
|
42
|
+
each candidate → runCampaign on the holdout set
|
|
43
|
+
(checks out the candidate's worktree, runs the worker against the changed
|
|
44
|
+
code/prompt, judges, scores). The measurement is driver-agnostic.
|
|
45
|
+
|
|
46
|
+
PHASE 5 — GATE + PROMOTE
|
|
47
|
+
defaultProductionGate(winner vs baseline on holdout) → ship | hold | …
|
|
48
|
+
on ship → open a PR from the winning worktree (one worktree = one PR).
|
|
49
|
+
|
|
50
|
+
↺ loop back to PHASE 1 with the promoted surface as the new baseline.
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
The improvement loop body (`runOptimization`) owns Phases 3–4; the gated
|
|
54
|
+
shell (`runImprovementLoop`) adds Phase 5. Phases 1–2 are upstream — the
|
|
55
|
+
run that produces traces, and the analysts that turn traces into a report.
|
|
56
|
+
|
|
57
|
+
## `propose()` — the plan step, recursively agentic
|
|
58
|
+
|
|
59
|
+
`propose()` does NOT run the worker and does NOT measure. It returns N
|
|
60
|
+
candidate surfaces to measure next. *How* it produces them is per-driver and
|
|
61
|
+
spans a cost spectrum:
|
|
62
|
+
|
|
63
|
+
| Driver | `propose()` mechanism | Sandbox? | Output |
|
|
64
|
+
|---|---|---|---|
|
|
65
|
+
| `evolutionaryDriver` | mutate current surface text into N variants | no | `string[]` |
|
|
66
|
+
| `analystDriver` (reflective) | LLM reads the report → drafts edits | LLM call | `string[]` / `CodeSurface[]` |
|
|
67
|
+
| `autoresearchDriver` (code-tier) | **full sandbox runLoop** (≤ `maxImprovementShots`) reads report+traces+codebase → commits in one worktree | **yes** | `CodeSurface[]` |
|
|
68
|
+
|
|
69
|
+
The recursion: generating *one* candidate (autoresearch `propose`) is itself a
|
|
70
|
+
driver↔worker-in-a-sandbox loop, nested inside the *measurement* of that
|
|
71
|
+
candidate (Phase 4), nested inside the improvement loop. "A loop whose step
|
|
72
|
+
contains a loop."
|
|
73
|
+
|
|
74
|
+
Two knobs, not one:
|
|
75
|
+
- **`populationSize`** — breadth: how many candidates `propose()` returns.
|
|
76
|
+
- **`maxImprovementShots`** — depth: how many runLoop iterations the
|
|
77
|
+
generating agent gets per candidate (N=1 → single-shot; N>1 → it can
|
|
78
|
+
iterate on its own change before handing it back to be measured).
|
|
79
|
+
|
|
80
|
+
## Package boundaries (respecting the leaf direction)
|
|
81
|
+
|
|
82
|
+
agent-eval is the leaf (imports nothing upstream). agent-runtime imports it.
|
|
83
|
+
So:
|
|
84
|
+
|
|
85
|
+
| Piece | Package | Why |
|
|
86
|
+
|---|---|---|
|
|
87
|
+
| `ImprovementDriver` contract | agent-eval | the shared interface; everyone implements it |
|
|
88
|
+
| widened `propose()` input (report/traces/dataset) | agent-eval | part of the contract |
|
|
89
|
+
| `evolutionaryDriver` | agent-eval | pure: dataset → surface, no sandbox |
|
|
90
|
+
| **VCS-pluggable worktree adapter** | agent-eval | pure git/FS, no sandbox; produces `CodeSurface` |
|
|
91
|
+
| `runOptimization` / `runImprovementLoop` | agent-eval | driver-agnostic loop body + gated shell |
|
|
92
|
+
| `defaultProductionGate` | agent-eval | measurement-side safety |
|
|
93
|
+
| **`autoresearchDriver`** (sandbox-spawning `propose`) | agent-runtime | needs the sandbox SDK + `runLoop` |
|
|
94
|
+
| `analystDriver` (wraps the improvement adapter) | agent-runtime | depends on `runAnalystLoop` machinery |
|
|
95
|
+
| trace analysts / `runAnalystLoop` (Phase 2) | agent-runtime | runs agents to analyze |
|
|
96
|
+
|
|
97
|
+
## The worktree adapter (VCS-pluggable)
|
|
98
|
+
|
|
99
|
+
One improvement = one worktree, PR-like (multiple commits allowed). The
|
|
100
|
+
adapter abstracts the VCS so the driver code is VCS-agnostic:
|
|
101
|
+
|
|
102
|
+
```ts
|
|
103
|
+
interface WorktreeAdapter {
|
|
104
|
+
create(opts: { baseRef: string; label: string }): Promise<Worktree>
|
|
105
|
+
// ... agent commits into worktree.path ...
|
|
106
|
+
finalize(wt: Worktree, summary: string): Promise<CodeSurface> // → { kind:'code', worktreeRef, baseRef, summary }
|
|
107
|
+
discard(wt: Worktree): Promise<void>
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
- **git** impl ships first (`git worktree add` / branch / commit).
|
|
112
|
+
- **jj** ([jj-vcs](https://github.com/jj-vcs/jj)) is a candidate second impl —
|
|
113
|
+
not built now; the interface exists so it can slot in without touching
|
|
114
|
+
driver code.
|
|
115
|
+
|
|
116
|
+
The measurement (Phase 4) consumes a `CodeSurface` by checking out
|
|
117
|
+
`worktreeRef` before running the worker; on promotion (Phase 5) the worktree
|
|
118
|
+
becomes the PR branch.
|
|
119
|
+
|
|
120
|
+
## Build sequence
|
|
121
|
+
|
|
122
|
+
1. **agent-eval 0.40.2**: widen `propose()` input (additive optional
|
|
123
|
+
`report` / `traces` / `dataset` / `maxImprovementShots`); add the
|
|
124
|
+
VCS-pluggable worktree adapter with a git impl; multi-sink trace fan-out
|
|
125
|
+
helper.
|
|
126
|
+
2. **agent-runtime 0.25.0**: `analystDriver` (wraps the improvement adapter,
|
|
127
|
+
fed the Phase-2 report); `autoresearchDriver` (sandbox runLoop `propose`);
|
|
128
|
+
default-on multi-sink tracing in `handleChatTurn`.
|
|
129
|
+
3. Wire one consumer end-to-end (Phase 4 of the broader rollout), prove it,
|
|
130
|
+
then fan out.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.40.2",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -104,6 +104,11 @@
|
|
|
104
104
|
"import": "./dist/multishot/index.js",
|
|
105
105
|
"default": "./dist/multishot/index.js"
|
|
106
106
|
},
|
|
107
|
+
"./campaign": {
|
|
108
|
+
"types": "./dist/campaign/index.d.ts",
|
|
109
|
+
"import": "./dist/campaign/index.js",
|
|
110
|
+
"default": "./dist/campaign/index.js"
|
|
111
|
+
},
|
|
107
112
|
"./openapi.json": {
|
|
108
113
|
"default": "./dist/openapi.json"
|
|
109
114
|
}
|
|
@@ -119,17 +124,6 @@
|
|
|
119
124
|
"publishConfig": {
|
|
120
125
|
"access": "public"
|
|
121
126
|
},
|
|
122
|
-
"scripts": {
|
|
123
|
-
"build": "tsup && pnpm openapi",
|
|
124
|
-
"dev": "tsup --watch",
|
|
125
|
-
"prepare": "pnpm build",
|
|
126
|
-
"test": "vitest run",
|
|
127
|
-
"test:watch": "vitest",
|
|
128
|
-
"typecheck": "tsc --noEmit",
|
|
129
|
-
"lint": "biome check src",
|
|
130
|
-
"format": "biome format --write src",
|
|
131
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
132
|
-
},
|
|
133
127
|
"dependencies": {
|
|
134
128
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
135
129
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -143,30 +137,45 @@
|
|
|
143
137
|
"@tangle-network/sandbox": "^0.2.1"
|
|
144
138
|
},
|
|
145
139
|
"peerDependenciesMeta": {
|
|
146
|
-
"@tangle-network/agent-runtime": {
|
|
147
|
-
|
|
140
|
+
"@tangle-network/agent-runtime": {
|
|
141
|
+
"optional": true
|
|
142
|
+
},
|
|
143
|
+
"@tangle-network/sandbox": {
|
|
144
|
+
"optional": true
|
|
145
|
+
}
|
|
148
146
|
},
|
|
149
147
|
"devDependencies": {
|
|
150
148
|
"@biomejs/biome": "^2.4.15",
|
|
151
149
|
"@tangle-network/agent-runtime": "^0.21.0",
|
|
152
150
|
"@tangle-network/sandbox": "^0.2.1",
|
|
153
151
|
"@types/node": "^25.6.0",
|
|
152
|
+
"husky": "^9.1.7",
|
|
153
|
+
"lint-staged": "^17.0.5",
|
|
154
154
|
"openapi3-ts": "^4.5.0",
|
|
155
155
|
"tsup": "^8.0.0",
|
|
156
156
|
"typescript": "^5.7.0",
|
|
157
157
|
"vitest": "^3.0.0"
|
|
158
158
|
},
|
|
159
|
-
"pnpm": {
|
|
160
|
-
"minimumReleaseAge": 4320,
|
|
161
|
-
"minimumReleaseAgeExclude": ["@tangle-network/sandbox", "@tangle-network/agent-runtime"],
|
|
162
|
-
"overrides": {
|
|
163
|
-
"postcss@<8.5.10": "^8.5.10",
|
|
164
|
-
"ws@>=8.0.0 <8.20.1": "^8.20.1"
|
|
165
|
-
}
|
|
166
|
-
},
|
|
167
159
|
"engines": {
|
|
168
160
|
"node": ">=20"
|
|
169
161
|
},
|
|
162
|
+
"lint-staged": {
|
|
163
|
+
"src/**/*.{ts,tsx}": [
|
|
164
|
+
"biome check --write --no-errors-on-unmatched"
|
|
165
|
+
],
|
|
166
|
+
"tests/**/*.{ts,tsx}": [
|
|
167
|
+
"biome check --write --no-errors-on-unmatched"
|
|
168
|
+
]
|
|
169
|
+
},
|
|
170
170
|
"license": "MIT",
|
|
171
|
-
"
|
|
172
|
-
|
|
171
|
+
"scripts": {
|
|
172
|
+
"build": "tsup && pnpm openapi",
|
|
173
|
+
"dev": "tsup --watch",
|
|
174
|
+
"test": "vitest run",
|
|
175
|
+
"test:watch": "vitest",
|
|
176
|
+
"typecheck": "tsc --noEmit",
|
|
177
|
+
"lint": "biome check src",
|
|
178
|
+
"format": "biome format --write src",
|
|
179
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
180
|
+
}
|
|
181
|
+
}
|