@tangle-network/agent-eval 0.38.0 → 0.40.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/campaign/index.d.ts +775 -0
  2. package/dist/campaign/index.js +807 -0
  3. package/dist/campaign/index.js.map +1 -0
  4. package/dist/chunk-5U2DOJU4.js +565 -0
  5. package/dist/chunk-5U2DOJU4.js.map +1 -0
  6. package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
  7. package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
  8. package/dist/chunk-BWZEGTES.js.map +1 -0
  9. package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
  10. package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
  11. package/dist/chunk-GGE4NNQT.js +65 -0
  12. package/dist/chunk-GGE4NNQT.js.map +1 -0
  13. package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
  14. package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
  15. package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
  16. package/dist/chunk-MAOZCN36.js.map +1 -0
  17. package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
  18. package/dist/chunk-TMXPFWC7.js +305 -0
  19. package/dist/chunk-TMXPFWC7.js.map +1 -0
  20. package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
  21. package/dist/chunk-WP7SY7AI.js.map +1 -0
  22. package/dist/chunk-YV7J7X5N.js +313 -0
  23. package/dist/chunk-YV7J7X5N.js.map +1 -0
  24. package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
  25. package/dist/control.d.ts +3 -3
  26. package/dist/control.js +2 -2
  27. package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
  28. package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
  29. package/dist/governance/index.d.ts +133 -5
  30. package/dist/index.d.ts +35 -34
  31. package/dist/index.js +97 -630
  32. package/dist/index.js.map +1 -1
  33. package/dist/multishot/index.d.ts +21 -21
  34. package/dist/multishot/index.js +64 -15
  35. package/dist/multishot/index.js.map +1 -1
  36. package/dist/openapi.json +1 -1
  37. package/dist/optimization.d.ts +2 -2
  38. package/dist/optimization.js +5 -5
  39. package/dist/pipelines/index.js +2 -2
  40. package/dist/red-team-30II1T4o.d.ts +63 -0
  41. package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
  42. package/dist/reporting.d.ts +2 -2
  43. package/dist/reporting.js +3 -3
  44. package/dist/rl.js +15 -315
  45. package/dist/rl.js.map +1 -1
  46. package/dist/run-campaign-JYJXYHHL.js +10 -0
  47. package/dist/run-campaign-JYJXYHHL.js.map +1 -0
  48. package/dist/traces.js +7 -5
  49. package/dist/wire/index.d.ts +2 -2
  50. package/docs/design/loop-taxonomy.md +233 -0
  51. package/docs/design/self-improvement-engine.md +130 -0
  52. package/package.json +33 -24
  53. package/dist/chunk-KHZRNY3F.js.map +0 -1
  54. package/dist/chunk-L5UNCDAJ.js.map +0 -1
  55. package/dist/chunk-TSPOEDM3.js.map +0 -1
  56. package/dist/index-CN2agEaO.d.ts +0 -191
  57. /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
  58. /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
  59. /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
  60. /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
  61. /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
  62. /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
@@ -0,0 +1,10 @@
1
+ import {
2
+ runCampaign
3
+ } from "./chunk-TMXPFWC7.js";
4
+ import "./chunk-WP7SY7AI.js";
5
+ import "./chunk-QYJT52YW.js";
6
+ import "./chunk-PZ5AY32C.js";
7
+ export {
8
+ runCampaign
9
+ };
10
+ //# sourceMappingURL=run-campaign-JYJXYHHL.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
package/dist/traces.js CHANGED
@@ -1,11 +1,9 @@
1
1
  import {
2
- DEFAULT_REDACTION_RULES,
3
2
  DEFAULT_TRACE_ANALYST_BUDGETS,
4
3
  FileSystemTraceStore,
5
4
  InMemoryTraceStore,
6
5
  OTEL_AGENT_EVAL_SCOPE,
7
6
  OtlpFileTraceStore,
8
- REDACTION_VERSION,
9
7
  ReplayCache,
10
8
  ReplayCacheMissError,
11
9
  SpanNotFoundError,
@@ -30,13 +28,17 @@ import {
30
28
  iterateRawCalls,
31
29
  otelRunCompleteHook,
32
30
  planTraceInsightQuestions,
33
- redactString,
34
- redactValue,
35
31
  scoreTraceInsightReadiness,
36
32
  tokenizeDomainWords,
37
33
  traceAnalystFunctionGroup,
38
34
  traceAnalystOnRunComplete
39
- } from "./chunk-L5UNCDAJ.js";
35
+ } from "./chunk-MAOZCN36.js";
36
+ import {
37
+ DEFAULT_REDACTION_RULES,
38
+ REDACTION_VERSION,
39
+ redactString,
40
+ redactValue
41
+ } from "./chunk-GGE4NNQT.js";
40
42
  import {
41
43
  aggregateLlm,
42
44
  argHash,
@@ -1,4 +1,4 @@
1
- import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-iATEAHmc.js';
1
+ import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-Dvy-bt7x.js';
2
2
  import { T as TraceStore } from '../store-Db2Bv8Cf.js';
3
3
  import { z } from 'zod';
4
4
  import { OpenAPIObject } from 'openapi3-ts/oas31';
@@ -7,7 +7,7 @@ import { ServerType } from '@hono/node-server';
7
7
  import { Hono } from 'hono';
8
8
  import '../control-runtime-BZ_lVLYW.js';
9
9
  import '../emitter-DP_cSSiw.js';
10
- import '../dataset-ueRVTUoY.js';
10
+ import '../dataset-BlwAtYYf.js';
11
11
  import '../errors-mje_cKOs.js';
12
12
 
13
13
  declare const RubricDimensionSchema: z.ZodObject<{
@@ -0,0 +1,233 @@
1
+ # Loop taxonomy: driver, worker, measurement, and the improvement loop
2
+
3
+ This is the canonical vocabulary for the Tangle agent stack. It exists because
4
+ the same word ("loop", "shot", "worker") was being used at three different
5
+ layers, and the layers were getting conflated. Every role below has exactly
6
+ one meaning. Use these words and nothing else.
7
+
8
+ Cross-links: [`three-package-architecture.md`](../three-package-architecture.md)
9
+ (who owns what), [`concepts.md`](../concepts.md) (eval mental model),
10
+ [`multi-shot-optimization.md`](../multi-shot-optimization.md) (GEPA),
11
+ [`auto-research-loop-end-to-end.md`](../auto-research-loop-end-to-end.md)
12
+ (analyst / autoresearch).
13
+
14
+ ## The three roles
15
+
16
+ | Role | Definition | Lives at |
17
+ |---|---|---|
18
+ | **Driver** | The thing that *decides what happens next*. Plans, then decides whether to continue. | Both layers (see below) |
19
+ | **Worker** | An agent harness instance (Claude Code, Codex, OpenCode, …) running inside a sandbox. Does the actual work; responds in chat. | Inner layer only |
20
+ | **Sandbox** | A multi-harness VM. Hosts **1..N workers**, which can share a workspace. Not an agent — the substrate an agent runs in. | Inner layer only |
21
+ | **Measurement** | Runs the worker over a set of scenarios and judges the outputs into a scorecard with confidence intervals. This is `runCampaign`. | Outer layer |
22
+
23
+ Two facts that trip people up:
24
+
25
+ 1. **A sandbox is not a worker.** One sandbox can hold ten workers — a driver
26
+ can coordinate CC + Codex + OpenCode siblings sharing one workspace, or a
27
+ fleet spread across machines. `runLoop`'s placement encodes exactly this:
28
+ `{ sibling, sandboxId }` = co-located workers; `{ fleet, fleetId,
29
+ machineId, sandboxId }` = workers across machines.
30
+
31
+ 2. **"Driver" exists at two layers and means the same *kind* of thing
32
+ (a decider) at each, but the things it decides differ:**
33
+ - **Conversation driver** (inner): decides the next *turn* — a persona/user
34
+ simulating chat, or a planner fanning work to workers.
35
+ - **Improvement driver** (outer): decides the next *surface* — what system
36
+ prompt / tool config / code the workers should run.
37
+
38
+ ## The nesting
39
+
40
+ There are two loops. The outer one improves the thing the inner one runs.
41
+
42
+ ```
43
+ runImprovementLoop OUTER loop — improve the agent over time
44
+
45
+ ├─ DRIVER = ImprovementDriver proposes a candidate SURFACE
46
+ │ (evolutionary mutator | (the worker's system prompt / tools / config)
47
+ │ reflective analyst) — NOT a conversation turn
48
+
49
+ └─ for each candidate surface:
50
+
51
+ runCampaign a MEASUREMENT — scores ONE surface
52
+
53
+ └─ for each scenario × rep:
54
+
55
+ dispatch(scenario) THE SEAM — topology-opaque, returns an artifact
56
+
57
+ └─ runLoop / runMultishot INNER loop — one conversation
58
+ ├─ DRIVER = persona / user / planner chats with ↓
59
+ └─ WORKERS = 1..N agent harnesses in 1..M sandboxes
60
+
61
+ → transcript / artifact
62
+ judge(artifact) → score
63
+ → scorecard + CIs
64
+ gate(winner vs baseline) → PR
65
+ ```
66
+
67
+ ### `dispatch` is the topology-opaque seam
68
+
69
+ `dispatch(scenario) → artifact` is the boundary between the measurement layer
70
+ and the execution layer. The measurement does **not** know or care how the
71
+ artifact was produced. Behind the seam can be:
72
+
73
+ - one LLM call,
74
+ - one worker (CC) in one sandbox,
75
+ - a conversation driver coordinating 10 workers (CC + Codex + OpenCode)
76
+ sharing a workspace in one sandbox,
77
+ - a fleet across machines.
78
+
79
+ All of it is invisible to `runCampaign`. This is why the substrate has no
80
+ opinion about execution topology: the topology lives inside `dispatch`.
81
+
82
+ ### Corrected statements (things that were said backwards)
83
+
84
+ - The worker is the agent in the sandbox. The driver talks to it. ✓
85
+ - `runCampaign` is a **measurement**, not a worker. It *runs the worker* (via
86
+ `dispatch`); the worker does not "run the eval".
87
+ - The outer improvement loop has **no single worker** — its driver proposes a
88
+ *surface*, and each surface is scored by a *measurement* that drives the
89
+ inner workers.
90
+
91
+ ## The dataset flywheel — why every loop run matters
92
+
93
+ **Every loop run, regardless of why it ran, feeds the same dataset.** This is
94
+ the through-line that ties measurement and improvement together.
95
+
96
+ When `runCampaign` runs with a `labeledStore`, each cell captures
97
+ `(scenario, artifact, judgeScore, source)` into the `LabeledScenarioStore`.
98
+ The `source` discriminates *why* the run happened — but the captured tuple is
99
+ identical in shape:
100
+
101
+ | `captureSource` | The run that produced it |
102
+ |---|---|
103
+ | `'eval-run'` | a plain evaluation campaign |
104
+ | `'production-trace'` | a real user conversation in production |
105
+ | `'red-team'` | an adversarial probe |
106
+ | `'synthetic'` | a generated scenario |
107
+ | `'manual'` | a human-curated example |
108
+
109
+ That captured corpus **is the GEPA training set.** A basic eval run, a
110
+ production conversation, and an autoresearch loop all deposit the same
111
+ `(input, output, reward)` tuples. The optimization driver later samples from
112
+ that corpus to evolve the surface. So:
113
+
114
+ > Running *any* loop — even one whose purpose is not optimization — builds the
115
+ > dataset that optimization needs. The flywheel turns whether or not you are
116
+ > currently optimizing.
117
+
118
+ This is enforced, not aspirational: `runImprovementLoop` **refuses**
119
+ `tracing: 'off'` whenever a driver is wired, precisely because a loop that
120
+ doesn't feed the dataset is a loop that breaks the flywheel.
121
+
122
+ Temporal-split discipline (train vs holdout, `capturedBefore`) and
123
+ default-off-for-training of `production-trace` are enforced at the
124
+ `LabeledScenarioStore.sample()` boundary so the flywheel cannot contaminate
125
+ the holdout it is judged against. See `src/campaign/labeled-store/`.
126
+
127
+ ## One improvement loop, pluggable drivers
128
+
129
+ The improvement loop is **driver-agnostic**. `runOptimization` (the loop body)
130
+ and `runImprovementLoop` (the gated-promotion shell) call
131
+ `driver.propose(...)` → measure → `driver.decide(...)`. They do not know which
132
+ strategy is driving. Two strategies conform to the same `ImprovementDriver`
133
+ interface:
134
+
135
+ ```ts
136
+ interface ImprovementDriver<TFindings = unknown> {
137
+ kind: string
138
+ propose(args: {
139
+ currentSurface: MutableSurface
140
+ history: GenerationRecord[] // what's been tried + scored
141
+ findings: TFindings[] // external signal (e.g. analyst output)
142
+ populationSize: number
143
+ generation: number
144
+ signal: AbortSignal
145
+ }): Promise<MutableSurface[]>
146
+ decide?(args: { history: GenerationRecord[] }): { stop: boolean; reason?: string }
147
+ }
148
+ ```
149
+
150
+ | Driver | Strategy | How it proposes | Where it lives |
151
+ |---|---|---|---|
152
+ | `evolutionaryDriver` | Evolutionary (GEPA / AxGEPA) | Mutates the current best surface into N candidates, blind to history beyond the current best. Optimizes against the dataset's rewards. | **agent-eval** (pure: dataset → surface, no sandbox) |
153
+ | `analystDriver` *(planned)* | Reflective | Reads trace findings + generation history, reasons about *why* candidates failed, proposes targeted edits. | **agent-runtime** (runs sandboxes to do research) — implements agent-eval's `ImprovementDriver` |
154
+
155
+ This resolves the prior duplication where `runImprovementLoop` (evolutionary,
156
+ agent-eval) and `runAnalystLoop` (reflective, agent-runtime) were two parallel
157
+ loops doing "propose change → measure → gate → PR". There is **one loop**;
158
+ the analyst becomes a driver of it. The dependency direction permits this
159
+ cleanly: agent-eval is the leaf and owns the `ImprovementDriver` contract;
160
+ agent-runtime imports agent-eval and implements the contract.
161
+
162
+ ## What "the surface" is — improvement tiers
163
+
164
+ `MutableSurface` is the thing a driver changes. It has tiers, least → most
165
+ invasive. Today `MutableSurface = string` models tiers 1–2; tiers 3–4 are the
166
+ open design question below.
167
+
168
+ | Tier | Surface | Driver that changes it | Blast radius |
169
+ |---|---|---|---|
170
+ | 1 | System prompt / prompt-signature addendum | `evolutionaryDriver` (GEPA), `analystDriver` | prompt only |
171
+ | 2 | Tool config / tool signatures | `analystDriver` | which tools, their schemas |
172
+ | 3 | Knowledge (wiki / knowledge graph) | agent-knowledge's knowledge adapter | what the agent *knows* |
173
+ | 4 | Code / scaffolding | autoresearch (reads codebase + traces) → worktree / PR | the implementation itself |
174
+
175
+ The key distinction Drew drew:
176
+
177
+ - **Analyst** updates the *signatures* — the prompt and tool surface (tiers
178
+ 1–2). Cheap, reversible, measured directly against the dataset.
179
+ - **Autoresearch** updates the *code* (tier 4). It reads the repository plus
180
+ the trace findings, opens a worktree, and proposes implementation changes —
181
+ measured by re-running the inner loop against the changed code.
182
+
183
+ Both are `ImprovementDriver`s in the abstract (propose a change → measure →
184
+ gate → PR). They differ only in *what* they edit and *how invasive* it is. And
185
+ both consume the **same dataset** the flywheel builds.
186
+
187
+ ## Resolved design decisions
188
+
189
+ 1. **`MutableSurface` widens to span all tiers.** `MutableSurface = string |
190
+ CodeSurface`. The `string` form is tiers 1–2 (prompt / serialized tool
191
+ config); `CodeSurface = { kind: 'code'; worktreeRef; baseRef?; summary? }`
192
+ is tier 4 (an implementation change behind a worktree ref). One loop spans
193
+ prompt *and* code improvement. `surfaceHash` hashes a string by content and
194
+ a code surface by its `(worktreeRef, baseRef)` identity (the content lives
195
+ in git). **Shipped in agent-eval 0.40.1.** The consumer's
196
+ `dispatchWithSurface` is responsible for checking out a code surface's
197
+ worktree before running the worker.
198
+
199
+ 2. **`runAnalystLoop` (agent-runtime): analyst becomes a driver; knowledge
200
+ stays separate.** Extract an `analystDriver` (implements agent-eval's
201
+ `ImprovementDriver`) for the surface-proposal part, and feed it into
202
+ `runImprovementLoop`'s gate + PR machinery. `runAnalystLoop`'s other
203
+ responsibilities — the findings ledger and knowledge-graph updates, which
204
+ are *not* surface optimization — stay where they are. **Phase 3
205
+ (agent-runtime); the `ImprovementDriver` contract it implements is already
206
+ shipped in agent-eval 0.40.1.**
207
+
208
+ 3. **`runLoop` + `runMultishot` converge into one parameterized
209
+ `runConversationLoop`** with a pluggable backend (`sandbox | router`). The
210
+ two are the same shape (driver ↔ workers, iterate) differing only in
211
+ backend and intent; unify them. **Phase 3+ (cross-repo); needs its own
212
+ design pass — introduces a backend abstraction and couples the two repos'
213
+ inner loops, so it lands after the `ImprovementDriver` model is proven in
214
+ product use.**
215
+
216
+ ## Vocabulary quick reference
217
+
218
+ - **shot** — one conversational turn (driver says X, worker responds Y). Used
219
+ in `runMultishot`. Never used to mean a whole eval run.
220
+ - **runMultishot** — many shots in one conversation; persona-driver ↔ one
221
+ router-agent. agent-eval.
222
+ - **runLoop** — driver ↔ workers in sandboxes; topology-agnostic execution.
223
+ agent-runtime.
224
+ - **runCampaign** — a measurement: a surface scored over N scenarios × M reps.
225
+ agent-eval. (A "campaign" = a coordinated batch of measurements.)
226
+ - **runOptimization** — the improvement loop body: driver proposes surfaces,
227
+ each measured by a campaign, top-K promoted per generation. agent-eval.
228
+ - **runImprovementLoop** — `runOptimization` + holdout re-score + release gate
229
+ + optional PR. agent-eval.
230
+ - **runAnalystLoop** — reflective autoresearch: findings + knowledge updates +
231
+ improvement proposals. agent-runtime.
232
+ - **ImprovementDriver** — the pluggable strategy that proposes surfaces;
233
+ `evolutionaryDriver` and (planned) `analystDriver` conform.
@@ -0,0 +1,130 @@
1
+ # The self-improvement engine
2
+
3
+ How the pieces compose into a closed loop that improves an agent over time.
4
+ This builds on [`loop-taxonomy.md`](./loop-taxonomy.md) (the role vocabulary)
5
+ — read that first. Here we describe the *engine*: the phases, the data flow,
6
+ and where each existing primitive plugs in.
7
+
8
+ ## The closed loop, by phase
9
+
10
+ ```
11
+ PHASE 1 — RUN
12
+ driver ↔ workers (sandbox) over scenarios
13
+ → traces emitted → TraceStore + LabeledScenarioStore (the dataset)
14
+ Every run feeds the dataset regardless of why it ran (see the flywheel
15
+ section in loop-taxonomy.md). This is the only source of improvement signal.
16
+
17
+ PHASE 2 — ANALYZE ← the research report is born here
18
+ trace analysts run over the accumulated traces
19
+ (today: runAnalystLoop steps 2–4 in agent-runtime)
20
+ - run the analyst registry over traces → findings
21
+ - persist findings to the ledger
22
+ - diff the new findings vs the baseline → research report
23
+ Output: a research report = { findings, diff } grounded in real traces.
24
+
25
+ PHASE 3 — PROPOSE
26
+ ImprovementDriver.propose(input) → MutableSurface[]
27
+ input carries:
28
+ - currentSurface the current best surface (prompt string or CodeSurface)
29
+ - history prior generations + their scores
30
+ - report the Phase-2 research report (findings + diff)
31
+ - traces all traces (read access) — "all the data"
32
+ - dataset the LabeledScenarioStore handle
33
+ - populationSize BREADTH: how many candidate surfaces to return
34
+ - maxImprovementShots DEPTH: how many runLoop iterations each candidate
35
+ generation may take (1..MAX_IMPROVEMENT_SHOTS)
36
+ For the code-tier (autoresearch) driver, propose() runs a FULL sandbox
37
+ runLoop: a driver↔worker(s) loop that reads report+traces+codebase and
38
+ produces the improvement as commits in ONE worktree per candidate.
39
+ Output: CodeSurface{ worktreeRef }[] (or string[] for prompt-tier).
40
+
41
+ PHASE 4 — MEASURE
42
+ each candidate → runCampaign on the holdout set
43
+ (checks out the candidate's worktree, runs the worker against the changed
44
+ code/prompt, judges, scores). The measurement is driver-agnostic.
45
+
46
+ PHASE 5 — GATE + PROMOTE
47
+ defaultProductionGate(winner vs baseline on holdout) → ship | hold | …
48
+ on ship → open a PR from the winning worktree (one worktree = one PR).
49
+
50
+ ↺ loop back to PHASE 1 with the promoted surface as the new baseline.
51
+ ```
52
+
53
+ The improvement loop body (`runOptimization`) owns Phases 3–4; the gated
54
+ shell (`runImprovementLoop`) adds Phase 5. Phases 1–2 are upstream — the
55
+ run that produces traces, and the analysts that turn traces into a report.
56
+
57
+ ## `propose()` — the plan step, recursively agentic
58
+
59
+ `propose()` does NOT run the worker and does NOT measure. It returns N
60
+ candidate surfaces to measure next. *How* it produces them is per-driver and
61
+ spans a cost spectrum:
62
+
63
+ | Driver | `propose()` mechanism | Sandbox? | Output |
64
+ |---|---|---|---|
65
+ | `evolutionaryDriver` | mutate current surface text into N variants | no | `string[]` |
66
+ | `analystDriver` (reflective) | LLM reads the report → drafts edits | LLM call | `string[]` / `CodeSurface[]` |
67
+ | `autoresearchDriver` (code-tier) | **full sandbox runLoop** (≤ `maxImprovementShots`) reads report+traces+codebase → commits in one worktree | **yes** | `CodeSurface[]` |
68
+
69
+ The recursion: generating *one* candidate (autoresearch `propose`) is itself a
70
+ driver↔worker-in-a-sandbox loop, nested inside the *measurement* of that
71
+ candidate (Phase 4), nested inside the improvement loop. "A loop whose step
72
+ contains a loop."
73
+
74
+ Two knobs, not one:
75
+ - **`populationSize`** — breadth: how many candidates `propose()` returns.
76
+ - **`maxImprovementShots`** — depth: how many runLoop iterations the
77
+ generating agent gets per candidate (N=1 → single-shot; N>1 → it can
78
+ iterate on its own change before handing it back to be measured).
79
+
80
+ ## Package boundaries (respecting the leaf direction)
81
+
82
+ agent-eval is the leaf (imports nothing upstream). agent-runtime imports it.
83
+ So:
84
+
85
+ | Piece | Package | Why |
86
+ |---|---|---|
87
+ | `ImprovementDriver` contract | agent-eval | the shared interface; everyone implements it |
88
+ | widened `propose()` input (report/traces/dataset) | agent-eval | part of the contract |
89
+ | `evolutionaryDriver` | agent-eval | pure: dataset → surface, no sandbox |
90
+ | **VCS-pluggable worktree adapter** | agent-eval | pure git/FS, no sandbox; produces `CodeSurface` |
91
+ | `runOptimization` / `runImprovementLoop` | agent-eval | driver-agnostic loop body + gated shell |
92
+ | `defaultProductionGate` | agent-eval | measurement-side safety |
93
+ | **`autoresearchDriver`** (sandbox-spawning `propose`) | agent-runtime | needs the sandbox SDK + `runLoop` |
94
+ | `analystDriver` (wraps the improvement adapter) | agent-runtime | depends on `runAnalystLoop` machinery |
95
+ | trace analysts / `runAnalystLoop` (Phase 2) | agent-runtime | runs agents to analyze |
96
+
97
+ ## The worktree adapter (VCS-pluggable)
98
+
99
+ One improvement = one worktree, PR-like (multiple commits allowed). The
100
+ adapter abstracts the VCS so the driver code is VCS-agnostic:
101
+
102
+ ```ts
103
+ interface WorktreeAdapter {
104
+ create(opts: { baseRef: string; label: string }): Promise<Worktree>
105
+ // ... agent commits into worktree.path ...
106
+ finalize(wt: Worktree, summary: string): Promise<CodeSurface> // → { kind:'code', worktreeRef, baseRef, summary }
107
+ discard(wt: Worktree): Promise<void>
108
+ }
109
+ ```
110
+
111
+ - **git** impl ships first (`git worktree add` / branch / commit).
112
+ - **jj** ([jj-vcs](https://github.com/jj-vcs/jj)) is a candidate second impl —
113
+ not built now; the interface exists so it can slot in without touching
114
+ driver code.
115
+
116
+ The measurement (Phase 4) consumes a `CodeSurface` by checking out
117
+ `worktreeRef` before running the worker; on promotion (Phase 5) the worktree
118
+ becomes the PR branch.
119
+
120
+ ## Build sequence
121
+
122
+ 1. **agent-eval 0.40.2**: widen `propose()` input (additive optional
123
+ `report` / `traces` / `dataset` / `maxImprovementShots`); add the
124
+ VCS-pluggable worktree adapter with a git impl; multi-sink trace fan-out
125
+ helper.
126
+ 2. **agent-runtime 0.25.0**: `analystDriver` (wraps the improvement adapter,
127
+ fed the Phase-2 report); `autoresearchDriver` (sandbox runLoop `propose`);
128
+ default-on multi-sink tracing in `handleChatTurn`.
129
+ 3. Wire one consumer end-to-end (Phase 4 of the broader rollout), prove it,
130
+ then fan out.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.38.0",
3
+ "version": "0.40.2",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -104,6 +104,11 @@
104
104
  "import": "./dist/multishot/index.js",
105
105
  "default": "./dist/multishot/index.js"
106
106
  },
107
+ "./campaign": {
108
+ "types": "./dist/campaign/index.d.ts",
109
+ "import": "./dist/campaign/index.js",
110
+ "default": "./dist/campaign/index.js"
111
+ },
107
112
  "./openapi.json": {
108
113
  "default": "./dist/openapi.json"
109
114
  }
@@ -119,17 +124,6 @@
119
124
  "publishConfig": {
120
125
  "access": "public"
121
126
  },
122
- "scripts": {
123
- "build": "tsup && pnpm openapi",
124
- "dev": "tsup --watch",
125
- "prepare": "pnpm build",
126
- "test": "vitest run",
127
- "test:watch": "vitest",
128
- "typecheck": "tsc --noEmit",
129
- "lint": "biome check src",
130
- "format": "biome format --write src",
131
- "openapi": "node dist/cli.js openapi --out dist/openapi.json"
132
- },
133
127
  "dependencies": {
134
128
  "@asteasolutions/zod-to-openapi": "^8.5.0",
135
129
  "@ax-llm/ax": "^19.0.25",
@@ -143,30 +137,45 @@
143
137
  "@tangle-network/sandbox": "^0.2.1"
144
138
  },
145
139
  "peerDependenciesMeta": {
146
- "@tangle-network/agent-runtime": { "optional": true },
147
- "@tangle-network/sandbox": { "optional": true }
140
+ "@tangle-network/agent-runtime": {
141
+ "optional": true
142
+ },
143
+ "@tangle-network/sandbox": {
144
+ "optional": true
145
+ }
148
146
  },
149
147
  "devDependencies": {
150
148
  "@biomejs/biome": "^2.4.15",
151
149
  "@tangle-network/agent-runtime": "^0.21.0",
152
150
  "@tangle-network/sandbox": "^0.2.1",
153
151
  "@types/node": "^25.6.0",
152
+ "husky": "^9.1.7",
153
+ "lint-staged": "^17.0.5",
154
154
  "openapi3-ts": "^4.5.0",
155
155
  "tsup": "^8.0.0",
156
156
  "typescript": "^5.7.0",
157
157
  "vitest": "^3.0.0"
158
158
  },
159
- "pnpm": {
160
- "minimumReleaseAge": 4320,
161
- "minimumReleaseAgeExclude": ["@tangle-network/sandbox", "@tangle-network/agent-runtime"],
162
- "overrides": {
163
- "postcss@<8.5.10": "^8.5.10",
164
- "ws@>=8.0.0 <8.20.1": "^8.20.1"
165
- }
166
- },
167
159
  "engines": {
168
160
  "node": ">=20"
169
161
  },
162
+ "lint-staged": {
163
+ "src/**/*.{ts,tsx}": [
164
+ "biome check --write --no-errors-on-unmatched"
165
+ ],
166
+ "tests/**/*.{ts,tsx}": [
167
+ "biome check --write --no-errors-on-unmatched"
168
+ ]
169
+ },
170
170
  "license": "MIT",
171
- "packageManager": "pnpm@10.22.0"
172
- }
171
+ "scripts": {
172
+ "build": "tsup && pnpm openapi",
173
+ "dev": "tsup --watch",
174
+ "test": "vitest run",
175
+ "test:watch": "vitest",
176
+ "typecheck": "tsc --noEmit",
177
+ "lint": "biome check src",
178
+ "format": "biome format --write src",
179
+ "openapi": "node dist/cli.js openapi --out dist/openapi.json"
180
+ }
181
+ }