@tangle-network/agent-eval 0.40.2 → 0.40.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +34 -29
- package/dist/campaign/index.js +2 -2
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-TMXPFWC7.js → chunk-YNMCYUWT.js} +10 -10
- package/dist/chunk-YNMCYUWT.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/{run-campaign-JYJXYHHL.js → run-campaign-KEJK5KFT.js} +2 -2
- package/docs/design/loop-taxonomy.md +40 -32
- package/docs/design/phase4-consumer-migration.md +70 -0
- package/docs/design/primitives-integration-spec.md +393 -0
- package/docs/design/product-self-improvement-loop.md +146 -0
- package/docs/design/self-improvement-engine.md +27 -17
- package/package.json +1 -1
- package/dist/chunk-TMXPFWC7.js.map +0 -1
- /package/dist/{run-campaign-JYJXYHHL.js.map → run-campaign-KEJK5KFT.js.map} +0 -0
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.40.
|
|
5
|
+
"version": "0.40.4",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-YNMCYUWT.js";
|
|
4
4
|
import "./chunk-WP7SY7AI.js";
|
|
5
5
|
import "./chunk-QYJT52YW.js";
|
|
6
6
|
import "./chunk-PZ5AY32C.js";
|
|
7
7
|
export {
|
|
8
8
|
runCampaign
|
|
9
9
|
};
|
|
10
|
-
//# sourceMappingURL=run-campaign-
|
|
10
|
+
//# sourceMappingURL=run-campaign-KEJK5KFT.js.map
|
|
@@ -147,42 +147,47 @@ interface ImprovementDriver<TFindings = unknown> {
|
|
|
147
147
|
}
|
|
148
148
|
```
|
|
149
149
|
|
|
150
|
-
|
|
|
150
|
+
| Implementation | Strategy | How it proposes | Where it lives |
|
|
151
151
|
|---|---|---|---|
|
|
152
|
-
| `evolutionaryDriver` | Evolutionary (GEPA / AxGEPA) | Mutates the current best surface into N candidates, blind to history beyond the current best. Optimizes against the dataset's rewards. | **agent-eval** (pure: dataset → surface, no sandbox) |
|
|
153
|
-
| `
|
|
152
|
+
| `evolutionaryDriver` | Evolutionary (GEPA / AxGEPA) | Standalone `ImprovementDriver`. Mutates the current best surface into N candidates, blind to history beyond the current best. Optimizes against the dataset's rewards. | **agent-eval** (pure: dataset → surface, no sandbox) |
|
|
153
|
+
| `improvementDriver` + `reflectiveGenerator` | Reflective | One driver, cheap generator: drafts patches from the report and applies them into a worktree (shots=1, no sandbox). | **agent-runtime** — implements agent-eval's `ImprovementDriver` |
|
|
154
|
+
| `improvementDriver` + `agenticGenerator` | Agentic | Same driver, full generator: runs a coding harness in the worktree (≤ `maxImprovementShots`) to edit in place. | **agent-runtime** |
|
|
154
155
|
|
|
155
156
|
This resolves the prior duplication where `runImprovementLoop` (evolutionary,
|
|
156
157
|
agent-eval) and `runAnalystLoop` (reflective, agent-runtime) were two parallel
|
|
157
|
-
loops doing "propose change → measure → gate → PR". There is **one loop
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
158
|
+
loops doing "propose change → measure → gate → PR". There is **one loop** and
|
|
159
|
+
**one driver** (`improvementDriver`); the reflective and agentic paths are
|
|
160
|
+
pluggable *generators* of it (the same operation at two settings of a cost
|
|
161
|
+
dial), not separate drivers. The dependency direction permits this cleanly:
|
|
162
|
+
agent-eval is the leaf and owns the `ImprovementDriver` contract; agent-runtime
|
|
163
|
+
imports agent-eval and implements it.
|
|
161
164
|
|
|
162
165
|
## What "the surface" is — improvement tiers
|
|
163
166
|
|
|
164
|
-
`MutableSurface` is the thing
|
|
165
|
-
invasive.
|
|
166
|
-
|
|
167
|
+
`MutableSurface` is the thing the driver changes. It has tiers, least → most
|
|
168
|
+
invasive. `MutableSurface = string | CodeSurface` spans all of them: `string`
|
|
169
|
+
for tiers 1–2, `CodeSurface{ worktreeRef }` for tier 4.
|
|
167
170
|
|
|
168
|
-
| Tier | Surface |
|
|
171
|
+
| Tier | Surface | Generator that changes it | Blast radius |
|
|
169
172
|
|---|---|---|---|
|
|
170
|
-
| 1 | System prompt / prompt-signature addendum | `evolutionaryDriver` (GEPA), `
|
|
171
|
-
| 2 | Tool config / tool signatures | `
|
|
173
|
+
| 1 | System prompt / prompt-signature addendum | `evolutionaryDriver` (GEPA), `reflectiveGenerator` | prompt only |
|
|
174
|
+
| 2 | Tool config / tool signatures | `reflectiveGenerator` | which tools, their schemas |
|
|
172
175
|
| 3 | Knowledge (wiki / knowledge graph) | agent-knowledge's knowledge adapter | what the agent *knows* |
|
|
173
|
-
| 4 | Code / scaffolding |
|
|
176
|
+
| 4 | Code / scaffolding | `agenticGenerator` (coding harness reads codebase + report) → worktree / PR | the implementation itself |
|
|
174
177
|
|
|
175
|
-
The
|
|
178
|
+
The cost/capability distinction:
|
|
176
179
|
|
|
177
|
-
-
|
|
178
|
-
1–2). Cheap, reversible, measured
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
180
|
+
- **`reflectiveGenerator`** updates the *signatures* — prompt + tool surface
|
|
181
|
+
(tiers 1–2). Cheap (drafts patches, no sandbox), reversible, measured
|
|
182
|
+
directly against the dataset.
|
|
183
|
+
- **`agenticGenerator`** updates the *code* (tier 4). A coding harness reads
|
|
184
|
+
the repository + the report, edits in a worktree, iterates up to
|
|
185
|
+
`maxImprovementShots` — measured by re-running the inner loop against the
|
|
186
|
+
changed code.
|
|
182
187
|
|
|
183
|
-
Both are
|
|
184
|
-
|
|
185
|
-
|
|
188
|
+
Both are generators of the one `improvementDriver` (propose → measure → gate →
|
|
189
|
+
PR). They differ only in *what* they edit and *how invasive* it is — and both
|
|
190
|
+
consume the **same dataset** the flywheel builds.
|
|
186
191
|
|
|
187
192
|
## Resolved design decisions
|
|
188
193
|
|
|
@@ -196,14 +201,13 @@ both consume the **same dataset** the flywheel builds.
|
|
|
196
201
|
`dispatchWithSurface` is responsible for checking out a code surface's
|
|
197
202
|
worktree before running the worker.
|
|
198
203
|
|
|
199
|
-
2. **`runAnalystLoop` (agent-runtime): analyst
|
|
200
|
-
stays separate.**
|
|
201
|
-
`
|
|
202
|
-
|
|
204
|
+
2. **`runAnalystLoop` (agent-runtime): the analyst is a GENERATOR, knowledge
|
|
205
|
+
stays separate.** Shipped in agent-runtime 0.25.0 as `improvementDriver` +
|
|
206
|
+
`reflectiveGenerator` (drafts patches from the report) / `agenticGenerator`
|
|
207
|
+
(coding harness in the worktree) — one driver, pluggable generators, fed
|
|
208
|
+
into `runImprovementLoop`'s gate + PR machinery. `runAnalystLoop`'s other
|
|
203
209
|
responsibilities — the findings ledger and knowledge-graph updates, which
|
|
204
|
-
are *not* surface optimization — stay where they are.
|
|
205
|
-
(agent-runtime); the `ImprovementDriver` contract it implements is already
|
|
206
|
-
shipped in agent-eval 0.40.1.**
|
|
210
|
+
are *not* surface optimization — stay where they are.
|
|
207
211
|
|
|
208
212
|
3. **`runLoop` + `runMultishot` converge into one parameterized
|
|
209
213
|
`runConversationLoop`** with a pluggable backend (`sandbox | router`). The
|
|
@@ -229,5 +233,9 @@ both consume the **same dataset** the flywheel builds.
|
|
|
229
233
|
+ optional PR. agent-eval.
|
|
230
234
|
- **runAnalystLoop** — reflective autoresearch: findings + knowledge updates +
|
|
231
235
|
improvement proposals. agent-runtime.
|
|
232
|
-
- **ImprovementDriver** — the
|
|
233
|
-
`evolutionaryDriver`
|
|
236
|
+
- **ImprovementDriver** — the contract a surface-proposer implements.
|
|
237
|
+
`evolutionaryDriver` (agent-eval) is one; agent-runtime's `improvementDriver`
|
|
238
|
+
is another, with pluggable `reflectiveGenerator` / `agenticGenerator`.
|
|
239
|
+
- **CandidateGenerator** — the byte-producing seam inside `improvementDriver`;
|
|
240
|
+
`reflectiveGenerator` (cheap, no sandbox) and `agenticGenerator` (coding
|
|
241
|
+
harness in the worktree) are the two cost settings. agent-runtime.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Phase 4 — consumer migration tracking
|
|
2
|
+
|
|
3
|
+
Migrate the product repos off their duplicated eval / prompt-evolution
|
|
4
|
+
orchestration onto the published substrate (`@tangle-network/agent-eval@^0.40.3`
|
|
5
|
+
+ `@tangle-network/agent-runtime@^0.25.0`). Integration contract:
|
|
6
|
+
[`primitives-integration-spec.md`](./primitives-integration-spec.md).
|
|
7
|
+
|
|
8
|
+
**Strategy:** prove **gtm end-to-end first** (the canonical consumer), then fan
|
|
9
|
+
the proven migration pattern to the rest via parallel subagents, each briefed
|
|
10
|
+
with the gtm reference diff + the spec's forbidden-anti-patterns list. Each
|
|
11
|
+
migration is its own reviewable, rollback-able PR.
|
|
12
|
+
|
|
13
|
+
## Status board
|
|
14
|
+
|
|
15
|
+
| Repo | Deletable orchestration (LOC est.) | Dispatch seam | Status | PR |
|
|
16
|
+
|---|---|---|---|---|
|
|
17
|
+
| gtm-agent | ~2,420 | `runChatThroughRuntime` | **IN PROGRESS** | — |
|
|
18
|
+
| legal-agent | tbd | tbd | queued | — |
|
|
19
|
+
| tax-agent | tbd | tbd | queued | — |
|
|
20
|
+
| creative-agent | tbd | tbd | queued | — |
|
|
21
|
+
| agent-builder | tbd | tbd | queued | — |
|
|
22
|
+
| blueprint-agent | tbd | tbd | queued (Drew dispatching via spec) | — |
|
|
23
|
+
| physim | tbd (MultiLayerVerifier adapter) | tbd | queued | — |
|
|
24
|
+
|
|
25
|
+
## Per-repo migration checklist
|
|
26
|
+
|
|
27
|
+
For each repo, in order:
|
|
28
|
+
|
|
29
|
+
- [ ] **Survey** — inventory eval + prompt-evolution wrappers (file:line + LOC).
|
|
30
|
+
Identify the dispatch seam, scenarios, judges, mutation strategy.
|
|
31
|
+
- [ ] **Bump deps** — `@tangle-network/agent-eval` → `^0.40.0`,
|
|
32
|
+
`@tangle-network/agent-runtime` → `^0.25.0`; `pnpm update`; baseline
|
|
33
|
+
typecheck green.
|
|
34
|
+
- [ ] **Rewire seams** — `dispatch`/`dispatchWithSurface`, `judges`,
|
|
35
|
+
`scenarios` extracted from the existing wrappers (KEEP domain logic).
|
|
36
|
+
- [ ] **Replace orchestration** — swap the local generation/population/scorecard
|
|
37
|
+
loop for `runImprovementLoop` (or `runCampaign` for eval-only). DELETE the
|
|
38
|
+
wrapper body.
|
|
39
|
+
- [ ] **Gate** — compose domain gates with `defaultProductionGate`.
|
|
40
|
+
- [ ] **Dataset** — wire `FsLabeledScenarioStore` with correct `captureSource`.
|
|
41
|
+
- [ ] **Tests** — port wrapper contract tests to assert the substrate wiring;
|
|
42
|
+
keep judge/scenario tests. Suite green.
|
|
43
|
+
- [ ] **Prove** — one real eval/improve run end-to-end; confirm scorecard +
|
|
44
|
+
(if applicable) a PR opens on a shipping gate.
|
|
45
|
+
- [ ] **Anti-pattern sweep** — no silent fallbacks, no reimplemented loop, no
|
|
46
|
+
train/holdout conflation, tracing on, dispatch named.
|
|
47
|
+
- [ ] **PR** — open, independent-review, merge.
|
|
48
|
+
|
|
49
|
+
## gtm-agent — migration map (from survey)
|
|
50
|
+
|
|
51
|
+
- **Branch base:** off the repo's working branch (`feat/gtm-rich-chat-actions`)
|
|
52
|
+
or main — confirm before starting.
|
|
53
|
+
- **Dispatch seam:** `runChatThroughRuntime(ctx)`
|
|
54
|
+
(`src/lib/.server/agent-runtime/chat.ts`) — prompt variant + scenario → real
|
|
55
|
+
agent run → artifact + events + token usage.
|
|
56
|
+
- **Scenarios:** `src/lib/.server/production-loop/scenarios.ts` (3 holdout) +
|
|
57
|
+
`eval/business-owner/personas.json` (canonical personas).
|
|
58
|
+
- **Judges:** `src/lib/.server/production-loop/judges.ts` (`runEnsembleJudge`,
|
|
59
|
+
3-model ensemble) + canonical 12-dimension judges in `eval/canonical.ts`.
|
|
60
|
+
- **Delete (~2,420 LOC orchestration):** the generation/population/reps loop in
|
|
61
|
+
`src/lib/.server/production-loop/index.ts` (~450), the checkpoint loop in
|
|
62
|
+
`eval/canonical.ts` (~600), `eval/run-prompt-evolution.ts` wrapper (~800),
|
|
63
|
+
`eval/analyst-loop.ts` wrapper (~300), `eval/optimization-campaign.ts` (~170),
|
|
64
|
+
`scripts/evals/run-optimization-campaign.ts` (~100 scaffold).
|
|
65
|
+
- **Rewire:** `buildHoldoutRunner` → `dispatchWithSurface`; `buildScorer` →
|
|
66
|
+
`judges`; `buildMutator` → `evolutionaryDriver({ mutator })`;
|
|
67
|
+
`runProductionLoop` → `runImprovementLoop`.
|
|
68
|
+
- **Keep:** judges, scenarios, persona data + reactive driver, deterministic
|
|
69
|
+
anti-slop/brief checks, GitHub PR wiring, feedback/trace ingestion.
|
|
70
|
+
- **Net:** ~1,400–1,600 LOC reduction.
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
# Self-improvement primitives — integration spec
|
|
2
|
+
|
|
3
|
+
**Audience:** an engineer (or agent) wiring a product onto the Tangle
|
|
4
|
+
self-improvement stack. This is the authoritative "how to use the primitives"
|
|
5
|
+
reference. It is exact: every signature, every seam, every forbidden pattern.
|
|
6
|
+
|
|
7
|
+
**Packages (published):**
|
|
8
|
+
- `@tangle-network/agent-eval@^0.40.3` — measurement + improvement loop +
|
|
9
|
+
worktree adapter + gates + dataset store. The leaf; depends on nothing
|
|
10
|
+
upstream. Import the loop surface from `@tangle-network/agent-eval/campaign`.
|
|
11
|
+
- `@tangle-network/agent-runtime@^0.25.0` — the runtime-side improvement
|
|
12
|
+
driver (`improvementDriver`) + generators (`reflectiveGenerator`,
|
|
13
|
+
`agenticGenerator`). Import from `@tangle-network/agent-runtime/improvement`.
|
|
14
|
+
|
|
15
|
+
Read [`loop-taxonomy.md`](./loop-taxonomy.md) (vocabulary) and
|
|
16
|
+
[`self-improvement-engine.md`](./self-improvement-engine.md) (phases) first.
|
|
17
|
+
This doc is the contract-level detail under them.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## 0. The one-paragraph model
|
|
22
|
+
|
|
23
|
+
A **measurement** (`runCampaign`) runs your agent (behind a `dispatch` seam)
|
|
24
|
+
over `scenarios`, judges the outputs, and returns a scorecard with confidence
|
|
25
|
+
intervals. An **improvement loop** (`runImprovementLoop`) drives an
|
|
26
|
+
`ImprovementDriver` to propose candidate **surfaces** (a prompt string, or a
|
|
27
|
+
`CodeSurface` = a git worktree of code edits), measures each on a **holdout**,
|
|
28
|
+
runs a release **gate**, and opens a **PR** for the winner. Every run feeds a
|
|
29
|
+
**dataset** (`LabeledScenarioStore`) — the same corpus the optimizer learns
|
|
30
|
+
from. Three roles, fixed meaning: **driver** decides what's next; **worker** =
|
|
31
|
+
the agent in a sandbox (invoked behind `dispatch`); **measurement** runs the
|
|
32
|
+
worker and scores it.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 1. The seams you implement (everything else is substrate)
|
|
37
|
+
|
|
38
|
+
You implement exactly three things. The substrate owns the rest.
|
|
39
|
+
|
|
40
|
+
| Seam | Type | What it is |
|
|
41
|
+
|---|---|---|
|
|
42
|
+
| `dispatch` | `(scenario, ctx) => Promise<TArtifact>` | invoke YOUR agent on one scenario → the artifact judges score. Topology-opaque: one LLM call, or a driver↔workers-in-a-sandbox loop — substrate doesn't care. |
|
|
43
|
+
| `judges` | `JudgeConfig<TArtifact, TScenario>[]` | score an artifact on named dimensions → composite. Your rubrics. |
|
|
44
|
+
| `scenarios` | `Scenario[]` | the inputs (`{ id, kind, ... }`). Your eval set. |
|
|
45
|
+
|
|
46
|
+
If you are also improving a surface, you additionally provide:
|
|
47
|
+
|
|
48
|
+
| Seam | Type | What it is |
|
|
49
|
+
|---|---|---|
|
|
50
|
+
| `dispatchWithSurface` | `(surface, scenario, ctx) => Promise<TArtifact>` | like `dispatch`, but takes the candidate surface (prompt string or `CodeSurface`) — swap it into your agent before running. |
|
|
51
|
+
| a **driver** | `ImprovementDriver` | how candidates are proposed (see §4). Use a shipped one; don't hand-roll. |
|
|
52
|
+
| a **gate** | `Gate` | ship/hold decision (use `defaultProductionGate`). |
|
|
53
|
+
|
|
54
|
+
**You never implement:** generation loops, population/top-K selection, seed
|
|
55
|
+
propagation, manifest hashing, cell caching, bootstrap CIs, worktree git
|
|
56
|
+
plumbing, PR-opening, or trace capture. Reimplementing any of these is the
|
|
57
|
+
anti-pattern this whole stack exists to delete.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 2. `runCampaign` — the measurement primitive
|
|
62
|
+
|
|
63
|
+
```ts
|
|
64
|
+
import { runCampaign, type RunCampaignOptions } from '@tangle-network/agent-eval/campaign'
|
|
65
|
+
|
|
66
|
+
const result = await runCampaign<MyScenario, MyArtifact>({
|
|
67
|
+
scenarios, // MyScenario[]
|
|
68
|
+
dispatch, // (scenario, ctx) => Promise<MyArtifact>
|
|
69
|
+
judges, // JudgeConfig<MyArtifact, MyScenario>[] (optional)
|
|
70
|
+
runDir: '/abs/run/dir', // REQUIRED — where artifacts + traces land
|
|
71
|
+
seed: 42, // default 42 — reproducibility
|
|
72
|
+
reps: 1, // per-scenario replicates; raise to 5+ for tight CIs
|
|
73
|
+
maxConcurrency: 2, // parallel cells
|
|
74
|
+
costCeiling: 5.0, // optional USD soft-abort
|
|
75
|
+
tracing: 'on', // default on; 'off' refused by improvement loop w/ a driver
|
|
76
|
+
labeledStore: store, // optional capture (see §8); 'off' to disable
|
|
77
|
+
captureSource: 'eval-run', // provenance for captured rows
|
|
78
|
+
})
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Returns `CampaignResult<TArtifact, TScenario>`:
|
|
82
|
+
```ts
|
|
83
|
+
{
|
|
84
|
+
manifestHash: string // sha256(scenarios, judges, dispatch ref, seed, reps) — run identity
|
|
85
|
+
seed: number
|
|
86
|
+
startedAt, endedAt, durationMs
|
|
87
|
+
cells: CampaignCellResult[] // one per scenario×rep: { cellId, scenarioId, rep, artifact, judgeScores, costUsd, cached, error? }
|
|
88
|
+
aggregates: {
|
|
89
|
+
byJudge: Record<string, JudgeAggregate> // { mean, stdev, ci95:[lo,hi], n } — bootstrap CIs
|
|
90
|
+
byScenario: Record<string, ScenarioAggregate>
|
|
91
|
+
totalCostUsd, cellsExecuted, cellsSkipped, cellsCached, cellsFailed
|
|
92
|
+
}
|
|
93
|
+
runDir, artifactsByPath, scenarios
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Rules:**
|
|
98
|
+
- `dispatch` must be a *named* function (`dispatch.name` feeds the manifest hash
|
|
99
|
+
— anonymous arrows weaken reproducibility identity).
|
|
100
|
+
- Inspect `cell.error` before trusting `cell.artifact`. Cells fail-soft
|
|
101
|
+
individually (one bad scenario doesn't kill the run) but the error is
|
|
102
|
+
recorded, never swallowed.
|
|
103
|
+
- Re-running the same `runDir` with `resumable: true` (default) skips cached
|
|
104
|
+
cells by `(manifestHash, scenarioId, rep)`.
|
|
105
|
+
|
|
106
|
+
`runEval(opts)` is a thin alias for the scorecard-only case (no improvement).
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## 3. `JudgeConfig`, `Scenario` — the domain types you own
|
|
111
|
+
|
|
112
|
+
```ts
|
|
113
|
+
interface Scenario { id: string; kind: string; /* + your fields */ }
|
|
114
|
+
|
|
115
|
+
interface JudgeConfig<TArtifact, TScenario = Scenario> {
|
|
116
|
+
name: string
|
|
117
|
+
dimensions: { key: string; weight?: number }[]
|
|
118
|
+
appliesTo?: (scenario: TScenario) => boolean // scope a judge to some scenarios
|
|
119
|
+
score(args: { artifact: TArtifact; scenario: TScenario; signal: AbortSignal })
|
|
120
|
+
: Promise<JudgeScore> | JudgeScore
|
|
121
|
+
}
|
|
122
|
+
interface JudgeScore { composite: number; dimensions: Record<string, number>; notes: string }
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Judges are where your rubric lives. They MUST fail loud: if the judge LLM call
|
|
126
|
+
fails, throw — do not return a `composite: 0` (a fake zero is indistinguishable
|
|
127
|
+
from a real zero and silently corrupts every aggregate downstream).
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## 4. The improvement loop — `runImprovementLoop`
|
|
132
|
+
|
|
133
|
+
```ts
|
|
134
|
+
import {
|
|
135
|
+
runImprovementLoop, defaultProductionGate, evolutionaryDriver,
|
|
136
|
+
} from '@tangle-network/agent-eval/campaign'
|
|
137
|
+
|
|
138
|
+
const result = await runImprovementLoop({
|
|
139
|
+
// --- measurement config (same as runCampaign, minus dispatch) ---
|
|
140
|
+
scenarios: trainScenarios,
|
|
141
|
+
judges,
|
|
142
|
+
runDir,
|
|
143
|
+
// --- surface improvement ---
|
|
144
|
+
baselineSurface, // string | CodeSurface — current best
|
|
145
|
+
dispatchWithSurface, // (surface, scenario, ctx) => artifact
|
|
146
|
+
driver, // ImprovementDriver — see §5/§6
|
|
147
|
+
populationSize: 4, // BREADTH: candidates per generation
|
|
148
|
+
maxGenerations: 3,
|
|
149
|
+
promoteTopK: 2,
|
|
150
|
+
maxImprovementShots: 3, // DEPTH: forwarded to the driver's propose()
|
|
151
|
+
// --- gated promotion ---
|
|
152
|
+
holdoutScenarios, // NEVER in the training pool — gate scores on these
|
|
153
|
+
gate: defaultProductionGate({ holdoutScenarios, deltaThreshold: 0.02 }),
|
|
154
|
+
autoOnPromote: 'pr', // 'pr' | 'none' (NO 'config' in v0.40 — throws)
|
|
155
|
+
ghOwner: 'tangle-network',
|
|
156
|
+
ghRepo: 'gtm-agent', // required when autoOnPromote: 'pr'
|
|
157
|
+
})
|
|
158
|
+
// → { winnerSurface, winnerSurfaceHash, generations, baselineOnHoldout,
|
|
159
|
+
// winnerOnHoldout, gateResult, prResult? }
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
`runOptimization(opts)` is the loop body without the gate/holdout/PR (use it
|
|
163
|
+
when you want candidates + a winner but will gate yourself).
|
|
164
|
+
|
|
165
|
+
**Hard refusals (by design — these throw):**
|
|
166
|
+
- `autoOnPromote: 'config'` → deferred to a later pass (live self-mutation
|
|
167
|
+
needs the full safety stack). Use `'pr'` or `'none'`.
|
|
168
|
+
- `tracing: 'off'` while a `driver` is wired → an improvement loop that doesn't
|
|
169
|
+
feed the dataset is unattributable.
|
|
170
|
+
- `autoOnPromote: 'pr'` without `ghOwner`/`ghRepo`.
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## 5. `ImprovementDriver` + `ProposeContext` — the contract
|
|
175
|
+
|
|
176
|
+
```ts
|
|
177
|
+
interface ImprovementDriver<TFindings = unknown> {
|
|
178
|
+
kind: string
|
|
179
|
+
propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]> // PLAN
|
|
180
|
+
decide?(args: { history: GenerationRecord[] }): { stop: boolean; reason?: string }
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
interface ProposeContext<TFindings = unknown> {
|
|
184
|
+
currentSurface: MutableSurface
|
|
185
|
+
history: GenerationRecord[] // prior generations + scores
|
|
186
|
+
findings: TFindings[]
|
|
187
|
+
populationSize: number // how many candidates to return
|
|
188
|
+
generation: number
|
|
189
|
+
signal: AbortSignal
|
|
190
|
+
report?: unknown // Phase-2 research report (analyst findings + diff)
|
|
191
|
+
dataset?: LabeledScenarioStore // handle to all captured data
|
|
192
|
+
maxImprovementShots?: number // DEPTH knob
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
type MutableSurface = string | CodeSurface
|
|
196
|
+
interface CodeSurface { kind: 'code'; worktreeRef: string; baseRef?: string; summary?: string }
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
`propose()` returns candidates; it does NOT measure (the loop measures). For a
|
|
200
|
+
code-tier driver, `propose()` may itself be agentic (spawn a harness, write a
|
|
201
|
+
worktree) — that's the recursion. Pick a shipped driver:
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## 6. The shipped drivers (use these; don't hand-roll)
|
|
206
|
+
|
|
207
|
+
### `evolutionaryDriver` (agent-eval) — prompt mutation, no sandbox
|
|
208
|
+
```ts
|
|
209
|
+
import { evolutionaryDriver } from '@tangle-network/agent-eval/campaign'
|
|
210
|
+
|
|
211
|
+
const driver = evolutionaryDriver({
|
|
212
|
+
mutator: { // YOUR Mutator (the only domain bit)
|
|
213
|
+
kind: 'reflection',
|
|
214
|
+
async mutate({ currentSurface, populationSize, findings, signal }) {
|
|
215
|
+
// return N prompt-string variants of currentSurface
|
|
216
|
+
return [...]
|
|
217
|
+
},
|
|
218
|
+
},
|
|
219
|
+
})
|
|
220
|
+
```
|
|
221
|
+
Use when the surface is a **prompt string** and you have a mutation strategy
|
|
222
|
+
(reflection, GEPA, AxGEPA). Cheap, deterministic-friendly.
|
|
223
|
+
|
|
224
|
+
### `improvementDriver` + generators (agent-runtime) — one driver, a cost dial
|
|
225
|
+
```ts
|
|
226
|
+
import {
|
|
227
|
+
improvementDriver, reflectiveGenerator, agenticGenerator,
|
|
228
|
+
} from '@tangle-network/agent-runtime/improvement'
|
|
229
|
+
import { gitWorktreeAdapter } from '@tangle-network/agent-eval/campaign'
|
|
230
|
+
|
|
231
|
+
const worktree = gitWorktreeAdapter({ repoRoot: '/abs/repo' })
|
|
232
|
+
|
|
233
|
+
// cheap, no sandbox: drafts patches from findings, applies them
|
|
234
|
+
const cheap = improvementDriver({
|
|
235
|
+
worktree,
|
|
236
|
+
generator: reflectiveGenerator({ improvementAdapter }), // wraps proposeFromFindings
|
|
237
|
+
baseRef: 'main',
|
|
238
|
+
})
|
|
239
|
+
|
|
240
|
+
// full agentic: a real coding harness edits the worktree, retries up to maxShots
|
|
241
|
+
const deep = improvementDriver({
|
|
242
|
+
worktree,
|
|
243
|
+
generator: agenticGenerator({ harness: 'claude' }), // claude | codex | opencode
|
|
244
|
+
baseRef: 'main',
|
|
245
|
+
})
|
|
246
|
+
```
|
|
247
|
+
One driver; the generator is the cost dial. Both emit `CodeSurface`s the loop
|
|
248
|
+
measures + gates. `agenticGenerator.generate()` runs the harness with
|
|
249
|
+
`cwd = worktree`, trusts the **git diff** (not harness stdout) to decide
|
|
250
|
+
"applied", and retries up to `maxImprovementShots` on a clean tree.
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## 7. Gates — `defaultProductionGate`, `composeGate`, `heldOutGate`
|
|
255
|
+
|
|
256
|
+
```ts
|
|
257
|
+
import { defaultProductionGate, composeGate, heldOutGate } from '@tangle-network/agent-eval/campaign'
|
|
258
|
+
|
|
259
|
+
// opinionated default: heldout-delta + budget + red-team + reward-hacking + canary
|
|
260
|
+
const gate = defaultProductionGate({
|
|
261
|
+
holdoutScenarios,
|
|
262
|
+
deltaThreshold: 0.02, // winner must beat baseline by this on holdout
|
|
263
|
+
budgetUsd: 5, // optional cost ceiling
|
|
264
|
+
redTeamBattery: [...], // optional adversarial probes
|
|
265
|
+
})
|
|
266
|
+
|
|
267
|
+
// compose your own: ALL must ship, else the worst verdict wins
|
|
268
|
+
const custom = composeGate(heldOutGate({ scenarios: holdoutScenarios, deltaThreshold: 0.02 }), myDomainGate)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
`Gate.decide(ctx) → GateResult` with a 5-valued verdict:
|
|
272
|
+
`GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'`.
|
|
273
|
+
`composeGate` returns `ship` only if all sub-gates ship; otherwise the
|
|
274
|
+
precedence is `arch_ceiling > model_ceiling > hold > need_more_work`. Use the
|
|
275
|
+
non-ship verdicts to route: `need_more_work` → more data, `model_ceiling` →
|
|
276
|
+
try a stronger model, `arch_ceiling` → the surface can't fix it.
|
|
277
|
+
|
|
278
|
+
`openAutoPr({ result, gate, promotedDiff, ghOwner, ghRepo })` opens the PR —
|
|
279
|
+
**refuses unless `gate.decision === 'ship'`**, dry-runs without a GH token.
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## 8. The dataset flywheel — `FsLabeledScenarioStore`
|
|
284
|
+
|
|
285
|
+
```ts
|
|
286
|
+
import { FsLabeledScenarioStore } from '@tangle-network/agent-eval/campaign'
|
|
287
|
+
|
|
288
|
+
const store = new FsLabeledScenarioStore({ root: '/abs/dataset', maxWritesPerMinutePerBucket: 60 })
|
|
289
|
+
// pass to runCampaign({ labeledStore: store, captureSource: 'production-trace' })
|
|
290
|
+
```
|
|
291
|
+
Every campaign cell captures `(scenario, artifact, judgeScore, source)`. This
|
|
292
|
+
corpus IS the optimizer's training set. Discipline enforced at the store:
|
|
293
|
+
- **provenance required** on every write (source / sourceVersionHash /
|
|
294
|
+
capturedAt / redactionStatus).
|
|
295
|
+
- **temporal split**: `sample()` requires explicit `split` + `capturedBefore`.
|
|
296
|
+
- **`production-trace` is excluded from the train split by default** (no
|
|
297
|
+
contamination of the holdout it's judged against).
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
## 9. The migration recipe (what to DELETE / KEEP / REWIRE)
|
|
302
|
+
|
|
303
|
+
For a product that already has eval + prompt-evolution wrappers:
|
|
304
|
+
|
|
305
|
+
**DELETE (orchestration the substrate now owns):**
|
|
306
|
+
- generation/population/top-K loops, trial-matrix construction, frontier
|
|
307
|
+
tracking, seed plumbing, manifest hashing, cell caching, scorecard
|
|
308
|
+
aggregation, CI math, PR-opening scaffolding, worktree git commands.
|
|
309
|
+
- any local `runProductionLoop` / `runPromptEvolution` / `runAnalystLoop`
|
|
310
|
+
wrapper whose body is a loop over generations × candidates × reps.
|
|
311
|
+
|
|
312
|
+
**KEEP (domain logic — it does not move):**
|
|
313
|
+
- scenarios (your eval inputs) → become `scenarios`.
|
|
314
|
+
- judges/rubrics/dimension weights → become `judges`.
|
|
315
|
+
- the agent-invocation function → becomes `dispatch` / `dispatchWithSurface`.
|
|
316
|
+
- the mutation strategy (reflection prompt) → becomes a `Mutator` or a
|
|
317
|
+
generator's `buildPrompt`.
|
|
318
|
+
- domain gates (e.g. anti-fabrication) → compose with `defaultProductionGate`.
|
|
319
|
+
|
|
320
|
+
**REWIRE:**
|
|
321
|
+
- `buildHoldoutRunner()` → `dispatchWithSurface`.
|
|
322
|
+
- `buildScorer()` → `judges`.
|
|
323
|
+
- `buildMutator()` → `evolutionaryDriver({ mutator })`.
|
|
324
|
+
- `runProductionLoop(...)` → `runImprovementLoop(...)`.
|
|
325
|
+
- `runPromptEvolution(...)` → `runImprovementLoop` (surface = prompt string).
|
|
326
|
+
- `runAnalystLoop(...)` improvement step → `improvementDriver` + a generator;
|
|
327
|
+
its findings-ledger + knowledge-graph writes stay.
|
|
328
|
+
|
|
329
|
+
Net for a typical consumer: ~2,400 LOC of orchestration deleted, ~800 LOC
|
|
330
|
+
rewired into the three seams.
|
|
331
|
+
|
|
332
|
+
---
|
|
333
|
+
|
|
334
|
+
## 10. Forbidden anti-patterns (a review will reject these)
|
|
335
|
+
|
|
336
|
+
1. **No silent fallbacks.** No `catch { return null }`, no `?? 0` on a judge
|
|
337
|
+
composite, no returning `false`/empty on an error you can't interpret.
|
|
338
|
+
External-boundary calls return typed outcomes or throw. A git/LLM/subprocess
|
|
339
|
+
failure is a *throw*, never a fold-into-a-default.
|
|
340
|
+
2. **Don't reimplement the loop.** If you write a `for (gen of generations)`
|
|
341
|
+
that mutates + scores + selects, you've rebuilt the substrate. Stop; call
|
|
342
|
+
`runImprovementLoop`.
|
|
343
|
+
3. **Don't conflate train and holdout.** Holdout scenarios never enter the
|
|
344
|
+
training pool. The gate scores on holdout only.
|
|
345
|
+
4. **Don't trust harness stdout.** For code edits, the git diff is the truth,
|
|
346
|
+
not what the agent says it did.
|
|
347
|
+
5. **Account for every worktree.** A created worktree is finalized into a
|
|
348
|
+
surface or discarded — never leaked, even on throw (the shipped
|
|
349
|
+
`improvementDriver` already guarantees this; preserve it if you extend).
|
|
350
|
+
6. **Don't auto-deploy.** Promotion opens a PR (`autoOnPromote: 'pr'`). Live
|
|
351
|
+
self-mutation (`'config'`) is deferred behind the full safety stack.
|
|
352
|
+
7. **Tracing stays on when improving.** The loop refuses `tracing: 'off'` with
|
|
353
|
+
a driver wired — the dataset must be fed.
|
|
354
|
+
8. **Name your `dispatch`.** Anonymous dispatch weakens the manifest-hash
|
|
355
|
+
reproducibility identity.
|
|
356
|
+
|
|
357
|
+
---
|
|
358
|
+
|
|
359
|
+
## 11. Minimal end-to-end skeleton
|
|
360
|
+
|
|
361
|
+
```ts
|
|
362
|
+
import {
|
|
363
|
+
runImprovementLoop, defaultProductionGate, evolutionaryDriver,
|
|
364
|
+
FsLabeledScenarioStore,
|
|
365
|
+
} from '@tangle-network/agent-eval/campaign'
|
|
366
|
+
|
|
367
|
+
const store = new FsLabeledScenarioStore({ root: '.dataset' })
|
|
368
|
+
|
|
369
|
+
async function dispatchWithSurface(surface: string, scenario: MyScenario) {
|
|
370
|
+
return runMyAgent({ systemPrompt: surface, input: scenario }) // → MyArtifact
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
const judges = [{
|
|
374
|
+
name: 'quality',
|
|
375
|
+
dimensions: [{ key: 'grounding' }, { key: 'actionability' }],
|
|
376
|
+
async score({ artifact, scenario }) { /* → JudgeScore, throw on failure */ },
|
|
377
|
+
}]
|
|
378
|
+
|
|
379
|
+
const result = await runImprovementLoop<MyScenario, MyArtifact>({
|
|
380
|
+
scenarios: train, holdoutScenarios: holdout, judges,
|
|
381
|
+
baselineSurface: CURRENT_PROMPT,
|
|
382
|
+
dispatchWithSurface,
|
|
383
|
+
driver: evolutionaryDriver({ mutator: myReflectionMutator }),
|
|
384
|
+
populationSize: 4, maxGenerations: 3, promoteTopK: 2,
|
|
385
|
+
gate: defaultProductionGate({ holdoutScenarios: holdout, deltaThreshold: 0.02 }),
|
|
386
|
+
autoOnPromote: 'pr', ghOwner: 'tangle-network', ghRepo: 'my-agent',
|
|
387
|
+
runDir: '.runs/improve', labeledStore: store, captureSource: 'eval-run',
|
|
388
|
+
})
|
|
389
|
+
|
|
390
|
+
if (result.gateResult.decision === 'ship') console.log('PR:', result.prResult?.prUrl)
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
That is the whole integration. Everything not in this skeleton is substrate.
|