@tangle-network/agent-eval 0.43.1 → 0.44.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import { R as RunRecord } from './run-record-BGY6bHRh.js';
2
- import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
2
+ import { O as OutcomeStore } from './outcome-store-BxJ3DQKJ.js';
3
3
 
4
4
  /**
5
5
  * Rubric predictive validity — does our eval rubric predict deployment
@@ -0,0 +1,401 @@
1
+ import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-DToGONFA.js';
2
+ import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
3
+ import { RunRecord } from '@tangle-network/agent-runtime';
4
+ import { R as RedTeamCase } from './red-team-30II1T4o.js';
5
+
6
+ /**
7
+ * @experimental
8
+ *
9
+ * `openAutoPr` — thin shell-out helper for the `runImprovementLoop` preset's
10
+ * `autoOnPromote: 'pr'` mode. Substitutes for the per-product PR-opening
11
+ * code consumers duplicated 4 times. The PR body includes the campaign's
12
+ * manifest hash, gate verdict, and scorecard summary so reviewers can see
13
+ * exactly what was promoted + why.
14
+ *
15
+ * NOT a deploy mechanism — this only OPENS a PR. The human reviews + merges.
16
+ * The Shape B (`autoOnPromote: 'config'`) live-runtime-mutation path is
17
+ * deferred to Pass B with the full shadow / canary / rollback stack.
18
+ */
19
+
20
+ interface OpenAutoPrOptions<TArtifact, TScenario extends Scenario> {
21
+ /** Campaign result to attach to the PR. */
22
+ result: CampaignResult<TArtifact, TScenario>;
23
+ /** Gate verdict explaining the promotion. Substrate refuses to open a PR
24
+ * when `gate.decision !== 'ship'` — fails loud. */
25
+ gate: GateResult;
26
+ /** Promoted surface diff — typically the new system prompt addendum or
27
+ * full profile diff. Substrate writes it as the PR body. */
28
+ promotedDiff: string;
29
+ /** GH owner/repo target (e.g., `tangle-network/gtm-agent`). */
30
+ ghOwner: string;
31
+ ghRepo: string;
32
+ /** Branch name for the PR. Default `auto/<manifestHash[:12]>`. */
33
+ branch?: string;
34
+ /** PR title. Default includes manifest hash. */
35
+ title?: string;
36
+ /** Whether to actually open the PR or just dry-run. Default reads
37
+ * `GH_AUTO_PR_TOKEN` env — present = open, absent = dry-run. */
38
+ dryRun?: boolean;
39
+ /** Test seam — substitute `gh pr create` invocation. */
40
+ ghExec?: (args: string[]) => {
41
+ stdout: string;
42
+ stderr: string;
43
+ status: number;
44
+ };
45
+ }
46
+ interface OpenAutoPrResult {
47
+ opened: boolean;
48
+ prUrl?: string;
49
+ dryRun: boolean;
50
+ reason: string;
51
+ }
52
+ declare function openAutoPr<TArtifact, TScenario extends Scenario>(options: OpenAutoPrOptions<TArtifact, TScenario>): OpenAutoPrResult;
53
+
54
+ /**
55
+ * @experimental
56
+ *
57
+ * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
58
+ * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
59
+ * the evolutionary strategy: each generation, mutate the current best surface
60
+ * into N candidates, measure, select. No generation memory beyond the current
61
+ * surface; the loop body handles ranking + promotion.
62
+ *
63
+ * The reflective alternative is agent-runtime's `improvementDriver` with a
64
+ * `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +
65
+ * trace findings to propose targeted edits rather than blind mutations. Both
66
+ * conform to `ImprovementDriver`; the improvement loop is identical regardless
67
+ * of which drives it.
68
+ */
69
+
70
+ interface EvolutionaryDriverOptions<TFindings = unknown> {
71
+ mutator: Mutator<TFindings>;
72
+ /** External findings fed to the mutator each generation. Default: []. */
73
+ findings?: TFindings[];
74
+ }
75
+ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
76
+
77
+ /**
78
+ * @experimental
79
+ *
80
+ * `gepaDriver` — a reflective `ImprovementDriver` for prompt-tier surfaces.
81
+ * Each generation it reflects on the prior best candidate's per-scenario
82
+ * scores + weakest dimensions (the `GenerationCandidate` evidence from
83
+ * `runOptimization`), asks an LLM to propose targeted rewrites of the current
84
+ * surface, and returns them as the next population.
85
+ *
86
+ * This is the substrate's best-in-class prompt optimizer: surface-agnostic, so
87
+ * ANY string surface in ANY consumer opts in by selecting it — system prompts,
88
+ * prompt addenda, judge/reviewer prompts, even a driver's own reflection
89
+ * prompt. It reuses the generic reflection primitive (`buildReflectionPrompt` /
90
+ * `parseReflectionResponse`) and the router client; it has NO dependency on the
91
+ * legacy `runMultiShotOptimization` / `prompt-evolution` orchestration.
92
+ *
93
+ * It earns its keep where there is real per-instance signal (which the
94
+ * dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
95
+ * now provide). For thin-signal surfaces it degrades to plain reflection — so
96
+ * it is a SELECTABLE driver, never a forced default. On generation 0 (no
97
+ * history) it reflects on the current surface against the mutation primitives
98
+ * alone.
99
+ */
100
+
101
+ interface GepaDriverOptions {
102
+ /** Router transport (apiKey/baseUrl). */
103
+ llm: LlmClientOptions;
104
+ /** Model that performs the reflection. */
105
+ model: string;
106
+ /** What is being optimized — appears in the reflection prompt for orientation. */
107
+ target: string;
108
+ /** Surface-specific mutation levers offered to the model. */
109
+ mutationPrimitives?: string[];
110
+ /** Top/bottom scenarios surfaced as evidence each generation. Default 3. */
111
+ evidenceK?: number;
112
+ /** Reflection sampling temperature. Default 0.7. */
113
+ temperature?: number;
114
+ /** Reflection max tokens. Default 6000. */
115
+ maxTokens?: number;
116
+ }
117
+ declare function gepaDriver(opts: GepaDriverOptions): ImprovementDriver;
118
+
119
+ /**
120
+ * @experimental
121
+ *
122
+ * Compose multiple `Gate` implementations — every gate must pass for the
123
+ * composite to ship. Closes the alignment reviewer's "default-only
124
+ * heldOutGate + costGate would happily promote a reward-hacked prompt"
125
+ * concern by making safety gates first-class composable defaults.
126
+ */
127
+
128
+ /** Compose gates — all must `ship` for the composite to `ship`. First
129
+ * non-ship verdict short-circuits the composite verdict, but ALL gates run
130
+ * (so the result records every gate's reason — useful for diagnostics). */
131
+ declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
132
+
133
+ /**
134
+ * @experimental
135
+ *
136
+ * `defaultProductionGate` — composes the substrate's existing safety
137
+ * primitives (red-team / reward-hacking / canary / heldout) into a single
138
+ * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
139
+ * primitives are off the critical path" blocker.
140
+ *
141
+ * The composition is opinionated — when consumers wire `runImprovementLoop`,
142
+ * THIS gate is the default. Consumers can still pass a custom gate to
143
+ * override; the recommended pattern is to compose THIS gate with whatever
144
+ * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
145
+ */
146
+
147
+ interface DefaultProductionGateOptions {
148
+ /** Required: scenarios held out from training; substrate compares
149
+ * candidate-on-holdout vs baseline-on-holdout. */
150
+ holdoutScenarios: Scenario[];
151
+ /** Minimum mean-composite improvement required to ship. Default 0.5. */
152
+ deltaThreshold?: number;
153
+ /** Total $ budget for ALL cells in this campaign — including baseline + candidate.
154
+ * Composite verdict refuses to ship when spend exceeded budget. */
155
+ budgetUsd?: number;
156
+ /** Red-team cases to probe candidate outputs against. When omitted the
157
+ * substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
158
+ * battery for tighter coverage. */
159
+ redTeamBattery?: RedTeamCase[];
160
+ /** Run records (oldest-first) needed for the reward-hacking detector.
161
+ * Substrate populates from prior production-loop generations. */
162
+ recentRuns?: RunRecord[];
163
+ /** When true, the gate refuses to ship if the reward-hacking detector
164
+ * fires at the `gaming` severity. Default true. */
165
+ blockOnRewardHackingGaming?: boolean;
166
+ }
167
+ declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
168
+
169
+ /**
170
+ * @experimental
171
+ *
172
+ * Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
173
+ * `Gate`. Use when you want held-out as one of N composed gates instead of
174
+ * the full `defaultProductionGate` stack.
175
+ */
176
+
177
+ interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
178
+ scenarios: TScenario[];
179
+ deltaThreshold?: number;
180
+ }
181
+ declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
182
+
183
+ /**
184
+ * @experimental
185
+ *
186
+ * `CampaignStorage` — the filesystem seam `runCampaign` writes through
187
+ * (run/cell dirs, the resumability cache, per-cell artifacts, trace spans).
188
+ *
189
+ * The default (`fsCampaignStorage`) is the Node filesystem — identical
190
+ * behavior to the inline `node:fs` calls it replaces, so existing CLI
191
+ * consumers are unaffected. `inMemoryCampaignStorage` keeps everything in a
192
+ * `Map`, so the substrate runs in environments WITHOUT a filesystem
193
+ * (Cloudflare Workers, Deno Deploy, other edge runtimes) — the campaign
194
+ * still produces its `CampaignResult` (cells + aggregates) in memory;
195
+ * artifacts/traces simply aren't persisted to disk.
196
+ *
197
+ * Paths are opaque keys to the in-memory adapter — it does not parse them,
198
+ * so the same `join(...)`-built paths work unchanged across both adapters.
199
+ */
200
+ interface CampaignStorage {
201
+ /** Ensure a directory exists (recursive). No-op for in-memory. */
202
+ ensureDir(dir: string): void;
203
+ /** Does this path exist (as a written file or an ensured dir)? */
204
+ exists(path: string): boolean;
205
+ /** Read a UTF-8 file; `undefined` when missing or unreadable. */
206
+ read(path: string): string | undefined;
207
+ /** Write a file (string or bytes). Parent dir is assumed ensured. */
208
+ write(path: string, content: string | Uint8Array): void;
209
+ }
210
+ /** Node-filesystem storage — the default. Lazily requires `node:fs` so the
211
+ * module imports cleanly in non-Node runtimes (where the caller passes
212
+ * `inMemoryCampaignStorage` instead and never constructs this). */
213
+ declare function fsCampaignStorage(): CampaignStorage;
214
+ /** In-memory storage for filesystem-less runtimes. Artifacts + trace spans
215
+ * live in a `Map` for the duration of the run; the `CampaignResult` is
216
+ * fully populated, but nothing is persisted to disk. */
217
+ declare function inMemoryCampaignStorage(): CampaignStorage;
218
+
219
+ /**
220
+ * @experimental
221
+ *
222
+ * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates
223
+ * scenarios → dispatch → artifacts → judges → aggregates, with full
224
+ * reproducibility (seed + manifest hash), cell-level resumability, bootstrap
225
+ * CIs, and the `LabeledScenarioStore` capture flywheel.
226
+ *
227
+ * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this
228
+ * primitive but live in `presets/run-improvement-loop.ts`. This file keeps
229
+ * the core orchestrator minimal — Phase 1 of the Pass A track.
230
+ */
231
+
232
+ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
233
+ scenarios: TScenario[];
234
+ dispatch: DispatchFn<TScenario, TArtifact>;
235
+ judges?: JudgeConfig<TArtifact, TScenario>[];
236
+ /** Required for reproducibility. Default 42. */
237
+ seed?: number;
238
+ /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for
239
+ * bootstrap-tight intervals on critical eval. */
240
+ reps?: number;
241
+ /** When true (default), completed cells are cached by
242
+ * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */
243
+ resumable?: boolean;
244
+ /** Optional store — when present, every artifact + judge score is captured
245
+ * with the configured `captureSource`. Capture is default ON; pass `'off'`
246
+ * to disable. */
247
+ labeledStore?: LabeledScenarioStore | 'off';
248
+ captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
249
+ captureSourceVersionHash?: string;
250
+ /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */
251
+ costCeiling?: number;
252
+ /** Max concurrent cells. Default 2. */
253
+ maxConcurrency?: number;
254
+ /** Required: where artifacts + traces land. */
255
+ runDir: string;
256
+ /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted
257
+ * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate
258
+ * refuses this when the caller wires `autoOnPromote !== 'none'`. */
259
+ tracing?: 'on' | 'off';
260
+ /** Test seam — override the wall clock for deterministic tests. */
261
+ now?: () => Date;
262
+ /** Test seam — override per-cell trace writer factory. */
263
+ buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter;
264
+ /** Storage backend for run/cell dirs, the resumability cache, artifacts,
265
+ * and trace spans. Default: the Node filesystem (`fsCampaignStorage`).
266
+ * Pass `inMemoryCampaignStorage()` to run in a filesystem-less runtime
267
+ * (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still
268
+ * produced; artifacts/traces just aren't persisted to disk. */
269
+ storage?: CampaignStorage;
270
+ }
271
+ declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
272
+
273
+ /**
274
+ * @experimental
275
+ *
276
+ * `runEval` — the simplest preset over `runCampaign`. No optimizer, no
277
+ * gate, no auto-PR. Just: run scenarios through dispatch, score with
278
+ * judges, return CampaignResult.
279
+ *
280
+ * The 80% case for consumers who want a scorecard, not an improvement loop.
281
+ */
282
+
283
+ interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
284
+ runDir: string;
285
+ }
286
+ declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
287
+
288
+ /**
289
+ * @experimental
290
+ *
291
+ * `runOptimization` — the improvement loop body. Runs N generations: the
292
+ * `ImprovementDriver` proposes K candidate surfaces per generation, each
293
+ * candidate runs a campaign (the measurement), top-scoring promote to the
294
+ * next generation. Driver-agnostic — the same loop runs an evolutionary
295
+ * population mutator (`evolutionaryDriver`) or agent-runtime's
296
+ * `improvementDriver` (reflective / agentic generators); they differ only in
297
+ * how `propose()` picks candidates.
298
+ *
299
+ * This is `runLoop`'s shape (plan → measure → decide) specialized to surface
300
+ * improvement: `driver.propose` = plan, `runCampaign` = the measurement (which
301
+ * runs the worker behind `dispatch`), the mean-composite ranking = the
302
+ * validator, `driver.decide` = the stop check.
303
+ *
304
+ * The gated-promotion shell (`runImprovementLoop`) wraps this with a holdout
305
+ * re-score + release gate + optional PR.
306
+ */
307
+
308
+ interface RunOptimizationOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'dispatch'> {
309
+ /** Initial mutable surface (typically system prompt or addendum). */
310
+ baselineSurface: MutableSurface;
311
+ /** Dispatcher that takes the CURRENT surface + scenario → artifact. */
312
+ dispatchWithSurface: (surface: MutableSurface, scenario: TScenario, ctx: Parameters<RunCampaignOptions<TScenario, TArtifact>['dispatch']>[1]) => Promise<TArtifact>;
313
+ /** The improvement strategy. Wrap a population `Mutator` via
314
+ * `evolutionaryDriver({ mutator })`, or pass agent-runtime's
315
+ * `improvementDriver` (reflective / agentic generators). */
316
+ driver: ImprovementDriver;
317
+ populationSize: number;
318
+ maxGenerations: number;
319
+ /** How many top-scoring candidates carry to the next generation. Default 2. */
320
+ promoteTopK?: number;
321
+ /** DEPTH knob forwarded to the driver's `propose()` — max iterations the
322
+ * agentic generator may take per candidate. */
323
+ maxImprovementShots?: number;
324
+ /** Phase-2 research report forwarded to `propose()` (analyst findings +
325
+ * diff). Opaque here; the driver types it. */
326
+ report?: unknown;
327
+ }
328
+ interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
329
+ generations: Array<{
330
+ record: GenerationRecord;
331
+ surfaces: Array<{
332
+ surfaceHash: string;
333
+ surface: MutableSurface;
334
+ campaign: CampaignResult<TArtifact, TScenario>;
335
+ }>;
336
+ }>;
337
+ winnerSurface: MutableSurface;
338
+ winnerSurfaceHash: string;
339
+ baselineCampaign: CampaignResult<TArtifact, TScenario>;
340
+ }
341
+ declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
342
+ declare function surfaceHash(surface: MutableSurface): string;
343
+
344
+ /**
345
+ * @experimental
346
+ *
347
+ * `runImprovementLoop` — the gated-promotion shell around the improvement
348
+ * loop body (`runOptimization`). Drives candidate surfaces via the
349
+ * `ImprovementDriver`, re-scores the winner against the baseline on a
350
+ * holdout set, runs the release gate, and optionally opens a PR.
351
+ *
352
+ * Role vocabulary (see docs/design/loop-taxonomy.md):
353
+ * - DRIVER = the `ImprovementDriver` (evolutionary GEPA mutator OR
354
+ * reflective analyst). Proposes candidate SURFACES — the
355
+ * worker's system prompt / tool config — NOT conversation
356
+ * turns.
357
+ * - MEASUREMENT= `runCampaign`. Scores one surface by running the worker
358
+ * (via `dispatch`) over scenarios and judging the output.
359
+ * - WORKER = the agent harness in the sandbox, invoked behind the
360
+ * topology-opaque `dispatch` seam — never referenced here.
361
+ *
362
+ * Distinct from `runLoop` in `@tangle-network/agent-runtime`, which is the
363
+ * INNER conversation loop (driver↔workers in a sandbox). `runImprovementLoop`
364
+ * is the OUTER loop: it improves the surface that those workers run.
365
+ *
366
+ * Hard-refuses unsafe configurations:
367
+ * - `tracing: 'off'` when a driver is wired (improvement is unattributable)
368
+ * - `autoOnPromote: 'config'` — DEFERRED to Pass B; v0.40 only ships
369
+ * `'pr'` and `'none'`.
370
+ */
371
+
372
+ interface RunImprovementLoopOptions<TScenario extends Scenario, TArtifact> extends RunOptimizationOptions<TScenario, TArtifact> {
373
+ /** Holdout scenarios kept OUT of the training optimization pool — used
374
+ * ONLY to score baseline vs winner for the gate. */
375
+ holdoutScenarios: TScenario[];
376
+ /** Promotion gate. Substrate strongly recommends `defaultProductionGate`
377
+ * for production wiring (composes red-team / reward-hacking / canary /
378
+ * heldout). */
379
+ gate: Gate<TArtifact, TScenario>;
380
+ /** What to do when the gate ships:
381
+ * - `'pr'`: open a PR via `openAutoPr`
382
+ * - `'none'`: just report — caller decides what to do with the winner
383
+ * v0.40 does NOT support `'config'` (live-runtime self-mutation) —
384
+ * deferred to Pass B behind safety stack. */
385
+ autoOnPromote: 'pr' | 'none';
386
+ /** GH owner / repo for the auto-PR. Required when autoOnPromote === 'pr'. */
387
+ ghOwner?: string;
388
+ ghRepo?: string;
389
+ /** Optional render override — substrate writes a diff-shaped surface; pass
390
+ * a function to format the promoted surface differently. */
391
+ renderPromotedDiff?: (winnerSurface: MutableSurface, baselineSurface: MutableSurface) => string;
392
+ }
393
+ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extends RunOptimizationResult<TArtifact, TScenario> {
394
+ baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
395
+ winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
396
+ gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>;
397
+ prResult?: ReturnType<typeof openAutoPr>;
398
+ }
399
+ declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
400
+
401
+ export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type RunEvalOptions as a, type RunImprovementLoopOptions as b, type RunImprovementLoopResult as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
@@ -364,4 +364,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
364
364
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
365
365
  }
366
366
 
367
- export type { CampaignResult as C, DispatchFn as D, GateResult as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, Mutator as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, Gate as a, LabeledScenarioWrite as b, LabeledScenarioSampleArgs as c, LabeledScenarioRecord as d, CampaignTraceWriter as e, MutableSurface as f, GenerationRecord as g, CodeSurface as h, CampaignAggregates as i, CampaignArtifactWriter as j, CampaignCellResult as k, CampaignCostMeter as l, DispatchContext as m, GateContext as n, GateDecision as o, GenerationCandidate as p, JudgeAggregate as q, JudgeDimension as r, JudgeScore as s, LabeledScenarioSource as t, ScenarioAggregate as u, SessionScript as v };
367
+ export type { CampaignAggregates as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchContext as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
@@ -0,0 +1,89 @@
1
+ # External-Agent Wedge — plug-in self-improvement for any agent
2
+
3
+ **Status:** proposed · **Owner:** Drew · **Tracking:** ops-board #507 (epic) · **Updated:** 2026-05-26
4
+
5
+ > One sentence: **expose the self-improvement engine our own fleet runs on as a drop-in library any agent builder can install — land free, bill the hosted intelligence.**
6
+
7
+ This doc locks direction so the team executes without thrash. It is deliberately short. If a question isn't answered here, the default is "do the cheapest thing that keeps the LAND tier free and the EXPAND tier billable."
8
+
9
+ ---
10
+
11
+ ## Why now
12
+
13
+ Repeated signal from agent founders (latest: a high-quality GTM/social-marketing agent, sharp CTO): **no evals, no observability, hand-built, no way for the agent to improve itself.** This is the median serious agent team. They don't need another trace viewer — they need their agent to *get better on its own* and proof that it did.
14
+
15
+ We already built that engine and hardened it the hard way: **6 of our own agents** (tax, legal, creative, gtm, agent-builder, physim) now run the *same* `runCampaign` / `runImprovementLoop` / `gepaDriver` / gate surface. It is `@tangle-network/agent-eval` — already on npm, runtime-agnostic, dogfooded in production. Exposing it externally is distribution of an asset we already own, not a new product to invent.
16
+
17
+ ## Positioning — self-improvement first, not observability
18
+
19
+ If we sell "observability," we are a worse Langfuse/Braintrust/Arize and it's a margin race. Our wedge is the **closed self-improvement loop** (eval campaign → reflective `gepaDriver` prompt optimization → gated promotion → Bradley-Terry tournaments → predictive-validity calibration). Observability is the *byproduct*, not the pitch.
20
+
21
+ **The pitch:** "Plug us in. Your agent runs a closed self-improvement loop against your own use case, gated so it never ships a regression — and you get the eval + trace observability for free as it does."
22
+
23
+ ### Backend-agnostic by design — the sandbox is swappable, not required
24
+
25
+ The whole stack already runs **without our sandbox**, at two granularities:
26
+ - **`agent-eval`** (the self-improvement engine) touches the sandbox only as a *type* — the LAND tier needs no sandbox.
27
+ - **`agent-runtime`** is runtime-decoupled too: every sandbox import is `import type`, and the sandbox backend takes an *injected* instance. Its `Backend` seam already ships `createOpenAICompatibleBackend` (pure LLM) and `createIterableBackend` (bring-your-own) next to `createSandboxPromptBackend`. A builder runs the full runtime + self-improvement loop on **their own execution backend**, no Tangle sandbox installed.
28
+
29
+ So adoption is *graduated*, and the builder picks the depth: (1) **trace-analysis only** over their existing runtime; (2) the **full self-improvement loop** on their own backend; (3) **our sandbox as a swap-in backend** for the batteries. Our sandbox becomes the best *option*, never the cost of entry. The only work to make this explicit: mark the `@tangle-network/sandbox` peer `optional` in agent-runtime + document the `Backend` interface as the swap seam (Phase A).
30
+
31
+ ## The surface — three tiers (land → expand → platform)
32
+
33
+ | Tier | What they do | What they get | Billing |
34
+ |---|---|---|---|
35
+ | **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) |
36
+ | **EXPAND** (the build) | Route trace/eval/labeled-scenario data to our orchestrator | Hosted dashboards, cross-run intelligence, the capture flywheel as a service | **Metered** — composes with existing sandbox Stripe + cost-ledger |
37
+ | **PLATFORM** (the carrot) | Move execution into our sandbox (agent-dev-container) | Substrate + orchestrator data/intelligence pre-wired, batteries included | Sandbox usage |
38
+
39
+ The free lib casts the widest possible net at near-zero cost (it's already published). Value capture is EXPAND: hosting their data/intelligence = a billable surface on the dimensions we already meter (ingested/retained volume, eval-campaign compute, loop runs, seats). "We don't host observability unless they route to us" is the *business model*, not a gap.
40
+
41
+ ## Plan & gates — land-first, validate, then build
42
+
43
+ The non-negotiable discipline: **do not build the hosted/billing tier before the free LAND is validated on a foreign agent.** Reality on someone else's real agent is cheaper than our imagination.
44
+
45
+ - **Phase A — unblock (ungated, now):** upgrade agent-dev-container to the latest substrate; export `OutcomeStore`/`DeploymentOutcome` from `/rl`; mark agent-runtime's `@tangle-network/sandbox` peer `optional` + document the `Backend` swap seam (make sandbox-free adoption explicitly supported). Pure wins, correct regardless of the wedge.
46
+ - **Phase B — design-partner LAND validation (forcing function):** wrap the founder's agent behind `dispatch`, author a marketing-quality judge, run one real campaign + `gepaDriver` loop. Instrument integration friction, judge cold-start, and actual quality lift.
47
+ - **GATE — go/no-go + pricing:** decided from Phase B evidence, not theory.
48
+ - **Phase C — LAND ergonomics:** external 15-minute quickstart + a stable `dispatch`/judge/scenario contract + ≥1 reference framework adapter.
49
+ - **Phase D — EXPAND (gated):** hosted OTLP/eval-run HTTP sink (client in agent-eval) + multi-tenant orchestrator ingest + metered billing + minimal dashboard (server in `@tangle-network/monorepo`).
50
+
51
+ ## Success metrics
52
+
53
+ - **Phase B:** ≥1 measurable quality lift on the partner's own use case; integration ≤ a 1–2 day pairing.
54
+ - **LAND:** time-to-first-self-improvement-loop for a new external agent < 1 day from the quickstart.
55
+ - **EXPAND:** first external tenant routed + first metered dollar.
56
+
57
+ ## Risk register
58
+
59
+ **Knowns (high confidence)**
60
+ - The substrate is a portable lib; it wraps our 6 agents behind `dispatch`/judge/gate in production.
61
+ - It's runtime-agnostic (FS + in-memory storage, Node + edge) and emits OTLP traces.
62
+ - **agent-runtime is already sandbox-decoupled at runtime** — sandbox is type-only + dependency-injected; the `Backend` seam (`createOpenAICompatibleBackend` / `createIterableBackend` / `createSandboxPromptBackend`) makes the sandbox one swappable option. Only the peer-dep label needs to change.
63
+ - The orchestrator/observability/billing platform lives in `@tangle-network/monorepo`.
64
+
65
+ **Known-unknowns (Phase B answers these)**
66
+ - Does `dispatch` wrap a *foreign* agent as cleanly as ours, or are there hidden Tangle assumptions?
67
+ - Judge cold-start: can a team with no prior evals author a usable judge for a subjective domain (marketing quality)?
68
+ - Data cold-start: no evals = no scenarios = no labels; how do they bootstrap the flywheel day 1?
69
+ - Orchestrator multi-tenancy: it's only run internally — external auth/isolation/privacy is unbuilt.
70
+ - Pricing line + OSS boundary (the lib is already public).
71
+
72
+ **Unknown-unknowns (instrument, don't predict)**
73
+ - Integration-surface explosion → keep a tiny stable contract + reference adapters; refuse to special-case.
74
+ - Foreign-domain eval semantics we've never seen → the design partner is the discovery mechanism.
75
+ - Multi-tenant orchestrator failure modes at external/adversarial scale.
76
+ - Supply-chain/trust — we enter their dependency tree; our security posture becomes their procurement question.
77
+ - *Mitigation for all:* land-first-with-a-partner surfaces surprises cheaply, on a real agent, before any hosted spend. Timebox Phase D behind the gate.
78
+
79
+ ## Non-goals (anti-thrash)
80
+
81
+ - **Not** a standalone observability product. Observability ships only as a byproduct of self-improvement.
82
+ - **Not** building the hosted/billing tier before Phase B validates the lib.
83
+ - **Not** per-framework bespoke integrations — one stable contract + a couple of reference adapters.
84
+ - **Not** re-architecting the substrate for hypothetical external needs — extend at the `dispatch`/judge/store seams only.
85
+ - **Not** changing the sandbox/products roadmap — this *exposes the same engine*, it doesn't fork it.
86
+
87
+ ## Tracking
88
+
89
+ ops-board epic **#507** + children: agent-dev-container stack upgrade (eng), foreign-agent adoption surface (eng), design-partner LAND validation (gtm), hosted orchestrator routing + billing (eng, blocked on the gate), GTM-fit + pricing decision (gtm).
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.43.1",
3
+ "version": "0.44.0",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -104,6 +104,11 @@
104
104
  "import": "./dist/campaign/index.js",
105
105
  "default": "./dist/campaign/index.js"
106
106
  },
107
+ "./contract": {
108
+ "types": "./dist/contract/index.d.ts",
109
+ "import": "./dist/contract/index.js",
110
+ "default": "./dist/contract/index.js"
111
+ },
107
112
  "./openapi.json": {
108
113
  "default": "./dist/openapi.json"
109
114
  }
@@ -140,7 +145,7 @@
140
145
  "zod": "^4.3.6"
141
146
  },
142
147
  "peerDependencies": {
143
- "@tangle-network/agent-runtime": "^0.21.0",
148
+ "@tangle-network/agent-runtime": ">=0.21.0 <0.26.0",
144
149
  "@tangle-network/sandbox": ">=0.2.1 <0.4.0"
145
150
  },
146
151
  "peerDependenciesMeta": {
@@ -153,7 +158,7 @@
153
158
  },
154
159
  "devDependencies": {
155
160
  "@biomejs/biome": "^2.4.15",
156
- "@tangle-network/agent-runtime": "^0.21.0",
161
+ "@tangle-network/agent-runtime": ">=0.21.0 <0.26.0",
157
162
  "@tangle-network/sandbox": "0.3.0",
158
163
  "@types/node": "^25.6.0",
159
164
  "husky": "^9.1.7",