@tangle-network/agent-eval 0.38.0 → 0.40.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/campaign/index.d.ts +775 -0
  2. package/dist/campaign/index.js +807 -0
  3. package/dist/campaign/index.js.map +1 -0
  4. package/dist/chunk-5U2DOJU4.js +565 -0
  5. package/dist/chunk-5U2DOJU4.js.map +1 -0
  6. package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
  7. package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
  8. package/dist/chunk-BWZEGTES.js.map +1 -0
  9. package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
  10. package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
  11. package/dist/chunk-GGE4NNQT.js +65 -0
  12. package/dist/chunk-GGE4NNQT.js.map +1 -0
  13. package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
  14. package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
  15. package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
  16. package/dist/chunk-MAOZCN36.js.map +1 -0
  17. package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
  18. package/dist/chunk-TMXPFWC7.js +305 -0
  19. package/dist/chunk-TMXPFWC7.js.map +1 -0
  20. package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
  21. package/dist/chunk-WP7SY7AI.js.map +1 -0
  22. package/dist/chunk-YV7J7X5N.js +313 -0
  23. package/dist/chunk-YV7J7X5N.js.map +1 -0
  24. package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
  25. package/dist/control.d.ts +3 -3
  26. package/dist/control.js +2 -2
  27. package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
  28. package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
  29. package/dist/governance/index.d.ts +133 -5
  30. package/dist/index.d.ts +35 -34
  31. package/dist/index.js +97 -630
  32. package/dist/index.js.map +1 -1
  33. package/dist/multishot/index.d.ts +21 -21
  34. package/dist/multishot/index.js +64 -15
  35. package/dist/multishot/index.js.map +1 -1
  36. package/dist/openapi.json +1 -1
  37. package/dist/optimization.d.ts +2 -2
  38. package/dist/optimization.js +5 -5
  39. package/dist/pipelines/index.js +2 -2
  40. package/dist/red-team-30II1T4o.d.ts +63 -0
  41. package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
  42. package/dist/reporting.d.ts +2 -2
  43. package/dist/reporting.js +3 -3
  44. package/dist/rl.js +15 -315
  45. package/dist/rl.js.map +1 -1
  46. package/dist/run-campaign-JYJXYHHL.js +10 -0
  47. package/dist/run-campaign-JYJXYHHL.js.map +1 -0
  48. package/dist/traces.js +7 -5
  49. package/dist/wire/index.d.ts +2 -2
  50. package/docs/design/loop-taxonomy.md +233 -0
  51. package/docs/design/self-improvement-engine.md +130 -0
  52. package/package.json +33 -24
  53. package/dist/chunk-KHZRNY3F.js.map +0 -1
  54. package/dist/chunk-L5UNCDAJ.js.map +0 -1
  55. package/dist/chunk-TSPOEDM3.js.map +0 -1
  56. package/dist/index-CN2agEaO.d.ts +0 -191
  57. /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
  58. /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
  59. /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
  60. /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
  61. /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
  62. /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
@@ -0,0 +1,775 @@
1
+ import { RunRecord } from '@tangle-network/agent-runtime';
2
+ import { R as RedTeamCase } from '../red-team-30II1T4o.js';
3
+ import '../dataset-BlwAtYYf.js';
4
+ import '../errors-mje_cKOs.js';
5
+ import '../store-Db2Bv8Cf.js';
6
+
7
+ /**
8
+ * @experimental
9
+ *
10
+ * Pass A substrate types — `runCampaign` is the one primitive every
11
+ * eval flow composes from. Three contracts in this file:
12
+ *
13
+ * - `Scenario` input set
14
+ * - `DispatchFn` how to run one scenario → artifact
15
+ * - `CampaignResult` defined output schema (the contract downstream tools depend on)
16
+ *
17
+ * Three more lifted from earlier substrate work (re-exported):
18
+ *
19
+ * - `JudgeConfig` pluggable dimensional scorer (0.38)
20
+ * - `Mutator` optimization-loop surface mutator
21
+ * - `Gate` promotion gate (`HeldOutGate` and friends adapt to this)
22
+ *
23
+ * No new architecture vs 0.38 — Pass A formalizes the shapes so consumers
24
+ * can build dashboards / CI gates / regression diffs against a stable schema.
25
+ */
26
+ /** @experimental Stable identifier + kind tag for any scenario. Consumers
27
+ * extend with their per-domain payload (persona, task, requirement, ...). */
28
+ interface Scenario {
29
+ id: string;
30
+ kind: string;
31
+ tags?: string[];
32
+ }
33
+ /** @experimental Context handed to every dispatch invocation. Scoped — every
34
+ * trace/span carries the cellId, every artifact write lands under the cell's
35
+ * artifact root, the cost meter accumulates per cell. */
36
+ interface DispatchContext {
37
+ cellId: string;
38
+ rep: number;
39
+ generation?: number;
40
+ seed: number;
41
+ signal: AbortSignal;
42
+ trace: CampaignTraceWriter;
43
+ artifacts: CampaignArtifactWriter;
44
+ cost: CampaignCostMeter;
45
+ /** Populated when this run is part of a multi-cycle improvement loop. */
46
+ cycleId?: string;
47
+ /** Populated when the substrate resumed from a prior cache hit. */
48
+ resumedFrom?: string;
49
+ }
50
+ /** @experimental One function: scenario + ctx → artifact. Dispatcher chooses
51
+ * whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */
52
+ type DispatchFn<TScenario extends Scenario, TArtifact> = (scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
53
+ /** @experimental One session within a multi-session journey. Dispatch is
54
+ * invoked once per session in order; state from prior session's artifact
55
+ * is exposed via `ctx.priorSessionArtifact`. */
56
+ interface SessionScript<TScenario, TArtifact> {
57
+ id: string;
58
+ intent: string;
59
+ maxTurns?: number;
60
+ /** When true, knowledge accumulated this session persists to next. */
61
+ affectsKnowledge?: boolean;
62
+ /** Optional per-session persona evolution — called after the session
63
+ * resolves. Returns the persona shape used by the NEXT session. */
64
+ evolveAfterSession?: (artifact: TArtifact, sessionIndex: number, scenario: TScenario) => TScenario;
65
+ }
66
+ interface JudgeDimension {
67
+ /** JSON field name + score key. */
68
+ key: string;
69
+ /** Description shown in the judge's user prompt. */
70
+ description: string;
71
+ }
72
+ /** @experimental Pluggable dimensional scorer. Consumers register one per
73
+ * scoring concern. `appliesTo` lets a judge run only on scenarios that match
74
+ * (e.g., legal_citation judge only on legal scenarios). */
75
+ interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {
76
+ name: string;
77
+ model?: string;
78
+ dimensions: JudgeDimension[];
79
+ systemPrompt: string;
80
+ buildPrompt: (input: {
81
+ artifact: TArtifact;
82
+ scenario: TScenario;
83
+ }) => string;
84
+ appliesTo?: (scenario: TScenario) => boolean;
85
+ apiKey?: string;
86
+ baseUrl?: string;
87
+ }
88
+ interface JudgeScore {
89
+ dimensions: Record<string, number>;
90
+ composite: number;
91
+ notes: string;
92
+ }
93
+ /** @experimental A tier-4 code surface — a candidate change to the agent's
94
+ * IMPLEMENTATION, not its prompt. Produced by autoresearch (reads codebase +
95
+ * trace findings → opens a worktree). Measured by checking out `worktreeRef`
96
+ * and running the worker against the changed code. See the improvement-tier
97
+ * table in `docs/design/loop-taxonomy.md`. */
98
+ interface CodeSurface {
99
+ kind: 'code';
100
+ /** Worktree path or git ref holding the candidate code change. The
101
+ * consumer's `dispatchWithSurface` checks this out before running. */
102
+ worktreeRef: string;
103
+ /** Base ref the change is measured against. Default: the repo's main. */
104
+ baseRef?: string;
105
+ /** Human summary of what changed — rendered into the auto-PR body. */
106
+ summary?: string;
107
+ }
108
+ /** @experimental The mutable surface a driver proposes. Tiers (see
109
+ * `docs/design/loop-taxonomy.md`):
110
+ * - `string` — tiers 1-2: system-prompt addendum / serialized tool
111
+ * config. Cheap, reversible, text-diffable.
112
+ * - `CodeSurface` — tier 4: an implementation change behind a worktree ref.
113
+ * Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,
114
+ * not this type. */
115
+ type MutableSurface = string | CodeSurface;
116
+ /** @experimental Stateless surface mutation — given findings + current
117
+ * surface, return N candidate surfaces. Pure transform, no generation
118
+ * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
119
+ * conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */
120
+ interface Mutator<TFindings = unknown> {
121
+ kind: string;
122
+ mutate(args: {
123
+ findings: TFindings[];
124
+ currentSurface: MutableSurface;
125
+ populationSize: number;
126
+ signal: AbortSignal;
127
+ }): Promise<MutableSurface[]>;
128
+ }
129
+ /** @experimental Everything a driver's `propose()` may read to plan the next
130
+ * batch of candidates. The first six fields are always present; the rest are
131
+ * optional context the loop supplies when available, so cheap drivers
132
+ * (`evolutionaryDriver`) can ignore them while a code-tier `autoresearchDriver`
133
+ * consumes the research report + dataset to drive a sandbox runLoop.
134
+ * See `docs/design/self-improvement-engine.md`. */
135
+ interface ProposeContext<TFindings = unknown> {
136
+ currentSurface: MutableSurface;
137
+ history: GenerationRecord[];
138
+ findings: TFindings[];
139
+ /** BREADTH: how many candidate surfaces to return this generation. */
140
+ populationSize: number;
141
+ generation: number;
142
+ signal: AbortSignal;
143
+ /** The Phase-2 research report (analyst findings + diff), produced AFTER the
144
+ * trace analysts run. Opaque to the substrate — the driver that consumes it
145
+ * types it. See the phase diagram in self-improvement-engine.md. */
146
+ report?: unknown;
147
+ /** Handle to all captured data — the driver samples traces / artifacts /
148
+ * rewards here to ground its proposals. */
149
+ dataset?: LabeledScenarioStore;
150
+ /** DEPTH: max runLoop iterations the generating agent may take per candidate
151
+ * (autoresearchDriver). 1 = single-shot; >1 = it may iterate on its own
152
+ * change before handing it back to be measured. */
153
+ maxImprovementShots?: number;
154
+ }
155
+ /** @experimental A surface-improvement strategy — the DRIVER of the
156
+ * improvement loop. Given the current best surface, the history of what's
157
+ * been tried + scored, and any external findings, propose the next batch of
158
+ * candidate surfaces to measure. Optionally decide to stop early.
159
+ *
160
+ * The evolutionary mutator (`evolutionaryDriver`) and a reflective analyst
161
+ * (`analystDriver`, consumer-wired from `@tangle-network/agent-runtime`'s
162
+ * `runAnalystLoop`) are two drivers of the SAME loop — not two loops. The
163
+ * loop body (`runOptimization`) and the gated promotion shell
164
+ * (`runImprovementLoop`) are driver-agnostic. */
165
+ interface ImprovementDriver<TFindings = unknown> {
166
+ kind: string;
167
+ /** Plan: propose N candidate surfaces for the next generation. */
168
+ propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]>;
169
+ /** Decide: stop early when the driver judges the search converged or
170
+ * exhausted. Default (omitted) runs all `maxGenerations`. */
171
+ decide?(args: {
172
+ history: GenerationRecord[];
173
+ }): {
174
+ stop: boolean;
175
+ reason?: string;
176
+ };
177
+ }
178
+ interface OptimizerConfig {
179
+ driver: ImprovementDriver;
180
+ populationSize: number;
181
+ maxGenerations: number;
182
+ surfaceExtractor: (profile: unknown) => MutableSurface;
183
+ }
184
+ /** @experimental Five-valued verdict taxonomy (MOSS-paper alignment). */
185
+ type GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
186
+ interface GateContext<TArtifact, TScenario extends Scenario> {
187
+ candidateArtifacts: Map<string, TArtifact>;
188
+ baselineArtifacts?: Map<string, TArtifact>;
189
+ judgeScores: Map<string, Record<string, JudgeScore>>;
190
+ scenarios: TScenario[];
191
+ cost: {
192
+ candidate: number;
193
+ baseline: number;
194
+ };
195
+ signal: AbortSignal;
196
+ }
197
+ interface GateResult {
198
+ decision: GateDecision;
199
+ reasons: string[];
200
+ contributingGates: Array<{
201
+ name: string;
202
+ passed: boolean;
203
+ detail: unknown;
204
+ }>;
205
+ delta?: number;
206
+ }
207
+ /** @experimental Composable promotion gate. */
208
+ interface Gate<TArtifact = unknown, TScenario extends Scenario = Scenario> {
209
+ name: string;
210
+ decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult>;
211
+ }
212
+ /** @experimental Scoped trace writer handed to each dispatch — every span
213
+ * auto-tagged with the cellId so traces filter cleanly. */
214
+ interface CampaignTraceWriter {
215
+ span(name: string, attributes?: Record<string, unknown>): TraceSpan;
216
+ flush(): Promise<void>;
217
+ }
218
+ interface TraceSpan {
219
+ end(attributes?: Record<string, unknown>): void;
220
+ setAttribute(key: string, value: unknown): void;
221
+ }
222
+ /** @experimental Scoped artifact writer — `write(path, content)` lands under
223
+ * `<runDir>/<cellId>/<path>`. */
224
+ interface CampaignArtifactWriter {
225
+ write(path: string, content: string | Uint8Array): Promise<string>;
226
+ writeJson(path: string, value: unknown): Promise<string>;
227
+ }
228
+ /** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs
229
+ * via the cost-ledger backend hooks; consumers can record additional
230
+ * spend (sandbox time, tool costs) via `observe`. */
231
+ interface CampaignCostMeter {
232
+ observe(amountUsd: number, source: string): void;
233
+ current(): number;
234
+ }
235
+ /** @experimental Source tag — required on every store write. Used by the
236
+ * default training-source filter (production-trace samples NOT used as
237
+ * training scenarios unless explicitly opted in). */
238
+ type LabeledScenarioSource = 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
239
+ type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted';
240
+ /** @experimental Required-provenance write. The store rejects writes that
241
+ * lack provenance — a default-on flywheel without provenance is the
242
+ * data-poisoning vector flagged in the alignment review. */
243
+ interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact = unknown> {
244
+ scenario: TScenario;
245
+ artifact: TArtifact;
246
+ judgeScores: Record<string, JudgeScore>;
247
+ source: LabeledScenarioSource;
248
+ sourceVersionHash: string;
249
+ capturedAt: string;
250
+ redactionStatus: RedactionStatus;
251
+ /** Optional per-source rate-limit bucket key (e.g., the tenant id). */
252
+ rateLimitBucket?: string;
253
+ }
254
+ interface LabeledScenarioRecord<TScenario extends Scenario = Scenario, TArtifact = unknown> extends LabeledScenarioWrite<TScenario, TArtifact> {
255
+ /** Stable hash of (scenario.id, source, capturedAt, sourceVersionHash). */
256
+ recordHash: string;
257
+ /** Substrate-assigned split — train if captured before the campaign's
258
+ * `temporalCutoff`, test if after. Explicit override allowed via filter. */
259
+ split: 'train' | 'test';
260
+ }
261
+ interface LabeledScenarioSampleArgs {
262
+ count: number;
263
+ /** REQUIRED — substrate refuses to sample without an explicit split. */
264
+ split: 'train' | 'test';
265
+ /** REQUIRED — only records captured before this timestamp are returned.
266
+ * Enforces temporal split discipline (test scenarios captured AFTER train
267
+ * cannot enter the training pool). */
268
+ capturedBefore: string;
269
+ filter?: {
270
+ kind?: string;
271
+ source?: LabeledScenarioSource | LabeledScenarioSource[];
272
+ minComposite?: number;
273
+ maxComposite?: number;
274
+ };
275
+ }
276
+ interface LabeledScenarioStore {
277
+ observe(write: LabeledScenarioWrite): Promise<void>;
278
+ sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>;
279
+ size(): Promise<{
280
+ train: number;
281
+ test: number;
282
+ bySource: Record<string, number>;
283
+ }>;
284
+ }
285
+ interface CampaignCellResult<TArtifact> {
286
+ cellId: string;
287
+ scenarioId: string;
288
+ rep: number;
289
+ generation?: number;
290
+ artifact: TArtifact;
291
+ judgeScores: Record<string, JudgeScore>;
292
+ costUsd: number;
293
+ durationMs: number;
294
+ seed: number;
295
+ cached: boolean;
296
+ error?: string;
297
+ }
298
+ interface JudgeAggregate {
299
+ mean: number;
300
+ stdev: number;
301
+ ci95: [number, number];
302
+ n: number;
303
+ }
304
+ interface ScenarioAggregate {
305
+ meanComposite: number;
306
+ ci95: [number, number];
307
+ n: number;
308
+ }
309
+ interface GenerationRecord {
310
+ generationIndex: number;
311
+ candidates: Array<{
312
+ surfaceHash: string;
313
+ composite: number;
314
+ ci95: [number, number];
315
+ }>;
316
+ promoted: string[];
317
+ }
318
+ interface CampaignAggregates {
319
+ byJudge: Record<string, JudgeAggregate>;
320
+ byScenario: Record<string, ScenarioAggregate>;
321
+ totalCostUsd: number;
322
+ cellsExecuted: number;
323
+ cellsSkipped: number;
324
+ cellsCached: number;
325
+ cellsFailed: number;
326
+ }
327
+ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scenario> {
328
+ /** sha256(scenarios, judges, dispatch source ref, optimizer config, seed). Stable identity for reruns. */
329
+ manifestHash: string;
330
+ seed: number;
331
+ startedAt: string;
332
+ endedAt: string;
333
+ durationMs: number;
334
+ cells: Array<CampaignCellResult<TArtifact>>;
335
+ aggregates: CampaignAggregates;
336
+ optimization?: {
337
+ generations: GenerationRecord[];
338
+ winnerSurfaceHash?: string;
339
+ };
340
+ gate?: GateResult;
341
+ prUrl?: string;
342
+ runDir: string;
343
+ artifactsByPath: Record<string, string>;
344
+ /** Substrate strips the input scenarios to id+kind for the result manifest;
345
+ * consumers needing full payload look it up via the original input. The
346
+ * type parameter `TScenario` is propagated for downstream consumers that
347
+ * want narrowed types when extending `CampaignResult`. */
348
+ scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
349
+ }
350
+
351
+ /**
352
+ * @experimental
353
+ *
354
+ * `openAutoPr` — thin shell-out helper for the `runImprovementLoop` preset's
355
+ * `autoOnPromote: 'pr'` mode. Substitutes for the per-product PR-opening
356
+ * code consumers duplicated 4 times. The PR body includes the campaign's
357
+ * manifest hash, gate verdict, and scorecard summary so reviewers can see
358
+ * exactly what was promoted + why.
359
+ *
360
+ * NOT a deploy mechanism — this only OPENS a PR. The human reviews + merges.
361
+ * The Shape B (`autoOnPromote: 'config'`) live-runtime-mutation path is
362
+ * deferred to Pass B with the full shadow / canary / rollback stack.
363
+ */
364
+
365
+ interface OpenAutoPrOptions<TArtifact, TScenario extends Scenario> {
366
+ /** Campaign result to attach to the PR. */
367
+ result: CampaignResult<TArtifact, TScenario>;
368
+ /** Gate verdict explaining the promotion. Substrate refuses to open a PR
369
+ * when `gate.decision !== 'ship'` — fails loud. */
370
+ gate: GateResult;
371
+ /** Promoted surface diff — typically the new system prompt addendum or
372
+ * full profile diff. Substrate writes it as the PR body. */
373
+ promotedDiff: string;
374
+ /** GH owner/repo target (e.g., `tangle-network/gtm-agent`). */
375
+ ghOwner: string;
376
+ ghRepo: string;
377
+ /** Branch name for the PR. Default `auto/<manifestHash[:12]>`. */
378
+ branch?: string;
379
+ /** PR title. Default includes manifest hash. */
380
+ title?: string;
381
+ /** Whether to actually open the PR or just dry-run. Default reads
382
+ * `GH_AUTO_PR_TOKEN` env — present = open, absent = dry-run. */
383
+ dryRun?: boolean;
384
+ /** Test seam — substitute `gh pr create` invocation. */
385
+ ghExec?: (args: string[]) => {
386
+ stdout: string;
387
+ stderr: string;
388
+ status: number;
389
+ };
390
+ }
391
+ interface OpenAutoPrResult {
392
+ opened: boolean;
393
+ prUrl?: string;
394
+ dryRun: boolean;
395
+ reason: string;
396
+ }
397
+ declare function openAutoPr<TArtifact, TScenario extends Scenario>(options: OpenAutoPrOptions<TArtifact, TScenario>): OpenAutoPrResult;
398
+
399
+ /**
400
+ * @experimental
401
+ *
402
+ * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
403
+ * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
404
+ * the evolutionary strategy: each generation, mutate the current best surface
405
+ * into N candidates, measure, select. No generation memory beyond the current
406
+ * surface; the loop body handles ranking + promotion.
407
+ *
408
+ * The reflective alternative — `analystDriver` — is consumer-wired from
409
+ * `@tangle-network/agent-runtime`'s `runAnalystLoop`: it reasons over the
410
+ * full generation history + trace findings to propose targeted edits rather
411
+ * than blind mutations. Both conform to `ImprovementDriver`; the improvement
412
+ * loop is identical regardless of which drives it.
413
+ */
414
+
415
+ interface EvolutionaryDriverOptions<TFindings = unknown> {
416
+ mutator: Mutator<TFindings>;
417
+ /** External findings fed to the mutator each generation. Default: []. */
418
+ findings?: TFindings[];
419
+ }
420
+ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
421
+
422
+ /**
423
+ * @experimental
424
+ *
425
+ * Compose multiple `Gate` implementations — every gate must pass for the
426
+ * composite to ship. Closes the alignment reviewer's "default-only
427
+ * heldOutGate + costGate would happily promote a reward-hacked prompt"
428
+ * concern by making safety gates first-class composable defaults.
429
+ */
430
+
431
+ /** Compose gates — all must `ship` for the composite to `ship`. First
432
+ * non-ship verdict short-circuits the composite verdict, but ALL gates run
433
+ * (so the result records every gate's reason — useful for diagnostics). */
434
+ declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
435
+
436
+ /**
437
+ * @experimental
438
+ *
439
+ * `defaultProductionGate` — composes the substrate's existing safety
440
+ * primitives (red-team / reward-hacking / canary / heldout) into a single
441
+ * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
442
+ * primitives are off the critical path" blocker.
443
+ *
444
+ * The composition is opinionated — when consumers wire `runImprovementLoop`,
445
+ * THIS gate is the default. Consumers can still pass a custom gate to
446
+ * override; the recommended pattern is to compose THIS gate with whatever
447
+ * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
448
+ */
449
+
450
+ interface DefaultProductionGateOptions {
451
+ /** Required: scenarios held out from training; substrate compares
452
+ * candidate-on-holdout vs baseline-on-holdout. */
453
+ holdoutScenarios: Scenario[];
454
+ /** Minimum mean-composite improvement required to ship. Default 0.5. */
455
+ deltaThreshold?: number;
456
+ /** Total $ budget for ALL cells in this campaign — including baseline + candidate.
457
+ * Composite verdict refuses to ship when spend exceeded budget. */
458
+ budgetUsd?: number;
459
+ /** Red-team cases to probe candidate outputs against. When omitted the
460
+ * substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
461
+ * battery for tighter coverage. */
462
+ redTeamBattery?: RedTeamCase[];
463
+ /** Run records (oldest-first) needed for the reward-hacking detector.
464
+ * Substrate populates from prior production-loop generations. */
465
+ recentRuns?: RunRecord[];
466
+ /** When true, the gate refuses to ship if the reward-hacking detector
467
+ * fires at the `gaming` severity. Default true. */
468
+ blockOnRewardHackingGaming?: boolean;
469
+ }
470
+ declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
471
+
472
+ /**
473
+ * @experimental
474
+ *
475
+ * Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
476
+ * `Gate`. Use when you want held-out as one of N composed gates instead of
477
+ * the full `defaultProductionGate` stack.
478
+ */
479
+
480
+ interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
481
+ scenarios: TScenario[];
482
+ deltaThreshold?: number;
483
+ }
484
+ declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
485
+
486
+ /**
487
+ * @experimental
488
+ *
489
+ * Filesystem `LabeledScenarioStore` adapter. The default capture sink for
490
+ * traces + eval artifacts. Production deployments typically swap for a
491
+ * Turso/SQLite adapter (same interface).
492
+ *
493
+ * Records land as one JSONL file per source under `<root>/<source>.jsonl`.
494
+ * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.
495
+ *
496
+ * Safety properties enforced at write-time:
497
+ *
498
+ * - **Provenance required**: writes without `source`, `sourceVersionHash`,
499
+ * `capturedAt`, `redactionStatus` are rejected. Closes the alignment
500
+ * reviewer's data-poisoning gap.
501
+ * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`
502
+ * stops a single tenant/source from flooding the store.
503
+ *
504
+ * Safety properties enforced at sample-time:
505
+ *
506
+ * - **Required split + capturedBefore**: substrate refuses to sample without
507
+ * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates
508
+ * accidental train/test contamination.
509
+ * - **Default training-source filter**: when the store is sampled with
510
+ * `split: 'train'`, production-trace records are EXCLUDED unless the
511
+ * caller passes `filter.source: 'production-trace'` explicitly. Closes
512
+ * the contamination-by-default gap flagged by the senior eval engineer.
513
+ */
514
+
515
+ interface FsLabeledScenarioStoreOptions {
516
+ /** Root directory for JSONL files. Created if missing. */
517
+ root: string;
518
+ /** Per-source rate limit. When set, writes exceeding the cap are rejected
519
+ * with a typed error. Default: no limit. */
520
+ maxWritesPerMinutePerBucket?: number;
521
+ /** Test seam — override `Date.now()` for deterministic tests. */
522
+ now?: () => number;
523
+ }
524
+ declare class LabeledScenarioStoreError extends Error {
525
+ readonly code: string;
526
+ constructor(code: string, message: string);
527
+ }
528
+ declare class FsLabeledScenarioStore implements LabeledScenarioStore {
529
+ private readonly options;
530
+ private readonly now;
531
+ private readonly rateLimits;
532
+ constructor(options: FsLabeledScenarioStoreOptions);
533
+ observe(write: LabeledScenarioWrite): Promise<void>;
534
+ sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>;
535
+ size(): Promise<{
536
+ train: number;
537
+ test: number;
538
+ bySource: Record<string, number>;
539
+ }>;
540
+ private assertProvenance;
541
+ private assertRateLimit;
542
+ private toRecord;
543
+ private pathForSource;
544
+ }
545
+
546
+ /**
547
+ * @experimental
548
+ *
549
+ * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates
550
+ * scenarios → dispatch → artifacts → judges → aggregates, with full
551
+ * reproducibility (seed + manifest hash), cell-level resumability, bootstrap
552
+ * CIs, and the `LabeledScenarioStore` capture flywheel.
553
+ *
554
+ * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this
555
+ * primitive but live in `presets/run-improvement-loop.ts`. This file keeps
556
+ * the core orchestrator minimal — Phase 1 of the Pass A track.
557
+ */
558
+
559
+ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
560
+ scenarios: TScenario[];
561
+ dispatch: DispatchFn<TScenario, TArtifact>;
562
+ judges?: JudgeConfig<TArtifact, TScenario>[];
563
+ /** Required for reproducibility. Default 42. */
564
+ seed?: number;
565
+ /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for
566
+ * bootstrap-tight intervals on critical eval. */
567
+ reps?: number;
568
+ /** When true (default), completed cells are cached by
569
+ * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */
570
+ resumable?: boolean;
571
+ /** Optional store — when present, every artifact + judge score is captured
572
+ * with the configured `captureSource`. Capture is default ON; pass `'off'`
573
+ * to disable. */
574
+ labeledStore?: LabeledScenarioStore | 'off';
575
+ captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
576
+ captureSourceVersionHash?: string;
577
+ /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */
578
+ costCeiling?: number;
579
+ /** Max concurrent cells. Default 2. */
580
+ maxConcurrency?: number;
581
+ /** Required: where artifacts + traces land. */
582
+ runDir: string;
583
+ /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted
584
+ * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate
585
+ * refuses this when the caller wires `autoOnPromote !== 'none'`. */
586
+ tracing?: 'on' | 'off';
587
+ /** Test seam — override the wall clock for deterministic tests. */
588
+ now?: () => Date;
589
+ /** Test seam — override per-cell trace writer factory. */
590
+ buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter;
591
+ }
592
+ declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
593
+
594
+ /**
595
+ * @experimental
596
+ *
597
+ * `runEval` — the simplest preset over `runCampaign`. No optimizer, no
598
+ * gate, no auto-PR. Just: run scenarios through dispatch, score with
599
+ * judges, return CampaignResult.
600
+ *
601
+ * The 80% case for consumers who want a scorecard, not an improvement loop.
602
+ */
603
+
604
+ interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
605
+ runDir: string;
606
+ }
607
+ declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
608
+
609
+ /**
610
+ * @experimental
611
+ *
612
+ * `runOptimization` — the improvement loop body. Runs N generations: the
613
+ * `ImprovementDriver` proposes K candidate surfaces per generation, each
614
+ * candidate runs a campaign (the measurement), top-scoring promote to the
615
+ * next generation. Driver-agnostic — the same loop runs an evolutionary
616
+ * population mutator (`evolutionaryDriver`) or a reflective analyst
617
+ * (`analystDriver`); they differ only in how `propose()` picks candidates.
618
+ *
619
+ * This is `runLoop`'s shape (plan → measure → decide) specialized to surface
620
+ * improvement: `driver.propose` = plan, `runCampaign` = the measurement (which
621
+ * runs the worker behind `dispatch`), the mean-composite ranking = the
622
+ * validator, `driver.decide` = the stop check.
623
+ *
624
+ * The gated-promotion shell (`runImprovementLoop`) wraps this with a holdout
625
+ * re-score + release gate + optional PR.
626
+ */
627
+
628
+ interface RunOptimizationOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'dispatch'> {
629
+ /** Initial mutable surface (typically system prompt or addendum). */
630
+ baselineSurface: MutableSurface;
631
+ /** Dispatcher that takes the CURRENT surface + scenario → artifact. */
632
+ dispatchWithSurface: (surface: MutableSurface, scenario: TScenario, ctx: Parameters<RunCampaignOptions<TScenario, TArtifact>['dispatch']>[1]) => Promise<TArtifact>;
633
+ /** The improvement strategy. Wrap a population `Mutator` via
634
+ * `evolutionaryDriver({ mutator })`, or pass a reflective `analystDriver`. */
635
+ driver: ImprovementDriver;
636
+ populationSize: number;
637
+ maxGenerations: number;
638
+ /** How many top-scoring candidates carry to the next generation. Default 2. */
639
+ promoteTopK?: number;
640
+ /** DEPTH knob forwarded to the driver's `propose()` — max runLoop iterations
641
+ * the generating agent may take per candidate (autoresearchDriver). */
642
+ maxImprovementShots?: number;
643
+ /** Phase-2 research report forwarded to `propose()` (analyst findings +
644
+ * diff). Opaque here; the driver types it. */
645
+ report?: unknown;
646
+ }
647
+ interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
648
+ generations: Array<{
649
+ record: GenerationRecord;
650
+ surfaces: Array<{
651
+ surfaceHash: string;
652
+ surface: MutableSurface;
653
+ campaign: CampaignResult<TArtifact, TScenario>;
654
+ }>;
655
+ }>;
656
+ winnerSurface: MutableSurface;
657
+ winnerSurfaceHash: string;
658
+ baselineCampaign: CampaignResult<TArtifact, TScenario>;
659
+ }
660
+ declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
661
+ declare function surfaceHash(surface: MutableSurface): string;
662
+
663
+ /**
664
+ * @experimental
665
+ *
666
+ * `runImprovementLoop` — the gated-promotion shell around the improvement
667
+ * loop body (`runOptimization`). Drives candidate surfaces via the
668
+ * `ImprovementDriver`, re-scores the winner against the baseline on a
669
+ * holdout set, runs the release gate, and optionally opens a PR.
670
+ *
671
+ * Role vocabulary (see docs/design/loop-taxonomy.md):
672
+ * - DRIVER = the `ImprovementDriver` (evolutionary GEPA mutator OR
673
+ * reflective analyst). Proposes candidate SURFACES — the
674
+ * worker's system prompt / tool config — NOT conversation
675
+ * turns.
676
+ * - MEASUREMENT= `runCampaign`. Scores one surface by running the worker
677
+ * (via `dispatch`) over scenarios and judging the output.
678
+ * - WORKER = the agent harness in the sandbox, invoked behind the
679
+ * topology-opaque `dispatch` seam — never referenced here.
680
+ *
681
+ * Distinct from `runLoop` in `@tangle-network/agent-runtime`, which is the
682
+ * INNER conversation loop (driver↔workers in a sandbox). `runImprovementLoop`
683
+ * is the OUTER loop: it improves the surface that those workers run.
684
+ *
685
+ * Hard-refuses unsafe configurations:
686
+ * - `tracing: 'off'` when a driver is wired (improvement is unattributable)
687
+ * - `autoOnPromote: 'config'` — DEFERRED to Pass B; v0.40 only ships
688
+ * `'pr'` and `'none'`.
689
+ */
690
+
691
+ interface RunImprovementLoopOptions<TScenario extends Scenario, TArtifact> extends RunOptimizationOptions<TScenario, TArtifact> {
692
+ /** Holdout scenarios kept OUT of the training optimization pool — used
693
+ * ONLY to score baseline vs winner for the gate. */
694
+ holdoutScenarios: TScenario[];
695
+ /** Promotion gate. Substrate strongly recommends `defaultProductionGate`
696
+ * for production wiring (composes red-team / reward-hacking / canary /
697
+ * heldout). */
698
+ gate: Gate<TArtifact, TScenario>;
699
+ /** What to do when the gate ships:
700
+ * - `'pr'`: open a PR via `openAutoPr`
701
+ * - `'none'`: just report — caller decides what to do with the winner
702
+ * v0.40 does NOT support `'config'` (live-runtime self-mutation) —
703
+ * deferred to Pass B behind safety stack. */
704
+ autoOnPromote: 'pr' | 'none';
705
+ /** GH owner / repo for the auto-PR. Required when autoOnPromote === 'pr'. */
706
+ ghOwner?: string;
707
+ ghRepo?: string;
708
+ /** Optional render override — substrate writes a diff-shaped surface; pass
709
+ * a function to format the promoted surface differently. */
710
+ renderPromotedDiff?: (winnerSurface: MutableSurface, baselineSurface: MutableSurface) => string;
711
+ }
712
+ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extends RunOptimizationResult<TArtifact, TScenario> {
713
+ baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
714
+ winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
715
+ gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>;
716
+ prResult?: ReturnType<typeof openAutoPr>;
717
+ }
718
+ declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
719
+
720
+ /**
721
+ * @experimental
722
+ *
723
+ * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like
724
+ * (multiple commits allowed). A code-tier driver's `propose()` creates a
725
+ * worktree, an agent commits the change into it, and `finalize()` returns a
726
+ * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker
727
+ * against the changed code. On promotion the worktree becomes the PR branch.
728
+ *
729
+ * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))
730
+ * adapter can slot in without touching driver code. Only the git adapter
731
+ * ships today. See `docs/design/self-improvement-engine.md`.
732
+ */
733
+
734
+ interface Worktree {
735
+ /** Absolute path to the checked-out worktree directory. */
736
+ path: string;
737
+ /** The branch the worktree is on (becomes the PR branch on promotion). */
738
+ branch: string;
739
+ /** The ref the worktree was forked from. */
740
+ baseRef: string;
741
+ }
742
+ interface WorktreeAdapter {
743
+ /** Create an isolated worktree on a fresh branch off `baseRef`. */
744
+ create(opts: {
745
+ baseRef: string;
746
+ label: string;
747
+ }): Promise<Worktree>;
748
+ /** Commit any pending changes in the worktree, then return a CodeSurface
749
+ * pointing at it. The agent has already written its change into
750
+ * `worktree.path` by the time this is called. */
751
+ finalize(worktree: Worktree, summary: string): Promise<CodeSurface>;
752
+ /** Remove the worktree (and its branch) — called for losing candidates. */
753
+ discard(worktree: Worktree): Promise<void>;
754
+ }
755
+ declare class WorktreeAdapterError extends Error {
756
+ readonly cause?: unknown | undefined;
757
+ constructor(message: string, cause?: unknown | undefined);
758
+ }
759
+ interface GitWorktreeAdapterOptions {
760
+ /** Repo root the worktrees fork from. */
761
+ repoRoot: string;
762
+ /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */
763
+ worktreeDir?: string;
764
+ /** Branch-name prefix. Default: `improve`. */
765
+ branchPrefix?: string;
766
+ /** Test seam — defaults to a real `git` runner. */
767
+ git?: (args: string[], cwd: string) => string;
768
+ }
769
+ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter;
770
+ /** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can
771
+ * run the worker in. A path ref is returned as-is; anything else is treated
772
+ * as a ref under the adapter's worktree dir. */
773
+ declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
774
+
775
+ export { type CampaignAggregates, type CampaignArtifactWriter, type CampaignCellResult, type CampaignCostMeter, type CampaignResult, type CampaignTraceWriter, type CodeSurface, type DefaultProductionGateOptions, type DispatchContext, type DispatchFn, type EvolutionaryDriverOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type Gate, type GateContext, type GateDecision, type GateResult, type GenerationRecord, type GitWorktreeAdapterOptions, type HeldOutGateOptions, type ImprovementDriver, type JudgeAggregate, type JudgeConfig, type JudgeDimension, type JudgeScore, type LabeledScenarioRecord, type LabeledScenarioSampleArgs, type LabeledScenarioSource, type LabeledScenarioStore, LabeledScenarioStoreError, type LabeledScenarioWrite, type MutableSurface, type Mutator, type OpenAutoPrOptions, type OpenAutoPrResult, type OptimizerConfig, type ProposeContext, type RedactionStatus, type RunCampaignOptions, type RunEvalOptions, type RunImprovementLoopOptions, type RunImprovementLoopResult, type RunOptimizationOptions, type RunOptimizationResult, type Scenario, type ScenarioAggregate, type SessionScript, type TraceSpan, type Worktree, type WorktreeAdapter, WorktreeAdapterError, composeGate, defaultProductionGate, evolutionaryDriver, gitWorktreeAdapter, heldOutGate, openAutoPr, resolveWorktreePath, runCampaign, runEval, runImprovementLoop, runOptimization, surfaceHash };