@tangle-network/agent-eval 0.38.0 → 0.40.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/dist/campaign/index.d.ts +695 -0
  2. package/dist/campaign/index.js +741 -0
  3. package/dist/campaign/index.js.map +1 -0
  4. package/dist/chunk-5U2DOJU4.js +565 -0
  5. package/dist/chunk-5U2DOJU4.js.map +1 -0
  6. package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
  7. package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
  8. package/dist/chunk-BWZEGTES.js.map +1 -0
  9. package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
  10. package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
  11. package/dist/chunk-GGE4NNQT.js +65 -0
  12. package/dist/chunk-GGE4NNQT.js.map +1 -0
  13. package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
  14. package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
  15. package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
  16. package/dist/chunk-MAOZCN36.js.map +1 -0
  17. package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
  18. package/dist/chunk-TMXPFWC7.js +305 -0
  19. package/dist/chunk-TMXPFWC7.js.map +1 -0
  20. package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
  21. package/dist/chunk-WP7SY7AI.js.map +1 -0
  22. package/dist/chunk-YV7J7X5N.js +313 -0
  23. package/dist/chunk-YV7J7X5N.js.map +1 -0
  24. package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
  25. package/dist/control.d.ts +3 -3
  26. package/dist/control.js +2 -2
  27. package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
  28. package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
  29. package/dist/governance/index.d.ts +133 -5
  30. package/dist/index.d.ts +35 -34
  31. package/dist/index.js +97 -630
  32. package/dist/index.js.map +1 -1
  33. package/dist/multishot/index.d.ts +21 -21
  34. package/dist/multishot/index.js +64 -15
  35. package/dist/multishot/index.js.map +1 -1
  36. package/dist/openapi.json +1 -1
  37. package/dist/optimization.d.ts +2 -2
  38. package/dist/optimization.js +5 -5
  39. package/dist/pipelines/index.js +2 -2
  40. package/dist/red-team-30II1T4o.d.ts +63 -0
  41. package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
  42. package/dist/reporting.d.ts +2 -2
  43. package/dist/reporting.js +3 -3
  44. package/dist/rl.js +15 -315
  45. package/dist/rl.js.map +1 -1
  46. package/dist/run-campaign-JYJXYHHL.js +10 -0
  47. package/dist/run-campaign-JYJXYHHL.js.map +1 -0
  48. package/dist/traces.js +7 -5
  49. package/dist/wire/index.d.ts +2 -2
  50. package/docs/design/loop-taxonomy.md +233 -0
  51. package/package.json +33 -24
  52. package/dist/chunk-KHZRNY3F.js.map +0 -1
  53. package/dist/chunk-L5UNCDAJ.js.map +0 -1
  54. package/dist/chunk-TSPOEDM3.js.map +0 -1
  55. package/dist/index-CN2agEaO.d.ts +0 -191
  56. /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
  57. /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
  58. /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
  59. /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
  60. /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
  61. /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
@@ -0,0 +1,695 @@
1
+ import { RunRecord } from '@tangle-network/agent-runtime';
2
+ import { R as RedTeamCase } from '../red-team-30II1T4o.js';
3
+ import '../dataset-BlwAtYYf.js';
4
+ import '../errors-mje_cKOs.js';
5
+ import '../store-Db2Bv8Cf.js';
6
+
7
+ /**
8
+ * @experimental
9
+ *
10
+ * Pass A substrate types — `runCampaign` is the one primitive every
11
+ * eval flow composes from. Three contracts in this file:
12
+ *
13
+ * - `Scenario` input set
14
+ * - `DispatchFn` how to run one scenario → artifact
15
+ * - `CampaignResult` defined output schema (the contract downstream tools depend on)
16
+ *
17
+ * Three more lifted from earlier substrate work (re-exported):
18
+ *
19
+ * - `JudgeConfig` pluggable dimensional scorer (0.38)
20
+ * - `Mutator` optimization-loop surface mutator
21
+ * - `Gate` promotion gate (`HeldOutGate` and friends adapt to this)
22
+ *
23
+ * No new architecture vs 0.38 — Pass A formalizes the shapes so consumers
24
+ * can build dashboards / CI gates / regression diffs against a stable schema.
25
+ */
26
+ /** @experimental Stable identifier + kind tag for any scenario. Consumers
27
+ * extend with their per-domain payload (persona, task, requirement, ...). */
28
+ interface Scenario {
29
+ id: string;
30
+ kind: string;
31
+ tags?: string[];
32
+ }
33
+ /** @experimental Context handed to every dispatch invocation. Scoped — every
34
+ * trace/span carries the cellId, every artifact write lands under the cell's
35
+ * artifact root, the cost meter accumulates per cell. */
36
+ interface DispatchContext {
37
+ cellId: string;
38
+ rep: number;
39
+ generation?: number;
40
+ seed: number;
41
+ signal: AbortSignal;
42
+ trace: CampaignTraceWriter;
43
+ artifacts: CampaignArtifactWriter;
44
+ cost: CampaignCostMeter;
45
+ /** Populated when this run is part of a multi-cycle improvement loop. */
46
+ cycleId?: string;
47
+ /** Populated when the substrate resumed from a prior cache hit. */
48
+ resumedFrom?: string;
49
+ }
50
+ /** @experimental One function: scenario + ctx → artifact. Dispatcher chooses
51
+ * whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */
52
+ type DispatchFn<TScenario extends Scenario, TArtifact> = (scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
53
+ /** @experimental One session within a multi-session journey. Dispatch is
54
+ * invoked once per session in order; state from prior session's artifact
55
+ * is exposed via `ctx.priorSessionArtifact`. */
56
+ interface SessionScript<TScenario, TArtifact> {
57
+ id: string;
58
+ intent: string;
59
+ maxTurns?: number;
60
+ /** When true, knowledge accumulated this session persists to next. */
61
+ affectsKnowledge?: boolean;
62
+ /** Optional per-session persona evolution — called after the session
63
+ * resolves. Returns the persona shape used by the NEXT session. */
64
+ evolveAfterSession?: (artifact: TArtifact, sessionIndex: number, scenario: TScenario) => TScenario;
65
+ }
66
+ interface JudgeDimension {
67
+ /** JSON field name + score key. */
68
+ key: string;
69
+ /** Description shown in the judge's user prompt. */
70
+ description: string;
71
+ }
72
+ /** @experimental Pluggable dimensional scorer. Consumers register one per
73
+ * scoring concern. `appliesTo` lets a judge run only on scenarios that match
74
+ * (e.g., legal_citation judge only on legal scenarios). */
75
+ interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {
76
+ name: string;
77
+ model?: string;
78
+ dimensions: JudgeDimension[];
79
+ systemPrompt: string;
80
+ buildPrompt: (input: {
81
+ artifact: TArtifact;
82
+ scenario: TScenario;
83
+ }) => string;
84
+ appliesTo?: (scenario: TScenario) => boolean;
85
+ apiKey?: string;
86
+ baseUrl?: string;
87
+ }
88
+ interface JudgeScore {
89
+ dimensions: Record<string, number>;
90
+ composite: number;
91
+ notes: string;
92
+ }
93
+ /** @experimental A tier-4 code surface — a candidate change to the agent's
94
+ * IMPLEMENTATION, not its prompt. Produced by autoresearch (reads codebase +
95
+ * trace findings → opens a worktree). Measured by checking out `worktreeRef`
96
+ * and running the worker against the changed code. See the improvement-tier
97
+ * table in `docs/design/loop-taxonomy.md`. */
98
+ interface CodeSurface {
99
+ kind: 'code';
100
+ /** Worktree path or git ref holding the candidate code change. The
101
+ * consumer's `dispatchWithSurface` checks this out before running. */
102
+ worktreeRef: string;
103
+ /** Base ref the change is measured against. Default: the repo's main. */
104
+ baseRef?: string;
105
+ /** Human summary of what changed — rendered into the auto-PR body. */
106
+ summary?: string;
107
+ }
108
+ /** @experimental The mutable surface a driver proposes. Tiers (see
109
+ * `docs/design/loop-taxonomy.md`):
110
+ * - `string` — tiers 1-2: system-prompt addendum / serialized tool
111
+ * config. Cheap, reversible, text-diffable.
112
+ * - `CodeSurface` — tier 4: an implementation change behind a worktree ref.
113
+ * Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,
114
+ * not this type. */
115
+ type MutableSurface = string | CodeSurface;
116
+ /** @experimental Stateless surface mutation — given findings + current
117
+ * surface, return N candidate surfaces. Pure transform, no generation
118
+ * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
119
+ * conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */
120
+ interface Mutator<TFindings = unknown> {
121
+ kind: string;
122
+ mutate(args: {
123
+ findings: TFindings[];
124
+ currentSurface: MutableSurface;
125
+ populationSize: number;
126
+ signal: AbortSignal;
127
+ }): Promise<MutableSurface[]>;
128
+ }
129
+ /** @experimental A surface-improvement strategy — the DRIVER of the
130
+ * improvement loop. Given the current best surface, the history of what's
131
+ * been tried + scored, and any external findings, propose the next batch of
132
+ * candidate surfaces to measure. Optionally decide to stop early.
133
+ *
134
+ * The evolutionary mutator (`evolutionaryDriver`) and a reflective analyst
135
+ * (`analystDriver`, consumer-wired from `@tangle-network/agent-runtime`'s
136
+ * `runAnalystLoop`) are two drivers of the SAME loop — not two loops. The
137
+ * loop body (`runOptimization`) and the gated promotion shell
138
+ * (`runImprovementLoop`) are driver-agnostic. */
139
+ interface ImprovementDriver<TFindings = unknown> {
140
+ kind: string;
141
+ /** Plan: propose N candidate surfaces for the next generation. */
142
+ propose(args: {
143
+ currentSurface: MutableSurface;
144
+ history: GenerationRecord[];
145
+ findings: TFindings[];
146
+ populationSize: number;
147
+ generation: number;
148
+ signal: AbortSignal;
149
+ }): Promise<MutableSurface[]>;
150
+ /** Decide: stop early when the driver judges the search converged or
151
+ * exhausted. Default (omitted) runs all `maxGenerations`. */
152
+ decide?(args: {
153
+ history: GenerationRecord[];
154
+ }): {
155
+ stop: boolean;
156
+ reason?: string;
157
+ };
158
+ }
159
+ interface OptimizerConfig {
160
+ driver: ImprovementDriver;
161
+ populationSize: number;
162
+ maxGenerations: number;
163
+ surfaceExtractor: (profile: unknown) => MutableSurface;
164
+ }
165
+ /** @experimental Five-valued verdict taxonomy (MOSS-paper alignment). */
166
+ type GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
167
+ interface GateContext<TArtifact, TScenario extends Scenario> {
168
+ candidateArtifacts: Map<string, TArtifact>;
169
+ baselineArtifacts?: Map<string, TArtifact>;
170
+ judgeScores: Map<string, Record<string, JudgeScore>>;
171
+ scenarios: TScenario[];
172
+ cost: {
173
+ candidate: number;
174
+ baseline: number;
175
+ };
176
+ signal: AbortSignal;
177
+ }
178
+ interface GateResult {
179
+ decision: GateDecision;
180
+ reasons: string[];
181
+ contributingGates: Array<{
182
+ name: string;
183
+ passed: boolean;
184
+ detail: unknown;
185
+ }>;
186
+ delta?: number;
187
+ }
188
+ /** @experimental Composable promotion gate. */
189
+ interface Gate<TArtifact = unknown, TScenario extends Scenario = Scenario> {
190
+ name: string;
191
+ decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult>;
192
+ }
193
+ /** @experimental Scoped trace writer handed to each dispatch — every span
194
+ * auto-tagged with the cellId so traces filter cleanly. */
195
+ interface CampaignTraceWriter {
196
+ span(name: string, attributes?: Record<string, unknown>): TraceSpan;
197
+ flush(): Promise<void>;
198
+ }
199
+ interface TraceSpan {
200
+ end(attributes?: Record<string, unknown>): void;
201
+ setAttribute(key: string, value: unknown): void;
202
+ }
203
+ /** @experimental Scoped artifact writer — `write(path, content)` lands under
204
+ * `<runDir>/<cellId>/<path>`. */
205
+ interface CampaignArtifactWriter {
206
+ write(path: string, content: string | Uint8Array): Promise<string>;
207
+ writeJson(path: string, value: unknown): Promise<string>;
208
+ }
209
+ /** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs
210
+ * via the cost-ledger backend hooks; consumers can record additional
211
+ * spend (sandbox time, tool costs) via `observe`. */
212
+ interface CampaignCostMeter {
213
+ observe(amountUsd: number, source: string): void;
214
+ current(): number;
215
+ }
216
+ /** @experimental Source tag — required on every store write. Used by the
217
+ * default training-source filter (production-trace samples NOT used as
218
+ * training scenarios unless explicitly opted in). */
219
+ type LabeledScenarioSource = 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
220
+ type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted';
221
+ /** @experimental Required-provenance write. The store rejects writes that
222
+ * lack provenance — a default-on flywheel without provenance is the
223
+ * data-poisoning vector flagged in the alignment review. */
224
+ interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact = unknown> {
225
+ scenario: TScenario;
226
+ artifact: TArtifact;
227
+ judgeScores: Record<string, JudgeScore>;
228
+ source: LabeledScenarioSource;
229
+ sourceVersionHash: string;
230
+ capturedAt: string;
231
+ redactionStatus: RedactionStatus;
232
+ /** Optional per-source rate-limit bucket key (e.g., the tenant id). */
233
+ rateLimitBucket?: string;
234
+ }
235
+ interface LabeledScenarioRecord<TScenario extends Scenario = Scenario, TArtifact = unknown> extends LabeledScenarioWrite<TScenario, TArtifact> {
236
+ /** Stable hash of (scenario.id, source, capturedAt, sourceVersionHash). */
237
+ recordHash: string;
238
+ /** Substrate-assigned split — train if captured before the campaign's
239
+ * `temporalCutoff`, test if after. Explicit override allowed via filter. */
240
+ split: 'train' | 'test';
241
+ }
242
+ interface LabeledScenarioSampleArgs {
243
+ count: number;
244
+ /** REQUIRED — substrate refuses to sample without an explicit split. */
245
+ split: 'train' | 'test';
246
+ /** REQUIRED — only records captured before this timestamp are returned.
247
+ * Enforces temporal split discipline (test scenarios captured AFTER train
248
+ * cannot enter the training pool). */
249
+ capturedBefore: string;
250
+ filter?: {
251
+ kind?: string;
252
+ source?: LabeledScenarioSource | LabeledScenarioSource[];
253
+ minComposite?: number;
254
+ maxComposite?: number;
255
+ };
256
+ }
257
+ interface LabeledScenarioStore {
258
+ observe(write: LabeledScenarioWrite): Promise<void>;
259
+ sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>;
260
+ size(): Promise<{
261
+ train: number;
262
+ test: number;
263
+ bySource: Record<string, number>;
264
+ }>;
265
+ }
266
+ interface CampaignCellResult<TArtifact> {
267
+ cellId: string;
268
+ scenarioId: string;
269
+ rep: number;
270
+ generation?: number;
271
+ artifact: TArtifact;
272
+ judgeScores: Record<string, JudgeScore>;
273
+ costUsd: number;
274
+ durationMs: number;
275
+ seed: number;
276
+ cached: boolean;
277
+ error?: string;
278
+ }
279
+ interface JudgeAggregate {
280
+ mean: number;
281
+ stdev: number;
282
+ ci95: [number, number];
283
+ n: number;
284
+ }
285
+ interface ScenarioAggregate {
286
+ meanComposite: number;
287
+ ci95: [number, number];
288
+ n: number;
289
+ }
290
+ interface GenerationRecord {
291
+ generationIndex: number;
292
+ candidates: Array<{
293
+ surfaceHash: string;
294
+ composite: number;
295
+ ci95: [number, number];
296
+ }>;
297
+ promoted: string[];
298
+ }
299
+ interface CampaignAggregates {
300
+ byJudge: Record<string, JudgeAggregate>;
301
+ byScenario: Record<string, ScenarioAggregate>;
302
+ totalCostUsd: number;
303
+ cellsExecuted: number;
304
+ cellsSkipped: number;
305
+ cellsCached: number;
306
+ cellsFailed: number;
307
+ }
308
+ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scenario> {
309
+ /** sha256(scenarios, judges, dispatch source ref, optimizer config, seed). Stable identity for reruns. */
310
+ manifestHash: string;
311
+ seed: number;
312
+ startedAt: string;
313
+ endedAt: string;
314
+ durationMs: number;
315
+ cells: Array<CampaignCellResult<TArtifact>>;
316
+ aggregates: CampaignAggregates;
317
+ optimization?: {
318
+ generations: GenerationRecord[];
319
+ winnerSurfaceHash?: string;
320
+ };
321
+ gate?: GateResult;
322
+ prUrl?: string;
323
+ runDir: string;
324
+ artifactsByPath: Record<string, string>;
325
+ /** Substrate strips the input scenarios to id+kind for the result manifest;
326
+ * consumers needing full payload look it up via the original input. The
327
+ * type parameter `TScenario` is propagated for downstream consumers that
328
+ * want narrowed types when extending `CampaignResult`. */
329
+ scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
330
+ }
331
+
332
+ /**
333
+ * @experimental
334
+ *
335
+ * `openAutoPr` — thin shell-out helper for the `runImprovementLoop` preset's
336
+ * `autoOnPromote: 'pr'` mode. Substitutes for the per-product PR-opening
337
+ * code consumers duplicated 4 times. The PR body includes the campaign's
338
+ * manifest hash, gate verdict, and scorecard summary so reviewers can see
339
+ * exactly what was promoted + why.
340
+ *
341
+ * NOT a deploy mechanism — this only OPENS a PR. The human reviews + merges.
342
+ * The Shape B (`autoOnPromote: 'config'`) live-runtime-mutation path is
343
+ * deferred to Pass B with the full shadow / canary / rollback stack.
344
+ */
345
+
346
+ interface OpenAutoPrOptions<TArtifact, TScenario extends Scenario> {
347
+ /** Campaign result to attach to the PR. */
348
+ result: CampaignResult<TArtifact, TScenario>;
349
+ /** Gate verdict explaining the promotion. Substrate refuses to open a PR
350
+ * when `gate.decision !== 'ship'` — fails loud. */
351
+ gate: GateResult;
352
+ /** Promoted surface diff — typically the new system prompt addendum or
353
+ * full profile diff. Substrate writes it as the PR body. */
354
+ promotedDiff: string;
355
+ /** GH owner/repo target (e.g., `tangle-network/gtm-agent`). */
356
+ ghOwner: string;
357
+ ghRepo: string;
358
+ /** Branch name for the PR. Default `auto/<manifestHash[:12]>`. */
359
+ branch?: string;
360
+ /** PR title. Default includes manifest hash. */
361
+ title?: string;
362
+ /** Whether to actually open the PR or just dry-run. Default reads
363
+ * `GH_AUTO_PR_TOKEN` env — present = open, absent = dry-run. */
364
+ dryRun?: boolean;
365
+ /** Test seam — substitute `gh pr create` invocation. */
366
+ ghExec?: (args: string[]) => {
367
+ stdout: string;
368
+ stderr: string;
369
+ status: number;
370
+ };
371
+ }
372
+ interface OpenAutoPrResult {
373
+ opened: boolean;
374
+ prUrl?: string;
375
+ dryRun: boolean;
376
+ reason: string;
377
+ }
378
+ declare function openAutoPr<TArtifact, TScenario extends Scenario>(options: OpenAutoPrOptions<TArtifact, TScenario>): OpenAutoPrResult;
379
+
380
+ /**
381
+ * @experimental
382
+ *
383
+ * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
384
+ * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
385
+ * the evolutionary strategy: each generation, mutate the current best surface
386
+ * into N candidates, measure, select. No generation memory beyond the current
387
+ * surface; the loop body handles ranking + promotion.
388
+ *
389
+ * The reflective alternative — `analystDriver` — is consumer-wired from
390
+ * `@tangle-network/agent-runtime`'s `runAnalystLoop`: it reasons over the
391
+ * full generation history + trace findings to propose targeted edits rather
392
+ * than blind mutations. Both conform to `ImprovementDriver`; the improvement
393
+ * loop is identical regardless of which drives it.
394
+ */
395
+
396
+ interface EvolutionaryDriverOptions<TFindings = unknown> {
397
+ mutator: Mutator<TFindings>;
398
+ /** External findings fed to the mutator each generation. Default: []. */
399
+ findings?: TFindings[];
400
+ }
401
+ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
402
+
403
+ /**
404
+ * @experimental
405
+ *
406
+ * Compose multiple `Gate` implementations — every gate must pass for the
407
+ * composite to ship. Closes the alignment reviewer's "default-only
408
+ * heldOutGate + costGate would happily promote a reward-hacked prompt"
409
+ * concern by making safety gates first-class composable defaults.
410
+ */
411
+
412
+ /** Compose gates — all must `ship` for the composite to `ship`. First
413
+ * non-ship verdict short-circuits the composite verdict, but ALL gates run
414
+ * (so the result records every gate's reason — useful for diagnostics). */
415
+ declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
416
+
417
+ /**
418
+ * @experimental
419
+ *
420
+ * `defaultProductionGate` — composes the substrate's existing safety
421
+ * primitives (red-team / reward-hacking / canary / heldout) into a single
422
+ * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
423
+ * primitives are off the critical path" blocker.
424
+ *
425
+ * The composition is opinionated — when consumers wire `runImprovementLoop`,
426
+ * THIS gate is the default. Consumers can still pass a custom gate to
427
+ * override; the recommended pattern is to compose THIS gate with whatever
428
+ * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
429
+ */
430
+
431
+ interface DefaultProductionGateOptions {
432
+ /** Required: scenarios held out from training; substrate compares
433
+ * candidate-on-holdout vs baseline-on-holdout. */
434
+ holdoutScenarios: Scenario[];
435
+ /** Minimum mean-composite improvement required to ship. Default 0.5. */
436
+ deltaThreshold?: number;
437
+ /** Total $ budget for ALL cells in this campaign — including baseline + candidate.
438
+ * Composite verdict refuses to ship when spend exceeded budget. */
439
+ budgetUsd?: number;
440
+ /** Red-team cases to probe candidate outputs against. When omitted the
441
+ * substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
442
+ * battery for tighter coverage. */
443
+ redTeamBattery?: RedTeamCase[];
444
+ /** Run records (oldest-first) needed for the reward-hacking detector.
445
+ * Substrate populates from prior production-loop generations. */
446
+ recentRuns?: RunRecord[];
447
+ /** When true, the gate refuses to ship if the reward-hacking detector
448
+ * fires at the `gaming` severity. Default true. */
449
+ blockOnRewardHackingGaming?: boolean;
450
+ }
451
+ declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
452
+
453
+ /**
454
+ * @experimental
455
+ *
456
+ * Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
457
+ * `Gate`. Use when you want held-out as one of N composed gates instead of
458
+ * the full `defaultProductionGate` stack.
459
+ */
460
+
461
+ interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
462
+ scenarios: TScenario[];
463
+ deltaThreshold?: number;
464
+ }
465
+ declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
466
+
467
+ /**
468
+ * @experimental
469
+ *
470
+ * Filesystem `LabeledScenarioStore` adapter. The default capture sink for
471
+ * traces + eval artifacts. Production deployments typically swap for a
472
+ * Turso/SQLite adapter (same interface).
473
+ *
474
+ * Records land as one JSONL file per source under `<root>/<source>.jsonl`.
475
+ * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.
476
+ *
477
+ * Safety properties enforced at write-time:
478
+ *
479
+ * - **Provenance required**: writes without `source`, `sourceVersionHash`,
480
+ * `capturedAt`, `redactionStatus` are rejected. Closes the alignment
481
+ * reviewer's data-poisoning gap.
482
+ * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`
483
+ * stops a single tenant/source from flooding the store.
484
+ *
485
+ * Safety properties enforced at sample-time:
486
+ *
487
+ * - **Required split + capturedBefore**: substrate refuses to sample without
488
+ * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates
489
+ * accidental train/test contamination.
490
+ * - **Default training-source filter**: when the store is sampled with
491
+ * `split: 'train'`, production-trace records are EXCLUDED unless the
492
+ * caller passes `filter.source: 'production-trace'` explicitly. Closes
493
+ * the contamination-by-default gap flagged by the senior eval engineer.
494
+ */
495
+
496
+ interface FsLabeledScenarioStoreOptions {
497
+ /** Root directory for JSONL files. Created if missing. */
498
+ root: string;
499
+ /** Per-source rate limit. When set, writes exceeding the cap are rejected
500
+ * with a typed error. Default: no limit. */
501
+ maxWritesPerMinutePerBucket?: number;
502
+ /** Test seam — override `Date.now()` for deterministic tests. */
503
+ now?: () => number;
504
+ }
505
+ declare class LabeledScenarioStoreError extends Error {
506
+ readonly code: string;
507
+ constructor(code: string, message: string);
508
+ }
509
+ declare class FsLabeledScenarioStore implements LabeledScenarioStore {
510
+ private readonly options;
511
+ private readonly now;
512
+ private readonly rateLimits;
513
+ constructor(options: FsLabeledScenarioStoreOptions);
514
+ observe(write: LabeledScenarioWrite): Promise<void>;
515
+ sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>;
516
+ size(): Promise<{
517
+ train: number;
518
+ test: number;
519
+ bySource: Record<string, number>;
520
+ }>;
521
+ private assertProvenance;
522
+ private assertRateLimit;
523
+ private toRecord;
524
+ private pathForSource;
525
+ }
526
+
527
+ /**
528
+ * @experimental
529
+ *
530
+ * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates
531
+ * scenarios → dispatch → artifacts → judges → aggregates, with full
532
+ * reproducibility (seed + manifest hash), cell-level resumability, bootstrap
533
+ * CIs, and the `LabeledScenarioStore` capture flywheel.
534
+ *
535
+ * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this
536
+ * primitive but live in `presets/run-improvement-loop.ts`. This file keeps
537
+ * the core orchestrator minimal — Phase 1 of the Pass A track.
538
+ */
539
+
540
+ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
541
+ scenarios: TScenario[];
542
+ dispatch: DispatchFn<TScenario, TArtifact>;
543
+ judges?: JudgeConfig<TArtifact, TScenario>[];
544
+ /** Required for reproducibility. Default 42. */
545
+ seed?: number;
546
+ /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for
547
+ * bootstrap-tight intervals on critical eval. */
548
+ reps?: number;
549
+ /** When true (default), completed cells are cached by
550
+ * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */
551
+ resumable?: boolean;
552
+ /** Optional store — when present, every artifact + judge score is captured
553
+ * with the configured `captureSource`. Capture is default ON; pass `'off'`
554
+ * to disable. */
555
+ labeledStore?: LabeledScenarioStore | 'off';
556
+ captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
557
+ captureSourceVersionHash?: string;
558
+ /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */
559
+ costCeiling?: number;
560
+ /** Max concurrent cells. Default 2. */
561
+ maxConcurrency?: number;
562
+ /** Required: where artifacts + traces land. */
563
+ runDir: string;
564
+ /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted
565
+ * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate
566
+ * refuses this when the caller wires `autoOnPromote !== 'none'`. */
567
+ tracing?: 'on' | 'off';
568
+ /** Test seam — override the wall clock for deterministic tests. */
569
+ now?: () => Date;
570
+ /** Test seam — override per-cell trace writer factory. */
571
+ buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter;
572
+ }
573
+ declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
574
+
575
+ /**
576
+ * @experimental
577
+ *
578
+ * `runEval` — the simplest preset over `runCampaign`. No optimizer, no
579
+ * gate, no auto-PR. Just: run scenarios through dispatch, score with
580
+ * judges, return CampaignResult.
581
+ *
582
+ * The 80% case for consumers who want a scorecard, not an improvement loop.
583
+ */
584
+
585
+ interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
586
+ runDir: string;
587
+ }
588
+ declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
589
+
590
+ /**
591
+ * @experimental
592
+ *
593
+ * `runOptimization` — the improvement loop body. Runs N generations: the
594
+ * `ImprovementDriver` proposes K candidate surfaces per generation, each
595
+ * candidate runs a campaign (the measurement), top-scoring promote to the
596
+ * next generation. Driver-agnostic — the same loop runs an evolutionary
597
+ * population mutator (`evolutionaryDriver`) or a reflective analyst
598
+ * (`analystDriver`); they differ only in how `propose()` picks candidates.
599
+ *
600
+ * This is `runLoop`'s shape (plan → measure → decide) specialized to surface
601
+ * improvement: `driver.propose` = plan, `runCampaign` = the measurement (which
602
+ * runs the worker behind `dispatch`), the mean-composite ranking = the
603
+ * validator, `driver.decide` = the stop check.
604
+ *
605
+ * The gated-promotion shell (`runImprovementLoop`) wraps this with a holdout
606
+ * re-score + release gate + optional PR.
607
+ */
608
+
609
+ interface RunOptimizationOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'dispatch'> {
610
+ /** Initial mutable surface (typically system prompt or addendum). */
611
+ baselineSurface: MutableSurface;
612
+ /** Dispatcher that takes the CURRENT surface + scenario → artifact. */
613
+ dispatchWithSurface: (surface: MutableSurface, scenario: TScenario, ctx: Parameters<RunCampaignOptions<TScenario, TArtifact>['dispatch']>[1]) => Promise<TArtifact>;
614
+ /** The improvement strategy. Wrap a population `Mutator` via
615
+ * `evolutionaryDriver({ mutator })`, or pass a reflective `analystDriver`. */
616
+ driver: ImprovementDriver;
617
+ populationSize: number;
618
+ maxGenerations: number;
619
+ /** How many top-scoring candidates carry to the next generation. Default 2. */
620
+ promoteTopK?: number;
621
+ }
622
+ interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
623
+ generations: Array<{
624
+ record: GenerationRecord;
625
+ surfaces: Array<{
626
+ surfaceHash: string;
627
+ surface: MutableSurface;
628
+ campaign: CampaignResult<TArtifact, TScenario>;
629
+ }>;
630
+ }>;
631
+ winnerSurface: MutableSurface;
632
+ winnerSurfaceHash: string;
633
+ baselineCampaign: CampaignResult<TArtifact, TScenario>;
634
+ }
635
+ declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
636
+ declare function surfaceHash(surface: MutableSurface): string;
637
+
638
+ /**
639
+ * @experimental
640
+ *
641
+ * `runImprovementLoop` — the gated-promotion shell around the improvement
642
+ * loop body (`runOptimization`). Drives candidate surfaces via the
643
+ * `ImprovementDriver`, re-scores the winner against the baseline on a
644
+ * holdout set, runs the release gate, and optionally opens a PR.
645
+ *
646
+ * Role vocabulary (see docs/design/loop-taxonomy.md):
647
+ * - DRIVER = the `ImprovementDriver` (evolutionary GEPA mutator OR
648
+ * reflective analyst). Proposes candidate SURFACES — the
649
+ * worker's system prompt / tool config — NOT conversation
650
+ * turns.
651
+ * - MEASUREMENT= `runCampaign`. Scores one surface by running the worker
652
+ * (via `dispatch`) over scenarios and judging the output.
653
+ * - WORKER = the agent harness in the sandbox, invoked behind the
654
+ * topology-opaque `dispatch` seam — never referenced here.
655
+ *
656
+ * Distinct from `runLoop` in `@tangle-network/agent-runtime`, which is the
657
+ * INNER conversation loop (driver↔workers in a sandbox). `runImprovementLoop`
658
+ * is the OUTER loop: it improves the surface that those workers run.
659
+ *
660
+ * Hard-refuses unsafe configurations:
661
+ * - `tracing: 'off'` when a driver is wired (improvement is unattributable)
662
+ * - `autoOnPromote: 'config'` — DEFERRED to Pass B; v0.40 only ships
663
+ * `'pr'` and `'none'`.
664
+ */
665
+
666
+ interface RunImprovementLoopOptions<TScenario extends Scenario, TArtifact> extends RunOptimizationOptions<TScenario, TArtifact> {
667
+ /** Holdout scenarios kept OUT of the training optimization pool — used
668
+ * ONLY to score baseline vs winner for the gate. */
669
+ holdoutScenarios: TScenario[];
670
+ /** Promotion gate. Substrate strongly recommends `defaultProductionGate`
671
+ * for production wiring (composes red-team / reward-hacking / canary /
672
+ * heldout). */
673
+ gate: Gate<TArtifact, TScenario>;
674
+ /** What to do when the gate ships:
675
+ * - `'pr'`: open a PR via `openAutoPr`
676
+ * - `'none'`: just report — caller decides what to do with the winner
677
+ * v0.40 does NOT support `'config'` (live-runtime self-mutation) —
678
+ * deferred to Pass B behind safety stack. */
679
+ autoOnPromote: 'pr' | 'none';
680
+ /** GH owner / repo for the auto-PR. Required when autoOnPromote === 'pr'. */
681
+ ghOwner?: string;
682
+ ghRepo?: string;
683
+ /** Optional render override — substrate writes a diff-shaped surface; pass
684
+ * a function to format the promoted surface differently. */
685
+ renderPromotedDiff?: (winnerSurface: MutableSurface, baselineSurface: MutableSurface) => string;
686
+ }
687
+ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extends RunOptimizationResult<TArtifact, TScenario> {
688
+ baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
689
+ winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
690
+ gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>;
691
+ prResult?: ReturnType<typeof openAutoPr>;
692
+ }
693
+ declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
694
+
695
+ export { type CampaignAggregates, type CampaignArtifactWriter, type CampaignCellResult, type CampaignCostMeter, type CampaignResult, type CampaignTraceWriter, type CodeSurface, type DefaultProductionGateOptions, type DispatchContext, type DispatchFn, type EvolutionaryDriverOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type Gate, type GateContext, type GateDecision, type GateResult, type GenerationRecord, type HeldOutGateOptions, type ImprovementDriver, type JudgeAggregate, type JudgeConfig, type JudgeDimension, type JudgeScore, type LabeledScenarioRecord, type LabeledScenarioSampleArgs, type LabeledScenarioSource, type LabeledScenarioStore, LabeledScenarioStoreError, type LabeledScenarioWrite, type MutableSurface, type Mutator, type OpenAutoPrOptions, type OpenAutoPrResult, type OptimizerConfig, type RedactionStatus, type RunCampaignOptions, type RunEvalOptions, type RunImprovementLoopOptions, type RunImprovementLoopResult, type RunOptimizationOptions, type RunOptimizationResult, type Scenario, type ScenarioAggregate, type SessionScript, type TraceSpan, composeGate, defaultProductionGate, evolutionaryDriver, heldOutGate, openAutoPr, runCampaign, runEval, runImprovementLoop, runOptimization, surfaceHash };