@tangle-network/agent-eval 0.77.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +50 -19
  2. package/dist/adapters/http.d.ts +2 -2
  3. package/dist/adapters/langchain.d.ts +2 -2
  4. package/dist/adapters/otel.d.ts +4 -4
  5. package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
  6. package/dist/analyst/index.d.ts +42 -8
  7. package/dist/analyst/index.js +32 -2
  8. package/dist/analyst/index.js.map +1 -1
  9. package/dist/authenticity/index.d.ts +54 -1
  10. package/dist/authenticity/index.js +88 -1
  11. package/dist/authenticity/index.js.map +1 -1
  12. package/dist/belief-state/index.d.ts +188 -0
  13. package/dist/belief-state/index.js +486 -0
  14. package/dist/belief-state/index.js.map +1 -0
  15. package/dist/benchmarks/index.d.ts +2 -2
  16. package/dist/calibration-Cpr3WaX3.d.ts +101 -0
  17. package/dist/campaign/index.d.ts +11 -11
  18. package/dist/campaign/index.js +4 -4
  19. package/dist/chunk-4DIJWVUT.js +131 -0
  20. package/dist/chunk-4DIJWVUT.js.map +1 -0
  21. package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
  22. package/dist/chunk-5LVWPNS5.js.map +1 -0
  23. package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
  24. package/dist/chunk-CF67I6QY.js.map +1 -0
  25. package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
  26. package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
  27. package/dist/chunk-KWRRMR3J.js.map +1 -0
  28. package/dist/chunk-NPCTHQIO.js +91 -0
  29. package/dist/chunk-NPCTHQIO.js.map +1 -0
  30. package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
  31. package/dist/chunk-RPLZ4OIB.js.map +1 -0
  32. package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
  33. package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
  34. package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
  35. package/dist/contract/index.d.ts +128 -15
  36. package/dist/contract/index.js +118 -2
  37. package/dist/contract/index.js.map +1 -1
  38. package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
  39. package/dist/control.d.ts +2 -2
  40. package/dist/control.js +2 -2
  41. package/dist/governance/index.d.ts +1 -1
  42. package/dist/hosted/index.d.ts +4 -4
  43. package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
  44. package/dist/index.d.ts +127 -26
  45. package/dist/index.js +32 -7
  46. package/dist/index.js.map +1 -1
  47. package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
  48. package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
  49. package/dist/meta-eval/index.d.ts +6 -99
  50. package/dist/meta-eval/index.js +7 -76
  51. package/dist/meta-eval/index.js.map +1 -1
  52. package/dist/off-policy-DiwuKKg7.d.ts +132 -0
  53. package/dist/openapi.json +1 -1
  54. package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
  55. package/dist/{provenance-B-TFszPW.d.ts → provenance-jG-Gngg8.d.ts} +3 -3
  56. package/dist/{registry-DuVYiTvw.d.ts → registry-BK0Zee01.d.ts} +1 -1
  57. package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
  58. package/dist/reporting.d.ts +5 -5
  59. package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
  60. package/dist/rl.d.ts +10 -140
  61. package/dist/rl.js +8 -122
  62. package/dist/rl.js.map +1 -1
  63. package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +2 -2
  64. package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +2 -4
  65. package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
  66. package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +3 -3
  67. package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
  68. package/dist/traces.d.ts +1 -1
  69. package/dist/traces.js +2 -2
  70. package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} +12 -4
  71. package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
  72. package/dist/workflow/index.d.ts +4 -4
  73. package/dist/workflow/index.js +1 -1
  74. package/docs/auto-research-loop-end-to-end.md +1 -1
  75. package/docs/feature-guide.md +4 -4
  76. package/docs/multi-shot-optimization.md +61 -115
  77. package/docs/product-eval-adoption.md +1 -1
  78. package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
  79. package/docs/research/research-roadmap.md +1 -0
  80. package/docs/three-package-architecture.md +1 -1
  81. package/docs/trace-analysis.md +19 -0
  82. package/package.json +7 -2
  83. package/dist/chunk-7W4SM7FD.js.map +0 -1
  84. package/dist/chunk-F3SRAAZO.js.map +0 -1
  85. package/dist/chunk-JYE3WOTE.js.map +0 -1
  86. package/dist/chunk-WYIHD6EB.js.map +0 -1
  87. /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
  88. /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
  89. /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
  90. /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
@@ -1,5 +1,5 @@
1
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
2
- import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
1
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
2
+ import { b as OutcomeStore } from './outcome-store-rnXLEqSn.js';
3
3
 
4
4
  /**
5
5
  * Rubric predictive validity — does our eval rubric predict deployment
@@ -1,5 +1,5 @@
1
1
  import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
2
- import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-Bba0vl1V.js';
2
+ import { I as ImprovementDriver, S as Scenario, g as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, h as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-4mm2msnR.js';
3
3
 
4
4
  /**
5
5
  * @experimental
@@ -28,9 +28,7 @@ import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateRe
28
28
  *
29
29
  * The driver is surface-agnostic — any string surface in any consumer opts
30
30
  * in by selecting it. Reuses the generic reflection primitive
31
- * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router
32
- * client; no dependency on the legacy `runMultiShotOptimization` /
33
- * `prompt-evolution` orchestration.
31
+ * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router client.
34
32
  *
35
33
  * Earns its keep where there is real per-instance signal (which the
36
34
  * dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
@@ -200,6 +200,17 @@ interface RunOutcome {
200
200
  * these records as input. Optional — single-judge or scalar-only
201
201
  * runs leave it unset. */
202
202
  judgeScores?: JudgeScoresRecord;
203
+ /** Authenticity / realness verdict — did the run build the REAL thing on the
204
+ * intended infra, or fake it (see `./authenticity`)? Optional: only domains
205
+ * with an authenticity config populate it. Carried in the corpus so the
206
+ * flywheel / off-policy learning can optimize for real completion, not gamed
207
+ * pass-rate. `score` is 0-1; `gated` is the anti-Goodhart flag — a gated run
208
+ * must not count as a real success regardless of `score`. */
209
+ realness?: {
210
+ score: number;
211
+ gated: boolean;
212
+ reason?: string;
213
+ };
203
214
  }
204
215
  /**
205
216
  * Mandatory paper-grade fields for a single evaluation run. Optional
@@ -1,8 +1,8 @@
1
1
  import { AxAIService } from '@ax-llm/ax';
2
- import { c as TraceAnalystKindSpec } from './kind-factory-DW9XWPvM.js';
3
- import { b as AnalystRegistryOptions, a as AnalystRegistry } from './registry-DuVYiTvw.js';
2
+ import { c as TraceAnalystKindSpec } from './kind-factory-DqV2t1Xk.js';
3
+ import { b as AnalystRegistryOptions, a as AnalystRegistry } from './registry-BK0Zee01.js';
4
4
  import { z } from 'zod';
5
- import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-CRD68aH7.js';
5
+ import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-DRvV0zRo.js';
6
6
  import { a as TraceAnalystSpan } from './store-GmBE2pZZ.js';
7
7
  import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
8
8
  import { S as Severity } from './multi-layer-verifier-DlWCXuxL.js';
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
1
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
2
2
  import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
3
3
 
4
4
  /**
package/dist/traces.d.ts CHANGED
@@ -14,7 +14,7 @@ import { A as AnalyzeTracesOptions, b as AnalyzeTracesResult } from './analyst-t
14
14
  export { a as AnalyzeTracesInput, c as AnalyzeTracesTurnSnapshot, d as analyzeTraces } from './analyst-t7zZS3TV.js';
15
15
  import { h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, T as TraceAnalysisStore, g as TraceAnalystFilters, b as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, c as SearchTraceResult, S as SearchSpanResult } from './store-GmBE2pZZ.js';
16
16
  export { D as DEFAULT_TRACE_ANALYST_BUDGETS, d as SpanMatchRecord, e as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, f as TraceAnalystByteBudgets, a as TraceAnalystSpan, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-GmBE2pZZ.js';
17
- import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-BgTFzO2r.js';
17
+ import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-sItO5ftF.js';
18
18
  import { AxFunction } from '@ax-llm/ax';
19
19
 
20
20
  /**
package/dist/traces.js CHANGED
@@ -25,7 +25,7 @@ import {
25
25
  scoreTraceInsightReadiness,
26
26
  tokenizeDomainWords,
27
27
  traceAnalystOnRunComplete
28
- } from "./chunk-XGNCBAVZ.js";
28
+ } from "./chunk-XQL22JDG.js";
29
29
  import {
30
30
  DEFAULT_REDACTION_RULES,
31
31
  REDACTION_VERSION,
@@ -86,7 +86,7 @@ import {
86
86
  defaultProviderRedactor,
87
87
  providerFromBaseUrl
88
88
  } from "./chunk-PC4UYEBM.js";
89
- import "./chunk-F3SRAAZO.js";
89
+ import "./chunk-KWRRMR3J.js";
90
90
  import {
91
91
  TraceEmitter,
92
92
  llmSpanFromProvider
@@ -1,4 +1,4 @@
1
- import { a as RunTokenUsage } from './run-record-BgTFzO2r.js';
1
+ import { a as RunTokenUsage } from './run-record-sItO5ftF.js';
2
2
 
3
3
  /**
4
4
  * @experimental
@@ -163,8 +163,8 @@ interface ParetoParent {
163
163
  }
164
164
  /** @experimental Stateless surface mutation — given findings + current
165
165
  * surface, return N candidate surfaces. Pure transform, no generation
166
- * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
167
- * conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */
166
+ * awareness. Reflective-mutation and `AxGEPA` mutators conform. Wrapped by
167
+ * `evolutionaryDriver` to become an `ImprovementDriver`. */
168
168
  interface Mutator<TFindings = unknown> {
169
169
  kind: string;
170
170
  mutate(args: {
@@ -206,6 +206,14 @@ interface ProposeContext<TFindings = unknown> {
206
206
  * scenarios) into a merged candidate. Drivers doing pure single-parent
207
207
  * reflection may ignore it. See {@link ParetoParent}. */
208
208
  paretoParents?: ParetoParent[];
209
+ /** FIREWALL (non-negotiable): the held-out judge is write-only — its verdicts
210
+ * score the chosen output and gate promotion, and are NEVER an input to
211
+ * proposal/steering (else the optimizer games the acceptance axis = an
212
+ * oracle). This `never`-typed field makes that a compile-time tripwire: a
213
+ * driver that tries to thread judge verdicts into the proposal will not type.
214
+ * Steering may consume TRACE-OBSERVABLE signals (what the agent did) via
215
+ * `findings`/`report`; it may NOT consume the judge's held-out verdict. */
216
+ judgeScores?: never;
209
217
  }
210
218
  /** @experimental A surface-improvement strategy — the DRIVER of the
211
219
  * improvement loop. Given the current best surface, the history of what's
@@ -489,4 +497,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
489
497
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
490
498
  }
491
499
 
492
- export { isProposedCandidate as A, labelTrustRank as B, type CampaignAggregates as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ParetoParent as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type CampaignArtifactWriter as c, type CampaignCellResult as d, type CampaignCostMeter as e, type CampaignResult as f, type CampaignTraceWriter as g, type CodeSurface as h, type GateContext as i, type GateDecision as j, type GateResult as k, type GenerationCandidate as l, type GenerationRecord as m, type JudgeDimension as n, type Mutator as o, type SessionScript as p, type ProposeContext as q, type LabeledScenarioWrite as r, type LabeledScenarioSampleArgs as s, type LabeledScenarioRecord as t, type LabelTrust as u, type LabeledScenarioSource as v, type CampaignTokenUsage as w, type JudgeAggregate as x, type ProposedCandidate as y, type ScenarioAggregate as z };
500
+ export { isProposedCandidate as A, labelTrustRank as B, type CampaignAggregates as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ParetoParent as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type GateDecision as c, type CampaignArtifactWriter as d, type CampaignCellResult as e, type CampaignCostMeter as f, type CampaignResult as g, type CampaignTraceWriter as h, type CodeSurface as i, type GateContext as j, type GateResult as k, type GenerationCandidate as l, type GenerationRecord as m, type JudgeDimension as n, type Mutator as o, type SessionScript as p, type ProposeContext as q, type LabeledScenarioWrite as r, type LabeledScenarioSampleArgs as s, type LabeledScenarioRecord as t, type LabelTrust as u, type LabeledScenarioSource as v, type CampaignTokenUsage as w, type JudgeAggregate as x, type ProposedCandidate as y, type ScenarioAggregate as z };
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
1
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
2
2
  import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
3
3
  import { a as JudgeInput } from './types-Croy5h7V.js';
4
4
  import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
@@ -146,6 +146,15 @@ interface AnalystFinding {
146
146
  * diff cleanly across runs.
147
147
  */
148
148
  subject?: string;
149
+ /** FIREWALL provenance (docs/learning-flywheel.md): true iff this finding was
150
+ * lifted from a JUDGE verdict (an acceptance score), not OBSERVED from the
151
+ * agent's behavior. A judge-derived finding must NEVER be admitted as a
152
+ * steering input — that is the held-out judge leaking into the loop. Set at
153
+ * the lift site (createJudgeAdapter); checked by `assertNoJudgeVerdict`.
154
+ * Provenance, not evidence presence, is the correct discriminator: an
155
+ * evidence-less trace-analyst observation legitimately steers, while a judge
156
+ * verdict that happens to cite an artifact must not. */
157
+ derived_from_judge?: boolean;
149
158
  /** Analyst-private extras; renderers ignore unless they know the analyst. */
150
159
  metadata?: Record<string, unknown>;
151
160
  }
@@ -1,7 +1,7 @@
1
1
  import { W as WorkflowTopology } from '../harness-optimizer-EnEnQPsr.js';
2
- import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-BgTFzO2r.js';
3
- import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-CRD68aH7.js';
4
- import { F as FailureClusterInsight } from '../insight-report-Df3lxYXM.js';
2
+ import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-sItO5ftF.js';
3
+ import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-DRvV0zRo.js';
4
+ import { F as FailureClusterInsight } from '../insight-report-dlpEzQDi.js';
5
5
  import { a as VerificationReport, L as LayerResult } from '../multi-layer-verifier-DlWCXuxL.js';
6
6
  import { F as FailureClusterReport } from '../failure-cluster-CL7IVgkJ.js';
7
7
  import { R as RedactionRule, a as RedactionReport } from '../redact-B40YG2M_.js';
@@ -18,7 +18,7 @@ import '../types-Croy5h7V.js';
18
18
  import '@tangle-network/tcloud';
19
19
  import '../llm-client-DbjLfz-K.js';
20
20
  import '../raw-provider-sink-C46HDghv.js';
21
- import '../summary-report-ByiOUrHj.js';
21
+ import '../summary-report-BTaXq1TS.js';
22
22
  import '../judge-calibration-DilmB3Ml.js';
23
23
  import '../control-runtime-DuFBYg7A.js';
24
24
  import '../emitter-DEZwY14K.js';
@@ -7,7 +7,7 @@ import {
7
7
  } from "../chunk-GGE4NNQT.js";
8
8
  import {
9
9
  validateRunRecord
10
- } from "../chunk-F3SRAAZO.js";
10
+ } from "../chunk-KWRRMR3J.js";
11
11
  import "../chunk-VSMTAMNK.js";
12
12
  import {
13
13
  ValidationError
@@ -152,7 +152,7 @@ async function runAutoResearchLoop(opts: {
152
152
  Two cases:
153
153
 
154
154
  1. **Trajectory-shaped optimization with steering.** Use
155
- `runMultiShotOptimization` directly — it already runs the inner
155
+ `runImprovementLoop` directly — it already runs the inner
156
156
  search-vs-holdout loop. Wrap with `analyzeOptimizationResult` after
157
157
  for the RL bridge.
158
158
 
@@ -33,8 +33,8 @@ trying, and whether a change made them better or worse.
33
33
  | “Human feedback should become reusable eval data.” | `FeedbackTrajectory` | Captures approvals, rejections, edits, choices, metrics, and policy blocks. |
34
34
  | “Can this action run, or does it need approval?” | `evaluateActionPolicy` | Generic preflight for side effects, budgets, and required evidence. |
35
35
  | “I need train/dev/test/holdout examples.” | `Dataset` plus feedback trajectory conversion | Stable splits and contamination control. |
36
- | “Which prompt or signature wins?” | `runMultiShotOptimization`, steering optimizers | Runs variants on scenarios and compares scores. |
37
- | “Improve a multi-turn agent over real task traces.” | `runMultiShotOptimization` | GEPA-style trajectory optimization with ASI and held-out promotion. |
36
+ | “Which prompt or signature wins?” | `runImprovementLoop`, steering optimizers | Runs variants on scenarios and compares scores. |
37
+ | “Improve a multi-turn agent over real task traces.” | `runImprovementLoop` | GEPA-style trajectory optimization with ASI and held-out promotion. |
38
38
  | “Improve prompts, then code if prompts plateau.” | `runPromptEvolution`, composite mutator, code mutator | Bounded evolution with telemetry and lineage. |
39
39
  | “Find why a regression happened.” | bisector, traces, run records | Narrows changes and preserves evidence. |
40
40
  | “Expose evals to another language.” | Wire protocol and Python client | HTTP/RPC boundary for non-TypeScript apps. |
@@ -105,7 +105,7 @@ generated code -> build/test/runtime gates -> score -> ship or revise
105
105
 
106
106
  Use when you want Ax/GEPA-style improvement.
107
107
 
108
- 1. For variable-length agent tasks, use `runMultiShotOptimization`.
108
+ 1. For variable-length agent tasks, use `runImprovementLoop`.
109
109
  2. Build search/dev/test/holdout splits from the real product loop.
110
110
  3. Score full trajectories, not just final text.
111
111
  4. Emit actionable side information for failures the mutator can fix.
@@ -156,7 +156,7 @@ Store as `FeedbackTrajectory`, then derive:
156
156
  | Feedback data | `FeedbackTrajectory`, stores, converters | Human/environment labels | Domain adapters live in downstream repos. |
157
157
  | Action policy | `evaluateActionPolicy` | Approval/budget preflight | Blocks or labels actions before `act()`. |
158
158
  | Datasets | `Dataset`, holdout tools, canaries | Train/dev/test/holdout corpora | Keeps optimization honest. |
159
- | Optimization | `runMultiShotOptimization`, steering optimizers | Prompt/signature comparison | Use held-out gates before promotion. |
159
+ | Optimization | `runImprovementLoop`, steering optimizers | Prompt/signature comparison | Use held-out gates before promotion. |
160
160
  | Evolution | prompt/code mutators, sandbox pool, telemetry | Autoresearch and mutation loops | Use budgets and lineage; do not run unbounded. |
161
161
  | Telemetry | `TraceStore`, OTLP, file sinks | Audit and replay | Treat traces as evidence, not just logs. |
162
162
  | Reporting | summaries, pareto, cost tracker | Decision support | Useful for PRs, launch gates, research notes. |
@@ -1,129 +1,75 @@
1
1
  # Multi-Shot Optimization
2
2
 
3
- `runMultiShotOptimization` is the public adapter for GEPA-style optimization over
4
- variable-length agent conversations.
3
+ > **Renamed.** `runMultiShotOptimization` was retired. The live API is
4
+ > `runImprovementLoop` (driver-agnostic, gated promotion) driven by `gepaDriver`,
5
+ > with `compareDrivers` for head-to-head driver lift. This doc was rewritten to the
6
+ > live API; see also [feature-guide.md](./feature-guide.md) and [concepts.md](./concepts.md).
5
7
 
6
- Use it when the thing you want to improve is not a single model call. Typical
7
- targets are agent system prompts, tool descriptions, routing policies, retrieval
8
- plans, or app-specific scaffolding that affects an entire task trajectory.
8
+ `runImprovementLoop` is the public entry for GEPA-style optimization over a whole
9
+ task trajectory the thing you improve is not a single model call but an agent
10
+ system prompt, tool descriptions, a routing policy, or any scaffolding that affects
11
+ the entire run. It is the OUTER loop: it improves the SURFACE the inner workers run.
9
12
 
10
- The primitive is intentionally small. Your app owns the domain logic:
13
+ ## The shape
11
14
 
12
- - `seedVariants`: prompt/config/tool-policy candidates
13
- - `runner`: executes one complete task trajectory for one variant
14
- - `scorer`: scores the trajectory and emits actionable side information
15
- - `mutateAdapter`: proposes new variants from top and bottom trials
15
+ You own a few seams; the loop owns the release-critical glue (paired seeds, the
16
+ held-out re-score, the promotion gate, provenance):
16
17
 
17
- `agent-eval` owns the release-critical glue:
18
+ - **`baselineSurface`** the current surface (a prompt string, or a `CodeSurface`).
19
+ - **`dispatchWithSurface(surface, scenario, ctx)`** — run one task to completion
20
+ under a candidate surface; return the artifact the judges score.
21
+ - **`judges`** — score the artifact (`{ composite, dimensions }`).
22
+ - **`driver`** — proposes candidate surfaces each generation: `gepaDriver`
23
+ (reflective + Pareto frontier) or `evolutionaryDriver` (mutator).
24
+ - **`gate`** — `defaultProductionGate` (held-out significance + red-team +
25
+ reward-hacking + canary). Ships ONLY on a CI-lower-bound held-out lift.
18
26
 
19
- - stable paired seeds
20
- - search-split prompt evolution
21
- - cost/score Pareto objectives
22
- - failed-run conversion into failed trials
23
- - ASI projection into reflection traces and numeric metrics
24
- - optional paired holdout gating through `HeldOutGate`
25
- - validated `RunRecord` rows for promotion evidence
26
-
27
- ## Result Contract
28
-
29
- The return shape separates discovery from promotion:
30
-
31
- - `searchBestVariant`: best variant on the optimizer-visible search scenarios
32
- - `searchBestAggregate`: aggregate for that search winner
33
- - `promotedVariant`: variant callers should ship
34
- - `promotedAggregate`: aggregate for the promoted variant
35
- - `gate`: holdout decision and evidence, or `null` when no gate ran
36
-
37
- If a holdout gate is configured and rejects the search winner,
38
- `promotedVariant` is the baseline. Do not ship `searchBestVariant` directly
39
- unless you intentionally run without a holdout gate.
40
-
41
- ## Actionable Side Information
42
-
43
- The scorer should return `asi` rows for concrete failure modes:
44
-
45
- ```ts
46
- {
47
- expectationId: 'used-primary-sources',
48
- message: 'The final answer cited secondary summaries instead of primary sources.',
49
- severity: 'error',
50
- responsibleSurface: 'retrieval-policy',
51
- suggestion: 'Prefer primary-source domains during source-gathering turns.',
52
- }
53
- ```
54
-
55
- Standard knowledge-related responsible surfaces are:
56
-
57
- - `knowledge-requirements`
58
- - `data-acquisition`
59
- - `retrieval-policy`
60
- - `user-question-policy`
61
-
62
- These rows become:
63
-
64
- - reflection expectations via `trialTraceFromMultiShotTrial`
65
- - aggregate metrics like `asi.error` and `surface.retrieval-policy`
66
- - trace evidence available to downstream reports
67
-
68
- This is the main reason to use this primitive instead of reducing each run to a
69
- single scalar reward.
70
-
71
- ## Holdout Discipline
72
-
73
- For release gates, configure `gate`. The first seed variant is the baseline and
74
- `gate.gate.baselineKey` must match its id.
75
-
76
- Holdout scenarios must be disjoint from `searchScenarioIds`. The adapter runs
77
- baseline and candidate with the same `(scenarioId, rep)` seed, validates every
78
- row with `validateRunRecord`, then asks `HeldOutGate` whether to promote.
79
-
80
- When `gate.searchScenarioIds` is omitted, the adapter reuses
81
- `searchScenarioIds` for the overfit-gap check.
82
-
83
- ## Minimal Shape
27
+ ## Minimal example
84
28
 
85
29
  ```ts
86
30
  import {
87
- runMultiShotOptimization,
88
- trialTraceFromMultiShotTrial,
89
- type MultiShotVariant,
90
- } from '@tangle-network/agent-eval'
91
-
92
- type Payload = { systemPrompt: string }
93
-
94
- const baseline: MultiShotVariant<Payload> = {
95
- id: 'baseline',
96
- label: 'baseline',
97
- generation: 0,
98
- payload: { systemPrompt: currentPrompt },
99
- }
100
-
101
- const result = await runMultiShotOptimization<Payload>({
102
- runId: `research-agent-${Date.now()}`,
103
- target: 'research-agent-system-prompt',
104
- seedVariants: [baseline],
105
- searchScenarioIds: searchScenarios.map((s) => s.id),
106
- reps: 2,
107
- generations: 4,
31
+ runImprovementLoop,
32
+ gepaDriver,
33
+ defaultProductionGate,
34
+ } from '@tangle-network/agent-eval/contract'
35
+
36
+ const result = await runImprovementLoop({
37
+ baselineSurface: currentSystemPrompt,
38
+ scenarios: trainScenarios, // optimizer-visible
39
+ holdoutScenarios, // DISJOINT — only the gate sees these
40
+ dispatchWithSurface: async (surface, scenario) =>
41
+ runYourAgentToCompletion({ scenario, prompt: String(surface) }),
42
+ judges: [myJudge],
43
+ driver: gepaDriver({
44
+ llm: { apiKey, baseUrl },
45
+ model: 'gpt-5',
46
+ target: 'enforce a strict output schema',
47
+ }),
108
48
  populationSize: 4,
109
- scoreConcurrency: 4,
110
- runner: {
111
- async run({ variant, scenarioId, seed }) {
112
- return runYourAgentToCompletion({ scenarioId, seed, prompt: variant.payload.systemPrompt })
113
- },
114
- },
115
- scorer: {
116
- async score({ run }) {
117
- return scoreFullTrajectory(run.trace)
118
- },
119
- },
120
- mutateAdapter: {
121
- async mutate({ parent, bottomTrials, childCount, generation }) {
122
- const traces = bottomTrials.map((t) => trialTraceFromMultiShotTrial(t))
123
- return proposePromptMutations({ parent, traces, childCount, generation })
124
- },
125
- },
49
+ maxGenerations: 4,
50
+ gate: defaultProductionGate({ holdoutScenarios, deltaThreshold: 0 }),
51
+ autoOnPromote: 'none', // or 'pr' (+ ghOwner/ghRepo) to open a PR on ship
52
+ runDir,
126
53
  })
127
54
 
128
- deploy(result.promotedVariant.payload)
55
+ if (result.gateResult.decision === 'ship') {
56
+ deploy(result.winnerSurface) // the driver's proposal, gated on a real held-out lift
57
+ }
129
58
  ```
59
+
60
+ ## Discipline (what makes it trustworthy)
61
+
62
+ - **Holdout is disjoint + gated.** `holdoutScenarios` must not overlap the training
63
+ pool. The gate re-scores baseline vs winner on the holdout and ships only when the
64
+ paired-bootstrap CI lower bound clears `deltaThreshold`; a few-instance swing at
65
+ thin `n` is held (`few_runs`), not promoted.
66
+ - **No-op never ships.** If no candidate beats the baseline, the winner IS the
67
+ baseline (empty diff) and the loop forces `hold` — it does not score
68
+ baseline-vs-itself and read model noise as lift.
69
+ - **Provenance falls out.** `result.promotedDiff` + `emitLoopProvenance` give the
70
+ auditable candidate→gate→promote chain (rationale, content hashes, a held-out lift
71
+ recomputable from the emitted record).
72
+
73
+ Reach for `compareDrivers` when the question is "which DRIVER wins" rather than
74
+ "improve this surface", and see `tests/campaign/presets.test.ts` for the executable
75
+ contract (no-op guard, fail-loud holdout, gate promotion).
@@ -152,7 +152,7 @@ set with a signed note.
152
152
 
153
153
  ## Optimization
154
154
 
155
- Use `runMultiShotOptimization()` when the system is a multi-step agent, not a
155
+ Use `runImprovementLoop()` when the system is a multi-step agent, not a
156
156
  single prompt.
157
157
 
158
158
  Good optimization targets: