@tangle-network/agent-eval 0.72.0 → 0.72.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/CHANGELOG.md +39 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +3 -2
  5. package/dist/agent-profile-DYRboYWu.d.ts +364 -0
  6. package/dist/analyst/index.d.ts +221 -0
  7. package/dist/analyst/index.js +371 -0
  8. package/dist/analyst/index.js.map +1 -0
  9. package/dist/analyst-t7zZS3TV.d.ts +88 -0
  10. package/dist/campaign/index.d.ts +485 -9
  11. package/dist/campaign/index.js +597 -22
  12. package/dist/campaign/index.js.map +1 -1
  13. package/dist/chunk-7W4SM7FD.js +1075 -0
  14. package/dist/chunk-7W4SM7FD.js.map +1 -0
  15. package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
  16. package/dist/chunk-JHA3ZGSO.js +1496 -0
  17. package/dist/chunk-JHA3ZGSO.js.map +1 -0
  18. package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
  19. package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
  20. package/dist/chunk-LB2UOI5F.js +412 -0
  21. package/dist/chunk-LB2UOI5F.js.map +1 -0
  22. package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
  23. package/dist/chunk-VUINJM5M.js.map +1 -0
  24. package/dist/chunk-WYIHD6EB.js +1044 -0
  25. package/dist/chunk-WYIHD6EB.js.map +1 -0
  26. package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
  27. package/dist/chunk-XPILG2CA.js.map +1 -0
  28. package/dist/contract/index.d.ts +17 -13
  29. package/dist/contract/index.js +13 -7
  30. package/dist/contract/index.js.map +1 -1
  31. package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
  32. package/dist/control.d.ts +2 -2
  33. package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
  34. package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
  35. package/dist/hosted/index.d.ts +223 -2
  36. package/dist/index.d.ts +49 -1323
  37. package/dist/index.js +353 -2496
  38. package/dist/index.js.map +1 -1
  39. package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
  40. package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
  41. package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
  42. package/dist/openapi.json +1 -1
  43. package/dist/pareto-E-pembql.d.ts +81 -0
  44. package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
  45. package/dist/redact-B40YG2M_.d.ts +45 -0
  46. package/dist/registry-DuVYiTvw.d.ts +128 -0
  47. package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
  48. package/dist/rl.d.ts +4 -3
  49. package/dist/rl.js +4 -4
  50. package/dist/run-critic-BAIjX99r.d.ts +56 -0
  51. package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
  52. package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
  53. package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
  54. package/dist/traces.d.ts +371 -308
  55. package/dist/traces.js +43 -18
  56. package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
  57. package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
  58. package/dist/wire/index.d.ts +1 -1
  59. package/dist/workflow/index.d.ts +494 -0
  60. package/dist/workflow/index.js +2177 -0
  61. package/dist/workflow/index.js.map +1 -0
  62. package/docs/design/self-improvement-roadmap.md +106 -0
  63. package/package.json +36 -12
  64. package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
  65. package/dist/chunk-ODGETRTM.js.map +0 -1
  66. package/dist/chunk-SL55X4VN.js +0 -186
  67. package/dist/chunk-SL55X4VN.js.map +0 -1
  68. package/dist/chunk-UD6EF73X.js.map +0 -1
  69. /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
@@ -1,25 +1,329 @@
1
- import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-Bzamo6GB.js';
2
- export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-Bzamo6GB.js';
3
- export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-C69gLUXH.js';
1
+ import { A as AnalyzeTracesOptions, a as AnalyzeTracesInput, b as AnalyzeTracesResult } from '../analyst-t7zZS3TV.js';
2
+ import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, q as ProposeContext, J as JudgeScore, L as LabeledScenarioStore, r as LabeledScenarioWrite, s as LabeledScenarioSampleArgs, t as LabeledScenarioRecord, u as LabelTrust, v as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-Bba0vl1V.js';
3
+ export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, w as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, x as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-Bba0vl1V.js';
4
+ import { a as RunCampaignOptions, b as RunImprovementLoopOptions, C as CampaignStorage } from '../run-improvement-loop-BqYH2vCR.js';
5
+ export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-BqYH2vCR.js';
6
+ export { A as AxisEvidence, a as AxisVerdict, B as BuildEvidenceVectorOptions, k as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, l as EmitLoopProvenanceArgs, m as EmitLoopProvenanceResult, E as EvidenceVector, b as EvolutionaryDriverOptions, H as HeldOutGateOptions, n as LoopProvenanceBackend, o as LoopProvenanceCandidate, L as LoopProvenanceRecord, O as ObjectiveSource, P as ParetoSignificanceGateOptions, c as PromotionObjective, d as PromotionPolicy, R as RunEvalOptions, e as buildEvidenceVector, q as buildLoopProvenanceRecord, f as composeGate, g as defaultProductionGate, s as emitLoopProvenance, h as evolutionaryDriver, i as heldOutGate, t as loopProvenanceSpans, p as paretoPolicy, j as paretoSignificanceGate, u as provenanceRecordPath, v as provenanceSpansPath, r as runEval, w as surfaceContentHash } from '../provenance-B-TFszPW.js';
4
7
  import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
5
- import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-CnmZ2bkP.js';
6
- export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-CnmZ2bkP.js';
8
+ import { c as TraceAnalystKindSpec } from '../kind-factory-DW9XWPvM.js';
9
+ import { c as AnalystFinding } from '../types-CRD68aH7.js';
7
10
  import { a as PairedBootstrapResult } from '../statistics-B7yCbi9i.js';
8
- import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-DzcPHR1Z.js';
11
+ import { A as AgentProfile, B as BackendIntegrityReport, C as CompletionRequirement, R as RuntimeEventLike, a as CompletionVerdict, P as ProducedState, b as CorrectnessChecker } from '../agent-profile-DYRboYWu.js';
9
12
  import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
10
13
  import { b as RunSplitTag, R as RunRecord } from '../run-record-BgTFzO2r.js';
14
+ import '@ax-llm/ax';
15
+ import '../store-GmBE2pZZ.js';
11
16
  import '../red-team-DW9Ca_tj.js';
12
17
  import '../dataset-B2kL-fSM.js';
13
18
  import '../store-CKUAgsJz.js';
14
19
  import '../schema-m0gsnbt3.js';
15
- import '../index-BGBrVS24.js';
20
+ import '../pareto-E-pembql.js';
21
+ import '../hosted/index.js';
22
+ import '../insight-report-Df3lxYXM.js';
16
23
  import '../summary-report-ByiOUrHj.js';
17
24
  import '../failure-cluster-CL7IVgkJ.js';
18
25
  import '../judge-calibration-DilmB3Ml.js';
19
26
  import '../raw-provider-sink-C46HDghv.js';
27
+ import 'zod';
20
28
  import '../types-Croy5h7V.js';
21
29
  import '@tangle-network/tcloud';
22
30
 
31
+ /**
32
+ * @experimental
33
+ *
34
+ * Make the trace-analyst's OWN prompt a GEPA-optimizable surface.
35
+ *
36
+ * The analyst that drives self-improvement is itself a prompt — and a
37
+ * hand-tuned one (a hardcoded, hand-versioned `const`). This module lets the
38
+ * loop optimize it: the analyst `actorDescription` becomes a `MutableSurface`
39
+ * that `gepaDriver` / `haloDriver` / any `ImprovementDriver` can mutate inside
40
+ * `runImprovementLoop` or `compareDrivers`. That is the second-order loop —
41
+ * optimizing the optimizer's eyes, not just the agent's prompt.
42
+ *
43
+ * Two pieces, both deliberately small (the loop engine already exists — this
44
+ * only supplies the analyst-shaped dispatch + an objective scorer):
45
+ *
46
+ * - `buildAnalystSurfaceDispatch` — `dispatchWithSurface(surface, scenario)`
47
+ * runs `analyzeTraces` with `surface` as the actorDescription over the
48
+ * scenario's fixed trace corpus, returning its findings.
49
+ * - `failureModeRecallJudge` — a DETERMINISTIC judge (no LLM, no opinion)
50
+ * that scores those findings against the scenario's GROUND-TRUTH failure
51
+ * modes. This is what keeps optimizing the analyst prompt ungameable: the
52
+ * labels come from objective signal (e.g. AppWorld `world.evaluate()` tells
53
+ * us which task failed and which API calls were wrong), so we reward an
54
+ * analyst for surfacing the failures that really happened — not for
55
+ * pleasing a judge that could be talked into anything (Goodhart).
56
+ *
57
+ * Wiring (the loop is unchanged; you only pass these in):
58
+ *
59
+ * const dispatchWithSurface = buildAnalystSurfaceDispatch({ analystOptions: { ai } })
60
+ * await runImprovementLoop({
61
+ * baselineSurface: TRACE_ANALYST_ACTOR_DESCRIPTION, // the prompt under optimization
62
+ * scenarios: trainScenarios, // labeled trace corpora
63
+ * holdoutScenarios: heldOutScenarios,
64
+ * dispatchWithSurface,
65
+ * judges: [failureModeRecallJudge()],
66
+ * driver: gepaDriver({ baseUrl, apiKey }),
67
+ * gate: heldOutGate({ minDelta: 0.02 }),
68
+ * autoOnPromote: 'none',
69
+ * })
70
+ */
71
+
72
+ /**
73
+ * A labeled trace scenario: a FIXED trace corpus plus the failure modes a
74
+ * competent analyst MUST surface from it. The labels are ground truth — the
75
+ * objective failures that actually occurred — which is what makes optimizing
76
+ * the analyst prompt against them meaningful rather than circular.
77
+ */
78
+ interface AnalystScenario extends Scenario {
79
+ kind: 'analyst-surface';
80
+ /** OTLP-JSONL path or an in-memory store of the traces to analyze. */
81
+ source: AnalyzeTracesOptions['source'];
82
+ /** The domain question handed to the analyst (framing lives here, not in
83
+ * the surface under optimization). */
84
+ question: string;
85
+ /**
86
+ * Ground-truth failure modes a good analyst must identify. A finding "hits"
87
+ * a mode when it contains ANY of the mode's case-insensitive cues. Derive
88
+ * these from objective signal (failed task + which step broke), never from
89
+ * the analyst's own prior output.
90
+ */
91
+ expectedFailureModes: Array<{
92
+ id: string;
93
+ cues: string[];
94
+ }>;
95
+ /**
96
+ * Cues that mark a finding as HALLUCINATED / out-of-scope for this corpus —
97
+ * naming a tool, error, or failure that did not occur. Presence penalizes
98
+ * precision. Optional; omit to score recall only.
99
+ */
100
+ forbiddenCues?: string[];
101
+ }
102
+ /** The analyst's output for one scenario — the artifact the judge scores. */
103
+ interface AnalystArtifact {
104
+ answer: string;
105
+ findings: string[];
106
+ /** The hardcoded-prompt version the analyst reported (provenance only; the
107
+ * optimized surface overrides the actual prompt text used). */
108
+ actorPromptVersion: string;
109
+ }
110
+ interface BuildAnalystSurfaceDispatchOptions {
111
+ /**
112
+ * Everything `analyzeTraces` needs EXCEPT `actorDescription` (supplied by the
113
+ * surface under optimization) and `source` (supplied by the scenario). `ai`
114
+ * (the AxAIService) is required for a live run.
115
+ */
116
+ analystOptions: Omit<AnalyzeTracesOptions, 'actorDescription' | 'source'>;
117
+ /** Test seam: defaults to the real `analyzeTraces`. */
118
+ analyze?: (input: AnalyzeTracesInput, options: AnalyzeTracesOptions) => Promise<AnalyzeTracesResult>;
119
+ }
120
+ /**
121
+ * Build the `dispatchWithSurface(surface, scenario, ctx)` the improvement loop
122
+ * calls: run the analyst with `surface` as its actorDescription over the
123
+ * scenario's trace corpus and return its findings.
124
+ */
125
+ declare function buildAnalystSurfaceDispatch(opts: BuildAnalystSurfaceDispatchOptions): (surface: MutableSurface, scenario: AnalystScenario, ctx: DispatchContext) => Promise<AnalystArtifact>;
126
+ interface FailureModeRecallJudgeOptions {
127
+ /** Weight on recall when precision is also scored (forbiddenCues present).
128
+ * Default 0.5 (equal). Recall-only when no forbiddenCues exist. */
129
+ recallWeight?: number;
130
+ }
131
+ /**
132
+ * Deterministic, ground-truth judge for analyst findings. Composite =
133
+ * recall of the scenario's `expectedFailureModes` (optionally blended with a
134
+ * precision term that penalizes findings tripping `forbiddenCues`). No LLM —
135
+ * the score is a function of the labels, so the analyst prompt is optimized
136
+ * toward surfacing real failures, not toward a judge it can flatter.
137
+ */
138
+ declare function failureModeRecallJudge(opts?: FailureModeRecallJudgeOptions): JudgeConfig<AnalystArtifact, AnalystScenario>;
139
+
140
+ /**
141
+ * @experimental
142
+ *
143
+ * `aceDriver` — Agentic Context Engineering: an APPEND-MOSTLY curator, the
144
+ * deliberate contrast to `memoryCurationDriver`'s dedup-and-replace. ACE's
145
+ * thesis (arXiv:2510.04618) is that aggressively deduping/rewriting a context
146
+ * causes "context collapse" — hard-won specific lessons get summarized away. So
147
+ * the playbook GROWS by appending each generation's new lessons as provenance-
148
+ * tagged delta bullets; existing bullets are preserved verbatim, never merged.
149
+ *
150
+ * Each generation it:
151
+ * 1. reads the playbook block already in the parent surface (verbatim);
152
+ * 2. turns this generation's `findings` into lessons, keeping only the ones not
153
+ * already present (idempotency — a recurring finding is not re-appended, but
154
+ * a genuinely NEW lesson always is, even if similar to an old one);
155
+ * 3. appends the new lessons as `- [gN] <lesson>` deltas and re-emits the block.
156
+ *
157
+ * Bounded WITHOUT collapse: when the playbook exceeds `maxEntries`, the OLDEST
158
+ * deltas are evicted (FIFO) — recency is kept, but no two distinct lessons are
159
+ * ever merged into one. Deterministic (no LLM) so a lift is attributable to the
160
+ * accumulated lessons, not a rewrite's model noise.
161
+ *
162
+ * Fail-loud: with no new lesson this generation it returns NO candidate (the
163
+ * playbook is unchanged — nothing to propose), never a fabricated bullet.
164
+ */
165
+
166
+ interface AceDriverOptions {
167
+ /** Max delta bullets retained in the playbook. On overflow the OLDEST are
168
+ * evicted (FIFO) — never merged. Default 50 (ACE keeps a long context). */
169
+ maxEntries?: number;
170
+ /** Heading rendered above the bullets inside the block. */
171
+ sectionHeading?: string;
172
+ }
173
+ declare function aceDriver(opts?: AceDriverOptions): ImprovementDriver;
174
+
175
+ /**
176
+ * Driver selection guide — "which `ImprovementDriver` do I pick, and why?"
177
+ *
178
+ * The substrate ships seven drivers with overlapping shapes. This is the
179
+ * decision table (data, not behavior): each entry says what a driver mutates,
180
+ * how it proposes changes, when to reach for it, and its relative cost.
181
+ * `selectDriver()` turns a goal + surface into a ranked recommendation.
182
+ *
183
+ * Import the actual driver functions from `@tangle-network/agent-eval/campaign`
184
+ * (gepaDriver, skillOptDriver, aceDriver, memoryCurationDriver, haloDriver,
185
+ * traceAnalystDriver, evolutionaryDriver); this module only helps you choose.
186
+ */
187
+ type DriverName = 'gepa' | 'skillOpt' | 'ace' | 'memoryCuration' | 'halo' | 'traceAnalyst' | 'evolutionary';
188
+ /** The mutable surface a driver targets. */
189
+ type DriverSurface = 'prompt' | 'skill-doc' | 'playbook' | 'memory' | 'any';
190
+ /** How a driver turns evidence into the next candidate. */
191
+ type DriverStrategy = 'reflective-rewrite' | 'anchored-patch' | 'append-only' | 'dedup-curate' | 'analysis-edit' | 'population-mutate';
192
+ /** What a caller is trying to do this run. */
193
+ type DriverGoal = 'explore' | 'refine' | 'accumulate' | 'benchmark';
194
+ interface DriverGuideEntry {
195
+ /** One-line description of the mechanism. */
196
+ summary: string;
197
+ /** The surface the driver edits. */
198
+ surface: DriverSurface;
199
+ /** How it proposes the next candidate. */
200
+ strategy: DriverStrategy;
201
+ /** When to reach for this driver. */
202
+ whenUse: string;
203
+ /** Relative LLM cost per generation. */
204
+ cost: 'low' | 'medium' | 'high';
205
+ /** True when the driver shells out to an external engine (extra setup). */
206
+ external?: boolean;
207
+ }
208
+ declare const DRIVER_GUIDE: Record<DriverName, DriverGuideEntry>;
209
+ interface SelectDriverCriteria {
210
+ /** What you're trying to do this run. */
211
+ goal: DriverGoal;
212
+ /** Restrict to drivers that edit this surface (optional). */
213
+ surface?: DriverSurface;
214
+ }
215
+ interface DriverRecommendation {
216
+ name: DriverName;
217
+ entry: DriverGuideEntry;
218
+ reason: string;
219
+ }
220
+ /**
221
+ * Rank the drivers for a goal (and optional surface filter), best first.
222
+ * Returns the recommendation list, not instances — import the chosen driver
223
+ * function yourself. Always returns at least the goal's primary driver.
224
+ */
225
+ declare function selectDriver(criteria: SelectDriverCriteria): DriverRecommendation[];
226
+
227
+ /**
228
+ * @experimental
229
+ *
230
+ * `haloDriver` — wraps the REAL halo-engine (Inference.net's hierarchical
231
+ * agentic trace analyzer, `pip install halo-engine`, repo context-labs/halo)
232
+ * as an agent-eval `ImprovementDriver`, so HALO competes head-to-head with
233
+ * `gepaDriver` — and with our own `traceAnalystDriver` — inside
234
+ * `compareDrivers` on identical traces / scenarios / held-out scoring.
235
+ *
236
+ * It PRESERVES halo's actual working usage — `propose()` shells out to the
237
+ * published CLI (`halo <traces.jsonl> -p <prompt> -m <model> --base-url
238
+ * --api-key`) and uses its real RLM findings verbatim. We do NOT reimplement
239
+ * its analysis; that would make the benchmark meaningless. The only adaptation
240
+ * is applying HALO's findings to the current prompt surface via one LLM edit —
241
+ * exactly what makes the comparison prompt-tier apples-to-apples with
242
+ * `gepaDriver` (which also mutates the prompt). The analysis is HALO's; only
243
+ * the surface-application is ours, and it is identical in spirit to how HALO's
244
+ * own loop feeds findings to a coding agent.
245
+ *
246
+ * Fail-loud: no traces → throw; halo errors → throw; empty findings → throw.
247
+ * Never fabricate a candidate (that would silently flatter or penalize HALO).
248
+ */
249
+
250
+ interface HaloDriverOptions {
251
+ /** OpenAI-compatible base URL for BOTH halo's RLM analysis and the apply
252
+ * step (e.g. the Tangle router `https://router.tangle.tools/v1`). */
253
+ baseUrl: string;
254
+ /** Bearer key (else relies on OPENAI_API_KEY in the env halo inherits). */
255
+ apiKey?: string;
256
+ /** Model for halo's `--model` (its RLM). Default 'gpt-5.4-mini' (halo's own default). */
257
+ model?: string;
258
+ /** Model used to APPLY halo's findings to the prompt surface. Default = `model`. */
259
+ applyModel?: string;
260
+ /** The real halo binary. Default 'halo' (from `pip install halo-engine`). */
261
+ haloBin?: string;
262
+ /**
263
+ * Resolve the OTLP traces (JSONL string) halo should analyze for THIS
264
+ * generation — wired by the bench to the captured AppWorld OTLP for the
265
+ * current surface. Returning empty throws (halo has nothing to analyze).
266
+ */
267
+ resolveTraces: (ctx: ProposeContext) => string | Promise<string>;
268
+ /** halo's analysis prompt (`-p`). Default targets the failure taxonomy. */
269
+ analysisPrompt?: string;
270
+ /** halo `--max-depth` / `--max-turns` passthrough. */
271
+ maxDepth?: number;
272
+ maxTurns?: number;
273
+ /** Test seam: inject a fetch for the apply-step callLlm (no network in unit tests). */
274
+ fetchImpl?: LlmClientOptions['fetch'];
275
+ }
276
+ /** Wrap the real halo-engine CLI as an ImprovementDriver (prompt-tier). */
277
+ declare function haloDriver(opts: HaloDriverOptions): ImprovementDriver;
278
+
279
+ /**
280
+ * @experimental
281
+ *
282
+ * `memoryCurationDriver` — a CURATOR `ImprovementDriver`, the complement to the
283
+ * OPTIMIZER drivers (`gepaDriver` rewrites the prompt; this one BUILDS a
284
+ * searchable memory of what prior trajectories taught and grafts the most
285
+ * relevant lessons onto the surface).
286
+ *
287
+ * Each generation it:
288
+ * 1. collects lessons — this generation's trace-analyst `findings` PLUS the
289
+ * memory already carried in the parent surface (so memory accumulates
290
+ * across generations instead of resetting);
291
+ * 2. curates them — normalizes, deduplicates near-identical lessons, and ranks
292
+ * by recurrence (a lesson seen across many findings outranks a one-off);
293
+ * 3. retrieves the top-K and writes them back as a single delimited memory
294
+ * block in the surface (idempotent — the block is replaced, never stacked,
295
+ * so the prompt does not grow without bound).
296
+ *
297
+ * This is the substrate behind the "knowledge base of working trajectories" the
298
+ * agent searches: the curated block IS the retrieved memory the next run reads.
299
+ * Curation is DETERMINISTIC (no LLM) so a lift it produces is attributable to
300
+ * the lessons, not to model noise in a rewrite. An optional `distill` LLM step
301
+ * can compress raw findings into crisp imperatives; default is verbatim.
302
+ *
303
+ * Fail-loud: never fabricates a lesson. With no findings and no prior memory it
304
+ * returns no candidate (nothing learned yet — gen 0). It does not throw on an
305
+ * empty generation because early generations legitimately have no findings.
306
+ */
307
+
308
+ interface MemoryCurationDriverOptions {
309
+ /** Top-K lessons retained in the surface memory block. Default 12. */
310
+ maxEntries?: number;
311
+ /** Heading rendered above the lessons inside the block. Default below. */
312
+ sectionHeading?: string;
313
+ /**
314
+ * Optional LLM distillation: compress raw findings into crisp, generalizable
315
+ * one-line imperatives before curating. Omit for verbatim (deterministic).
316
+ */
317
+ distill?: {
318
+ baseUrl: string;
319
+ apiKey?: string;
320
+ model: string;
321
+ fetchImpl?: LlmClientOptions['fetch'];
322
+ };
323
+ }
324
+ /** Build the CURATOR driver. */
325
+ declare function memoryCurationDriver(opts?: MemoryCurationDriverOptions): ImprovementDriver;
326
+
23
327
  /**
24
328
  * @experimental
25
329
  *
@@ -134,6 +438,11 @@ interface ProposePatchesArgs {
134
438
  rejectedBuffer: RejectedEdit[];
135
439
  /** Slow-update meta guidance accumulated across epochs. */
136
440
  metaNote?: string;
441
+ /** Analyst findings + research report rendered as a prompt block (the
442
+ * EYES→HANDS wire) so a patch targets a NAMED diagnosed root cause. Built by
443
+ * the driver from `ctx.findings`/`ctx.report`; the patch-native `runSkillOpt`
444
+ * path may also supply it. */
445
+ findingsNote?: string;
137
446
  /** How many candidate patches to propose. */
138
447
  count: number;
139
448
  signal: AbortSignal;
@@ -167,6 +476,74 @@ declare class SkillPatchParseError extends Error {
167
476
  }
168
477
  declare function parseSkillPatchResponse(raw: string, maxPatches: number, editBudget: number): SkillPatch[];
169
478
 
479
+ /**
480
+ * @experimental
481
+ *
482
+ * `traceAnalystDriver` — wraps agent-eval's OWN trace-analyst engine
483
+ * (`AnalystRegistry` over the agentic OTLP reader) as an `ImprovementDriver`.
484
+ * It is the symmetric opponent to `haloDriver`: both consume the SAME OTLP
485
+ * corpus and apply their findings to the prompt surface via one IDENTICAL
486
+ * LLM edit, so a `compareDrivers` lift delta isolates a single variable —
487
+ * ANALYSIS QUALITY. The benchmark answers "is our HALO clone as good as the
488
+ * real HALO?" as a held-out lift CI, not a vibe.
489
+ *
490
+ * The fairness contract (the only thing that makes the head-to-head honest):
491
+ * - SAME input: both engines read the identical `traces.jsonl` (haloDriver
492
+ * hands it to the halo CLI; this driver wraps it in an `OtlpFileTraceStore`).
493
+ * - SAME application: the apply-step here is byte-for-byte the apply-step in
494
+ * `haloDriver` (same `APPLY_SYSTEM`, same one-shot `callLlm` prompt edit).
495
+ * - ONLY difference: who produced the findings — the real halo-engine vs our
496
+ * `AnalystRegistry` (whose actor prompt is a near-verbatim port of HALO's).
497
+ *
498
+ * Findings come from the REGISTRY (structured `AnalystFinding[]` carrying
499
+ * area / severity / recommended_action), NOT bare `analyzeTraces` (which emits
500
+ * `string[]`). The registry is the productized engine; raw `analyzeTraces` is
501
+ * the unstructured escape hatch.
502
+ *
503
+ * Fail-loud: no traces → throw; analyst run errors → throw; zero findings →
504
+ * throw. Never fabricate a candidate (that would silently flatter or penalize
505
+ * our engine relative to HALO).
506
+ */
507
+
508
+ interface TraceAnalystDriverOptions {
509
+ /** OpenAI-compatible base URL for BOTH the analyst's agentic reads and the
510
+ * apply step (e.g. `https://api.deepseek.com/v1` or the Tangle router). */
511
+ baseUrl: string;
512
+ /** Bearer key. Required — the Ax AI service has no env fallback here. */
513
+ apiKey: string;
514
+ /** Model the analyst kinds use for their agentic trace reads. */
515
+ model: string;
516
+ /** Model used to APPLY findings to the prompt surface. Default = `model`.
517
+ * Keep this EQUAL to haloDriver's `applyModel` for an apples-to-apples run. */
518
+ applyModel?: string;
519
+ /** Ax provider name. Default 'openai' — works for any OpenAI-compatible base
520
+ * via `apiURL`. Use 'deepseek' to hit DeepSeek's native provider. */
521
+ provider?: string;
522
+ /** Which analyst kinds to run. Default = the full shipped suite
523
+ * (`DEFAULT_TRACE_ANALYST_KINDS`: failure-mode, knowledge-gap,
524
+ * knowledge-poisoning, improvement). Narrow it for cost-parity runs. */
525
+ kinds?: readonly TraceAnalystKindSpec[];
526
+ /**
527
+ * Resolve the OTLP traces (JSONL string) the analyst should read for THIS
528
+ * generation — identical contract to `haloDriver.resolveTraces`, wired by
529
+ * the bench to the captured AppWorld OTLP for the current surface. Returning
530
+ * empty throws (the analyst has nothing to read).
531
+ */
532
+ resolveTraces: (ctx: ProposeContext) => string | Promise<string>;
533
+ /**
534
+ * Override the findings producer. Default: the shipped `AnalystRegistry`
535
+ * over `kinds`, reading the resolved traces as an `OtlpFileTraceStore`. A
536
+ * consumer may inject a pre-built registry / alternate engine here; the
537
+ * unit suite injects canned findings to exercise the apply path without
538
+ * driving the agentic loop.
539
+ */
540
+ analyze?: (tracePath: string, ctx: ProposeContext) => Promise<ReadonlyArray<AnalystFinding>>;
541
+ /** Test seam: inject a fetch for the apply-step `callLlm` (no network in unit tests). */
542
+ fetchImpl?: LlmClientOptions['fetch'];
543
+ }
544
+ /** Wrap agent-eval's trace-analyst registry as an ImprovementDriver (prompt-tier). */
545
+ declare function traceAnalystDriver(opts: TraceAnalystDriverOptions): ImprovementDriver;
546
+
170
547
  /**
171
548
  * @experimental
172
549
  *
@@ -428,12 +805,28 @@ interface OptimizerEntryConfig<TScenario extends Scenario, TArtifact> {
428
805
  /** SkillOpt epochs. Default 6. */
429
806
  maxEpochs?: number;
430
807
  mutationPrimitives?: string[];
808
+ /** Static findings seed forwarded to each GEPA driver's `propose()` as
809
+ * `ctx.findings` (the EYES→HANDS wire). Forwarded by `gepaReflectionEntry` /
810
+ * `gepaParetoEntry`; `skillOptEntry` runs findings-BLIND (see its doc). */
811
+ findings?: unknown[];
812
+ /** Per-generation findings producer (EYES→HANDS loop closure): after each
813
+ * generation scores, this re-diagnoses and REPLACES `ctx.findings` for the
814
+ * next generation's `propose()`. Reuses the `runOptimization` field type so
815
+ * it cannot drift. GEPA entries only. */
816
+ analyzeGeneration?: RunImprovementLoopOptions<TScenario, TArtifact>['analyzeGeneration'];
817
+ /** Phase-2 research report forwarded to `propose()` as `ctx.report`. */
818
+ report?: unknown;
431
819
  }
432
820
  /** GEPA, reflection-only (single-parent, no Pareto combine). */
433
821
  declare function gepaReflectionEntry<TScenario extends Scenario, TArtifact>(config: OptimizerEntryConfig<TScenario, TArtifact>, name?: string): DriverEntry;
434
822
  /** GEPA with the Pareto frontier + combine-complementary-lessons. */
435
823
  declare function gepaParetoEntry<TScenario extends Scenario, TArtifact>(config: OptimizerEntryConfig<TScenario, TArtifact>, name?: string): DriverEntry;
436
- /** SkillOpt patch-mode hill-climb. */
824
+ /** SkillOpt patch-mode hill-climb. Runs findings-BLIND: `runSkillOpt` owns its
825
+ * own epoch acceptance/budget loop and does not thread `analyzeGeneration`, so
826
+ * `config.findings` is intentionally NOT forwarded here. In a findings-fed
827
+ * comparison this entry is the blind control — do not read its result as
828
+ * findings-fed. (Threading findings into the SkillOpt epoch loop is a separate
829
+ * refactor, deferred not faked.) */
437
830
  declare function skillOptEntry<TScenario extends Scenario, TArtifact>(config: OptimizerEntryConfig<TScenario, TArtifact>, name?: string): DriverEntry;
438
831
 
439
832
  /**
@@ -584,6 +977,89 @@ interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {
584
977
  }
585
978
  declare function runProfileMatrix<TScenario extends Scenario, TArtifact>(opts: RunProfileMatrixOptions<TScenario, TArtifact>): Promise<RunProfileMatrixResult<TArtifact, TScenario>>;
586
979
 
980
+ /**
981
+ * Product-flow playback — drive the REAL product through a user story and
982
+ * score the produced state per requirement (the launch "Jira tick-off").
983
+ *
984
+ * This is the substrate adapter + contract only. It plugs a `PlaybackDriver`
985
+ * into the existing `runProfileMatrix` dispatch seam: a driver drives the real
986
+ * product (a Playwright UI session or a sandbox workspace) and returns the
987
+ * runtime event stream; `extractProducedState` + `verifyCompletion` then score
988
+ * each requirement PASS/FAIL. The concrete drivers live in consumers — they
989
+ * depend on browser / runtime infra the substrate must not import — so
990
+ * agent-eval owns the seam, the `UserStory` contract, and the scoreboard.
991
+ */
992
+
993
+ /** One step of a user story — what the user does. The driver interprets
994
+ * `payload` (a Playwright selector + action, or a sandbox chat turn). */
995
+ interface PlaybackStep {
996
+ /** Human-readable action, captured verbatim in the UX narrative. */
997
+ action: string;
998
+ /** Driver-specific payload (e.g. `{ selector, fill }` or `{ turn }`). */
999
+ payload?: Record<string, unknown>;
1000
+ }
1001
+ /**
1002
+ * A user story = a runnable product journey plus the requirements that define
1003
+ * "this story works". Each requirement is one Jira ticket line. Extends
1004
+ * `Scenario` so a catalog drops straight into `runProfileMatrix({ scenarios })`.
1005
+ */
1006
+ interface UserStory extends Scenario {
1007
+ /** Human-readable story title (the ticket headline). */
1008
+ title: string;
1009
+ /** Ordered steps the driver executes. */
1010
+ steps: PlaybackStep[];
1011
+ /** What must hold in the produced state for the story to pass. */
1012
+ requirements: CompletionRequirement[];
1013
+ }
1014
+ /** Dispatch context plus the profile under test (which cheap model, etc.). */
1015
+ interface PlaybackContext extends DispatchContext {
1016
+ profile: AgentProfile;
1017
+ }
1018
+ /**
1019
+ * Drives the real product through a story and returns the runtime event stream
1020
+ * `extractProducedState` consumes. Implemented by CONSUMERS —
1021
+ * `SandboxPlaybackDriver` (real API / sandbox workspace) and
1022
+ * `PlaywrightPlaybackDriver` (real UI) — because they depend on runtime /
1023
+ * browser infra the substrate must not import. The driver MUST report LLM
1024
+ * usage via `ctx.cost.observeTokens` so the backend-integrity guard sees real
1025
+ * tokens (a run that never reports tokens reads as a stub).
1026
+ */
1027
+ interface PlaybackDriver<TStory extends UserStory = UserStory> {
1028
+ run(story: TStory, ctx: PlaybackContext): Promise<readonly RuntimeEventLike[]>;
1029
+ }
1030
+ /**
1031
+ * Adapt a `PlaybackDriver` into a `runProfileMatrix` dispatch. The artifact the
1032
+ * matrix scores is the `ProducedState` extracted from the driver's event
1033
+ * stream — grade it with `scoreUserStory` (or a judge wrapping it).
1034
+ */
1035
+ declare function makePlaybackDispatch<TStory extends UserStory>(driver: PlaybackDriver<TStory>): ProfileDispatchFn<TStory, ProducedState>;
1036
+ /** A scored user story — the completion verdict plus its human title. */
1037
+ interface UserStoryVerdict extends CompletionVerdict {
1038
+ title: string;
1039
+ }
1040
+ /**
1041
+ * Score one story's produced state against its requirements. Thin wrapper over
1042
+ * `verifyCompletion` that builds the gold from the story and returns a
1043
+ * per-requirement PASS/FAIL verdict. `checkCorrectness` is injected — a
1044
+ * deterministic stub in tests, `createLlmCorrectnessChecker` in production.
1045
+ */
1046
+ declare function scoreUserStory(story: UserStory, state: ProducedState, checkCorrectness: CorrectnessChecker): Promise<UserStoryVerdict>;
1047
+ /** One row of the launch scoreboard — story × requirement → PASS/FAIL. */
1048
+ interface ScoreboardRow {
1049
+ storyId: string;
1050
+ storyTitle: string;
1051
+ reqId: string;
1052
+ reqTitle: string;
1053
+ status: 'PASS' | 'FAIL';
1054
+ evidence: string[];
1055
+ }
1056
+ /**
1057
+ * Flatten story verdicts into the per-requirement scoreboard — the literal
1058
+ * Jira tick-off: one row per (story, requirement) with PASS/FAIL and the
1059
+ * evidence behind the verdict.
1060
+ */
1061
+ declare function userStoryScoreboard(verdicts: readonly UserStoryVerdict[]): ScoreboardRow[];
1062
+
587
1063
  /**
588
1064
  * @experimental
589
1065
  *
@@ -763,4 +1239,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
763
1239
  * as a ref under the adapter's worktree dir. */
764
1240
  declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
765
1241
 
766
- export { type AcceptedEdit, type ApplySkillPatchResult, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, type DimensionRegression, DispatchContext, type DriverComparison, type DriverEntry, type DriverPairwise, type DriverScore, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, type HeldoutSignificance, type HeldoutSignificanceOptions, ImprovementDriver, JudgeConfig, JudgeScore, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, MutableSurface, type OptimizerEntryConfig, type PairedHoldout, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type Worktree, type WorktreeAdapter, WorktreeAdapterError, applySkillPatch, campaignBreakdown, campaignMeanComposite, compareDrivers, detectScale, dimensionRegressions, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, heldoutSignificance, pairHoldout, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, skillOptDriver, skillOptEntry };
1242
+ export { type AcceptedEdit, type AceDriverOptions, type AnalystArtifact, type AnalystScenario, type ApplySkillPatchResult, type BuildAnalystSurfaceDispatchOptions, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, DRIVER_GUIDE, type DimensionRegression, DispatchContext, type DriverComparison, type DriverEntry, type DriverGoal, type DriverGuideEntry, type DriverName, type DriverPairwise, type DriverRecommendation, type DriverScore, type DriverStrategy, type DriverSurface, type FailureModeRecallJudgeOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, type HaloDriverOptions, type HeldoutSignificance, type HeldoutSignificanceOptions, ImprovementDriver, JudgeConfig, JudgeScore, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type MemoryCurationDriverOptions, MutableSurface, type OptimizerEntryConfig, type PairedHoldout, type PlaybackContext, type PlaybackDriver, type PlaybackStep, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, ProposeContext, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, RunImprovementLoopOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type ScoreboardRow, type SelectDriverCriteria, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type TraceAnalystDriverOptions, type UserStory, type UserStoryVerdict, type Worktree, type WorktreeAdapter, WorktreeAdapterError, aceDriver, applySkillPatch, buildAnalystSurfaceDispatch, campaignBreakdown, campaignMeanComposite, compareDrivers, detectScale, dimensionRegressions, failureModeRecallJudge, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, haloDriver, heldoutSignificance, makePlaybackDispatch, memoryCurationDriver, pairHoldout, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, scoreUserStory, selectDriver, skillOptDriver, skillOptEntry, traceAnalystDriver, userStoryScoreboard };