npm - @tangle-network/agent-eval - Versions diffs - 0.72.0 → 0.72.3 - Mend

@tangle-network/agent-eval 0.72.0 → 0.72.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/CHANGELOG.md +39 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +3 -2
package/dist/agent-profile-DYRboYWu.d.ts +364 -0
package/dist/analyst/index.d.ts +221 -0
package/dist/analyst/index.js +371 -0
package/dist/analyst/index.js.map +1 -0
package/dist/analyst-t7zZS3TV.d.ts +88 -0
package/dist/campaign/index.d.ts +485 -9
package/dist/campaign/index.js +597 -22
package/dist/campaign/index.js.map +1 -1
package/dist/chunk-7W4SM7FD.js +1075 -0
package/dist/chunk-7W4SM7FD.js.map +1 -0
package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
package/dist/chunk-JHA3ZGSO.js +1496 -0
package/dist/chunk-JHA3ZGSO.js.map +1 -0
package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
package/dist/chunk-LB2UOI5F.js +412 -0
package/dist/chunk-LB2UOI5F.js.map +1 -0
package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
package/dist/chunk-VUINJM5M.js.map +1 -0
package/dist/chunk-WYIHD6EB.js +1044 -0
package/dist/chunk-WYIHD6EB.js.map +1 -0
package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
package/dist/chunk-XPILG2CA.js.map +1 -0
package/dist/contract/index.d.ts +17 -13
package/dist/contract/index.js +13 -7
package/dist/contract/index.js.map +1 -1
package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
package/dist/hosted/index.d.ts +223 -2
package/dist/index.d.ts +49 -1323
package/dist/index.js +353 -2496
package/dist/index.js.map +1 -1
package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/pareto-E-pembql.d.ts +81 -0
package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
package/dist/redact-B40YG2M_.d.ts +45 -0
package/dist/registry-DuVYiTvw.d.ts +128 -0
package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
package/dist/rl.d.ts +4 -3
package/dist/rl.js +4 -4
package/dist/run-critic-BAIjX99r.d.ts +56 -0
package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
package/dist/traces.d.ts +371 -308
package/dist/traces.js +43 -18
package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
package/dist/wire/index.d.ts +1 -1
package/dist/workflow/index.d.ts +494 -0
package/dist/workflow/index.js +2177 -0
package/dist/workflow/index.js.map +1 -0
package/docs/design/self-improvement-roadmap.md +106 -0
package/package.json +36 -12
package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
package/dist/chunk-ODGETRTM.js.map +0 -1
package/dist/chunk-SL55X4VN.js +0 -186
package/dist/chunk-SL55X4VN.js.map +0 -1
package/dist/chunk-UD6EF73X.js.map +0 -1
/package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0

package/dist/campaign/index.d.ts CHANGED Viewed

@@ -1,25 +1,329 @@
-import { a as RunCampaignOptions, C as CampaignStorage } from '../run-improvement-loop-Bzamo6GB.js';
-export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, b as RunImprovementLoopOptions, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-Bzamo6GB.js';
-export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, a as EmitLoopProvenanceArgs, b as EmitLoopProvenanceResult, E as EvolutionaryDriverOptions, H as HeldOutGateOptions, f as LoopProvenanceBackend, g as LoopProvenanceCandidate, L as LoopProvenanceRecord, R as RunEvalOptions, i as buildLoopProvenanceRecord, c as composeGate, d as defaultProductionGate, j as emitLoopProvenance, e as evolutionaryDriver, h as heldOutGate, l as loopProvenanceSpans, p as provenanceRecordPath, k as provenanceSpansPath, r as runEval, s as surfaceContentHash } from '../provenance-C69gLUXH.js';
+import { A as AnalyzeTracesOptions, a as AnalyzeTracesInput, b as AnalyzeTracesResult } from '../analyst-t7zZS3TV.js';
+import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, q as ProposeContext, J as JudgeScore, L as LabeledScenarioStore, r as LabeledScenarioWrite, s as LabeledScenarioSampleArgs, t as LabeledScenarioRecord, u as LabelTrust, v as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-Bba0vl1V.js';
+export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, w as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, x as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-Bba0vl1V.js';
+import { a as RunCampaignOptions, b as RunImprovementLoopOptions, C as CampaignStorage } from '../run-improvement-loop-BqYH2vCR.js';
+export { d as GepaDriverConstraints, G as GepaDriverOptions, O as OpenAutoPrOptions, e as OpenAutoPrResult, R as RunImprovementLoopResult, h as RunOptimizationOptions, j as RunOptimizationResult, k as countSentenceEdits, l as defaultRenderDiff, m as extractH2Sections, f as fsCampaignStorage, g as gepaDriver, i as inMemoryCampaignStorage, o as openAutoPr, r as runCampaign, c as runImprovementLoop, n as runOptimization, s as surfaceHash } from '../run-improvement-loop-BqYH2vCR.js';
+export { A as AxisEvidence, a as AxisVerdict, B as BuildEvidenceVectorOptions, k as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, l as EmitLoopProvenanceArgs, m as EmitLoopProvenanceResult, E as EvidenceVector, b as EvolutionaryDriverOptions, H as HeldOutGateOptions, n as LoopProvenanceBackend, o as LoopProvenanceCandidate, L as LoopProvenanceRecord, O as ObjectiveSource, P as ParetoSignificanceGateOptions, c as PromotionObjective, d as PromotionPolicy, R as RunEvalOptions, e as buildEvidenceVector, q as buildLoopProvenanceRecord, f as composeGate, g as defaultProductionGate, s as emitLoopProvenance, h as evolutionaryDriver, i as heldOutGate, t as loopProvenanceSpans, p as paretoPolicy, j as paretoSignificanceGate, u as provenanceRecordPath, v as provenanceSpansPath, r as runEval, w as surfaceContentHash } from '../provenance-B-TFszPW.js';
 import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
-import { I as ImprovementDriver, J as JudgeScore, L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, t as LabelTrust, S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, u as LabeledScenarioSource, f as CampaignResult, h as CodeSurface } from '../types-CnmZ2bkP.js';
-export { C as CampaignAggregates, c as CampaignArtifactWriter, d as CampaignCellResult, e as CampaignCostMeter, v as CampaignTokenUsage, g as CampaignTraceWriter, D as DispatchFn, G as Gate, i as GateContext, j as GateDecision, k as GateResult, l as GenerationCandidate, m as GenerationRecord, w as JudgeAggregate, n as JudgeDimension, o as Mutator, O as OptimizerConfig, P as ParetoParent, x as ProposeContext, y as ProposedCandidate, R as RedactionStatus, z as ScenarioAggregate, p as SessionScript, T as TraceSpan, A as isProposedCandidate, B as labelTrustRank } from '../types-CnmZ2bkP.js';
+import { c as TraceAnalystKindSpec } from '../kind-factory-DW9XWPvM.js';
+import { c as AnalystFinding } from '../types-CRD68aH7.js';
 import { a as PairedBootstrapResult } from '../statistics-B7yCbi9i.js';
-import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-DzcPHR1Z.js';
+import { A as AgentProfile, B as BackendIntegrityReport, C as CompletionRequirement, R as RuntimeEventLike, a as CompletionVerdict, P as ProducedState, b as CorrectnessChecker } from '../agent-profile-DYRboYWu.js';
 import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
 import { b as RunSplitTag, R as RunRecord } from '../run-record-BgTFzO2r.js';
+import '@ax-llm/ax';
+import '../store-GmBE2pZZ.js';
 import '../red-team-DW9Ca_tj.js';
 import '../dataset-B2kL-fSM.js';
 import '../store-CKUAgsJz.js';
 import '../schema-m0gsnbt3.js';
-import '../index-BGBrVS24.js';
+import '../pareto-E-pembql.js';
+import '../hosted/index.js';
+import '../insight-report-Df3lxYXM.js';
 import '../summary-report-ByiOUrHj.js';
 import '../failure-cluster-CL7IVgkJ.js';
 import '../judge-calibration-DilmB3Ml.js';
 import '../raw-provider-sink-C46HDghv.js';
+import 'zod';
 import '../types-Croy5h7V.js';
 import '@tangle-network/tcloud';
+/**
+ * @experimental
+ *
+ * Make the trace-analyst's OWN prompt a GEPA-optimizable surface.
+ *
+ * The analyst that drives self-improvement is itself a prompt — and a
+ * hand-tuned one (a hardcoded, hand-versioned `const`). This module lets the
+ * loop optimize it: the analyst `actorDescription` becomes a `MutableSurface`
+ * that `gepaDriver` / `haloDriver` / any `ImprovementDriver` can mutate inside
+ * `runImprovementLoop` or `compareDrivers`. That is the second-order loop —
+ * optimizing the optimizer's eyes, not just the agent's prompt.
+ *
+ * Two pieces, both deliberately small (the loop engine already exists — this
+ * only supplies the analyst-shaped dispatch + an objective scorer):
+ *
+ *  - `buildAnalystSurfaceDispatch` — `dispatchWithSurface(surface, scenario)`
+ *    runs `analyzeTraces` with `surface` as the actorDescription over the
+ *    scenario's fixed trace corpus, returning its findings.
+ *  - `failureModeRecallJudge` — a DETERMINISTIC judge (no LLM, no opinion)
+ *    that scores those findings against the scenario's GROUND-TRUTH failure
+ *    modes. This is what keeps optimizing the analyst prompt ungameable: the
+ *    labels come from objective signal (e.g. AppWorld `world.evaluate()` tells
+ *    us which task failed and which API calls were wrong), so we reward an
+ *    analyst for surfacing the failures that really happened — not for
+ *    pleasing a judge that could be talked into anything (Goodhart).
+ *
+ * Wiring (the loop is unchanged; you only pass these in):
+ *
+ *   const dispatchWithSurface = buildAnalystSurfaceDispatch({ analystOptions: { ai } })
+ *   await runImprovementLoop({
+ *     baselineSurface: TRACE_ANALYST_ACTOR_DESCRIPTION,  // the prompt under optimization
+ *     scenarios: trainScenarios,                          // labeled trace corpora
+ *     holdoutScenarios: heldOutScenarios,
+ *     dispatchWithSurface,
+ *     judges: [failureModeRecallJudge()],
+ *     driver: gepaDriver({ baseUrl, apiKey }),
+ *     gate: heldOutGate({ minDelta: 0.02 }),
+ *     autoOnPromote: 'none',
+ *   })
+ */
+/**
+ * A labeled trace scenario: a FIXED trace corpus plus the failure modes a
+ * competent analyst MUST surface from it. The labels are ground truth — the
+ * objective failures that actually occurred — which is what makes optimizing
+ * the analyst prompt against them meaningful rather than circular.
+ */
+interface AnalystScenario extends Scenario {
+    kind: 'analyst-surface';
+    /** OTLP-JSONL path or an in-memory store of the traces to analyze. */
+    source: AnalyzeTracesOptions['source'];
+    /** The domain question handed to the analyst (framing lives here, not in
+     *  the surface under optimization). */
+    question: string;
+    /**
+     * Ground-truth failure modes a good analyst must identify. A finding "hits"
+     * a mode when it contains ANY of the mode's case-insensitive cues. Derive
+     * these from objective signal (failed task + which step broke), never from
+     * the analyst's own prior output.
+     */
+    expectedFailureModes: Array<{
+        id: string;
+        cues: string[];
+    }>;
+    /**
+     * Cues that mark a finding as HALLUCINATED / out-of-scope for this corpus —
+     * naming a tool, error, or failure that did not occur. Presence penalizes
+     * precision. Optional; omit to score recall only.
+     */
+    forbiddenCues?: string[];
+}
+/** The analyst's output for one scenario — the artifact the judge scores. */
+interface AnalystArtifact {
+    answer: string;
+    findings: string[];
+    /** The hardcoded-prompt version the analyst reported (provenance only; the
+     *  optimized surface overrides the actual prompt text used). */
+    actorPromptVersion: string;
+}
+interface BuildAnalystSurfaceDispatchOptions {
+    /**
+     * Everything `analyzeTraces` needs EXCEPT `actorDescription` (supplied by the
+     * surface under optimization) and `source` (supplied by the scenario). `ai`
+     * (the AxAIService) is required for a live run.
+     */
+    analystOptions: Omit<AnalyzeTracesOptions, 'actorDescription' | 'source'>;
+    /** Test seam: defaults to the real `analyzeTraces`. */
+    analyze?: (input: AnalyzeTracesInput, options: AnalyzeTracesOptions) => Promise<AnalyzeTracesResult>;
+}
+/**
+ * Build the `dispatchWithSurface(surface, scenario, ctx)` the improvement loop
+ * calls: run the analyst with `surface` as its actorDescription over the
+ * scenario's trace corpus and return its findings.
+ */
+declare function buildAnalystSurfaceDispatch(opts: BuildAnalystSurfaceDispatchOptions): (surface: MutableSurface, scenario: AnalystScenario, ctx: DispatchContext) => Promise<AnalystArtifact>;
+interface FailureModeRecallJudgeOptions {
+    /** Weight on recall when precision is also scored (forbiddenCues present).
+     *  Default 0.5 (equal). Recall-only when no forbiddenCues exist. */
+    recallWeight?: number;
+}
+/**
+ * Deterministic, ground-truth judge for analyst findings. Composite =
+ * recall of the scenario's `expectedFailureModes` (optionally blended with a
+ * precision term that penalizes findings tripping `forbiddenCues`). No LLM —
+ * the score is a function of the labels, so the analyst prompt is optimized
+ * toward surfacing real failures, not toward a judge it can flatter.
+ */
+declare function failureModeRecallJudge(opts?: FailureModeRecallJudgeOptions): JudgeConfig<AnalystArtifact, AnalystScenario>;
+/**
+ * @experimental
+ *
+ * `aceDriver` — Agentic Context Engineering: an APPEND-MOSTLY curator, the
+ * deliberate contrast to `memoryCurationDriver`'s dedup-and-replace. ACE's
+ * thesis (arXiv:2510.04618) is that aggressively deduping/rewriting a context
+ * causes "context collapse" — hard-won specific lessons get summarized away. So
+ * the playbook GROWS by appending each generation's new lessons as provenance-
+ * tagged delta bullets; existing bullets are preserved verbatim, never merged.
+ *
+ * Each generation it:
+ *  1. reads the playbook block already in the parent surface (verbatim);
+ *  2. turns this generation's `findings` into lessons, keeping only the ones not
+ *     already present (idempotency — a recurring finding is not re-appended, but
+ *     a genuinely NEW lesson always is, even if similar to an old one);
+ *  3. appends the new lessons as `- [gN] <lesson>` deltas and re-emits the block.
+ *
+ * Bounded WITHOUT collapse: when the playbook exceeds `maxEntries`, the OLDEST
+ * deltas are evicted (FIFO) — recency is kept, but no two distinct lessons are
+ * ever merged into one. Deterministic (no LLM) so a lift is attributable to the
+ * accumulated lessons, not a rewrite's model noise.
+ *
+ * Fail-loud: with no new lesson this generation it returns NO candidate (the
+ * playbook is unchanged — nothing to propose), never a fabricated bullet.
+ */
+interface AceDriverOptions {
+    /** Max delta bullets retained in the playbook. On overflow the OLDEST are
+     *  evicted (FIFO) — never merged. Default 50 (ACE keeps a long context). */
+    maxEntries?: number;
+    /** Heading rendered above the bullets inside the block. */
+    sectionHeading?: string;
+}
+declare function aceDriver(opts?: AceDriverOptions): ImprovementDriver;
+/**
+ * Driver selection guide — "which `ImprovementDriver` do I pick, and why?"
+ *
+ * The substrate ships seven drivers with overlapping shapes. This is the
+ * decision table (data, not behavior): each entry says what a driver mutates,
+ * how it proposes changes, when to reach for it, and its relative cost.
+ * `selectDriver()` turns a goal + surface into a ranked recommendation.
+ *
+ * Import the actual driver functions from `@tangle-network/agent-eval/campaign`
+ * (gepaDriver, skillOptDriver, aceDriver, memoryCurationDriver, haloDriver,
+ * traceAnalystDriver, evolutionaryDriver); this module only helps you choose.
+ */
+type DriverName = 'gepa' | 'skillOpt' | 'ace' | 'memoryCuration' | 'halo' | 'traceAnalyst' | 'evolutionary';
+/** The mutable surface a driver targets. */
+type DriverSurface = 'prompt' | 'skill-doc' | 'playbook' | 'memory' | 'any';
+/** How a driver turns evidence into the next candidate. */
+type DriverStrategy = 'reflective-rewrite' | 'anchored-patch' | 'append-only' | 'dedup-curate' | 'analysis-edit' | 'population-mutate';
+/** What a caller is trying to do this run. */
+type DriverGoal = 'explore' | 'refine' | 'accumulate' | 'benchmark';
+interface DriverGuideEntry {
+    /** One-line description of the mechanism. */
+    summary: string;
+    /** The surface the driver edits. */
+    surface: DriverSurface;
+    /** How it proposes the next candidate. */
+    strategy: DriverStrategy;
+    /** When to reach for this driver. */
+    whenUse: string;
+    /** Relative LLM cost per generation. */
+    cost: 'low' | 'medium' | 'high';
+    /** True when the driver shells out to an external engine (extra setup). */
+    external?: boolean;
+}
+declare const DRIVER_GUIDE: Record<DriverName, DriverGuideEntry>;
+interface SelectDriverCriteria {
+    /** What you're trying to do this run. */
+    goal: DriverGoal;
+    /** Restrict to drivers that edit this surface (optional). */
+    surface?: DriverSurface;
+}
+interface DriverRecommendation {
+    name: DriverName;
+    entry: DriverGuideEntry;
+    reason: string;
+}
+/**
+ * Rank the drivers for a goal (and optional surface filter), best first.
+ * Returns the recommendation list, not instances — import the chosen driver
+ * function yourself. Always returns at least the goal's primary driver.
+ */
+declare function selectDriver(criteria: SelectDriverCriteria): DriverRecommendation[];
+/**
+ * @experimental
+ *
+ * `haloDriver` — wraps the REAL halo-engine (Inference.net's hierarchical
+ * agentic trace analyzer, `pip install halo-engine`, repo context-labs/halo)
+ * as an agent-eval `ImprovementDriver`, so HALO competes head-to-head with
+ * `gepaDriver` — and with our own `traceAnalystDriver` — inside
+ * `compareDrivers` on identical traces / scenarios / held-out scoring.
+ *
+ * It PRESERVES halo's actual working usage — `propose()` shells out to the
+ * published CLI (`halo <traces.jsonl> -p <prompt> -m <model> --base-url
+ * --api-key`) and uses its real RLM findings verbatim. We do NOT reimplement
+ * its analysis; that would make the benchmark meaningless. The only adaptation
+ * is applying HALO's findings to the current prompt surface via one LLM edit —
+ * exactly what makes the comparison prompt-tier apples-to-apples with
+ * `gepaDriver` (which also mutates the prompt). The analysis is HALO's; only
+ * the surface-application is ours, and it is identical in spirit to how HALO's
+ * own loop feeds findings to a coding agent.
+ *
+ * Fail-loud: no traces → throw; halo errors → throw; empty findings → throw.
+ * Never fabricate a candidate (that would silently flatter or penalize HALO).
+ */
+interface HaloDriverOptions {
+    /** OpenAI-compatible base URL for BOTH halo's RLM analysis and the apply
+     *  step (e.g. the Tangle router `https://router.tangle.tools/v1`). */
+    baseUrl: string;
+    /** Bearer key (else relies on OPENAI_API_KEY in the env halo inherits). */
+    apiKey?: string;
+    /** Model for halo's `--model` (its RLM). Default 'gpt-5.4-mini' (halo's own default). */
+    model?: string;
+    /** Model used to APPLY halo's findings to the prompt surface. Default = `model`. */
+    applyModel?: string;
+    /** The real halo binary. Default 'halo' (from `pip install halo-engine`). */
+    haloBin?: string;
+    /**
+     * Resolve the OTLP traces (JSONL string) halo should analyze for THIS
+     * generation — wired by the bench to the captured AppWorld OTLP for the
+     * current surface. Returning empty throws (halo has nothing to analyze).
+     */
+    resolveTraces: (ctx: ProposeContext) => string | Promise<string>;
+    /** halo's analysis prompt (`-p`). Default targets the failure taxonomy. */
+    analysisPrompt?: string;
+    /** halo `--max-depth` / `--max-turns` passthrough. */
+    maxDepth?: number;
+    maxTurns?: number;
+    /** Test seam: inject a fetch for the apply-step callLlm (no network in unit tests). */
+    fetchImpl?: LlmClientOptions['fetch'];
+}
+/** Wrap the real halo-engine CLI as an ImprovementDriver (prompt-tier). */
+declare function haloDriver(opts: HaloDriverOptions): ImprovementDriver;
+/**
+ * @experimental
+ *
+ * `memoryCurationDriver` — a CURATOR `ImprovementDriver`, the complement to the
+ * OPTIMIZER drivers (`gepaDriver` rewrites the prompt; this one BUILDS a
+ * searchable memory of what prior trajectories taught and grafts the most
+ * relevant lessons onto the surface).
+ *
+ * Each generation it:
+ *  1. collects lessons — this generation's trace-analyst `findings` PLUS the
+ *     memory already carried in the parent surface (so memory accumulates
+ *     across generations instead of resetting);
+ *  2. curates them — normalizes, deduplicates near-identical lessons, and ranks
+ *     by recurrence (a lesson seen across many findings outranks a one-off);
+ *  3. retrieves the top-K and writes them back as a single delimited memory
+ *     block in the surface (idempotent — the block is replaced, never stacked,
+ *     so the prompt does not grow without bound).
+ *
+ * This is the substrate behind the "knowledge base of working trajectories" the
+ * agent searches: the curated block IS the retrieved memory the next run reads.
+ * Curation is DETERMINISTIC (no LLM) so a lift it produces is attributable to
+ * the lessons, not to model noise in a rewrite. An optional `distill` LLM step
+ * can compress raw findings into crisp imperatives; default is verbatim.
+ *
+ * Fail-loud: never fabricates a lesson. With no findings and no prior memory it
+ * returns no candidate (nothing learned yet — gen 0). It does not throw on an
+ * empty generation because early generations legitimately have no findings.
+ */
+interface MemoryCurationDriverOptions {
+    /** Top-K lessons retained in the surface memory block. Default 12. */
+    maxEntries?: number;
+    /** Heading rendered above the lessons inside the block. Default below. */
+    sectionHeading?: string;
+    /**
+     * Optional LLM distillation: compress raw findings into crisp, generalizable
+     * one-line imperatives before curating. Omit for verbatim (deterministic).
+     */
+    distill?: {
+        baseUrl: string;
+        apiKey?: string;
+        model: string;
+        fetchImpl?: LlmClientOptions['fetch'];
+    };
+}
+/** Build the CURATOR driver. */
+declare function memoryCurationDriver(opts?: MemoryCurationDriverOptions): ImprovementDriver;
 /**
  * @experimental
  *
@@ -134,6 +438,11 @@ interface ProposePatchesArgs {
     rejectedBuffer: RejectedEdit[];
     /** Slow-update meta guidance accumulated across epochs. */
     metaNote?: string;
+    /** Analyst findings + research report rendered as a prompt block (the
+     *  EYES→HANDS wire) so a patch targets a NAMED diagnosed root cause. Built by
+     *  the driver from `ctx.findings`/`ctx.report`; the patch-native `runSkillOpt`
+     *  path may also supply it. */
+    findingsNote?: string;
     /** How many candidate patches to propose. */
     count: number;
     signal: AbortSignal;
@@ -167,6 +476,74 @@ declare class SkillPatchParseError extends Error {
 }
 declare function parseSkillPatchResponse(raw: string, maxPatches: number, editBudget: number): SkillPatch[];
+/**
+ * @experimental
+ *
+ * `traceAnalystDriver` — wraps agent-eval's OWN trace-analyst engine
+ * (`AnalystRegistry` over the agentic OTLP reader) as an `ImprovementDriver`.
+ * It is the symmetric opponent to `haloDriver`: both consume the SAME OTLP
+ * corpus and apply their findings to the prompt surface via one IDENTICAL
+ * LLM edit, so a `compareDrivers` lift delta isolates a single variable —
+ * ANALYSIS QUALITY. The benchmark answers "is our HALO clone as good as the
+ * real HALO?" as a held-out lift CI, not a vibe.
+ *
+ * The fairness contract (the only thing that makes the head-to-head honest):
+ *   - SAME input: both engines read the identical `traces.jsonl` (haloDriver
+ *     hands it to the halo CLI; this driver wraps it in an `OtlpFileTraceStore`).
+ *   - SAME application: the apply-step here is byte-for-byte the apply-step in
+ *     `haloDriver` (same `APPLY_SYSTEM`, same one-shot `callLlm` prompt edit).
+ *   - ONLY difference: who produced the findings — the real halo-engine vs our
+ *     `AnalystRegistry` (whose actor prompt is a near-verbatim port of HALO's).
+ *
+ * Findings come from the REGISTRY (structured `AnalystFinding[]` carrying
+ * area / severity / recommended_action), NOT bare `analyzeTraces` (which emits
+ * `string[]`). The registry is the productized engine; raw `analyzeTraces` is
+ * the unstructured escape hatch.
+ *
+ * Fail-loud: no traces → throw; analyst run errors → throw; zero findings →
+ * throw. Never fabricate a candidate (that would silently flatter or penalize
+ * our engine relative to HALO).
+ */
+interface TraceAnalystDriverOptions {
+    /** OpenAI-compatible base URL for BOTH the analyst's agentic reads and the
+     *  apply step (e.g. `https://api.deepseek.com/v1` or the Tangle router). */
+    baseUrl: string;
+    /** Bearer key. Required — the Ax AI service has no env fallback here. */
+    apiKey: string;
+    /** Model the analyst kinds use for their agentic trace reads. */
+    model: string;
+    /** Model used to APPLY findings to the prompt surface. Default = `model`.
+     *  Keep this EQUAL to haloDriver's `applyModel` for an apples-to-apples run. */
+    applyModel?: string;
+    /** Ax provider name. Default 'openai' — works for any OpenAI-compatible base
+     *  via `apiURL`. Use 'deepseek' to hit DeepSeek's native provider. */
+    provider?: string;
+    /** Which analyst kinds to run. Default = the full shipped suite
+     *  (`DEFAULT_TRACE_ANALYST_KINDS`: failure-mode, knowledge-gap,
+     *  knowledge-poisoning, improvement). Narrow it for cost-parity runs. */
+    kinds?: readonly TraceAnalystKindSpec[];
+    /**
+     * Resolve the OTLP traces (JSONL string) the analyst should read for THIS
+     * generation — identical contract to `haloDriver.resolveTraces`, wired by
+     * the bench to the captured AppWorld OTLP for the current surface. Returning
+     * empty throws (the analyst has nothing to read).
+     */
+    resolveTraces: (ctx: ProposeContext) => string | Promise<string>;
+    /**
+     * Override the findings producer. Default: the shipped `AnalystRegistry`
+     * over `kinds`, reading the resolved traces as an `OtlpFileTraceStore`. A
+     * consumer may inject a pre-built registry / alternate engine here; the
+     * unit suite injects canned findings to exercise the apply path without
+     * driving the agentic loop.
+     */
+    analyze?: (tracePath: string, ctx: ProposeContext) => Promise<ReadonlyArray<AnalystFinding>>;
+    /** Test seam: inject a fetch for the apply-step `callLlm` (no network in unit tests). */
+    fetchImpl?: LlmClientOptions['fetch'];
+}
+/** Wrap agent-eval's trace-analyst registry as an ImprovementDriver (prompt-tier). */
+declare function traceAnalystDriver(opts: TraceAnalystDriverOptions): ImprovementDriver;
 /**
  * @experimental
  *
@@ -428,12 +805,28 @@ interface OptimizerEntryConfig<TScenario extends Scenario, TArtifact> {
     /** SkillOpt epochs. Default 6. */
     maxEpochs?: number;
     mutationPrimitives?: string[];
+    /** Static findings seed forwarded to each GEPA driver's `propose()` as
+     *  `ctx.findings` (the EYES→HANDS wire). Forwarded by `gepaReflectionEntry` /
+     *  `gepaParetoEntry`; `skillOptEntry` runs findings-BLIND (see its doc). */
+    findings?: unknown[];
+    /** Per-generation findings producer (EYES→HANDS loop closure): after each
+     *  generation scores, this re-diagnoses and REPLACES `ctx.findings` for the
+     *  next generation's `propose()`. Reuses the `runOptimization` field type so
+     *  it cannot drift. GEPA entries only. */
+    analyzeGeneration?: RunImprovementLoopOptions<TScenario, TArtifact>['analyzeGeneration'];
+    /** Phase-2 research report forwarded to `propose()` as `ctx.report`. */
+    report?: unknown;
 }
 /** GEPA, reflection-only (single-parent, no Pareto combine). */
 declare function gepaReflectionEntry<TScenario extends Scenario, TArtifact>(config: OptimizerEntryConfig<TScenario, TArtifact>, name?: string): DriverEntry;
 /** GEPA with the Pareto frontier + combine-complementary-lessons. */
 declare function gepaParetoEntry<TScenario extends Scenario, TArtifact>(config: OptimizerEntryConfig<TScenario, TArtifact>, name?: string): DriverEntry;
-/** SkillOpt patch-mode hill-climb. */
+/** SkillOpt patch-mode hill-climb. Runs findings-BLIND: `runSkillOpt` owns its
+ *  own epoch acceptance/budget loop and does not thread `analyzeGeneration`, so
+ *  `config.findings` is intentionally NOT forwarded here. In a findings-fed
+ *  comparison this entry is the blind control — do not read its result as
+ *  findings-fed. (Threading findings into the SkillOpt epoch loop is a separate
+ *  refactor, deferred not faked.) */
 declare function skillOptEntry<TScenario extends Scenario, TArtifact>(config: OptimizerEntryConfig<TScenario, TArtifact>, name?: string): DriverEntry;
 /**
@@ -584,6 +977,89 @@ interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {
 }
 declare function runProfileMatrix<TScenario extends Scenario, TArtifact>(opts: RunProfileMatrixOptions<TScenario, TArtifact>): Promise<RunProfileMatrixResult<TArtifact, TScenario>>;
+/**
+ * Product-flow playback — drive the REAL product through a user story and
+ * score the produced state per requirement (the launch "Jira tick-off").
+ *
+ * This is the substrate adapter + contract only. It plugs a `PlaybackDriver`
+ * into the existing `runProfileMatrix` dispatch seam: a driver drives the real
+ * product (a Playwright UI session or a sandbox workspace) and returns the
+ * runtime event stream; `extractProducedState` + `verifyCompletion` then score
+ * each requirement PASS/FAIL. The concrete drivers live in consumers — they
+ * depend on browser / runtime infra the substrate must not import — so
+ * agent-eval owns the seam, the `UserStory` contract, and the scoreboard.
+ */
+/** One step of a user story — what the user does. The driver interprets
+ *  `payload` (a Playwright selector + action, or a sandbox chat turn). */
+interface PlaybackStep {
+    /** Human-readable action, captured verbatim in the UX narrative. */
+    action: string;
+    /** Driver-specific payload (e.g. `{ selector, fill }` or `{ turn }`). */
+    payload?: Record<string, unknown>;
+}
+/**
+ * A user story = a runnable product journey plus the requirements that define
+ * "this story works". Each requirement is one Jira ticket line. Extends
+ * `Scenario` so a catalog drops straight into `runProfileMatrix({ scenarios })`.
+ */
+interface UserStory extends Scenario {
+    /** Human-readable story title (the ticket headline). */
+    title: string;
+    /** Ordered steps the driver executes. */
+    steps: PlaybackStep[];
+    /** What must hold in the produced state for the story to pass. */
+    requirements: CompletionRequirement[];
+}
+/** Dispatch context plus the profile under test (which cheap model, etc.). */
+interface PlaybackContext extends DispatchContext {
+    profile: AgentProfile;
+}
+/**
+ * Drives the real product through a story and returns the runtime event stream
+ * `extractProducedState` consumes. Implemented by CONSUMERS —
+ * `SandboxPlaybackDriver` (real API / sandbox workspace) and
+ * `PlaywrightPlaybackDriver` (real UI) — because they depend on runtime /
+ * browser infra the substrate must not import. The driver MUST report LLM
+ * usage via `ctx.cost.observeTokens` so the backend-integrity guard sees real
+ * tokens (a run that never reports tokens reads as a stub).
+ */
+interface PlaybackDriver<TStory extends UserStory = UserStory> {
+    run(story: TStory, ctx: PlaybackContext): Promise<readonly RuntimeEventLike[]>;
+}
+/**
+ * Adapt a `PlaybackDriver` into a `runProfileMatrix` dispatch. The artifact the
+ * matrix scores is the `ProducedState` extracted from the driver's event
+ * stream — grade it with `scoreUserStory` (or a judge wrapping it).
+ */
+declare function makePlaybackDispatch<TStory extends UserStory>(driver: PlaybackDriver<TStory>): ProfileDispatchFn<TStory, ProducedState>;
+/** A scored user story — the completion verdict plus its human title. */
+interface UserStoryVerdict extends CompletionVerdict {
+    title: string;
+}
+/**
+ * Score one story's produced state against its requirements. Thin wrapper over
+ * `verifyCompletion` that builds the gold from the story and returns a
+ * per-requirement PASS/FAIL verdict. `checkCorrectness` is injected — a
+ * deterministic stub in tests, `createLlmCorrectnessChecker` in production.
+ */
+declare function scoreUserStory(story: UserStory, state: ProducedState, checkCorrectness: CorrectnessChecker): Promise<UserStoryVerdict>;
+/** One row of the launch scoreboard — story × requirement → PASS/FAIL. */
+interface ScoreboardRow {
+    storyId: string;
+    storyTitle: string;
+    reqId: string;
+    reqTitle: string;
+    status: 'PASS' | 'FAIL';
+    evidence: string[];
+}
+/**
+ * Flatten story verdicts into the per-requirement scoreboard — the literal
+ * Jira tick-off: one row per (story, requirement) with PASS/FAIL and the
+ * evidence behind the verdict.
+ */
+declare function userStoryScoreboard(verdicts: readonly UserStoryVerdict[]): ScoreboardRow[];
 /**
  * @experimental
  *
@@ -763,4 +1239,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
  *  as a ref under the adapter's worktree dir. */
 declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
-export { type AcceptedEdit, type ApplySkillPatchResult, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, type DimensionRegression, DispatchContext, type DriverComparison, type DriverEntry, type DriverPairwise, type DriverScore, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, type HeldoutSignificance, type HeldoutSignificanceOptions, ImprovementDriver, JudgeConfig, JudgeScore, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, MutableSurface, type OptimizerEntryConfig, type PairedHoldout, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type Worktree, type WorktreeAdapter, WorktreeAdapterError, applySkillPatch, campaignBreakdown, campaignMeanComposite, compareDrivers, detectScale, dimensionRegressions, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, heldoutSignificance, pairHoldout, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, skillOptDriver, skillOptEntry };
+export { type AcceptedEdit, type AceDriverOptions, type AnalystArtifact, type AnalystScenario, type ApplySkillPatchResult, type BuildAnalystSurfaceDispatchOptions, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, DRIVER_GUIDE, type DimensionRegression, DispatchContext, type DriverComparison, type DriverEntry, type DriverGoal, type DriverGuideEntry, type DriverName, type DriverPairwise, type DriverRecommendation, type DriverScore, type DriverStrategy, type DriverSurface, type FailureModeRecallJudgeOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, type HaloDriverOptions, type HeldoutSignificance, type HeldoutSignificanceOptions, ImprovementDriver, JudgeConfig, JudgeScore, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type MemoryCurationDriverOptions, MutableSurface, type OptimizerEntryConfig, type PairedHoldout, type PlaybackContext, type PlaybackDriver, type PlaybackStep, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, ProposeContext, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, RunImprovementLoopOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type ScoreboardRow, type SelectDriverCriteria, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type TraceAnalystDriverOptions, type UserStory, type UserStoryVerdict, type Worktree, type WorktreeAdapter, WorktreeAdapterError, aceDriver, applySkillPatch, buildAnalystSurfaceDispatch, campaignBreakdown, campaignMeanComposite, compareDrivers, detectScale, dimensionRegressions, failureModeRecallJudge, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, haloDriver, heldoutSignificance, makePlaybackDispatch, memoryCurationDriver, pairHoldout, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, scoreUserStory, selectDriver, skillOptDriver, skillOptEntry, traceAnalystDriver, userStoryScoreboard };