npm - @tangle-network/agent-eval - Versions diffs - 0.77.0 → 0.80.0 - Mend

@tangle-network/agent-eval 0.77.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/README.md +50 -19
package/dist/adapters/http.d.ts +2 -2
package/dist/adapters/langchain.d.ts +2 -2
package/dist/adapters/otel.d.ts +4 -4
package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
package/dist/analyst/index.d.ts +42 -8
package/dist/analyst/index.js +32 -2
package/dist/analyst/index.js.map +1 -1
package/dist/authenticity/index.d.ts +54 -1
package/dist/authenticity/index.js +88 -1
package/dist/authenticity/index.js.map +1 -1
package/dist/belief-state/index.d.ts +188 -0
package/dist/belief-state/index.js +486 -0
package/dist/belief-state/index.js.map +1 -0
package/dist/benchmarks/index.d.ts +2 -2
package/dist/calibration-Cpr3WaX3.d.ts +101 -0
package/dist/campaign/index.d.ts +11 -11
package/dist/campaign/index.js +4 -4
package/dist/chunk-4DIJWVUT.js +131 -0
package/dist/chunk-4DIJWVUT.js.map +1 -0
package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
package/dist/chunk-5LVWPNS5.js.map +1 -0
package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
package/dist/chunk-CF67I6QY.js.map +1 -0
package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
package/dist/chunk-KWRRMR3J.js.map +1 -0
package/dist/chunk-NPCTHQIO.js +91 -0
package/dist/chunk-NPCTHQIO.js.map +1 -0
package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
package/dist/chunk-RPLZ4OIB.js.map +1 -0
package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
package/dist/contract/index.d.ts +128 -15
package/dist/contract/index.js +118 -2
package/dist/contract/index.js.map +1 -1
package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/control.js +2 -2
package/dist/governance/index.d.ts +1 -1
package/dist/hosted/index.d.ts +4 -4
package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
package/dist/index.d.ts +127 -26
package/dist/index.js +32 -7
package/dist/index.js.map +1 -1
package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
package/dist/meta-eval/index.d.ts +6 -99
package/dist/meta-eval/index.js +7 -76
package/dist/meta-eval/index.js.map +1 -1
package/dist/off-policy-DiwuKKg7.d.ts +132 -0
package/dist/openapi.json +1 -1
package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
package/dist/{provenance-B-TFszPW.d.ts → provenance-jG-Gngg8.d.ts} +3 -3
package/dist/{registry-DuVYiTvw.d.ts → registry-BK0Zee01.d.ts} +1 -1
package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
package/dist/reporting.d.ts +5 -5
package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
package/dist/rl.d.ts +10 -140
package/dist/rl.js +8 -122
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +2 -2
package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +2 -4
package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +3 -3
package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
package/dist/traces.d.ts +1 -1
package/dist/traces.js +2 -2
package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} +12 -4
package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
package/dist/workflow/index.d.ts +4 -4
package/dist/workflow/index.js +1 -1
package/docs/auto-research-loop-end-to-end.md +1 -1
package/docs/feature-guide.md +4 -4
package/docs/multi-shot-optimization.md +61 -115
package/docs/product-eval-adoption.md +1 -1
package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
package/docs/research/research-roadmap.md +1 -0
package/docs/three-package-architecture.md +1 -1
package/docs/trace-analysis.md +19 -0
package/package.json +7 -2
package/dist/chunk-7W4SM7FD.js.map +0 -1
package/dist/chunk-F3SRAAZO.js.map +0 -1
package/dist/chunk-JYE3WOTE.js.map +0 -1
package/dist/chunk-WYIHD6EB.js.map +0 -1
/package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
/package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
/package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
/package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0

package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { R as RunRecord } from './run-record-BgTFzO2r.js';
-import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
+import { R as RunRecord } from './run-record-sItO5ftF.js';
+import { b as OutcomeStore } from './outcome-store-rnXLEqSn.js';
 /**
  * Rubric predictive validity — does our eval rubric predict deployment

package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
 import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
-import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-Bba0vl1V.js';
+import { I as ImprovementDriver, S as Scenario, g as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, h as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-4mm2msnR.js';
 /**
  * @experimental
@@ -28,9 +28,7 @@ import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateRe
  *
  * The driver is surface-agnostic — any string surface in any consumer opts
  * in by selecting it. Reuses the generic reflection primitive
- * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router
- * client; no dependency on the legacy `runMultiShotOptimization` /
- * `prompt-evolution` orchestration.
+ * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router client.
  *
  * Earns its keep where there is real per-instance signal (which the
  * dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel

package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} RENAMED Viewed

@@ -200,6 +200,17 @@ interface RunOutcome {
      *  these records as input. Optional — single-judge or scalar-only
      *  runs leave it unset. */
     judgeScores?: JudgeScoresRecord;
+    /** Authenticity / realness verdict — did the run build the REAL thing on the
+     *  intended infra, or fake it (see `./authenticity`)? Optional: only domains
+     *  with an authenticity config populate it. Carried in the corpus so the
+     *  flywheel / off-policy learning can optimize for real completion, not gamed
+     *  pass-rate. `score` is 0-1; `gated` is the anti-Goodhart flag — a gated run
+     *  must not count as a real success regardless of `score`. */
+    realness?: {
+        score: number;
+        gated: boolean;
+        reason?: string;
+    };
 }
 /**
  * Mandatory paper-grade fields for a single evaluation run. Optional

package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} RENAMED Viewed

@@ -1,8 +1,8 @@
 import { AxAIService } from '@ax-llm/ax';
-import { c as TraceAnalystKindSpec } from './kind-factory-DW9XWPvM.js';
-import { b as AnalystRegistryOptions, a as AnalystRegistry } from './registry-DuVYiTvw.js';
+import { c as TraceAnalystKindSpec } from './kind-factory-DqV2t1Xk.js';
+import { b as AnalystRegistryOptions, a as AnalystRegistry } from './registry-BK0Zee01.js';
 import { z } from 'zod';
-import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-CRD68aH7.js';
+import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-DRvV0zRo.js';
 import { a as TraceAnalystSpan } from './store-GmBE2pZZ.js';
 import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
 import { S as Severity } from './multi-layer-verifier-DlWCXuxL.js';

package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-BgTFzO2r.js';
+import { R as RunRecord } from './run-record-sItO5ftF.js';
 import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
 /**

package/dist/traces.d.ts CHANGED Viewed

@@ -14,7 +14,7 @@ import { A as AnalyzeTracesOptions, b as AnalyzeTracesResult } from './analyst-t
 export { a as AnalyzeTracesInput, c as AnalyzeTracesTurnSnapshot, d as analyzeTraces } from './analyst-t7zZS3TV.js';
 import { h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, T as TraceAnalysisStore, g as TraceAnalystFilters, b as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, c as SearchTraceResult, S as SearchSpanResult } from './store-GmBE2pZZ.js';
 export { D as DEFAULT_TRACE_ANALYST_BUDGETS, d as SpanMatchRecord, e as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, f as TraceAnalystByteBudgets, a as TraceAnalystSpan, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-GmBE2pZZ.js';
-import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-BgTFzO2r.js';
+import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-sItO5ftF.js';
 import { AxFunction } from '@ax-llm/ax';
 /**

package/dist/traces.js CHANGED Viewed

@@ -25,7 +25,7 @@ import {
   scoreTraceInsightReadiness,
   tokenizeDomainWords,
   traceAnalystOnRunComplete
-} from "./chunk-XGNCBAVZ.js";
+} from "./chunk-XQL22JDG.js";
 import {
   DEFAULT_REDACTION_RULES,
   REDACTION_VERSION,
@@ -86,7 +86,7 @@ import {
   defaultProviderRedactor,
   providerFromBaseUrl
 } from "./chunk-PC4UYEBM.js";
-import "./chunk-F3SRAAZO.js";
+import "./chunk-KWRRMR3J.js";
 import {
   TraceEmitter,
   llmSpanFromProvider

package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { a as RunTokenUsage } from './run-record-BgTFzO2r.js';
+import { a as RunTokenUsage } from './run-record-sItO5ftF.js';
 /**
  * @experimental
@@ -163,8 +163,8 @@ interface ParetoParent {
 }
 /** @experimental Stateless surface mutation — given findings + current
  *  surface, return N candidate surfaces. Pure transform, no generation
- *  awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
- *  conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */
+ *  awareness. Reflective-mutation and `AxGEPA` mutators conform. Wrapped by
+ *  `evolutionaryDriver` to become an `ImprovementDriver`. */
 interface Mutator<TFindings = unknown> {
     kind: string;
     mutate(args: {
@@ -206,6 +206,14 @@ interface ProposeContext<TFindings = unknown> {
      *  scenarios) into a merged candidate. Drivers doing pure single-parent
      *  reflection may ignore it. See {@link ParetoParent}. */
     paretoParents?: ParetoParent[];
+    /** FIREWALL (non-negotiable): the held-out judge is write-only — its verdicts
+     *  score the chosen output and gate promotion, and are NEVER an input to
+     *  proposal/steering (else the optimizer games the acceptance axis = an
+     *  oracle). This `never`-typed field makes that a compile-time tripwire: a
+     *  driver that tries to thread judge verdicts into the proposal will not type.
+     *  Steering may consume TRACE-OBSERVABLE signals (what the agent did) via
+     *  `findings`/`report`; it may NOT consume the judge's held-out verdict. */
+    judgeScores?: never;
 }
 /** @experimental A surface-improvement strategy — the DRIVER of the
  *  improvement loop. Given the current best surface, the history of what's
@@ -489,4 +497,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
     scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
 }
-export { isProposedCandidate as A, labelTrustRank as B, type CampaignAggregates as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ParetoParent as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type CampaignArtifactWriter as c, type CampaignCellResult as d, type CampaignCostMeter as e, type CampaignResult as f, type CampaignTraceWriter as g, type CodeSurface as h, type GateContext as i, type GateDecision as j, type GateResult as k, type GenerationCandidate as l, type GenerationRecord as m, type JudgeDimension as n, type Mutator as o, type SessionScript as p, type ProposeContext as q, type LabeledScenarioWrite as r, type LabeledScenarioSampleArgs as s, type LabeledScenarioRecord as t, type LabelTrust as u, type LabeledScenarioSource as v, type CampaignTokenUsage as w, type JudgeAggregate as x, type ProposedCandidate as y, type ScenarioAggregate as z };
+export { isProposedCandidate as A, labelTrustRank as B, type CampaignAggregates as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ParetoParent as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type GateDecision as c, type CampaignArtifactWriter as d, type CampaignCellResult as e, type CampaignCostMeter as f, type CampaignResult as g, type CampaignTraceWriter as h, type CodeSurface as i, type GateContext as j, type GateResult as k, type GenerationCandidate as l, type GenerationRecord as m, type JudgeDimension as n, type Mutator as o, type SessionScript as p, type ProposeContext as q, type LabeledScenarioWrite as r, type LabeledScenarioSampleArgs as s, type LabeledScenarioRecord as t, type LabelTrust as u, type LabeledScenarioSource as v, type CampaignTokenUsage as w, type JudgeAggregate as x, type ProposedCandidate as y, type ScenarioAggregate as z };

package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-BgTFzO2r.js';
+import { R as RunRecord } from './run-record-sItO5ftF.js';
 import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
 import { a as JudgeInput } from './types-Croy5h7V.js';
 import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
@@ -146,6 +146,15 @@ interface AnalystFinding {
      * diff cleanly across runs.
      */
     subject?: string;
+    /** FIREWALL provenance (docs/learning-flywheel.md): true iff this finding was
+     *  lifted from a JUDGE verdict (an acceptance score), not OBSERVED from the
+     *  agent's behavior. A judge-derived finding must NEVER be admitted as a
+     *  steering input — that is the held-out judge leaking into the loop. Set at
+     *  the lift site (createJudgeAdapter); checked by `assertNoJudgeVerdict`.
+     *  Provenance, not evidence presence, is the correct discriminator: an
+     *  evidence-less trace-analyst observation legitimately steers, while a judge
+     *  verdict that happens to cite an artifact must not. */
+    derived_from_judge?: boolean;
     /** Analyst-private extras; renderers ignore unless they know the analyst. */
     metadata?: Record<string, unknown>;
 }

package/dist/workflow/index.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { W as WorkflowTopology } from '../harness-optimizer-EnEnQPsr.js';
-import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-BgTFzO2r.js';
-import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-CRD68aH7.js';
-import { F as FailureClusterInsight } from '../insight-report-Df3lxYXM.js';
+import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-sItO5ftF.js';
+import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-DRvV0zRo.js';
+import { F as FailureClusterInsight } from '../insight-report-dlpEzQDi.js';
 import { a as VerificationReport, L as LayerResult } from '../multi-layer-verifier-DlWCXuxL.js';
 import { F as FailureClusterReport } from '../failure-cluster-CL7IVgkJ.js';
 import { R as RedactionRule, a as RedactionReport } from '../redact-B40YG2M_.js';
@@ -18,7 +18,7 @@ import '../types-Croy5h7V.js';
 import '@tangle-network/tcloud';
 import '../llm-client-DbjLfz-K.js';
 import '../raw-provider-sink-C46HDghv.js';
-import '../summary-report-ByiOUrHj.js';
+import '../summary-report-BTaXq1TS.js';
 import '../judge-calibration-DilmB3Ml.js';
 import '../control-runtime-DuFBYg7A.js';
 import '../emitter-DEZwY14K.js';

package/dist/workflow/index.js CHANGED Viewed

@@ -7,7 +7,7 @@ import {
 } from "../chunk-GGE4NNQT.js";
 import {
   validateRunRecord
-} from "../chunk-F3SRAAZO.js";
+} from "../chunk-KWRRMR3J.js";
 import "../chunk-VSMTAMNK.js";
 import {
   ValidationError

package/docs/auto-research-loop-end-to-end.md CHANGED Viewed

@@ -152,7 +152,7 @@ async function runAutoResearchLoop(opts: {
 Two cases:
 1. **Trajectory-shaped optimization with steering.** Use
-   `runMultiShotOptimization` directly — it already runs the inner
+   `runImprovementLoop` directly — it already runs the inner
    search-vs-holdout loop. Wrap with `analyzeOptimizationResult` after
    for the RL bridge.

package/docs/feature-guide.md CHANGED Viewed

@@ -33,8 +33,8 @@ trying, and whether a change made them better or worse.
 | “Human feedback should become reusable eval data.” | `FeedbackTrajectory` | Captures approvals, rejections, edits, choices, metrics, and policy blocks. |
 | “Can this action run, or does it need approval?” | `evaluateActionPolicy` | Generic preflight for side effects, budgets, and required evidence. |
 | “I need train/dev/test/holdout examples.” | `Dataset` plus feedback trajectory conversion | Stable splits and contamination control. |
-| “Which prompt or signature wins?” | `runMultiShotOptimization`, steering optimizers | Runs variants on scenarios and compares scores. |
-| “Improve a multi-turn agent over real task traces.” | `runMultiShotOptimization` | GEPA-style trajectory optimization with ASI and held-out promotion. |
+| “Which prompt or signature wins?” | `runImprovementLoop`, steering optimizers | Runs variants on scenarios and compares scores. |
+| “Improve a multi-turn agent over real task traces.” | `runImprovementLoop` | GEPA-style trajectory optimization with ASI and held-out promotion. |
 | “Improve prompts, then code if prompts plateau.” | `runPromptEvolution`, composite mutator, code mutator | Bounded evolution with telemetry and lineage. |
 | “Find why a regression happened.” | bisector, traces, run records | Narrows changes and preserves evidence. |
 | “Expose evals to another language.” | Wire protocol and Python client | HTTP/RPC boundary for non-TypeScript apps. |
@@ -105,7 +105,7 @@ generated code -> build/test/runtime gates -> score -> ship or revise
 Use when you want Ax/GEPA-style improvement.
-1. For variable-length agent tasks, use `runMultiShotOptimization`.
+1. For variable-length agent tasks, use `runImprovementLoop`.
 2. Build search/dev/test/holdout splits from the real product loop.
 3. Score full trajectories, not just final text.
 4. Emit actionable side information for failures the mutator can fix.
@@ -156,7 +156,7 @@ Store as `FeedbackTrajectory`, then derive:
 | Feedback data | `FeedbackTrajectory`, stores, converters | Human/environment labels | Domain adapters live in downstream repos. |
 | Action policy | `evaluateActionPolicy` | Approval/budget preflight | Blocks or labels actions before `act()`. |
 | Datasets | `Dataset`, holdout tools, canaries | Train/dev/test/holdout corpora | Keeps optimization honest. |
-| Optimization | `runMultiShotOptimization`, steering optimizers | Prompt/signature comparison | Use held-out gates before promotion. |
+| Optimization | `runImprovementLoop`, steering optimizers | Prompt/signature comparison | Use held-out gates before promotion. |
 | Evolution | prompt/code mutators, sandbox pool, telemetry | Autoresearch and mutation loops | Use budgets and lineage; do not run unbounded. |
 | Telemetry | `TraceStore`, OTLP, file sinks | Audit and replay | Treat traces as evidence, not just logs. |
 | Reporting | summaries, pareto, cost tracker | Decision support | Useful for PRs, launch gates, research notes. |

package/docs/multi-shot-optimization.md CHANGED Viewed

@@ -1,129 +1,75 @@
 # Multi-Shot Optimization
-`runMultiShotOptimization` is the public adapter for GEPA-style optimization over
-variable-length agent conversations.
+> **Renamed.** `runMultiShotOptimization` was retired. The live API is
+> `runImprovementLoop` (driver-agnostic, gated promotion) driven by `gepaDriver`,
+> with `compareDrivers` for head-to-head driver lift. This doc was rewritten to the
+> live API; see also [feature-guide.md](./feature-guide.md) and [concepts.md](./concepts.md).
-Use it when the thing you want to improve is not a single model call. Typical
-targets are agent system prompts, tool descriptions, routing policies, retrieval
-plans, or app-specific scaffolding that affects an entire task trajectory.
+`runImprovementLoop` is the public entry for GEPA-style optimization over a whole
+task trajectory — the thing you improve is not a single model call but an agent
+system prompt, tool descriptions, a routing policy, or any scaffolding that affects
+the entire run. It is the OUTER loop: it improves the SURFACE the inner workers run.
-The primitive is intentionally small. Your app owns the domain logic:
+## The shape
-- `seedVariants`: prompt/config/tool-policy candidates
-- `runner`: executes one complete task trajectory for one variant
-- `scorer`: scores the trajectory and emits actionable side information
-- `mutateAdapter`: proposes new variants from top and bottom trials
+You own a few seams; the loop owns the release-critical glue (paired seeds, the
+held-out re-score, the promotion gate, provenance):
-`agent-eval` owns the release-critical glue:
+- **`baselineSurface`** — the current surface (a prompt string, or a `CodeSurface`).
+- **`dispatchWithSurface(surface, scenario, ctx)`** — run one task to completion
+  under a candidate surface; return the artifact the judges score.
+- **`judges`** — score the artifact (`{ composite, dimensions }`).
+- **`driver`** — proposes candidate surfaces each generation: `gepaDriver`
+  (reflective + Pareto frontier) or `evolutionaryDriver` (mutator).
+- **`gate`** — `defaultProductionGate` (held-out significance + red-team +
+  reward-hacking + canary). Ships ONLY on a CI-lower-bound held-out lift.
-- stable paired seeds
-- search-split prompt evolution
-- cost/score Pareto objectives
-- failed-run conversion into failed trials
-- ASI projection into reflection traces and numeric metrics
-- optional paired holdout gating through `HeldOutGate`
-- validated `RunRecord` rows for promotion evidence
-## Result Contract
-The return shape separates discovery from promotion:
-- `searchBestVariant`: best variant on the optimizer-visible search scenarios
-- `searchBestAggregate`: aggregate for that search winner
-- `promotedVariant`: variant callers should ship
-- `promotedAggregate`: aggregate for the promoted variant
-- `gate`: holdout decision and evidence, or `null` when no gate ran
-If a holdout gate is configured and rejects the search winner,
-`promotedVariant` is the baseline. Do not ship `searchBestVariant` directly
-unless you intentionally run without a holdout gate.
-## Actionable Side Information
-The scorer should return `asi` rows for concrete failure modes:
-```ts
-{
-  expectationId: 'used-primary-sources',
-  message: 'The final answer cited secondary summaries instead of primary sources.',
-  severity: 'error',
-  responsibleSurface: 'retrieval-policy',
-  suggestion: 'Prefer primary-source domains during source-gathering turns.',
-}
-```
-Standard knowledge-related responsible surfaces are:
-- `knowledge-requirements`
-- `data-acquisition`
-- `retrieval-policy`
-- `user-question-policy`
-These rows become:
-- reflection expectations via `trialTraceFromMultiShotTrial`
-- aggregate metrics like `asi.error` and `surface.retrieval-policy`
-- trace evidence available to downstream reports
-This is the main reason to use this primitive instead of reducing each run to a
-single scalar reward.
-## Holdout Discipline
-For release gates, configure `gate`. The first seed variant is the baseline and
-`gate.gate.baselineKey` must match its id.
-Holdout scenarios must be disjoint from `searchScenarioIds`. The adapter runs
-baseline and candidate with the same `(scenarioId, rep)` seed, validates every
-row with `validateRunRecord`, then asks `HeldOutGate` whether to promote.
-When `gate.searchScenarioIds` is omitted, the adapter reuses
-`searchScenarioIds` for the overfit-gap check.
-## Minimal Shape
+## Minimal example
 ```ts
 import {
-  runMultiShotOptimization,
-  trialTraceFromMultiShotTrial,
-  type MultiShotVariant,
-} from '@tangle-network/agent-eval'
-type Payload = { systemPrompt: string }
-const baseline: MultiShotVariant<Payload> = {
-  id: 'baseline',
-  label: 'baseline',
-  generation: 0,
-  payload: { systemPrompt: currentPrompt },
-}
-const result = await runMultiShotOptimization<Payload>({
-  runId: `research-agent-${Date.now()}`,
-  target: 'research-agent-system-prompt',
-  seedVariants: [baseline],
-  searchScenarioIds: searchScenarios.map((s) => s.id),
-  reps: 2,
-  generations: 4,
+  runImprovementLoop,
+  gepaDriver,
+  defaultProductionGate,
+} from '@tangle-network/agent-eval/contract'
+const result = await runImprovementLoop({
+  baselineSurface: currentSystemPrompt,
+  scenarios: trainScenarios, // optimizer-visible
+  holdoutScenarios, // DISJOINT — only the gate sees these
+  dispatchWithSurface: async (surface, scenario) =>
+    runYourAgentToCompletion({ scenario, prompt: String(surface) }),
+  judges: [myJudge],
+  driver: gepaDriver({
+    llm: { apiKey, baseUrl },
+    model: 'gpt-5',
+    target: 'enforce a strict output schema',
+  }),
   populationSize: 4,
-  scoreConcurrency: 4,
-  runner: {
-    async run({ variant, scenarioId, seed }) {
-      return runYourAgentToCompletion({ scenarioId, seed, prompt: variant.payload.systemPrompt })
-    },
-  },
-  scorer: {
-    async score({ run }) {
-      return scoreFullTrajectory(run.trace)
-    },
-  },
-  mutateAdapter: {
-    async mutate({ parent, bottomTrials, childCount, generation }) {
-      const traces = bottomTrials.map((t) => trialTraceFromMultiShotTrial(t))
-      return proposePromptMutations({ parent, traces, childCount, generation })
-    },
-  },
+  maxGenerations: 4,
+  gate: defaultProductionGate({ holdoutScenarios, deltaThreshold: 0 }),
+  autoOnPromote: 'none', // or 'pr' (+ ghOwner/ghRepo) to open a PR on ship
+  runDir,
 })
-deploy(result.promotedVariant.payload)
+if (result.gateResult.decision === 'ship') {
+  deploy(result.winnerSurface) // the driver's proposal, gated on a real held-out lift
+}
 ```
+## Discipline (what makes it trustworthy)
+- **Holdout is disjoint + gated.** `holdoutScenarios` must not overlap the training
+  pool. The gate re-scores baseline vs winner on the holdout and ships only when the
+  paired-bootstrap CI lower bound clears `deltaThreshold`; a few-instance swing at
+  thin `n` is held (`few_runs`), not promoted.
+- **No-op never ships.** If no candidate beats the baseline, the winner IS the
+  baseline (empty diff) and the loop forces `hold` — it does not score
+  baseline-vs-itself and read model noise as lift.
+- **Provenance falls out.** `result.promotedDiff` + `emitLoopProvenance` give the
+  auditable candidate→gate→promote chain (rationale, content hashes, a held-out lift
+  recomputable from the emitted record).
+Reach for `compareDrivers` when the question is "which DRIVER wins" rather than
+"improve this surface", and see `tests/campaign/presets.test.ts` for the executable
+contract (no-op guard, fail-loud holdout, gate promotion).

package/docs/product-eval-adoption.md CHANGED Viewed

@@ -152,7 +152,7 @@ set with a signed note.
 ## Optimization
-Use `runMultiShotOptimization()` when the system is a multi-step agent, not a
+Use `runImprovementLoop()` when the system is a multi-step agent, not a
 single prompt.
 Good optimization targets: