npm - @tangle-network/agent-eval - Versions diffs - 0.77.0 → 0.79.0 - Mend

@tangle-network/agent-eval 0.77.0 → 0.79.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/dist/adapters/http.d.ts +2 -2
package/dist/adapters/langchain.d.ts +2 -2
package/dist/adapters/otel.d.ts +4 -4
package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
package/dist/analyst/index.d.ts +42 -8
package/dist/analyst/index.js +32 -2
package/dist/analyst/index.js.map +1 -1
package/dist/authenticity/index.d.ts +54 -1
package/dist/authenticity/index.js +88 -1
package/dist/authenticity/index.js.map +1 -1
package/dist/benchmarks/index.d.ts +2 -2
package/dist/campaign/index.d.ts +11 -11
package/dist/campaign/index.js +4 -4
package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
package/dist/chunk-5LVWPNS5.js.map +1 -0
package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
package/dist/chunk-CF67I6QY.js.map +1 -0
package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
package/dist/chunk-KWRRMR3J.js.map +1 -0
package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
package/dist/chunk-RPLZ4OIB.js.map +1 -0
package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
package/dist/contract/index.d.ts +12 -12
package/dist/contract/index.js +2 -2
package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/control.js +2 -2
package/dist/hosted/index.d.ts +4 -4
package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
package/dist/index.d.ts +126 -25
package/dist/index.js +32 -7
package/dist/index.js.map +1 -1
package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
package/dist/meta-eval/index.d.ts +2 -2
package/dist/openapi.json +1 -1
package/dist/{provenance-B-TFszPW.d.ts → provenance-CEAJI9rm.d.ts} +3 -3
package/dist/{registry-DuVYiTvw.d.ts → registry-BmEuU94S.d.ts} +2 -2
package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
package/dist/reporting.d.ts +4 -4
package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
package/dist/rl.d.ts +6 -6
package/dist/rl.js +2 -2
package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CWyWWLBg.d.ts} +1 -1
package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-Bgu4C59E.d.ts} +2 -4
package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-Du4ZVyef.d.ts} +3 -3
package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
package/dist/traces.d.ts +1 -1
package/dist/traces.js +2 -2
package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
package/dist/{types-Bba0vl1V.d.ts → types-QHG0KnkF.d.ts} +11 -3
package/dist/workflow/index.d.ts +4 -4
package/dist/workflow/index.js +1 -1
package/docs/auto-research-loop-end-to-end.md +1 -1
package/docs/feature-guide.md +4 -4
package/docs/multi-shot-optimization.md +61 -115
package/docs/product-eval-adoption.md +1 -1
package/docs/three-package-architecture.md +1 -1
package/docs/trace-analysis.md +19 -0
package/package.json +1 -1
package/dist/chunk-7W4SM7FD.js.map +0 -1
package/dist/chunk-F3SRAAZO.js.map +0 -1
package/dist/chunk-JYE3WOTE.js.map +0 -1
package/dist/chunk-WYIHD6EB.js.map +0 -1
/package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
/package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
/package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
/package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0

package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-ByiOUrHj.js';
+import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-BTaXq1TS.js';
 import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
 /**

package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} RENAMED Viewed

@@ -1,7 +1,7 @@
 import { AxAIService, AxFunction } from '@ax-llm/ax';
 import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
 import { z } from 'zod';
-import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-CRD68aH7.js';
+import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-DRvV0zRo.js';
 /**
  * Typed Ax output for analyst findings.

package/dist/meta-eval/index.d.ts CHANGED Viewed

@@ -2,8 +2,8 @@ import { T as TraceStore } from '../store-CKUAgsJz.js';
 import { R as Run } from '../schema-m0gsnbt3.js';
 import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-D_4BSXGV.js';
-import '../run-record-BgTFzO2r.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-CWyWWLBg.js';
+import '../run-record-sItO5ftF.js';
 import '../errors-Dwqw-T_m.js';
 /**

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.77.0",
+    "version": "0.79.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/{provenance-B-TFszPW.d.ts → provenance-CEAJI9rm.d.ts} RENAMED Viewed

@@ -1,9 +1,9 @@
-import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, i as GateContext, f as CampaignResult, M as MutableSurface, j as GateDecision } from './types-Bba0vl1V.js';
+import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, i as GateContext, f as CampaignResult, M as MutableSurface, j as GateDecision } from './types-QHG0KnkF.js';
 import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
-import { R as RunRecord } from './run-record-BgTFzO2r.js';
+import { R as RunRecord } from './run-record-sItO5ftF.js';
 import { D as Direction } from './pareto-E-pembql.js';
 import { a as PairedBootstrapResult } from './statistics-B7yCbi9i.js';
-import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BqYH2vCR.js';
+import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-Bgu4C59E.js';
 import { HostedClient, TraceSpanEvent } from './hosted/index.js';
 /**

package/dist/{registry-DuVYiTvw.d.ts → registry-BmEuU94S.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-CRD68aH7.js';
+import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-DRvV0zRo.js';
 /**
  * AnalystRegistry — orchestrate N analysts against one run.
@@ -125,4 +125,4 @@ declare class AnalystRegistry {
     private routeInput;
 }
-export { type AnalystHooks as A, type BudgetPolicy as B, type RegistryRunOpts as R, AnalystRegistry as a, type AnalystRegistryOptions as b };
+export { AnalystRegistry as A, type BudgetPolicy as B, type RegistryRunOpts as R, type AnalystHooks as a, type AnalystRegistryOptions as b };

package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} RENAMED Viewed

@@ -1,6 +1,6 @@
 import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
-import { m as GateDecision } from './summary-report-ByiOUrHj.js';
-import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
+import { m as GateDecision } from './summary-report-BTaXq1TS.js';
+import { R as RunRecord, b as RunSplitTag } from './run-record-sItO5ftF.js';
 /**
  * Release confidence gate.

package/dist/reporting.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-D_4BSXGV.js';
-export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CWyWWLBg.js';
+export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CXXZlR8g.js';
 export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
 export { P as PairedBootstrapOptions, a as PairedBootstrapResult, b as benjaminiHochberg, p as pairedBootstrap, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
-export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
-import './run-record-BgTFzO2r.js';
+export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BTaXq1TS.js';
+import './run-record-sItO5ftF.js';
 import './errors-Dwqw-T_m.js';
 import './schema-m0gsnbt3.js';
 import './outcome-store-D6KWmYvj.js';

package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} RENAMED Viewed

@@ -1,6 +1,6 @@
-import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BgTFzO2r.js';
+import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-sItO5ftF.js';
 import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
-import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-ByiOUrHj.js';
+import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-BTaXq1TS.js';
 import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
 import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
 import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';

package/dist/rl.d.ts CHANGED Viewed

@@ -1,18 +1,18 @@
-import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
-import { f as CampaignResult } from './types-Bba0vl1V.js';
+import { R as RunRecord, b as RunSplitTag } from './run-record-sItO5ftF.js';
+import { f as CampaignResult } from './types-QHG0KnkF.js';
 import { a as VerificationReport } from './multi-layer-verifier-DlWCXuxL.js';
 import { S as Span } from './schema-m0gsnbt3.js';
 import { T as TraceStore } from './store-CKUAgsJz.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
-import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-D_4BSXGV.js';
-import { R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-C_KJyIGg.js';
-export { r as runEvalCampaign } from './researcher-C_KJyIGg.js';
+import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CWyWWLBg.js';
+import { R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-rInLj9De.js';
+export { r as runEvalCampaign } from './researcher-rInLj9De.js';
 import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
 import './errors-Dwqw-T_m.js';
 import './llm-client-DbjLfz-K.js';
 import './raw-provider-sink-C46HDghv.js';
-import './summary-report-ByiOUrHj.js';
+import './summary-report-BTaXq1TS.js';
 import './failure-cluster-CL7IVgkJ.js';
 import './emitter-DEZwY14K.js';
 import './integrity-CJzrpUua.js';

package/dist/rl.js CHANGED Viewed

@@ -10,7 +10,7 @@ import {
 } from "./chunk-3RF76KTD.js";
 import {
   runEvalCampaign
-} from "./chunk-GJJNJVIR.js";
+} from "./chunk-XXNIODOM.js";
 import "./chunk-IHDHUN2X.js";
 import {
   rubricPredictiveValidity
@@ -25,7 +25,7 @@ import {
 } from "./chunk-ITBRCT73.js";
 import "./chunk-SBCB6VZY.js";
 import "./chunk-PC4UYEBM.js";
-import "./chunk-F3SRAAZO.js";
+import "./chunk-KWRRMR3J.js";
 import "./chunk-TVVP3ZZQ.js";
 import "./chunk-VSMTAMNK.js";
 import {

package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CWyWWLBg.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-BgTFzO2r.js';
+import { R as RunRecord } from './run-record-sItO5ftF.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 /**

package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-Bgu4C59E.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
 import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
-import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-Bba0vl1V.js';
+import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-QHG0KnkF.js';
 /**
  * @experimental
@@ -28,9 +28,7 @@ import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateRe
  *
  * The driver is surface-agnostic — any string surface in any consumer opts
  * in by selecting it. Reuses the generic reflection primitive
- * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router
- * client; no dependency on the legacy `runMultiShotOptimization` /
- * `prompt-evolution` orchestration.
+ * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router client.
  *
  * Earns its keep where there is real per-instance signal (which the
  * dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel

package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} RENAMED Viewed

@@ -200,6 +200,17 @@ interface RunOutcome {
      *  these records as input. Optional — single-judge or scalar-only
      *  runs leave it unset. */
     judgeScores?: JudgeScoresRecord;
+    /** Authenticity / realness verdict — did the run build the REAL thing on the
+     *  intended infra, or fake it (see `./authenticity`)? Optional: only domains
+     *  with an authenticity config populate it. Carried in the corpus so the
+     *  flywheel / off-policy learning can optimize for real completion, not gamed
+     *  pass-rate. `score` is 0-1; `gated` is the anti-Goodhart flag — a gated run
+     *  must not count as a real success regardless of `score`. */
+    realness?: {
+        score: number;
+        gated: boolean;
+        reason?: string;
+    };
 }
 /**
  * Mandatory paper-grade fields for a single evaluation run. Optional

package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-Du4ZVyef.d.ts} RENAMED Viewed

@@ -1,8 +1,8 @@
 import { AxAIService } from '@ax-llm/ax';
-import { c as TraceAnalystKindSpec } from './kind-factory-DW9XWPvM.js';
-import { b as AnalystRegistryOptions, a as AnalystRegistry } from './registry-DuVYiTvw.js';
+import { c as TraceAnalystKindSpec } from './kind-factory-DqV2t1Xk.js';
+import { b as AnalystRegistryOptions, A as AnalystRegistry } from './registry-BmEuU94S.js';
 import { z } from 'zod';
-import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-CRD68aH7.js';
+import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-DRvV0zRo.js';
 import { a as TraceAnalystSpan } from './store-GmBE2pZZ.js';
 import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
 import { S as Severity } from './multi-layer-verifier-DlWCXuxL.js';

package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-BgTFzO2r.js';
+import { R as RunRecord } from './run-record-sItO5ftF.js';
 import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
 /**

package/dist/traces.d.ts CHANGED Viewed

@@ -14,7 +14,7 @@ import { A as AnalyzeTracesOptions, b as AnalyzeTracesResult } from './analyst-t
 export { a as AnalyzeTracesInput, c as AnalyzeTracesTurnSnapshot, d as analyzeTraces } from './analyst-t7zZS3TV.js';
 import { h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, T as TraceAnalysisStore, g as TraceAnalystFilters, b as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, c as SearchTraceResult, S as SearchSpanResult } from './store-GmBE2pZZ.js';
 export { D as DEFAULT_TRACE_ANALYST_BUDGETS, d as SpanMatchRecord, e as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, f as TraceAnalystByteBudgets, a as TraceAnalystSpan, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-GmBE2pZZ.js';
-import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-BgTFzO2r.js';
+import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-sItO5ftF.js';
 import { AxFunction } from '@ax-llm/ax';
 /**

package/dist/traces.js CHANGED Viewed

@@ -25,7 +25,7 @@ import {
   scoreTraceInsightReadiness,
   tokenizeDomainWords,
   traceAnalystOnRunComplete
-} from "./chunk-XGNCBAVZ.js";
+} from "./chunk-XQL22JDG.js";
 import {
   DEFAULT_REDACTION_RULES,
   REDACTION_VERSION,
@@ -86,7 +86,7 @@ import {
   defaultProviderRedactor,
   providerFromBaseUrl
 } from "./chunk-PC4UYEBM.js";
-import "./chunk-F3SRAAZO.js";
+import "./chunk-KWRRMR3J.js";
 import {
   TraceEmitter,
   llmSpanFromProvider

package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-BgTFzO2r.js';
+import { R as RunRecord } from './run-record-sItO5ftF.js';
 import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
 import { a as JudgeInput } from './types-Croy5h7V.js';
 import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
@@ -146,6 +146,15 @@ interface AnalystFinding {
      * diff cleanly across runs.
      */
     subject?: string;
+    /** FIREWALL provenance (docs/learning-flywheel.md): true iff this finding was
+     *  lifted from a JUDGE verdict (an acceptance score), not OBSERVED from the
+     *  agent's behavior. A judge-derived finding must NEVER be admitted as a
+     *  steering input — that is the held-out judge leaking into the loop. Set at
+     *  the lift site (createJudgeAdapter); checked by `assertNoJudgeVerdict`.
+     *  Provenance, not evidence presence, is the correct discriminator: an
+     *  evidence-less trace-analyst observation legitimately steers, while a judge
+     *  verdict that happens to cite an artifact must not. */
+    derived_from_judge?: boolean;
     /** Analyst-private extras; renderers ignore unless they know the analyst. */
     metadata?: Record<string, unknown>;
 }

package/dist/{types-Bba0vl1V.d.ts → types-QHG0KnkF.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { a as RunTokenUsage } from './run-record-BgTFzO2r.js';
+import { a as RunTokenUsage } from './run-record-sItO5ftF.js';
 /**
  * @experimental
@@ -163,8 +163,8 @@ interface ParetoParent {
 }
 /** @experimental Stateless surface mutation — given findings + current
  *  surface, return N candidate surfaces. Pure transform, no generation
- *  awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
- *  conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */
+ *  awareness. Reflective-mutation and `AxGEPA` mutators conform. Wrapped by
+ *  `evolutionaryDriver` to become an `ImprovementDriver`. */
 interface Mutator<TFindings = unknown> {
     kind: string;
     mutate(args: {
@@ -206,6 +206,14 @@ interface ProposeContext<TFindings = unknown> {
      *  scenarios) into a merged candidate. Drivers doing pure single-parent
      *  reflection may ignore it. See {@link ParetoParent}. */
     paretoParents?: ParetoParent[];
+    /** FIREWALL (non-negotiable): the held-out judge is write-only — its verdicts
+     *  score the chosen output and gate promotion, and are NEVER an input to
+     *  proposal/steering (else the optimizer games the acceptance axis = an
+     *  oracle). This `never`-typed field makes that a compile-time tripwire: a
+     *  driver that tries to thread judge verdicts into the proposal will not type.
+     *  Steering may consume TRACE-OBSERVABLE signals (what the agent did) via
+     *  `findings`/`report`; it may NOT consume the judge's held-out verdict. */
+    judgeScores?: never;
 }
 /** @experimental A surface-improvement strategy — the DRIVER of the
  *  improvement loop. Given the current best surface, the history of what's

package/dist/workflow/index.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { W as WorkflowTopology } from '../harness-optimizer-EnEnQPsr.js';
-import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-BgTFzO2r.js';
-import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-CRD68aH7.js';
-import { F as FailureClusterInsight } from '../insight-report-Df3lxYXM.js';
+import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-sItO5ftF.js';
+import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-DRvV0zRo.js';
+import { F as FailureClusterInsight } from '../insight-report-dlpEzQDi.js';
 import { a as VerificationReport, L as LayerResult } from '../multi-layer-verifier-DlWCXuxL.js';
 import { F as FailureClusterReport } from '../failure-cluster-CL7IVgkJ.js';
 import { R as RedactionRule, a as RedactionReport } from '../redact-B40YG2M_.js';
@@ -18,7 +18,7 @@ import '../types-Croy5h7V.js';
 import '@tangle-network/tcloud';
 import '../llm-client-DbjLfz-K.js';
 import '../raw-provider-sink-C46HDghv.js';
-import '../summary-report-ByiOUrHj.js';
+import '../summary-report-BTaXq1TS.js';
 import '../judge-calibration-DilmB3Ml.js';
 import '../control-runtime-DuFBYg7A.js';
 import '../emitter-DEZwY14K.js';

package/dist/workflow/index.js CHANGED Viewed

@@ -7,7 +7,7 @@ import {
 } from "../chunk-GGE4NNQT.js";
 import {
   validateRunRecord
-} from "../chunk-F3SRAAZO.js";
+} from "../chunk-KWRRMR3J.js";
 import "../chunk-VSMTAMNK.js";
 import {
   ValidationError

package/docs/auto-research-loop-end-to-end.md CHANGED Viewed

@@ -152,7 +152,7 @@ async function runAutoResearchLoop(opts: {
 Two cases:
 1. **Trajectory-shaped optimization with steering.** Use
-   `runMultiShotOptimization` directly — it already runs the inner
+   `runImprovementLoop` directly — it already runs the inner
    search-vs-holdout loop. Wrap with `analyzeOptimizationResult` after
    for the RL bridge.

package/docs/feature-guide.md CHANGED Viewed

@@ -33,8 +33,8 @@ trying, and whether a change made them better or worse.
 | “Human feedback should become reusable eval data.” | `FeedbackTrajectory` | Captures approvals, rejections, edits, choices, metrics, and policy blocks. |
 | “Can this action run, or does it need approval?” | `evaluateActionPolicy` | Generic preflight for side effects, budgets, and required evidence. |
 | “I need train/dev/test/holdout examples.” | `Dataset` plus feedback trajectory conversion | Stable splits and contamination control. |
-| “Which prompt or signature wins?” | `runMultiShotOptimization`, steering optimizers | Runs variants on scenarios and compares scores. |
-| “Improve a multi-turn agent over real task traces.” | `runMultiShotOptimization` | GEPA-style trajectory optimization with ASI and held-out promotion. |
+| “Which prompt or signature wins?” | `runImprovementLoop`, steering optimizers | Runs variants on scenarios and compares scores. |
+| “Improve a multi-turn agent over real task traces.” | `runImprovementLoop` | GEPA-style trajectory optimization with ASI and held-out promotion. |
 | “Improve prompts, then code if prompts plateau.” | `runPromptEvolution`, composite mutator, code mutator | Bounded evolution with telemetry and lineage. |
 | “Find why a regression happened.” | bisector, traces, run records | Narrows changes and preserves evidence. |
 | “Expose evals to another language.” | Wire protocol and Python client | HTTP/RPC boundary for non-TypeScript apps. |
@@ -105,7 +105,7 @@ generated code -> build/test/runtime gates -> score -> ship or revise
 Use when you want Ax/GEPA-style improvement.
-1. For variable-length agent tasks, use `runMultiShotOptimization`.
+1. For variable-length agent tasks, use `runImprovementLoop`.
 2. Build search/dev/test/holdout splits from the real product loop.
 3. Score full trajectories, not just final text.
 4. Emit actionable side information for failures the mutator can fix.
@@ -156,7 +156,7 @@ Store as `FeedbackTrajectory`, then derive:
 | Feedback data | `FeedbackTrajectory`, stores, converters | Human/environment labels | Domain adapters live in downstream repos. |
 | Action policy | `evaluateActionPolicy` | Approval/budget preflight | Blocks or labels actions before `act()`. |
 | Datasets | `Dataset`, holdout tools, canaries | Train/dev/test/holdout corpora | Keeps optimization honest. |
-| Optimization | `runMultiShotOptimization`, steering optimizers | Prompt/signature comparison | Use held-out gates before promotion. |
+| Optimization | `runImprovementLoop`, steering optimizers | Prompt/signature comparison | Use held-out gates before promotion. |
 | Evolution | prompt/code mutators, sandbox pool, telemetry | Autoresearch and mutation loops | Use budgets and lineage; do not run unbounded. |
 | Telemetry | `TraceStore`, OTLP, file sinks | Audit and replay | Treat traces as evidence, not just logs. |
 | Reporting | summaries, pareto, cost tracker | Decision support | Useful for PRs, launch gates, research notes. |

package/docs/multi-shot-optimization.md CHANGED Viewed

@@ -1,129 +1,75 @@
 # Multi-Shot Optimization
-`runMultiShotOptimization` is the public adapter for GEPA-style optimization over
-variable-length agent conversations.
+> **Renamed.** `runMultiShotOptimization` was retired. The live API is
+> `runImprovementLoop` (driver-agnostic, gated promotion) driven by `gepaDriver`,
+> with `compareDrivers` for head-to-head driver lift. This doc was rewritten to the
+> live API; see also [feature-guide.md](./feature-guide.md) and [concepts.md](./concepts.md).
-Use it when the thing you want to improve is not a single model call. Typical
-targets are agent system prompts, tool descriptions, routing policies, retrieval
-plans, or app-specific scaffolding that affects an entire task trajectory.
+`runImprovementLoop` is the public entry for GEPA-style optimization over a whole
+task trajectory — the thing you improve is not a single model call but an agent
+system prompt, tool descriptions, a routing policy, or any scaffolding that affects
+the entire run. It is the OUTER loop: it improves the SURFACE the inner workers run.
-The primitive is intentionally small. Your app owns the domain logic:
+## The shape
-- `seedVariants`: prompt/config/tool-policy candidates
-- `runner`: executes one complete task trajectory for one variant
-- `scorer`: scores the trajectory and emits actionable side information
-- `mutateAdapter`: proposes new variants from top and bottom trials
+You own a few seams; the loop owns the release-critical glue (paired seeds, the
+held-out re-score, the promotion gate, provenance):
-`agent-eval` owns the release-critical glue:
+- **`baselineSurface`** — the current surface (a prompt string, or a `CodeSurface`).
+- **`dispatchWithSurface(surface, scenario, ctx)`** — run one task to completion
+  under a candidate surface; return the artifact the judges score.
+- **`judges`** — score the artifact (`{ composite, dimensions }`).
+- **`driver`** — proposes candidate surfaces each generation: `gepaDriver`
+  (reflective + Pareto frontier) or `evolutionaryDriver` (mutator).
+- **`gate`** — `defaultProductionGate` (held-out significance + red-team +
+  reward-hacking + canary). Ships ONLY on a CI-lower-bound held-out lift.
-- stable paired seeds
-- search-split prompt evolution
-- cost/score Pareto objectives
-- failed-run conversion into failed trials
-- ASI projection into reflection traces and numeric metrics
-- optional paired holdout gating through `HeldOutGate`
-- validated `RunRecord` rows for promotion evidence
-## Result Contract
-The return shape separates discovery from promotion:
-- `searchBestVariant`: best variant on the optimizer-visible search scenarios
-- `searchBestAggregate`: aggregate for that search winner
-- `promotedVariant`: variant callers should ship
-- `promotedAggregate`: aggregate for the promoted variant
-- `gate`: holdout decision and evidence, or `null` when no gate ran
-If a holdout gate is configured and rejects the search winner,
-`promotedVariant` is the baseline. Do not ship `searchBestVariant` directly
-unless you intentionally run without a holdout gate.
-## Actionable Side Information
-The scorer should return `asi` rows for concrete failure modes:
-```ts
-{
-  expectationId: 'used-primary-sources',
-  message: 'The final answer cited secondary summaries instead of primary sources.',
-  severity: 'error',
-  responsibleSurface: 'retrieval-policy',
-  suggestion: 'Prefer primary-source domains during source-gathering turns.',
-}
-```
-Standard knowledge-related responsible surfaces are:
-- `knowledge-requirements`
-- `data-acquisition`
-- `retrieval-policy`
-- `user-question-policy`
-These rows become:
-- reflection expectations via `trialTraceFromMultiShotTrial`
-- aggregate metrics like `asi.error` and `surface.retrieval-policy`
-- trace evidence available to downstream reports
-This is the main reason to use this primitive instead of reducing each run to a
-single scalar reward.
-## Holdout Discipline
-For release gates, configure `gate`. The first seed variant is the baseline and
-`gate.gate.baselineKey` must match its id.
-Holdout scenarios must be disjoint from `searchScenarioIds`. The adapter runs
-baseline and candidate with the same `(scenarioId, rep)` seed, validates every
-row with `validateRunRecord`, then asks `HeldOutGate` whether to promote.
-When `gate.searchScenarioIds` is omitted, the adapter reuses
-`searchScenarioIds` for the overfit-gap check.
-## Minimal Shape
+## Minimal example
 ```ts
 import {
-  runMultiShotOptimization,
-  trialTraceFromMultiShotTrial,
-  type MultiShotVariant,
-} from '@tangle-network/agent-eval'
-type Payload = { systemPrompt: string }
-const baseline: MultiShotVariant<Payload> = {
-  id: 'baseline',
-  label: 'baseline',
-  generation: 0,
-  payload: { systemPrompt: currentPrompt },
-}
-const result = await runMultiShotOptimization<Payload>({
-  runId: `research-agent-${Date.now()}`,
-  target: 'research-agent-system-prompt',
-  seedVariants: [baseline],
-  searchScenarioIds: searchScenarios.map((s) => s.id),
-  reps: 2,
-  generations: 4,
+  runImprovementLoop,
+  gepaDriver,
+  defaultProductionGate,
+} from '@tangle-network/agent-eval/contract'
+const result = await runImprovementLoop({
+  baselineSurface: currentSystemPrompt,
+  scenarios: trainScenarios, // optimizer-visible
+  holdoutScenarios, // DISJOINT — only the gate sees these
+  dispatchWithSurface: async (surface, scenario) =>
+    runYourAgentToCompletion({ scenario, prompt: String(surface) }),
+  judges: [myJudge],
+  driver: gepaDriver({
+    llm: { apiKey, baseUrl },
+    model: 'gpt-5',
+    target: 'enforce a strict output schema',
+  }),
   populationSize: 4,
-  scoreConcurrency: 4,
-  runner: {
-    async run({ variant, scenarioId, seed }) {
-      return runYourAgentToCompletion({ scenarioId, seed, prompt: variant.payload.systemPrompt })
-    },
-  },
-  scorer: {
-    async score({ run }) {
-      return scoreFullTrajectory(run.trace)
-    },
-  },
-  mutateAdapter: {
-    async mutate({ parent, bottomTrials, childCount, generation }) {
-      const traces = bottomTrials.map((t) => trialTraceFromMultiShotTrial(t))
-      return proposePromptMutations({ parent, traces, childCount, generation })
-    },
-  },
+  maxGenerations: 4,
+  gate: defaultProductionGate({ holdoutScenarios, deltaThreshold: 0 }),
+  autoOnPromote: 'none', // or 'pr' (+ ghOwner/ghRepo) to open a PR on ship
+  runDir,
 })
-deploy(result.promotedVariant.payload)
+if (result.gateResult.decision === 'ship') {
+  deploy(result.winnerSurface) // the driver's proposal, gated on a real held-out lift
+}
 ```
+## Discipline (what makes it trustworthy)
+- **Holdout is disjoint + gated.** `holdoutScenarios` must not overlap the training
+  pool. The gate re-scores baseline vs winner on the holdout and ships only when the
+  paired-bootstrap CI lower bound clears `deltaThreshold`; a few-instance swing at
+  thin `n` is held (`few_runs`), not promoted.
+- **No-op never ships.** If no candidate beats the baseline, the winner IS the
+  baseline (empty diff) and the loop forces `hold` — it does not score
+  baseline-vs-itself and read model noise as lift.
+- **Provenance falls out.** `result.promotedDiff` + `emitLoopProvenance` give the
+  auditable candidate→gate→promote chain (rationale, content hashes, a held-out lift
+  recomputable from the emitted record).
+Reach for `compareDrivers` when the question is "which DRIVER wins" rather than
+"improve this surface", and see `tests/campaign/presets.test.ts` for the executable
+contract (no-op guard, fail-loud holdout, gate promotion).

package/docs/product-eval-adoption.md CHANGED Viewed

@@ -152,7 +152,7 @@ set with a signed note.
 ## Optimization
-Use `runMultiShotOptimization()` when the system is a multi-step agent, not a
+Use `runImprovementLoop()` when the system is a multi-step agent, not a
 single prompt.
 Good optimization targets:

package/docs/three-package-architecture.md CHANGED Viewed

@@ -136,7 +136,7 @@ report, RL bridge).
 | From → To | Type | What it carries |
 |---|---|---|
-| agent-knowledge → agent-eval | `RunRecord` | (consumed via `runMultiShotOptimization` for knowledge-base optimization) |
+| agent-knowledge → agent-eval | `RunRecord` | (consumed via `runImprovementLoop` for knowledge-base optimization) |
 | agent-knowledge → agent-eval | `KnowledgeReadinessReport`, `KnowledgeBundle`, `KnowledgeRequirement` | (re-exported from agent-eval; agent-knowledge populates) |
 | agent-knowledge → agent-eval | `ControlRuntimeConfig<KnowledgeBaseCandidate>` | (knowledge research adapter) |
 | agent-runtime → agent-eval | `runAgentControlLoop`, `scoreKnowledgeReadiness`, `blockingKnowledgeEval` | (consumed; agent-runtime calls these in its task lifecycle) |

package/docs/trace-analysis.md CHANGED Viewed

@@ -44,6 +44,25 @@ console.log(result.findings)
 Products can pass any `TraceAnalysisStore`; they do not need to use the file
 store in production.
+## Deterministic failure coverage (no LLM)
+Before (or alongside) the LLM analyst, `OtlpFileTraceStore.getOverview()` returns a
+`DatasetOverview` whose `error_clusters` are computed deterministically — error
+spans are grouped by a normalized failure signature (uuids / hex ids / numbers /
+absolute paths / durations collapsed), each cluster carrying its prevalence,
+exemplar `trace_id`/`span_id`, and a verbatim sample. This is a zero-LLM,
+reproducible failure checklist the analyst then explains and closes:
+```ts
+const overview = await store.getOverview()
+for (const c of overview.error_clusters) {
+  console.log(`${c.trace_count}× ${c.signature} — e.g. trace ${c.exemplar_trace_ids[0]}`)
+}
+```
+See `failureClusters` in [insight-report.md](./insight-report.md) and the
+`ErrorCluster` type doc-comments for the field-level contract.
 ## Required Trace Shape
 Every serious product run should include:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.77.0",
+  "version": "0.79.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {