@tangle-network/agent-eval 0.77.0 → 0.79.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/dist/adapters/http.d.ts +2 -2
  2. package/dist/adapters/langchain.d.ts +2 -2
  3. package/dist/adapters/otel.d.ts +4 -4
  4. package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
  5. package/dist/analyst/index.d.ts +42 -8
  6. package/dist/analyst/index.js +32 -2
  7. package/dist/analyst/index.js.map +1 -1
  8. package/dist/authenticity/index.d.ts +54 -1
  9. package/dist/authenticity/index.js +88 -1
  10. package/dist/authenticity/index.js.map +1 -1
  11. package/dist/benchmarks/index.d.ts +2 -2
  12. package/dist/campaign/index.d.ts +11 -11
  13. package/dist/campaign/index.js +4 -4
  14. package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
  15. package/dist/chunk-5LVWPNS5.js.map +1 -0
  16. package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
  17. package/dist/chunk-CF67I6QY.js.map +1 -0
  18. package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
  19. package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
  20. package/dist/chunk-KWRRMR3J.js.map +1 -0
  21. package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
  22. package/dist/chunk-RPLZ4OIB.js.map +1 -0
  23. package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
  24. package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
  25. package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
  26. package/dist/contract/index.d.ts +12 -12
  27. package/dist/contract/index.js +2 -2
  28. package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
  29. package/dist/control.d.ts +2 -2
  30. package/dist/control.js +2 -2
  31. package/dist/hosted/index.d.ts +4 -4
  32. package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
  33. package/dist/index.d.ts +126 -25
  34. package/dist/index.js +32 -7
  35. package/dist/index.js.map +1 -1
  36. package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
  37. package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
  38. package/dist/meta-eval/index.d.ts +2 -2
  39. package/dist/openapi.json +1 -1
  40. package/dist/{provenance-B-TFszPW.d.ts → provenance-CEAJI9rm.d.ts} +3 -3
  41. package/dist/{registry-DuVYiTvw.d.ts → registry-BmEuU94S.d.ts} +2 -2
  42. package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
  43. package/dist/reporting.d.ts +4 -4
  44. package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
  45. package/dist/rl.d.ts +6 -6
  46. package/dist/rl.js +2 -2
  47. package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CWyWWLBg.d.ts} +1 -1
  48. package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-Bgu4C59E.d.ts} +2 -4
  49. package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
  50. package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-Du4ZVyef.d.ts} +3 -3
  51. package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
  52. package/dist/traces.d.ts +1 -1
  53. package/dist/traces.js +2 -2
  54. package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
  55. package/dist/{types-Bba0vl1V.d.ts → types-QHG0KnkF.d.ts} +11 -3
  56. package/dist/workflow/index.d.ts +4 -4
  57. package/dist/workflow/index.js +1 -1
  58. package/docs/auto-research-loop-end-to-end.md +1 -1
  59. package/docs/feature-guide.md +4 -4
  60. package/docs/multi-shot-optimization.md +61 -115
  61. package/docs/product-eval-adoption.md +1 -1
  62. package/docs/three-package-architecture.md +1 -1
  63. package/docs/trace-analysis.md +19 -0
  64. package/package.json +1 -1
  65. package/dist/chunk-7W4SM7FD.js.map +0 -1
  66. package/dist/chunk-F3SRAAZO.js.map +0 -1
  67. package/dist/chunk-JYE3WOTE.js.map +0 -1
  68. package/dist/chunk-WYIHD6EB.js.map +0 -1
  69. /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
  70. /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
  71. /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
  72. /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
@@ -1,4 +1,4 @@
1
- import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-ByiOUrHj.js';
1
+ import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-BTaXq1TS.js';
2
2
  import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
3
3
 
4
4
  /**
@@ -1,7 +1,7 @@
1
1
  import { AxAIService, AxFunction } from '@ax-llm/ax';
2
2
  import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
3
3
  import { z } from 'zod';
4
- import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-CRD68aH7.js';
4
+ import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-DRvV0zRo.js';
5
5
 
6
6
  /**
7
7
  * Typed Ax output for analyst findings.
@@ -2,8 +2,8 @@ import { T as TraceStore } from '../store-CKUAgsJz.js';
2
2
  import { R as Run } from '../schema-m0gsnbt3.js';
3
3
  import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
4
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
5
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-D_4BSXGV.js';
6
- import '../run-record-BgTFzO2r.js';
5
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-CWyWWLBg.js';
6
+ import '../run-record-sItO5ftF.js';
7
7
  import '../errors-Dwqw-T_m.js';
8
8
 
9
9
  /**
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.77.0",
5
+ "version": "0.79.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,9 +1,9 @@
1
- import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, i as GateContext, f as CampaignResult, M as MutableSurface, j as GateDecision } from './types-Bba0vl1V.js';
1
+ import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, k as GateResult, i as GateContext, f as CampaignResult, M as MutableSurface, j as GateDecision } from './types-QHG0KnkF.js';
2
2
  import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
3
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
3
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
4
4
  import { D as Direction } from './pareto-E-pembql.js';
5
5
  import { a as PairedBootstrapResult } from './statistics-B7yCbi9i.js';
6
- import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BqYH2vCR.js';
6
+ import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-Bgu4C59E.js';
7
7
  import { HostedClient, TraceSpanEvent } from './hosted/index.js';
8
8
 
9
9
  /**
@@ -1,4 +1,4 @@
1
- import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-CRD68aH7.js';
1
+ import { A as Analyst, a as AnalystContext, b as AnalystRunSummary, c as AnalystFinding, d as AnalystRunResult, C as ChatClient, e as AnalystRunInputs, f as AnalystRunEvent } from './types-DRvV0zRo.js';
2
2
 
3
3
  /**
4
4
  * AnalystRegistry — orchestrate N analysts against one run.
@@ -125,4 +125,4 @@ declare class AnalystRegistry {
125
125
  private routeInput;
126
126
  }
127
127
 
128
- export { type AnalystHooks as A, type BudgetPolicy as B, type RegistryRunOpts as R, AnalystRegistry as a, type AnalystRegistryOptions as b };
128
+ export { AnalystRegistry as A, type BudgetPolicy as B, type RegistryRunOpts as R, type AnalystHooks as a, type AnalystRegistryOptions as b };
@@ -1,6 +1,6 @@
1
1
  import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B2kL-fSM.js';
2
- import { m as GateDecision } from './summary-report-ByiOUrHj.js';
3
- import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
2
+ import { m as GateDecision } from './summary-report-BTaXq1TS.js';
3
+ import { R as RunRecord, b as RunSplitTag } from './run-record-sItO5ftF.js';
4
4
 
5
5
  /**
6
6
  * Release confidence gate.
@@ -1,9 +1,9 @@
1
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-D_4BSXGV.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
1
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CWyWWLBg.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, f as ReleaseConfidenceScorecard, g as ReleaseConfidenceStatus, h as ReleaseConfidenceThresholds, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CXXZlR8g.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
4
  export { P as PairedBootstrapOptions, a as PairedBootstrapResult, b as benjaminiHochberg, p as pairedBootstrap, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
5
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
6
- import './run-record-BgTFzO2r.js';
5
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BTaXq1TS.js';
6
+ import './run-record-sItO5ftF.js';
7
7
  import './errors-Dwqw-T_m.js';
8
8
  import './schema-m0gsnbt3.js';
9
9
  import './outcome-store-D6KWmYvj.js';
@@ -1,6 +1,6 @@
1
- import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BgTFzO2r.js';
1
+ import { b as RunSplitTag, a as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-sItO5ftF.js';
2
2
  import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-DbjLfz-K.js';
3
- import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-ByiOUrHj.js';
3
+ import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-BTaXq1TS.js';
4
4
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
5
5
  import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CJzrpUua.js';
6
6
  import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
package/dist/rl.d.ts CHANGED
@@ -1,18 +1,18 @@
1
- import { R as RunRecord, b as RunSplitTag } from './run-record-BgTFzO2r.js';
2
- import { f as CampaignResult } from './types-Bba0vl1V.js';
1
+ import { R as RunRecord, b as RunSplitTag } from './run-record-sItO5ftF.js';
2
+ import { f as CampaignResult } from './types-QHG0KnkF.js';
3
3
  import { a as VerificationReport } from './multi-layer-verifier-DlWCXuxL.js';
4
4
  import { S as Span } from './schema-m0gsnbt3.js';
5
5
  import { T as TraceStore } from './store-CKUAgsJz.js';
6
6
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
7
7
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
8
- import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-D_4BSXGV.js';
9
- import { R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-C_KJyIGg.js';
10
- export { r as runEvalCampaign } from './researcher-C_KJyIGg.js';
8
+ import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CWyWWLBg.js';
9
+ import { R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-rInLj9De.js';
10
+ export { r as runEvalCampaign } from './researcher-rInLj9De.js';
11
11
  import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
12
12
  import './errors-Dwqw-T_m.js';
13
13
  import './llm-client-DbjLfz-K.js';
14
14
  import './raw-provider-sink-C46HDghv.js';
15
- import './summary-report-ByiOUrHj.js';
15
+ import './summary-report-BTaXq1TS.js';
16
16
  import './failure-cluster-CL7IVgkJ.js';
17
17
  import './emitter-DEZwY14K.js';
18
18
  import './integrity-CJzrpUua.js';
package/dist/rl.js CHANGED
@@ -10,7 +10,7 @@ import {
10
10
  } from "./chunk-3RF76KTD.js";
11
11
  import {
12
12
  runEvalCampaign
13
- } from "./chunk-GJJNJVIR.js";
13
+ } from "./chunk-XXNIODOM.js";
14
14
  import "./chunk-IHDHUN2X.js";
15
15
  import {
16
16
  rubricPredictiveValidity
@@ -25,7 +25,7 @@ import {
25
25
  } from "./chunk-ITBRCT73.js";
26
26
  import "./chunk-SBCB6VZY.js";
27
27
  import "./chunk-PC4UYEBM.js";
28
- import "./chunk-F3SRAAZO.js";
28
+ import "./chunk-KWRRMR3J.js";
29
29
  import "./chunk-TVVP3ZZQ.js";
30
30
  import "./chunk-VSMTAMNK.js";
31
31
  import {
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
1
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
2
2
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
3
 
4
4
  /**
@@ -1,5 +1,5 @@
1
1
  import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
2
- import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-Bba0vl1V.js';
2
+ import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateResult, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, g as CampaignTraceWriter, m as GenerationRecord, M as MutableSurface, P as ParetoParent, G as Gate } from './types-QHG0KnkF.js';
3
3
 
4
4
  /**
5
5
  * @experimental
@@ -28,9 +28,7 @@ import { I as ImprovementDriver, S as Scenario, f as CampaignResult, k as GateRe
28
28
  *
29
29
  * The driver is surface-agnostic — any string surface in any consumer opts
30
30
  * in by selecting it. Reuses the generic reflection primitive
31
- * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router
32
- * client; no dependency on the legacy `runMultiShotOptimization` /
33
- * `prompt-evolution` orchestration.
31
+ * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router client.
34
32
  *
35
33
  * Earns its keep where there is real per-instance signal (which the
36
34
  * dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
@@ -200,6 +200,17 @@ interface RunOutcome {
200
200
  * these records as input. Optional — single-judge or scalar-only
201
201
  * runs leave it unset. */
202
202
  judgeScores?: JudgeScoresRecord;
203
+ /** Authenticity / realness verdict — did the run build the REAL thing on the
204
+ * intended infra, or fake it (see `./authenticity`)? Optional: only domains
205
+ * with an authenticity config populate it. Carried in the corpus so the
206
+ * flywheel / off-policy learning can optimize for real completion, not gamed
207
+ * pass-rate. `score` is 0-1; `gated` is the anti-Goodhart flag — a gated run
208
+ * must not count as a real success regardless of `score`. */
209
+ realness?: {
210
+ score: number;
211
+ gated: boolean;
212
+ reason?: string;
213
+ };
203
214
  }
204
215
  /**
205
216
  * Mandatory paper-grade fields for a single evaluation run. Optional
@@ -1,8 +1,8 @@
1
1
  import { AxAIService } from '@ax-llm/ax';
2
- import { c as TraceAnalystKindSpec } from './kind-factory-DW9XWPvM.js';
3
- import { b as AnalystRegistryOptions, a as AnalystRegistry } from './registry-DuVYiTvw.js';
2
+ import { c as TraceAnalystKindSpec } from './kind-factory-DqV2t1Xk.js';
3
+ import { b as AnalystRegistryOptions, A as AnalystRegistry } from './registry-BmEuU94S.js';
4
4
  import { z } from 'zod';
5
- import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-CRD68aH7.js';
5
+ import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-DRvV0zRo.js';
6
6
  import { a as TraceAnalystSpan } from './store-GmBE2pZZ.js';
7
7
  import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
8
8
  import { S as Severity } from './multi-layer-verifier-DlWCXuxL.js';
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
1
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
2
2
  import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
3
3
 
4
4
  /**
package/dist/traces.d.ts CHANGED
@@ -14,7 +14,7 @@ import { A as AnalyzeTracesOptions, b as AnalyzeTracesResult } from './analyst-t
14
14
  export { a as AnalyzeTracesInput, c as AnalyzeTracesTurnSnapshot, d as analyzeTraces } from './analyst-t7zZS3TV.js';
15
15
  import { h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, T as TraceAnalysisStore, g as TraceAnalystFilters, b as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, c as SearchTraceResult, S as SearchSpanResult } from './store-GmBE2pZZ.js';
16
16
  export { D as DEFAULT_TRACE_ANALYST_BUDGETS, d as SpanMatchRecord, e as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, f as TraceAnalystByteBudgets, a as TraceAnalystSpan, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-GmBE2pZZ.js';
17
- import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-BgTFzO2r.js';
17
+ import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-sItO5ftF.js';
18
18
  import { AxFunction } from '@ax-llm/ax';
19
19
 
20
20
  /**
package/dist/traces.js CHANGED
@@ -25,7 +25,7 @@ import {
25
25
  scoreTraceInsightReadiness,
26
26
  tokenizeDomainWords,
27
27
  traceAnalystOnRunComplete
28
- } from "./chunk-XGNCBAVZ.js";
28
+ } from "./chunk-XQL22JDG.js";
29
29
  import {
30
30
  DEFAULT_REDACTION_RULES,
31
31
  REDACTION_VERSION,
@@ -86,7 +86,7 @@ import {
86
86
  defaultProviderRedactor,
87
87
  providerFromBaseUrl
88
88
  } from "./chunk-PC4UYEBM.js";
89
- import "./chunk-F3SRAAZO.js";
89
+ import "./chunk-KWRRMR3J.js";
90
90
  import {
91
91
  TraceEmitter,
92
92
  llmSpanFromProvider
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
1
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
2
2
  import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
3
3
  import { a as JudgeInput } from './types-Croy5h7V.js';
4
4
  import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
@@ -146,6 +146,15 @@ interface AnalystFinding {
146
146
  * diff cleanly across runs.
147
147
  */
148
148
  subject?: string;
149
+ /** FIREWALL provenance (docs/learning-flywheel.md): true iff this finding was
150
+ * lifted from a JUDGE verdict (an acceptance score), not OBSERVED from the
151
+ * agent's behavior. A judge-derived finding must NEVER be admitted as a
152
+ * steering input — that is the held-out judge leaking into the loop. Set at
153
+ * the lift site (createJudgeAdapter); checked by `assertNoJudgeVerdict`.
154
+ * Provenance, not evidence presence, is the correct discriminator: an
155
+ * evidence-less trace-analyst observation legitimately steers, while a judge
156
+ * verdict that happens to cite an artifact must not. */
157
+ derived_from_judge?: boolean;
149
158
  /** Analyst-private extras; renderers ignore unless they know the analyst. */
150
159
  metadata?: Record<string, unknown>;
151
160
  }
@@ -1,4 +1,4 @@
1
- import { a as RunTokenUsage } from './run-record-BgTFzO2r.js';
1
+ import { a as RunTokenUsage } from './run-record-sItO5ftF.js';
2
2
 
3
3
  /**
4
4
  * @experimental
@@ -163,8 +163,8 @@ interface ParetoParent {
163
163
  }
164
164
  /** @experimental Stateless surface mutation — given findings + current
165
165
  * surface, return N candidate surfaces. Pure transform, no generation
166
- * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
167
- * conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */
166
+ * awareness. Reflective-mutation and `AxGEPA` mutators conform. Wrapped by
167
+ * `evolutionaryDriver` to become an `ImprovementDriver`. */
168
168
  interface Mutator<TFindings = unknown> {
169
169
  kind: string;
170
170
  mutate(args: {
@@ -206,6 +206,14 @@ interface ProposeContext<TFindings = unknown> {
206
206
  * scenarios) into a merged candidate. Drivers doing pure single-parent
207
207
  * reflection may ignore it. See {@link ParetoParent}. */
208
208
  paretoParents?: ParetoParent[];
209
+ /** FIREWALL (non-negotiable): the held-out judge is write-only — its verdicts
210
+ * score the chosen output and gate promotion, and are NEVER an input to
211
+ * proposal/steering (else the optimizer games the acceptance axis = an
212
+ * oracle). This `never`-typed field makes that a compile-time tripwire: a
213
+ * driver that tries to thread judge verdicts into the proposal will not type.
214
+ * Steering may consume TRACE-OBSERVABLE signals (what the agent did) via
215
+ * `findings`/`report`; it may NOT consume the judge's held-out verdict. */
216
+ judgeScores?: never;
209
217
  }
210
218
  /** @experimental A surface-improvement strategy — the DRIVER of the
211
219
  * improvement loop. Given the current best surface, the history of what's
@@ -1,7 +1,7 @@
1
1
  import { W as WorkflowTopology } from '../harness-optimizer-EnEnQPsr.js';
2
- import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-BgTFzO2r.js';
3
- import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-CRD68aH7.js';
4
- import { F as FailureClusterInsight } from '../insight-report-Df3lxYXM.js';
2
+ import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from '../run-record-sItO5ftF.js';
3
+ import { c as AnalystFinding, h as AnalystSeverity, E as EvidenceRef } from '../types-DRvV0zRo.js';
4
+ import { F as FailureClusterInsight } from '../insight-report-dlpEzQDi.js';
5
5
  import { a as VerificationReport, L as LayerResult } from '../multi-layer-verifier-DlWCXuxL.js';
6
6
  import { F as FailureClusterReport } from '../failure-cluster-CL7IVgkJ.js';
7
7
  import { R as RedactionRule, a as RedactionReport } from '../redact-B40YG2M_.js';
@@ -18,7 +18,7 @@ import '../types-Croy5h7V.js';
18
18
  import '@tangle-network/tcloud';
19
19
  import '../llm-client-DbjLfz-K.js';
20
20
  import '../raw-provider-sink-C46HDghv.js';
21
- import '../summary-report-ByiOUrHj.js';
21
+ import '../summary-report-BTaXq1TS.js';
22
22
  import '../judge-calibration-DilmB3Ml.js';
23
23
  import '../control-runtime-DuFBYg7A.js';
24
24
  import '../emitter-DEZwY14K.js';
@@ -7,7 +7,7 @@ import {
7
7
  } from "../chunk-GGE4NNQT.js";
8
8
  import {
9
9
  validateRunRecord
10
- } from "../chunk-F3SRAAZO.js";
10
+ } from "../chunk-KWRRMR3J.js";
11
11
  import "../chunk-VSMTAMNK.js";
12
12
  import {
13
13
  ValidationError
@@ -152,7 +152,7 @@ async function runAutoResearchLoop(opts: {
152
152
  Two cases:
153
153
 
154
154
  1. **Trajectory-shaped optimization with steering.** Use
155
- `runMultiShotOptimization` directly — it already runs the inner
155
+ `runImprovementLoop` directly — it already runs the inner
156
156
  search-vs-holdout loop. Wrap with `analyzeOptimizationResult` after
157
157
  for the RL bridge.
158
158
 
@@ -33,8 +33,8 @@ trying, and whether a change made them better or worse.
33
33
  | “Human feedback should become reusable eval data.” | `FeedbackTrajectory` | Captures approvals, rejections, edits, choices, metrics, and policy blocks. |
34
34
  | “Can this action run, or does it need approval?” | `evaluateActionPolicy` | Generic preflight for side effects, budgets, and required evidence. |
35
35
  | “I need train/dev/test/holdout examples.” | `Dataset` plus feedback trajectory conversion | Stable splits and contamination control. |
36
- | “Which prompt or signature wins?” | `runMultiShotOptimization`, steering optimizers | Runs variants on scenarios and compares scores. |
37
- | “Improve a multi-turn agent over real task traces.” | `runMultiShotOptimization` | GEPA-style trajectory optimization with ASI and held-out promotion. |
36
+ | “Which prompt or signature wins?” | `runImprovementLoop`, steering optimizers | Runs variants on scenarios and compares scores. |
37
+ | “Improve a multi-turn agent over real task traces.” | `runImprovementLoop` | GEPA-style trajectory optimization with ASI and held-out promotion. |
38
38
  | “Improve prompts, then code if prompts plateau.” | `runPromptEvolution`, composite mutator, code mutator | Bounded evolution with telemetry and lineage. |
39
39
  | “Find why a regression happened.” | bisector, traces, run records | Narrows changes and preserves evidence. |
40
40
  | “Expose evals to another language.” | Wire protocol and Python client | HTTP/RPC boundary for non-TypeScript apps. |
@@ -105,7 +105,7 @@ generated code -> build/test/runtime gates -> score -> ship or revise
105
105
 
106
106
  Use when you want Ax/GEPA-style improvement.
107
107
 
108
- 1. For variable-length agent tasks, use `runMultiShotOptimization`.
108
+ 1. For variable-length agent tasks, use `runImprovementLoop`.
109
109
  2. Build search/dev/test/holdout splits from the real product loop.
110
110
  3. Score full trajectories, not just final text.
111
111
  4. Emit actionable side information for failures the mutator can fix.
@@ -156,7 +156,7 @@ Store as `FeedbackTrajectory`, then derive:
156
156
  | Feedback data | `FeedbackTrajectory`, stores, converters | Human/environment labels | Domain adapters live in downstream repos. |
157
157
  | Action policy | `evaluateActionPolicy` | Approval/budget preflight | Blocks or labels actions before `act()`. |
158
158
  | Datasets | `Dataset`, holdout tools, canaries | Train/dev/test/holdout corpora | Keeps optimization honest. |
159
- | Optimization | `runMultiShotOptimization`, steering optimizers | Prompt/signature comparison | Use held-out gates before promotion. |
159
+ | Optimization | `runImprovementLoop`, steering optimizers | Prompt/signature comparison | Use held-out gates before promotion. |
160
160
  | Evolution | prompt/code mutators, sandbox pool, telemetry | Autoresearch and mutation loops | Use budgets and lineage; do not run unbounded. |
161
161
  | Telemetry | `TraceStore`, OTLP, file sinks | Audit and replay | Treat traces as evidence, not just logs. |
162
162
  | Reporting | summaries, pareto, cost tracker | Decision support | Useful for PRs, launch gates, research notes. |
@@ -1,129 +1,75 @@
1
1
  # Multi-Shot Optimization
2
2
 
3
- `runMultiShotOptimization` is the public adapter for GEPA-style optimization over
4
- variable-length agent conversations.
3
+ > **Renamed.** `runMultiShotOptimization` was retired. The live API is
4
+ > `runImprovementLoop` (driver-agnostic, gated promotion) driven by `gepaDriver`,
5
+ > with `compareDrivers` for head-to-head driver lift. This doc was rewritten to the
6
+ > live API; see also [feature-guide.md](./feature-guide.md) and [concepts.md](./concepts.md).
5
7
 
6
- Use it when the thing you want to improve is not a single model call. Typical
7
- targets are agent system prompts, tool descriptions, routing policies, retrieval
8
- plans, or app-specific scaffolding that affects an entire task trajectory.
8
+ `runImprovementLoop` is the public entry for GEPA-style optimization over a whole
9
+ task trajectory the thing you improve is not a single model call but an agent
10
+ system prompt, tool descriptions, a routing policy, or any scaffolding that affects
11
+ the entire run. It is the OUTER loop: it improves the SURFACE the inner workers run.
9
12
 
10
- The primitive is intentionally small. Your app owns the domain logic:
13
+ ## The shape
11
14
 
12
- - `seedVariants`: prompt/config/tool-policy candidates
13
- - `runner`: executes one complete task trajectory for one variant
14
- - `scorer`: scores the trajectory and emits actionable side information
15
- - `mutateAdapter`: proposes new variants from top and bottom trials
15
+ You own a few seams; the loop owns the release-critical glue (paired seeds, the
16
+ held-out re-score, the promotion gate, provenance):
16
17
 
17
- `agent-eval` owns the release-critical glue:
18
+ - **`baselineSurface`** the current surface (a prompt string, or a `CodeSurface`).
19
+ - **`dispatchWithSurface(surface, scenario, ctx)`** — run one task to completion
20
+ under a candidate surface; return the artifact the judges score.
21
+ - **`judges`** — score the artifact (`{ composite, dimensions }`).
22
+ - **`driver`** — proposes candidate surfaces each generation: `gepaDriver`
23
+ (reflective + Pareto frontier) or `evolutionaryDriver` (mutator).
24
+ - **`gate`** — `defaultProductionGate` (held-out significance + red-team +
25
+ reward-hacking + canary). Ships ONLY on a CI-lower-bound held-out lift.
18
26
 
19
- - stable paired seeds
20
- - search-split prompt evolution
21
- - cost/score Pareto objectives
22
- - failed-run conversion into failed trials
23
- - ASI projection into reflection traces and numeric metrics
24
- - optional paired holdout gating through `HeldOutGate`
25
- - validated `RunRecord` rows for promotion evidence
26
-
27
- ## Result Contract
28
-
29
- The return shape separates discovery from promotion:
30
-
31
- - `searchBestVariant`: best variant on the optimizer-visible search scenarios
32
- - `searchBestAggregate`: aggregate for that search winner
33
- - `promotedVariant`: variant callers should ship
34
- - `promotedAggregate`: aggregate for the promoted variant
35
- - `gate`: holdout decision and evidence, or `null` when no gate ran
36
-
37
- If a holdout gate is configured and rejects the search winner,
38
- `promotedVariant` is the baseline. Do not ship `searchBestVariant` directly
39
- unless you intentionally run without a holdout gate.
40
-
41
- ## Actionable Side Information
42
-
43
- The scorer should return `asi` rows for concrete failure modes:
44
-
45
- ```ts
46
- {
47
- expectationId: 'used-primary-sources',
48
- message: 'The final answer cited secondary summaries instead of primary sources.',
49
- severity: 'error',
50
- responsibleSurface: 'retrieval-policy',
51
- suggestion: 'Prefer primary-source domains during source-gathering turns.',
52
- }
53
- ```
54
-
55
- Standard knowledge-related responsible surfaces are:
56
-
57
- - `knowledge-requirements`
58
- - `data-acquisition`
59
- - `retrieval-policy`
60
- - `user-question-policy`
61
-
62
- These rows become:
63
-
64
- - reflection expectations via `trialTraceFromMultiShotTrial`
65
- - aggregate metrics like `asi.error` and `surface.retrieval-policy`
66
- - trace evidence available to downstream reports
67
-
68
- This is the main reason to use this primitive instead of reducing each run to a
69
- single scalar reward.
70
-
71
- ## Holdout Discipline
72
-
73
- For release gates, configure `gate`. The first seed variant is the baseline and
74
- `gate.gate.baselineKey` must match its id.
75
-
76
- Holdout scenarios must be disjoint from `searchScenarioIds`. The adapter runs
77
- baseline and candidate with the same `(scenarioId, rep)` seed, validates every
78
- row with `validateRunRecord`, then asks `HeldOutGate` whether to promote.
79
-
80
- When `gate.searchScenarioIds` is omitted, the adapter reuses
81
- `searchScenarioIds` for the overfit-gap check.
82
-
83
- ## Minimal Shape
27
+ ## Minimal example
84
28
 
85
29
  ```ts
86
30
  import {
87
- runMultiShotOptimization,
88
- trialTraceFromMultiShotTrial,
89
- type MultiShotVariant,
90
- } from '@tangle-network/agent-eval'
91
-
92
- type Payload = { systemPrompt: string }
93
-
94
- const baseline: MultiShotVariant<Payload> = {
95
- id: 'baseline',
96
- label: 'baseline',
97
- generation: 0,
98
- payload: { systemPrompt: currentPrompt },
99
- }
100
-
101
- const result = await runMultiShotOptimization<Payload>({
102
- runId: `research-agent-${Date.now()}`,
103
- target: 'research-agent-system-prompt',
104
- seedVariants: [baseline],
105
- searchScenarioIds: searchScenarios.map((s) => s.id),
106
- reps: 2,
107
- generations: 4,
31
+ runImprovementLoop,
32
+ gepaDriver,
33
+ defaultProductionGate,
34
+ } from '@tangle-network/agent-eval/contract'
35
+
36
+ const result = await runImprovementLoop({
37
+ baselineSurface: currentSystemPrompt,
38
+ scenarios: trainScenarios, // optimizer-visible
39
+ holdoutScenarios, // DISJOINT — only the gate sees these
40
+ dispatchWithSurface: async (surface, scenario) =>
41
+ runYourAgentToCompletion({ scenario, prompt: String(surface) }),
42
+ judges: [myJudge],
43
+ driver: gepaDriver({
44
+ llm: { apiKey, baseUrl },
45
+ model: 'gpt-5',
46
+ target: 'enforce a strict output schema',
47
+ }),
108
48
  populationSize: 4,
109
- scoreConcurrency: 4,
110
- runner: {
111
- async run({ variant, scenarioId, seed }) {
112
- return runYourAgentToCompletion({ scenarioId, seed, prompt: variant.payload.systemPrompt })
113
- },
114
- },
115
- scorer: {
116
- async score({ run }) {
117
- return scoreFullTrajectory(run.trace)
118
- },
119
- },
120
- mutateAdapter: {
121
- async mutate({ parent, bottomTrials, childCount, generation }) {
122
- const traces = bottomTrials.map((t) => trialTraceFromMultiShotTrial(t))
123
- return proposePromptMutations({ parent, traces, childCount, generation })
124
- },
125
- },
49
+ maxGenerations: 4,
50
+ gate: defaultProductionGate({ holdoutScenarios, deltaThreshold: 0 }),
51
+ autoOnPromote: 'none', // or 'pr' (+ ghOwner/ghRepo) to open a PR on ship
52
+ runDir,
126
53
  })
127
54
 
128
- deploy(result.promotedVariant.payload)
55
+ if (result.gateResult.decision === 'ship') {
56
+ deploy(result.winnerSurface) // the driver's proposal, gated on a real held-out lift
57
+ }
129
58
  ```
59
+
60
+ ## Discipline (what makes it trustworthy)
61
+
62
+ - **Holdout is disjoint + gated.** `holdoutScenarios` must not overlap the training
63
+ pool. The gate re-scores baseline vs winner on the holdout and ships only when the
64
+ paired-bootstrap CI lower bound clears `deltaThreshold`; a few-instance swing at
65
+ thin `n` is held (`few_runs`), not promoted.
66
+ - **No-op never ships.** If no candidate beats the baseline, the winner IS the
67
+ baseline (empty diff) and the loop forces `hold` — it does not score
68
+ baseline-vs-itself and read model noise as lift.
69
+ - **Provenance falls out.** `result.promotedDiff` + `emitLoopProvenance` give the
70
+ auditable candidate→gate→promote chain (rationale, content hashes, a held-out lift
71
+ recomputable from the emitted record).
72
+
73
+ Reach for `compareDrivers` when the question is "which DRIVER wins" rather than
74
+ "improve this surface", and see `tests/campaign/presets.test.ts` for the executable
75
+ contract (no-op guard, fail-loud holdout, gate promotion).
@@ -152,7 +152,7 @@ set with a signed note.
152
152
 
153
153
  ## Optimization
154
154
 
155
- Use `runMultiShotOptimization()` when the system is a multi-step agent, not a
155
+ Use `runImprovementLoop()` when the system is a multi-step agent, not a
156
156
  single prompt.
157
157
 
158
158
  Good optimization targets:
@@ -136,7 +136,7 @@ report, RL bridge).
136
136
 
137
137
  | From → To | Type | What it carries |
138
138
  |---|---|---|
139
- | agent-knowledge → agent-eval | `RunRecord` | (consumed via `runMultiShotOptimization` for knowledge-base optimization) |
139
+ | agent-knowledge → agent-eval | `RunRecord` | (consumed via `runImprovementLoop` for knowledge-base optimization) |
140
140
  | agent-knowledge → agent-eval | `KnowledgeReadinessReport`, `KnowledgeBundle`, `KnowledgeRequirement` | (re-exported from agent-eval; agent-knowledge populates) |
141
141
  | agent-knowledge → agent-eval | `ControlRuntimeConfig<KnowledgeBaseCandidate>` | (knowledge research adapter) |
142
142
  | agent-runtime → agent-eval | `runAgentControlLoop`, `scoreKnowledgeReadiness`, `blockingKnowledgeEval` | (consumed; agent-runtime calls these in its task lifecycle) |
@@ -44,6 +44,25 @@ console.log(result.findings)
44
44
  Products can pass any `TraceAnalysisStore`; they do not need to use the file
45
45
  store in production.
46
46
 
47
+ ## Deterministic failure coverage (no LLM)
48
+
49
+ Before (or alongside) the LLM analyst, `OtlpFileTraceStore.getOverview()` returns a
50
+ `DatasetOverview` whose `error_clusters` are computed deterministically — error
51
+ spans are grouped by a normalized failure signature (uuids / hex ids / numbers /
52
+ absolute paths / durations collapsed), each cluster carrying its prevalence,
53
+ exemplar `trace_id`/`span_id`, and a verbatim sample. This is a zero-LLM,
54
+ reproducible failure checklist the analyst then explains and closes:
55
+
56
+ ```ts
57
+ const overview = await store.getOverview()
58
+ for (const c of overview.error_clusters) {
59
+ console.log(`${c.trace_count}× ${c.signature} — e.g. trace ${c.exemplar_trace_ids[0]}`)
60
+ }
61
+ ```
62
+
63
+ See `failureClusters` in [insight-report.md](./insight-report.md) and the
64
+ `ErrorCluster` type doc-comments for the field-level contract.
65
+
47
66
  ## Required Trace Shape
48
67
 
49
68
  Every serious product run should include:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.77.0",
3
+ "version": "0.79.0",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {