@tangle-network/agent-eval 0.27.0 → 0.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +72 -0
  2. package/README.md +4 -5
  3. package/dist/builder-eval/index.js +1 -1
  4. package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
  5. package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
  6. package/dist/chunk-4U4BKCXK.js.map +1 -0
  7. package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
  8. package/dist/chunk-5AKPEK5L.js.map +1 -0
  9. package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
  10. package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
  11. package/dist/chunk-K33INZHH.js.map +1 -0
  12. package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
  13. package/dist/chunk-MAZ26DC7.js.map +1 -0
  14. package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
  15. package/dist/chunk-NCRFYPS3.js.map +1 -0
  16. package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
  17. package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
  18. package/dist/chunk-QHF6EQKK.js.map +1 -0
  19. package/dist/chunk-R5UQJNKC.js +722 -0
  20. package/dist/chunk-R5UQJNKC.js.map +1 -0
  21. package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
  22. package/dist/chunk-RUI6SIHY.js.map +1 -0
  23. package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
  24. package/dist/chunk-SZSBQUIJ.js.map +1 -0
  25. package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
  26. package/dist/chunk-VSMTAMNK.js.map +1 -0
  27. package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
  28. package/dist/chunk-XFZCM5Z3.js.map +1 -0
  29. package/dist/cli.js +1 -1
  30. package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
  31. package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
  32. package/dist/control.d.ts +3 -3
  33. package/dist/control.js +2 -2
  34. package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
  35. package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
  36. package/dist/governance/index.d.ts +1 -1
  37. package/dist/{index-D3iBCjdF.d.ts → index-BhLlu-qO.d.ts} +1 -1
  38. package/dist/index.d.ts +157 -167
  39. package/dist/index.js +25 -335
  40. package/dist/index.js.map +1 -1
  41. package/dist/knowledge/index.d.ts +1 -1
  42. package/dist/knowledge/index.js +2 -2
  43. package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
  44. package/dist/openapi.json +1 -1
  45. package/dist/optimization.d.ts +5 -5
  46. package/dist/optimization.js +5 -5
  47. package/dist/pipelines/index.d.ts +1 -1
  48. package/dist/pipelines/index.js +2 -2
  49. package/dist/{release-report-wfUySN5F.d.ts → release-report-CCQqnK46.d.ts} +1 -1
  50. package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
  51. package/dist/reporting.d.ts +4 -4
  52. package/dist/reporting.js +5 -5
  53. package/dist/{researcher-bGkI7vCl.d.ts → researcher-G81CWc0q.d.ts} +9 -10
  54. package/dist/rl.d.ts +26 -44
  55. package/dist/rl.js +5 -5
  56. package/dist/rl.js.map +1 -1
  57. package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
  58. package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-Dl4akLKX.d.ts} +5 -5
  59. package/dist/traces.d.ts +1 -1
  60. package/dist/traces.js +2 -2
  61. package/dist/wire/index.d.ts +2 -2
  62. package/dist/wire/index.js +1 -1
  63. package/docs/research-report-methodology.md +4 -4
  64. package/docs/three-package-architecture.md +12 -24
  65. package/package.json +1 -1
  66. package/dist/chunk-2A5XJB43.js.map +0 -1
  67. package/dist/chunk-4F5DQN55.js.map +0 -1
  68. package/dist/chunk-5LBB5B3Z.js.map +0 -1
  69. package/dist/chunk-I4MBDTY5.js +0 -272
  70. package/dist/chunk-I4MBDTY5.js.map +0 -1
  71. package/dist/chunk-JLZQWFV3.js.map +0 -1
  72. package/dist/chunk-K2TPS5LB.js.map +0 -1
  73. package/dist/chunk-LSH4MMOZ.js.map +0 -1
  74. package/dist/chunk-NU65VQ7M.js.map +0 -1
  75. package/dist/chunk-OWLAAMME.js.map +0 -1
  76. package/dist/chunk-SESZDQPX.js.map +0 -1
  77. package/dist/chunk-WHZMVFUV.js.map +0 -1
  78. /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
  79. /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
  80. /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
@@ -1,4 +1,4 @@
1
- import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BuJHoLg0.js';
1
+ import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
2
2
  import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
3
3
  import '../store-Db2Bv8Cf.js';
4
4
 
@@ -4,8 +4,8 @@ import {
4
4
  knowledgeReadinessTracePayload,
5
5
  scoreKnowledgeReadiness,
6
6
  userQuestionsForKnowledgeGaps
7
- } from "../chunk-WWYCWKUM.js";
8
- import "../chunk-LSH4MMOZ.js";
7
+ } from "../chunk-3CKU6VGU.js";
8
+ import "../chunk-NCRFYPS3.js";
9
9
  import "../chunk-TVVP3ZZQ.js";
10
10
  import "../chunk-PZ5AY32C.js";
11
11
  export {
@@ -51,7 +51,7 @@ interface LayerResult {
51
51
  * diagnostic name; null = "diagnostic not applicable / not measured."
52
52
  * Renderers that know the keys can display them; ones that don't,
53
53
  * ignore. Free-form on purpose — consumers type the value shape in
54
- * their own namespace. Added in 0.10.
54
+ * their own namespace.
55
55
  */
56
56
  diagnostics?: Record<string, number | null>;
57
57
  /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.27.0",
5
+ "version": "0.27.2",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,11 +1,11 @@
1
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-bGkI7vCl.js';
2
- export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DfFdrraJ.js';
3
- export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-DZVXOCK_.js';
1
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-G81CWc0q.js';
2
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-D1aGKusy.js';
3
+ export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-Dl4akLKX.js';
4
4
  import './errors-BZ9sTdz7.js';
5
5
  import './integrity-DK2EBVZC.js';
6
6
  import './store-Db2Bv8Cf.js';
7
7
  import './run-record-CqzahIbx.js';
8
8
  import './emitter-DP_cSSiw.js';
9
- import './control-runtime-BuJHoLg0.js';
9
+ import './control-runtime-BZ_lVLYW.js';
10
10
  import './dataset-CiK_3LDr.js';
11
- import './failure-cluster-C2EGSDiT.js';
11
+ import './failure-cluster-Cw65_5FY.js';
@@ -25,18 +25,18 @@ import {
25
25
  summarizePreferenceMemory,
26
26
  trialTraceFromMultiShotTrial,
27
27
  withAssignedFeedbackSplit
28
- } from "./chunk-WHZMVFUV.js";
28
+ } from "./chunk-SZSBQUIJ.js";
29
29
  import "./chunk-NLMNWKVM.js";
30
30
  import {
31
31
  runEvalCampaign
32
- } from "./chunk-SESZDQPX.js";
32
+ } from "./chunk-RUI6SIHY.js";
33
33
  import "./chunk-4S4BM3QQ.js";
34
- import "./chunk-2A5XJB43.js";
35
- import "./chunk-I4MBDTY5.js";
34
+ import "./chunk-5AKPEK5L.js";
35
+ import "./chunk-R5UQJNKC.js";
36
36
  import "./chunk-KTGTIOFD.js";
37
37
  import "./chunk-PC4UYEBM.js";
38
38
  import "./chunk-TVVP3ZZQ.js";
39
- import "./chunk-4F5DQN55.js";
39
+ import "./chunk-VSMTAMNK.js";
40
40
  import "./chunk-NG236HPC.js";
41
41
  import "./chunk-PZ5AY32C.js";
42
42
  export {
@@ -1,5 +1,5 @@
1
1
  import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
2
- export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-C2EGSDiT.js';
2
+ export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
3
3
  import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
4
4
  import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
5
5
  export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
@@ -2,13 +2,13 @@ import {
2
2
  compareToBaseline,
3
3
  computeToolUseMetrics,
4
4
  failureClusterView
5
- } from "../chunk-JLZQWFV3.js";
5
+ } from "../chunk-K33INZHH.js";
6
6
  import {
7
7
  buildTrajectory
8
8
  } from "../chunk-RZTMDUO7.js";
9
9
  import {
10
10
  interRaterReliability
11
- } from "../chunk-I4MBDTY5.js";
11
+ } from "../chunk-R5UQJNKC.js";
12
12
  import {
13
13
  aggregateLlm,
14
14
  argHash,
@@ -1,5 +1,5 @@
1
1
  import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-CiK_3LDr.js';
2
- import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-DZVXOCK_.js';
2
+ import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-Dl4akLKX.js';
3
3
  import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
4
4
 
5
5
  /**
@@ -107,11 +107,10 @@ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: R
107
107
  * Replay-from-raw-events — turn every captured campaign run into a
108
108
  * re-runnable artifact.
109
109
  *
110
- * The premise: 0.21 made `RawProviderSink` capture every provider HTTP
111
- * envelope. 0.22's `runEvalCampaign` makes capture the default. Together
112
- * they mean every past run is a complete fingerprint of what happened on
113
- * the wire and that fingerprint is enough to replay the run without
114
- * burning new LLM cost.
110
+ * `RawProviderSink` captures every provider HTTP envelope; `runEvalCampaign`
111
+ * makes that capture the default. Together they make every past run a
112
+ * complete fingerprint of what happened on the wire enough to replay
113
+ * the run without burning new LLM cost.
115
114
  *
116
115
  * Three use cases this primitive enables:
117
116
  *
@@ -1,10 +1,10 @@
1
1
  export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-C0uDYwG6.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-wfUySN5F.js';
3
- export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-Dgz1n51-.js';
4
- export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-DZVXOCK_.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-CCQqnK46.js';
3
+ export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
+ export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-Dl4akLKX.js';
5
5
  import './run-record-CqzahIbx.js';
6
6
  import './errors-BZ9sTdz7.js';
7
7
  import './outcome-store-D6KWmYvj.js';
8
8
  import './dataset-CiK_3LDr.js';
9
- import './failure-cluster-C2EGSDiT.js';
9
+ import './failure-cluster-Cw65_5FY.js';
10
10
  import './store-Db2Bv8Cf.js';
package/dist/reporting.js CHANGED
@@ -5,14 +5,14 @@ import {
5
5
  judgeReplayGate,
6
6
  releaseTraceEvidenceFromMultiShotTrials,
7
7
  renderReleaseReport
8
- } from "./chunk-RAF443UI.js";
8
+ } from "./chunk-DBIGN5MJ.js";
9
9
  import {
10
10
  rubricPredictiveValidity
11
11
  } from "./chunk-YRZ4M5GS.js";
12
12
  import {
13
13
  evaluateInterimReleaseConfidence,
14
14
  pairedEvalueSequence
15
- } from "./chunk-NU65VQ7M.js";
15
+ } from "./chunk-MAZ26DC7.js";
16
16
  import {
17
17
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
18
18
  bhAdjust,
@@ -22,9 +22,9 @@ import {
22
22
  paretoChart,
23
23
  researchReport,
24
24
  summaryTable
25
- } from "./chunk-2A5XJB43.js";
26
- import "./chunk-I4MBDTY5.js";
27
- import "./chunk-4F5DQN55.js";
25
+ } from "./chunk-5AKPEK5L.js";
26
+ import "./chunk-R5UQJNKC.js";
27
+ import "./chunk-VSMTAMNK.js";
28
28
  import "./chunk-NG236HPC.js";
29
29
  import "./chunk-PZ5AY32C.js";
30
30
  export {
@@ -1,7 +1,7 @@
1
1
  import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
2
2
  import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DK2EBVZC.js';
3
3
  import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-CqzahIbx.js';
4
- import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-DZVXOCK_.js';
4
+ import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-Dl4akLKX.js';
5
5
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
6
6
  import { T as TraceStore } from './store-Db2Bv8Cf.js';
7
7
 
@@ -224,16 +224,15 @@ declare class LlmClient {
224
224
  * EvalCampaign — opinionated matrix runner that wires the four
225
225
  * capture-integrity directives by construction.
226
226
  *
227
- * Every consumer that ran a launch-grade benchmark before 0.22 reinvented
228
- * the same shape: matrix runnerfor each (variant, scenario, seed) →
229
- * start a TraceEmitter call LLMs end the run maybe analyze.
230
- * The bug class blueprint-agent reported (raw events not captured, route
231
- * silently wrong, integrity not asserted, analyst never ran) lives at the
232
- * integration boundary — not the agent-eval API surface. The four
233
- * directives in `SKILL.md § Capture integrity` are mitigations.
227
+ * The canonical benchmark shape matrix runner for each
228
+ * (variant, scenario, seed) start a TraceEmitter call LLMs end the
229
+ * run analyze has a bug class at the integration boundary: raw
230
+ * events not captured, route silently wrong, integrity not asserted,
231
+ * analyst never run. The directives in `SKILL.md § Capture integrity`
232
+ * are the mitigations.
234
233
  *
235
- * `EvalCampaign` is the structural fix. Consumers don't wire the integrity
236
- * surface anymore; the campaign owns it. Specifically, the campaign:
234
+ * `EvalCampaign` is the structural fix consumers don't wire the
235
+ * integrity surface themselves; the campaign owns it. Specifically:
237
236
  *
238
237
  * - calls `assertLlmRoute` once at preflight before any work runs
239
238
  * - constructs a per-run `TraceStore` and `RawProviderSink` via factories
package/dist/rl.d.ts CHANGED
@@ -1,14 +1,14 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
2
- import { V as VerificationReport } from './multi-layer-verifier-LkP3LVKj.js';
3
- import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-DZVXOCK_.js';
2
+ import { V as VerificationReport } from './multi-layer-verifier-U-c8ge1k.js';
3
+ import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-Dl4akLKX.js';
4
4
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
5
5
  import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-C0uDYwG6.js';
6
- import { I as InterimReleaseConfidence } from './sequential-Dgz1n51-.js';
6
+ import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
7
7
  import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
8
- import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-bGkI7vCl.js';
9
- export { r as runEvalCampaign } from './researcher-bGkI7vCl.js';
8
+ import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-G81CWc0q.js';
9
+ export { r as runEvalCampaign } from './researcher-G81CWc0q.js';
10
10
  import './errors-BZ9sTdz7.js';
11
- import './failure-cluster-C2EGSDiT.js';
11
+ import './failure-cluster-Cw65_5FY.js';
12
12
  import './integrity-DK2EBVZC.js';
13
13
  import './emitter-DP_cSSiw.js';
14
14
 
@@ -529,17 +529,12 @@ declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
529
529
  }>;
530
530
 
531
531
  /**
532
- * Adapters: convert legacy optimization outputs into the canonical
533
- * `RunRecord[]` artifact that 0.22+ primitives consume.
532
+ * Adapters: convert `TrialResult[]` (from `runMultiShotOptimization`,
533
+ * `runPromptEvolution`) into the canonical `RunRecord[]` artifact that
534
+ * `replayCache`, `pairedEvalueSequence`, and `rubricPredictiveValidity`
535
+ * consume.
534
536
  *
535
- * The 0.22 release standardized the campaign artifact: every cell of an
536
- * eval matrix produces one `RunRecord`. The pre-0.22 optimization
537
- * primitives (`runMultiShotOptimization`, `runPromptEvolution`) produce
538
- * `TrialResult[]` with a different shape. This file bridges the two so
539
- * the new primitives (`replayCache`, `pairedEvalueSequence`,
540
- * `rubricPredictiveValidity`) compose cleanly with the existing RL stack.
541
- *
542
- * The adapters are thin and explicit — every mandatory `RunRecord` field
537
+ * Adapters are thin and explicit every mandatory `RunRecord` field
543
538
  * comes from a caller-supplied context (`commitSha`, `model`,
544
539
  * `promptHash`, `configHash`) plus the trial's runtime data. Defaults
545
540
  * exist for fields the trial doesn't carry (`tokenUsage`, `costUsd`),
@@ -1505,18 +1500,16 @@ interface DetectRewardHackingInput {
1505
1500
  declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
1506
1501
 
1507
1502
  /**
1508
- * `analyzeOptimizationResult` — unifies the pre-0.22 auto-research stack
1503
+ * `analyzeOptimizationResult` — unifies the auto-research stack
1509
1504
  * (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
1510
- * Ax/AxRLM trace analyst) with the 0.23 RL bridge in a single call.
1505
+ * Ax/AxRLM trace analyst) with the RL bridge in a single call.
1511
1506
  *
1512
- * What this fixes: until 0.23 the optimization stack and the RL bridge
1513
- * lived in parallel namespaces. The optimization primitives produced
1514
- * `TrialResult[]`; the RL bridge consumed `RunRecord[]`. Trace-analyst
1515
- * was decoupled from both. `analyzeOptimizationResult` does the wiring
1516
- * once so consumers don't have to:
1507
+ * The optimization primitives produce `TrialResult[]`; the RL bridge
1508
+ * consumes `RunRecord[]`. Trace-analyst is independent of both. This
1509
+ * function does the wiring once so consumers don't have to:
1517
1510
  *
1518
- * Optimization (existing primitives) RL bridge (0.23)
1519
- * ────────────────────────────────── ────────────────
1511
+ * Optimization (existing primitives) RL bridge
1512
+ * ────────────────────────────────── ────────
1520
1513
  * runPromptEvolution → TrialResult[] →
1521
1514
  * runMultiShotOptimization → MSTrial[] → analyzeOptimizationResult →
1522
1515
  * reflective-mutation → mutations.jsonl → ↓
@@ -1527,10 +1520,10 @@ declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHac
1527
1520
  * ↓ │
1528
1521
  * TraceAnalyst.analyze(progressLog) ←─────────────────────────┘
1529
1522
  *
1530
- * The output of this function is the canonical RL artifact set:
1531
- * `RunRecord[]` (so every other 0.22+ primitive composes), preference
1532
- * triples, verifiable reward signals, reward-hacking diagnosis,
1533
- * sequential interim verdict, and (when wired) trace-analyst summary.
1523
+ * The output is the canonical RL artifact set: `RunRecord[]` (so every
1524
+ * other RL primitive composes), preference triples, verifiable reward
1525
+ * signals, reward-hacking diagnosis, sequential interim verdict, and
1526
+ * (when wired) trace-analyst summary.
1534
1527
  *
1535
1528
  * What this primitive does NOT do: it does not modify the optimization
1536
1529
  * primitives' internals. They keep producing `TrialResult` and emitting
@@ -1609,11 +1602,7 @@ declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOption
1609
1602
  * `PredictiveValidityResearcher` — concrete `Researcher` implementation
1610
1603
  * that drives selection from outcome-anchored predictive validity.
1611
1604
  *
1612
- * `Researcher` was a placeholder interface plus `NoopResearcher` until
1613
- * 0.23. The 0.23 panel critique called this out: shipping the interface
1614
- * without a default implementation that drives the loop is incomplete.
1615
- *
1616
- * This researcher answers each method:
1605
+ * Each method:
1617
1606
  *
1618
1607
  * - `inspectFailures(runs)` — synthesizes failure modes from the
1619
1608
  * bottom-quartile of `RunRecord`s on the configured proxy reward.
@@ -1676,14 +1665,10 @@ declare class PredictiveValidityResearcher implements Researcher {
1676
1665
  }
1677
1666
 
1678
1667
  /**
1679
- * `runRLCampaign` — the missing top-level orchestrator.
1668
+ * `runRLCampaign` — top-level orchestrator that runs the matrix and
1669
+ * produces every RL-ready artifact in one call.
1680
1670
  *
1681
- * `runEvalCampaign` runs the matrix and produces `RunRecord[]`. The 0.23
1682
- * RL primitives consume that artifact in different ways. Until 0.24 they
1683
- * had to be wired together by hand at every consumer; that defeats the
1684
- * cohesion the package is supposed to provide.
1685
- *
1686
- * `runRLCampaign` wires:
1671
+ * Wires:
1687
1672
  * 1. `runEvalCampaign` for the matrix run (capture, integrity, hooks)
1688
1673
  * 2. `extractVerifiableReward` over each run, separating deterministic
1689
1674
  * from probabilistic reward sources for the trainer
@@ -1697,9 +1682,6 @@ declare class PredictiveValidityResearcher implements Researcher {
1697
1682
  * stage's output is in there. The consumer's downstream fits in a single
1698
1683
  * line: pass `result.preferences` to their DPO trainer, `result.grpoRows`
1699
1684
  * to GRPO, `result.runs` plus `result.rewardSignals` to a custom RL loop.
1700
- *
1701
- * This is what the 0.23 panel critique called the "missing top-level
1702
- * primitive." Now shipped.
1703
1685
  */
1704
1686
 
1705
1687
  interface RunRLCampaignOptions<V> extends EvalCampaignOptions<V> {
package/dist/rl.js CHANGED
@@ -1,23 +1,23 @@
1
1
  import {
2
2
  runEvalCampaign
3
- } from "./chunk-SESZDQPX.js";
3
+ } from "./chunk-RUI6SIHY.js";
4
4
  import "./chunk-4S4BM3QQ.js";
5
5
  import {
6
6
  rubricPredictiveValidity
7
7
  } from "./chunk-YRZ4M5GS.js";
8
8
  import {
9
9
  evaluateInterimReleaseConfidence
10
- } from "./chunk-NU65VQ7M.js";
10
+ } from "./chunk-MAZ26DC7.js";
11
11
  import {
12
12
  benjaminiHochberg
13
- } from "./chunk-2A5XJB43.js";
13
+ } from "./chunk-5AKPEK5L.js";
14
14
  import {
15
15
  wilcoxonSignedRank
16
- } from "./chunk-I4MBDTY5.js";
16
+ } from "./chunk-R5UQJNKC.js";
17
17
  import "./chunk-KTGTIOFD.js";
18
18
  import "./chunk-PC4UYEBM.js";
19
19
  import "./chunk-TVVP3ZZQ.js";
20
- import "./chunk-4F5DQN55.js";
20
+ import "./chunk-VSMTAMNK.js";
21
21
  import {
22
22
  ValidationError
23
23
  } from "./chunk-NG236HPC.js";