@tangle-network/agent-eval 0.30.0 → 0.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/CHANGELOG.md +79 -0
  2. package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
  3. package/dist/benchmarks/index.d.ts +3 -3
  4. package/dist/builder-eval/index.d.ts +3 -3
  5. package/dist/builder-eval/index.js +2 -2
  6. package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
  7. package/dist/{chunk-SZSBQUIJ.js → chunk-B73G44OH.js} +3 -3
  8. package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
  9. package/dist/{chunk-RUI6SIHY.js → chunk-DTEJNZYK.js} +5 -4
  10. package/dist/chunk-DTEJNZYK.js.map +1 -0
  11. package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
  12. package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
  13. package/dist/chunk-HIO4UIS5.js.map +1 -0
  14. package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
  15. package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
  16. package/dist/chunk-QYJT52YW.js.map +1 -0
  17. package/dist/{chunk-PALJO75S.js → chunk-S4Y5VXMS.js} +2 -2
  18. package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
  19. package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
  20. package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
  21. package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
  22. package/dist/{chunk-NLMNWKVM.js → chunk-ZN2CMQIW.js} +54 -2
  23. package/dist/chunk-ZN2CMQIW.js.map +1 -0
  24. package/dist/cli.js +3 -3
  25. package/dist/{control-rJhEDdpy.d.ts → control-p2ns7elI.d.ts} +5 -5
  26. package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
  27. package/dist/control.d.ts +8 -8
  28. package/dist/control.js +3 -3
  29. package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
  30. package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  31. package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
  32. package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
  33. package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
  34. package/dist/governance/index.d.ts +4 -4
  35. package/dist/{index--fVrWDiR.d.ts → index-BTqhGHJT.d.ts} +1 -1
  36. package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
  37. package/dist/index.d.ts +108 -38
  38. package/dist/index.js +159 -14
  39. package/dist/index.js.map +1 -1
  40. package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
  41. package/dist/knowledge/index.d.ts +3 -3
  42. package/dist/meta-eval/index.d.ts +4 -4
  43. package/dist/openapi.json +1 -1
  44. package/dist/optimization.d.ts +11 -11
  45. package/dist/optimization.js +8 -8
  46. package/dist/pipelines/index.d.ts +6 -6
  47. package/dist/pipelines/index.js +3 -3
  48. package/dist/prm/index.d.ts +4 -4
  49. package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
  50. package/dist/{release-report-PWhGlpfO.d.ts → release-report-DLWbBPtH.d.ts} +3 -3
  51. package/dist/reporting.d.ts +8 -8
  52. package/dist/reporting.js +4 -4
  53. package/dist/{researcher-ClDX3KZx.d.ts → researcher-BRHa5Jxo.d.ts} +12 -6
  54. package/dist/rl.d.ts +10 -10
  55. package/dist/rl.js +6 -6
  56. package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
  57. package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-CMHypZ_M.d.ts} +1 -1
  58. package/dist/{run-record-CqzahIbx.d.ts → run-record-BfX5y68A.d.ts} +43 -2
  59. package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  60. package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-D7AQS7eB.d.ts} +2 -2
  61. package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
  62. package/dist/traces.d.ts +533 -10
  63. package/dist/traces.js +14 -300
  64. package/dist/traces.js.map +1 -1
  65. package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
  66. package/dist/wire/index.d.ts +6 -6
  67. package/dist/wire/index.js +3 -3
  68. package/package.json +12 -21
  69. package/dist/chunk-NG236HPC.js.map +0 -1
  70. package/dist/chunk-NLMNWKVM.js.map +0 -1
  71. package/dist/chunk-RUI6SIHY.js.map +0 -1
  72. package/dist/chunk-UW4NOOZI.js.map +0 -1
  73. package/dist/replay-BX5Fm8en.d.ts +0 -529
  74. /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
  75. /package/dist/{chunk-SZSBQUIJ.js.map → chunk-B73G44OH.js.map} +0 -0
  76. /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
  77. /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
  78. /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
  79. /package/dist/{chunk-PALJO75S.js.map → chunk-S4Y5VXMS.js.map} +0 -0
  80. /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
  81. /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
  82. /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
  83. /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
@@ -1,5 +1,5 @@
1
- import { C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
2
- import { T as TraceStore } from './store-BP5be6s7.js';
1
+ import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
2
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
3
3
 
4
4
  /**
5
5
  * RawProviderSink — first-class persistence for the actual HTTP-level
@@ -1,6 +1,6 @@
1
- import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BRdQ0wrx.js';
2
- import { T as TraceEmitter } from '../emitter-BqjeOvJh.js';
3
- import '../store-BP5be6s7.js';
1
+ import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
2
+ import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
3
+ import '../store-Db2Bv8Cf.js';
4
4
 
5
5
  type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
6
6
  type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';
@@ -1,9 +1,9 @@
1
- import { R as Run, T as TraceStore } from '../store-BP5be6s7.js';
1
+ import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
2
2
  import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
3
3
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-C0uDYwG6.js';
5
- import '../run-record-CqzahIbx.js';
6
- import '../errors-BZ9sTdz7.js';
4
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-CMHypZ_M.js';
5
+ import '../run-record-BfX5y68A.js';
6
+ import '../errors-mje_cKOs.js';
7
7
 
8
8
  /**
9
9
  * Correlation study — "does our eval score predict real-world outcomes?"
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.30.0",
5
+ "version": "0.31.1",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,11 +1,11 @@
1
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-ClDX3KZx.js';
2
- export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-j0nJFgC6.js';
3
- export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-jrSGb2xZ.js';
4
- import './errors-BZ9sTdz7.js';
5
- import './integrity-BAxLGJ9I.js';
6
- import './store-BP5be6s7.js';
7
- import './run-record-CqzahIbx.js';
8
- import './emitter-BqjeOvJh.js';
9
- import './control-runtime-BRdQ0wrx.js';
10
- import './dataset-CiK_3LDr.js';
11
- import './failure-cluster-D1NZKqYu.js';
1
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-BRHa5Jxo.js';
2
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
3
+ export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-D7AQS7eB.js';
4
+ import './errors-mje_cKOs.js';
5
+ import './integrity-DYR5gWlb.js';
6
+ import './store-Db2Bv8Cf.js';
7
+ import './run-record-BfX5y68A.js';
8
+ import './emitter-DP_cSSiw.js';
9
+ import './control-runtime-BZ_lVLYW.js';
10
+ import './dataset-ueRVTUoY.js';
11
+ import './failure-cluster-Cw65_5FY.js';
@@ -25,19 +25,19 @@ import {
25
25
  summarizePreferenceMemory,
26
26
  trialTraceFromMultiShotTrial,
27
27
  withAssignedFeedbackSplit
28
- } from "./chunk-SZSBQUIJ.js";
29
- import "./chunk-NLMNWKVM.js";
28
+ } from "./chunk-B73G44OH.js";
29
+ import "./chunk-ZN2CMQIW.js";
30
30
  import {
31
31
  runEvalCampaign
32
- } from "./chunk-RUI6SIHY.js";
33
- import "./chunk-4S4BM3QQ.js";
34
- import "./chunk-5AKPEK5L.js";
35
- import "./chunk-R5UQJNKC.js";
36
- import "./chunk-KTGTIOFD.js";
32
+ } from "./chunk-DTEJNZYK.js";
33
+ import "./chunk-M6RZ5LJN.js";
34
+ import "./chunk-CXJOVDJR.js";
35
+ import "./chunk-4L3WJXQJ.js";
36
+ import "./chunk-UBPIXOC4.js";
37
37
  import "./chunk-PC4UYEBM.js";
38
38
  import "./chunk-TVVP3ZZQ.js";
39
39
  import "./chunk-VSMTAMNK.js";
40
- import "./chunk-NG236HPC.js";
40
+ import "./chunk-QYJT52YW.js";
41
41
  import "./chunk-PZ5AY32C.js";
42
42
  export {
43
43
  CallbackResearcher,
@@ -1,9 +1,9 @@
1
- import { g as BudgetSpec, T as TraceStore, l as RunFilter, R as Run, a as ToolSpan } from '../store-BP5be6s7.js';
2
- export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-D1NZKqYu.js';
3
- import { a as TrajectoryStep } from '../trajectory-BFmveYZt.js';
4
- import { B as BaselineOptions, a as BaselineReport } from '../baseline-BwdCXUS8.js';
5
- export { c as computeToolUseMetrics } from '../baseline-BwdCXUS8.js';
6
- import { l as llmSpans } from '../query-BFDT0kX_.js';
1
+ import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
2
+ export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
3
+ import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
4
+ import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
5
+ export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
6
+ import { l as llmSpans } from '../query-DODUYdPg.js';
7
7
 
8
8
  /**
9
9
  * BudgetBreachView — aggregates breach events across the corpus.
@@ -2,13 +2,13 @@ import {
2
2
  compareToBaseline,
3
3
  computeToolUseMetrics,
4
4
  failureClusterView
5
- } from "../chunk-K33INZHH.js";
5
+ } from "../chunk-GVQT44CS.js";
6
6
  import {
7
7
  buildTrajectory
8
8
  } from "../chunk-RZTMDUO7.js";
9
9
  import {
10
10
  interRaterReliability
11
- } from "../chunk-R5UQJNKC.js";
11
+ } from "../chunk-4L3WJXQJ.js";
12
12
  import {
13
13
  aggregateLlm,
14
14
  argHash,
@@ -17,7 +17,7 @@ import {
17
17
  toolSpans
18
18
  } from "../chunk-47X6LRCE.js";
19
19
  import "../chunk-5BKGXME7.js";
20
- import "../chunk-NG236HPC.js";
20
+ import "../chunk-QYJT52YW.js";
21
21
  import "../chunk-PZ5AY32C.js";
22
22
 
23
23
  // src/pipelines/budget-breach.ts
@@ -1,7 +1,7 @@
1
- import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-DgSqjqqj.js';
2
- export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-DgSqjqqj.js';
3
- import { S as Span, T as TraceStore } from '../store-BP5be6s7.js';
4
- import '../trajectory-BFmveYZt.js';
1
+ import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
2
+ export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
3
+ import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
4
+ import '../trajectory-CnoBo-JY.js';
5
5
 
6
6
  /**
7
7
  * Export PRM-graded traces as training data for downstream reward-model
@@ -1,4 +1,4 @@
1
- import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-BP5be6s7.js';
1
+ import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
2
2
 
3
3
  /**
4
4
  * Typed query helpers over TraceStore.
@@ -1,6 +1,6 @@
1
- import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-CiK_3LDr.js';
2
- import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-jrSGb2xZ.js';
3
- import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
1
+ import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-ueRVTUoY.js';
2
+ import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-D7AQS7eB.js';
3
+ import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
4
4
 
5
5
  /**
6
6
  * Release confidence gate.
@@ -1,10 +1,10 @@
1
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-C0uDYwG6.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-PWhGlpfO.js';
1
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CMHypZ_M.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-DLWbBPtH.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
- export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-jrSGb2xZ.js';
5
- import './run-record-CqzahIbx.js';
6
- import './errors-BZ9sTdz7.js';
4
+ export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-D7AQS7eB.js';
5
+ import './run-record-BfX5y68A.js';
6
+ import './errors-mje_cKOs.js';
7
7
  import './outcome-store-D6KWmYvj.js';
8
- import './dataset-CiK_3LDr.js';
9
- import './failure-cluster-D1NZKqYu.js';
10
- import './store-BP5be6s7.js';
8
+ import './dataset-ueRVTUoY.js';
9
+ import './failure-cluster-Cw65_5FY.js';
10
+ import './store-Db2Bv8Cf.js';
package/dist/reporting.js CHANGED
@@ -5,7 +5,7 @@ import {
5
5
  judgeReplayGate,
6
6
  releaseTraceEvidenceFromMultiShotTrials,
7
7
  renderReleaseReport
8
- } from "./chunk-DBIGN5MJ.js";
8
+ } from "./chunk-WGXZAQLR.js";
9
9
  import {
10
10
  rubricPredictiveValidity
11
11
  } from "./chunk-YRZ4M5GS.js";
@@ -22,10 +22,10 @@ import {
22
22
  paretoChart,
23
23
  researchReport,
24
24
  summaryTable
25
- } from "./chunk-5AKPEK5L.js";
26
- import "./chunk-R5UQJNKC.js";
25
+ } from "./chunk-CXJOVDJR.js";
26
+ import "./chunk-4L3WJXQJ.js";
27
27
  import "./chunk-VSMTAMNK.js";
28
- import "./chunk-NG236HPC.js";
28
+ import "./chunk-QYJT52YW.js";
29
29
  import "./chunk-PZ5AY32C.js";
30
30
  export {
31
31
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
@@ -1,9 +1,9 @@
1
- import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
2
- import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-BAxLGJ9I.js';
3
- import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-CqzahIbx.js';
4
- import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-jrSGb2xZ.js';
5
- import { T as TraceEmitter, R as RunCompleteHook } from './emitter-BqjeOvJh.js';
6
- import { T as TraceStore } from './store-BP5be6s7.js';
1
+ import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
2
+ import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DYR5gWlb.js';
3
+ import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, R as RunRecord } from './run-record-BfX5y68A.js';
4
+ import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-D7AQS7eB.js';
5
+ import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
6
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
7
7
 
8
8
  /**
9
9
  * LLM client with graceful degrade.
@@ -316,6 +316,12 @@ interface CampaignRunOutcome {
316
316
  failureMode?: string;
317
317
  /** Optional judge metadata when a judge was used. */
318
318
  judgeMetadata?: RunJudgeMetadata;
319
+ /**
320
+ * Optional per-judge / per-dim breakdown for ensemble-judged runs.
321
+ * Propagated to `outcome.judgeScores` on the resulting `RunRecord`.
322
+ * Single-judge or scalar-only runs leave this unset.
323
+ */
324
+ judgeScores?: JudgeScoresRecord;
319
325
  }
320
326
  type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>;
321
327
  type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log';
package/dist/rl.d.ts CHANGED
@@ -1,16 +1,16 @@
1
- import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
1
+ import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
2
2
  import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
3
- import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-jrSGb2xZ.js';
3
+ import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-D7AQS7eB.js';
4
4
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
5
- import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-C0uDYwG6.js';
5
+ import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CMHypZ_M.js';
6
6
  import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
7
- import { S as Span, T as TraceStore } from './store-BP5be6s7.js';
8
- import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-ClDX3KZx.js';
9
- export { r as runEvalCampaign } from './researcher-ClDX3KZx.js';
10
- import './errors-BZ9sTdz7.js';
11
- import './failure-cluster-D1NZKqYu.js';
12
- import './integrity-BAxLGJ9I.js';
13
- import './emitter-BqjeOvJh.js';
7
+ import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
8
+ import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-BRHa5Jxo.js';
9
+ export { r as runEvalCampaign } from './researcher-BRHa5Jxo.js';
10
+ import './errors-mje_cKOs.js';
11
+ import './failure-cluster-Cw65_5FY.js';
12
+ import './integrity-DYR5gWlb.js';
13
+ import './emitter-DP_cSSiw.js';
14
14
 
15
15
  /**
16
16
  * Test-time compute scaling curves.
package/dist/rl.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  runEvalCampaign
3
- } from "./chunk-RUI6SIHY.js";
4
- import "./chunk-4S4BM3QQ.js";
3
+ } from "./chunk-DTEJNZYK.js";
4
+ import "./chunk-M6RZ5LJN.js";
5
5
  import {
6
6
  rubricPredictiveValidity
7
7
  } from "./chunk-YRZ4M5GS.js";
@@ -10,17 +10,17 @@ import {
10
10
  } from "./chunk-MAZ26DC7.js";
11
11
  import {
12
12
  benjaminiHochberg
13
- } from "./chunk-5AKPEK5L.js";
13
+ } from "./chunk-CXJOVDJR.js";
14
14
  import {
15
15
  wilcoxonSignedRank
16
- } from "./chunk-R5UQJNKC.js";
17
- import "./chunk-KTGTIOFD.js";
16
+ } from "./chunk-4L3WJXQJ.js";
17
+ import "./chunk-UBPIXOC4.js";
18
18
  import "./chunk-PC4UYEBM.js";
19
19
  import "./chunk-TVVP3ZZQ.js";
20
20
  import "./chunk-VSMTAMNK.js";
21
21
  import {
22
22
  ValidationError
23
- } from "./chunk-NG236HPC.js";
23
+ } from "./chunk-QYJT52YW.js";
24
24
  import "./chunk-PZ5AY32C.js";
25
25
 
26
26
  // src/rl/compute-curves.ts
@@ -1,5 +1,5 @@
1
- import { S as Span, T as TraceStore, J as JudgeSpan } from './store-BP5be6s7.js';
2
- import { T as Trajectory, a as TrajectoryStep } from './trajectory-BFmveYZt.js';
1
+ import { S as Span, T as TraceStore, J as JudgeSpan } from './store-Db2Bv8Cf.js';
2
+ import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
3
3
 
4
4
  /**
5
5
  * Process Reward Modeling — per-step rubric grading.
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-CqzahIbx.js';
1
+ import { R as RunRecord } from './run-record-BfX5y68A.js';
2
2
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
3
 
4
4
  /**
@@ -1,4 +1,4 @@
1
- import { V as ValidationError } from './errors-BZ9sTdz7.js';
1
+ import { V as ValidationError } from './errors-mje_cKOs.js';
2
2
 
3
3
  /**
4
4
  * Paper-grade RunRecord schema + runtime validator.
@@ -42,6 +42,41 @@ interface RunJudgeMetadata {
42
42
  * prior-call cache, etc.). The canary uses this to alert. */
43
43
  fallback: boolean;
44
44
  }
45
+ /**
46
+ * Per-judge / per-dimension breakdown for runs scored by an ensemble of
47
+ * judges over a multi-dimensional rubric.
48
+ *
49
+ * The collapsed `outcome.searchScore` / `holdoutScore` carries the
50
+ * composite the gate uses. The full breakdown belongs here so consumers
51
+ * can answer "which judge disagreed?", "which dimension dragged the
52
+ * composite down?", and "did half the panel fail?" without re-running.
53
+ *
54
+ * `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and
55
+ * `composite` are convenience projections — derivable but precomputed so
56
+ * downstream IRR primitives (`interRaterReliability`,
57
+ * `corpusInterRaterAgreement`) and reporters don't pay the same
58
+ * aggregation twice.
59
+ *
60
+ * Fail-loud discipline: judges that errored out land in `failedJudges`
61
+ * by id. A missing key in `perJudge` is ambiguous (silent zero vs not
62
+ * run); the explicit list makes a partial-failure recorded as such.
63
+ */
64
+ interface JudgeScoresRecord {
65
+ /** Per-judge per-dimension scores. `{ "kimi-k2.6": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */
66
+ perJudge: Record<string, Record<string, number>>;
67
+ /** Per-dim mean across judges. Convenience — derivable from `perJudge`. */
68
+ perDimMean: Record<string, number>;
69
+ /** Composite mean across all dims and judges. Mirrors the score
70
+ * the gate sees on `outcome.searchScore` / `holdoutScore`. */
71
+ composite: number;
72
+ /** Judges that errored or returned an unparseable verdict. Recorded
73
+ * by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit,
74
+ * not inferred from missing keys in `perJudge`. */
75
+ failedJudges?: string[];
76
+ /** Free-form notes the judges emitted (joined across judges or
77
+ * first-judge only — consumer's choice). */
78
+ notes?: string;
79
+ }
45
80
  interface RunOutcome {
46
81
  /** Score on the search/optimization split. Optional because a
47
82
  * holdout-only evaluation only fills `holdoutScore`. */
@@ -53,6 +88,12 @@ interface RunOutcome {
53
88
  * pass/fail counters, latency stats, etc. Numeric only — keeps
54
89
  * reporters honest. */
55
90
  raw: Record<string, number>;
91
+ /** Per-judge / per-dim breakdown. Consumers writing ensemble
92
+ * judgements populate this; substrate primitives like
93
+ * `interRaterReliability` and `corpusInterRaterAgreement` accept
94
+ * these records as input. Optional — single-judge or scalar-only
95
+ * runs leave it unset. */
96
+ judgeScores?: JudgeScoresRecord;
56
97
  }
57
98
  /**
58
99
  * Mandatory paper-grade fields for a single evaluation run. Optional
@@ -143,4 +184,4 @@ declare function parseRunRecordSafe(input: unknown): {
143
184
  /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
144
185
  declare function roundTripRunRecord(record: RunRecord): RunRecord;
145
186
 
146
- export { type RunRecord as R, type RunSplitTag as a, type RunTokenUsage as b, type RunJudgeMetadata as c, type RunOutcome as d, RunRecordValidationError as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };
187
+ export { type JudgeScoresRecord as J, type RunRecord as R, type RunSplitTag as a, type RunTokenUsage as b, type RunJudgeMetadata as c, type RunOutcome as d, RunRecordValidationError as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };
@@ -294,4 +294,4 @@ declare class FileSystemTraceStore implements TraceStore {
294
294
  artifacts(runId: string): Promise<Artifact[]>;
295
295
  }
296
296
 
297
- export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type EventFilter as h, FAILURE_CLASSES as i, FileSystemTraceStore as j, type FileSystemTraceStoreOptions as k, type RunFilter as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
297
+ export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
@@ -1,5 +1,5 @@
1
- import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
2
- import { F as FailureClusterReport } from './failure-cluster-D1NZKqYu.js';
1
+ import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
2
+ import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
3
3
 
4
4
  /**
5
5
  * HeldOutGate — first-class held-out paired-delta promotion gate.
@@ -1,5 +1,5 @@
1
- import { T as TraceEmitter } from './emitter-BqjeOvJh.js';
2
- import { R as Run, F as FailureClass, T as TraceStore } from './store-BP5be6s7.js';
1
+ import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
2
+ import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
3
3
 
4
4
  /**
5
5
  * SandboxHarness — executes a scenario in an isolated environment and