@tangle-network/agent-eval 0.29.1 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
  2. package/dist/benchmarks/index.d.ts +3 -3
  3. package/dist/builder-eval/index.d.ts +3 -3
  4. package/dist/builder-eval/index.js +2 -2
  5. package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
  6. package/dist/{chunk-RUI6SIHY.js → chunk-75ZREHD7.js} +4 -4
  7. package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
  8. package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
  9. package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
  10. package/dist/chunk-HIO4UIS5.js.map +1 -0
  11. package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
  12. package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
  13. package/dist/chunk-QYJT52YW.js.map +1 -0
  14. package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
  15. package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
  16. package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
  17. package/dist/{chunk-NLMNWKVM.js → chunk-WSI4K3WB.js} +2 -2
  18. package/dist/{chunk-PALJO75S.js → chunk-XEL6UP7C.js} +2 -2
  19. package/dist/{chunk-SZSBQUIJ.js → chunk-Y2CPBYKH.js} +3 -3
  20. package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
  21. package/dist/cli.js +3 -3
  22. package/dist/{control-rJhEDdpy.d.ts → control-BFpqHFV2.d.ts} +5 -5
  23. package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
  24. package/dist/control.d.ts +8 -8
  25. package/dist/control.js +3 -3
  26. package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
  27. package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  28. package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
  29. package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
  30. package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
  31. package/dist/governance/index.d.ts +4 -4
  32. package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
  33. package/dist/{index--fVrWDiR.d.ts → index-TVjRYWRm.d.ts} +1 -1
  34. package/dist/index.d.ts +254 -38
  35. package/dist/index.js +378 -26
  36. package/dist/index.js.map +1 -1
  37. package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
  38. package/dist/knowledge/index.d.ts +3 -3
  39. package/dist/meta-eval/index.d.ts +4 -4
  40. package/dist/openapi.json +1 -1
  41. package/dist/optimization.d.ts +11 -11
  42. package/dist/optimization.js +8 -8
  43. package/dist/pipelines/index.d.ts +6 -6
  44. package/dist/pipelines/index.js +3 -3
  45. package/dist/prm/index.d.ts +4 -4
  46. package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
  47. package/dist/{release-report-PWhGlpfO.d.ts → release-report-C8r4Vben.d.ts} +3 -3
  48. package/dist/reporting.d.ts +8 -8
  49. package/dist/reporting.js +4 -4
  50. package/dist/{researcher-ClDX3KZx.d.ts → researcher-BmgJ_901.d.ts} +6 -6
  51. package/dist/rl.d.ts +10 -10
  52. package/dist/rl.js +6 -6
  53. package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
  54. package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-Bm-CbN46.d.ts} +1 -1
  55. package/dist/{run-record-CqzahIbx.d.ts → run-record-nYf9x2hU.d.ts} +1 -1
  56. package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  57. package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-dir7A-eQ.d.ts} +2 -2
  58. package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
  59. package/dist/traces.d.ts +533 -10
  60. package/dist/traces.js +14 -300
  61. package/dist/traces.js.map +1 -1
  62. package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
  63. package/dist/wire/index.d.ts +6 -6
  64. package/dist/wire/index.js +3 -3
  65. package/package.json +1 -1
  66. package/dist/chunk-NG236HPC.js.map +0 -1
  67. package/dist/chunk-UW4NOOZI.js.map +0 -1
  68. package/dist/replay-BX5Fm8en.d.ts +0 -529
  69. /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
  70. /package/dist/{chunk-RUI6SIHY.js.map → chunk-75ZREHD7.js.map} +0 -0
  71. /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
  72. /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
  73. /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
  74. /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
  75. /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
  76. /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
  77. /package/dist/{chunk-NLMNWKVM.js.map → chunk-WSI4K3WB.js.map} +0 -0
  78. /package/dist/{chunk-PALJO75S.js.map → chunk-XEL6UP7C.js.map} +0 -0
  79. /package/dist/{chunk-SZSBQUIJ.js.map → chunk-Y2CPBYKH.js.map} +0 -0
  80. /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
@@ -1,5 +1,5 @@
1
- import { C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
2
- import { T as TraceStore } from './store-BP5be6s7.js';
1
+ import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
2
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
3
3
 
4
4
  /**
5
5
  * RawProviderSink — first-class persistence for the actual HTTP-level
@@ -1,6 +1,6 @@
1
- import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BRdQ0wrx.js';
2
- import { T as TraceEmitter } from '../emitter-BqjeOvJh.js';
3
- import '../store-BP5be6s7.js';
1
+ import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
2
+ import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
3
+ import '../store-Db2Bv8Cf.js';
4
4
 
5
5
  type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
6
6
  type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';
@@ -1,9 +1,9 @@
1
- import { R as Run, T as TraceStore } from '../store-BP5be6s7.js';
1
+ import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
2
2
  import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
3
3
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-C0uDYwG6.js';
5
- import '../run-record-CqzahIbx.js';
6
- import '../errors-BZ9sTdz7.js';
4
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-Bm-CbN46.js';
5
+ import '../run-record-nYf9x2hU.js';
6
+ import '../errors-mje_cKOs.js';
7
7
 
8
8
  /**
9
9
  * Correlation study — "does our eval score predict real-world outcomes?"
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.29.1",
5
+ "version": "0.31.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,11 +1,11 @@
1
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-ClDX3KZx.js';
2
- export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-j0nJFgC6.js';
3
- export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-jrSGb2xZ.js';
4
- import './errors-BZ9sTdz7.js';
5
- import './integrity-BAxLGJ9I.js';
6
- import './store-BP5be6s7.js';
7
- import './run-record-CqzahIbx.js';
8
- import './emitter-BqjeOvJh.js';
9
- import './control-runtime-BRdQ0wrx.js';
10
- import './dataset-CiK_3LDr.js';
11
- import './failure-cluster-D1NZKqYu.js';
1
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-BmgJ_901.js';
2
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
3
+ export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-dir7A-eQ.js';
4
+ import './errors-mje_cKOs.js';
5
+ import './integrity-DYR5gWlb.js';
6
+ import './store-Db2Bv8Cf.js';
7
+ import './run-record-nYf9x2hU.js';
8
+ import './emitter-DP_cSSiw.js';
9
+ import './control-runtime-BZ_lVLYW.js';
10
+ import './dataset-ueRVTUoY.js';
11
+ import './failure-cluster-Cw65_5FY.js';
@@ -25,19 +25,19 @@ import {
25
25
  summarizePreferenceMemory,
26
26
  trialTraceFromMultiShotTrial,
27
27
  withAssignedFeedbackSplit
28
- } from "./chunk-SZSBQUIJ.js";
29
- import "./chunk-NLMNWKVM.js";
28
+ } from "./chunk-Y2CPBYKH.js";
29
+ import "./chunk-WSI4K3WB.js";
30
30
  import {
31
31
  runEvalCampaign
32
- } from "./chunk-RUI6SIHY.js";
33
- import "./chunk-4S4BM3QQ.js";
34
- import "./chunk-5AKPEK5L.js";
35
- import "./chunk-R5UQJNKC.js";
36
- import "./chunk-KTGTIOFD.js";
32
+ } from "./chunk-75ZREHD7.js";
33
+ import "./chunk-M6RZ5LJN.js";
34
+ import "./chunk-CXJOVDJR.js";
35
+ import "./chunk-4L3WJXQJ.js";
36
+ import "./chunk-UBPIXOC4.js";
37
37
  import "./chunk-PC4UYEBM.js";
38
38
  import "./chunk-TVVP3ZZQ.js";
39
39
  import "./chunk-VSMTAMNK.js";
40
- import "./chunk-NG236HPC.js";
40
+ import "./chunk-QYJT52YW.js";
41
41
  import "./chunk-PZ5AY32C.js";
42
42
  export {
43
43
  CallbackResearcher,
@@ -1,9 +1,9 @@
1
- import { g as BudgetSpec, T as TraceStore, l as RunFilter, R as Run, a as ToolSpan } from '../store-BP5be6s7.js';
2
- export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-D1NZKqYu.js';
3
- import { a as TrajectoryStep } from '../trajectory-BFmveYZt.js';
4
- import { B as BaselineOptions, a as BaselineReport } from '../baseline-BwdCXUS8.js';
5
- export { c as computeToolUseMetrics } from '../baseline-BwdCXUS8.js';
6
- import { l as llmSpans } from '../query-BFDT0kX_.js';
1
+ import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
2
+ export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
3
+ import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
4
+ import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
5
+ export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
6
+ import { l as llmSpans } from '../query-DODUYdPg.js';
7
7
 
8
8
  /**
9
9
  * BudgetBreachView — aggregates breach events across the corpus.
@@ -2,13 +2,13 @@ import {
2
2
  compareToBaseline,
3
3
  computeToolUseMetrics,
4
4
  failureClusterView
5
- } from "../chunk-K33INZHH.js";
5
+ } from "../chunk-GVQT44CS.js";
6
6
  import {
7
7
  buildTrajectory
8
8
  } from "../chunk-RZTMDUO7.js";
9
9
  import {
10
10
  interRaterReliability
11
- } from "../chunk-R5UQJNKC.js";
11
+ } from "../chunk-4L3WJXQJ.js";
12
12
  import {
13
13
  aggregateLlm,
14
14
  argHash,
@@ -17,7 +17,7 @@ import {
17
17
  toolSpans
18
18
  } from "../chunk-47X6LRCE.js";
19
19
  import "../chunk-5BKGXME7.js";
20
- import "../chunk-NG236HPC.js";
20
+ import "../chunk-QYJT52YW.js";
21
21
  import "../chunk-PZ5AY32C.js";
22
22
 
23
23
  // src/pipelines/budget-breach.ts
@@ -1,7 +1,7 @@
1
- import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-DgSqjqqj.js';
2
- export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-DgSqjqqj.js';
3
- import { S as Span, T as TraceStore } from '../store-BP5be6s7.js';
4
- import '../trajectory-BFmveYZt.js';
1
+ import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
2
+ export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
3
+ import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
4
+ import '../trajectory-CnoBo-JY.js';
5
5
 
6
6
  /**
7
7
  * Export PRM-graded traces as training data for downstream reward-model
@@ -1,4 +1,4 @@
1
- import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-BP5be6s7.js';
1
+ import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
2
2
 
3
3
  /**
4
4
  * Typed query helpers over TraceStore.
@@ -1,6 +1,6 @@
1
- import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-CiK_3LDr.js';
2
- import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-jrSGb2xZ.js';
3
- import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
1
+ import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-ueRVTUoY.js';
2
+ import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-dir7A-eQ.js';
3
+ import { R as RunRecord, a as RunSplitTag } from './run-record-nYf9x2hU.js';
4
4
 
5
5
  /**
6
6
  * Release confidence gate.
@@ -1,10 +1,10 @@
1
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-C0uDYwG6.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-PWhGlpfO.js';
1
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-Bm-CbN46.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-C8r4Vben.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
- export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-jrSGb2xZ.js';
5
- import './run-record-CqzahIbx.js';
6
- import './errors-BZ9sTdz7.js';
4
+ export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-dir7A-eQ.js';
5
+ import './run-record-nYf9x2hU.js';
6
+ import './errors-mje_cKOs.js';
7
7
  import './outcome-store-D6KWmYvj.js';
8
- import './dataset-CiK_3LDr.js';
9
- import './failure-cluster-D1NZKqYu.js';
10
- import './store-BP5be6s7.js';
8
+ import './dataset-ueRVTUoY.js';
9
+ import './failure-cluster-Cw65_5FY.js';
10
+ import './store-Db2Bv8Cf.js';
package/dist/reporting.js CHANGED
@@ -5,7 +5,7 @@ import {
5
5
  judgeReplayGate,
6
6
  releaseTraceEvidenceFromMultiShotTrials,
7
7
  renderReleaseReport
8
- } from "./chunk-DBIGN5MJ.js";
8
+ } from "./chunk-WGXZAQLR.js";
9
9
  import {
10
10
  rubricPredictiveValidity
11
11
  } from "./chunk-YRZ4M5GS.js";
@@ -22,10 +22,10 @@ import {
22
22
  paretoChart,
23
23
  researchReport,
24
24
  summaryTable
25
- } from "./chunk-5AKPEK5L.js";
26
- import "./chunk-R5UQJNKC.js";
25
+ } from "./chunk-CXJOVDJR.js";
26
+ import "./chunk-4L3WJXQJ.js";
27
27
  import "./chunk-VSMTAMNK.js";
28
- import "./chunk-NG236HPC.js";
28
+ import "./chunk-QYJT52YW.js";
29
29
  import "./chunk-PZ5AY32C.js";
30
30
  export {
31
31
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
@@ -1,9 +1,9 @@
1
- import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
2
- import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-BAxLGJ9I.js';
3
- import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-CqzahIbx.js';
4
- import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-jrSGb2xZ.js';
5
- import { T as TraceEmitter, R as RunCompleteHook } from './emitter-BqjeOvJh.js';
6
- import { T as TraceStore } from './store-BP5be6s7.js';
1
+ import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
2
+ import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DYR5gWlb.js';
3
+ import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-nYf9x2hU.js';
4
+ import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-dir7A-eQ.js';
5
+ import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
6
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
7
7
 
8
8
  /**
9
9
  * LLM client with graceful degrade.
package/dist/rl.d.ts CHANGED
@@ -1,16 +1,16 @@
1
- import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
1
+ import { R as RunRecord, a as RunSplitTag } from './run-record-nYf9x2hU.js';
2
2
  import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
3
- import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-jrSGb2xZ.js';
3
+ import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-dir7A-eQ.js';
4
4
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
5
- import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-C0uDYwG6.js';
5
+ import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-Bm-CbN46.js';
6
6
  import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
7
- import { S as Span, T as TraceStore } from './store-BP5be6s7.js';
8
- import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-ClDX3KZx.js';
9
- export { r as runEvalCampaign } from './researcher-ClDX3KZx.js';
10
- import './errors-BZ9sTdz7.js';
11
- import './failure-cluster-D1NZKqYu.js';
12
- import './integrity-BAxLGJ9I.js';
13
- import './emitter-BqjeOvJh.js';
7
+ import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
8
+ import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-BmgJ_901.js';
9
+ export { r as runEvalCampaign } from './researcher-BmgJ_901.js';
10
+ import './errors-mje_cKOs.js';
11
+ import './failure-cluster-Cw65_5FY.js';
12
+ import './integrity-DYR5gWlb.js';
13
+ import './emitter-DP_cSSiw.js';
14
14
 
15
15
  /**
16
16
  * Test-time compute scaling curves.
package/dist/rl.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  runEvalCampaign
3
- } from "./chunk-RUI6SIHY.js";
4
- import "./chunk-4S4BM3QQ.js";
3
+ } from "./chunk-75ZREHD7.js";
4
+ import "./chunk-M6RZ5LJN.js";
5
5
  import {
6
6
  rubricPredictiveValidity
7
7
  } from "./chunk-YRZ4M5GS.js";
@@ -10,17 +10,17 @@ import {
10
10
  } from "./chunk-MAZ26DC7.js";
11
11
  import {
12
12
  benjaminiHochberg
13
- } from "./chunk-5AKPEK5L.js";
13
+ } from "./chunk-CXJOVDJR.js";
14
14
  import {
15
15
  wilcoxonSignedRank
16
- } from "./chunk-R5UQJNKC.js";
17
- import "./chunk-KTGTIOFD.js";
16
+ } from "./chunk-4L3WJXQJ.js";
17
+ import "./chunk-UBPIXOC4.js";
18
18
  import "./chunk-PC4UYEBM.js";
19
19
  import "./chunk-TVVP3ZZQ.js";
20
20
  import "./chunk-VSMTAMNK.js";
21
21
  import {
22
22
  ValidationError
23
- } from "./chunk-NG236HPC.js";
23
+ } from "./chunk-QYJT52YW.js";
24
24
  import "./chunk-PZ5AY32C.js";
25
25
 
26
26
  // src/rl/compute-curves.ts
@@ -1,5 +1,5 @@
1
- import { S as Span, T as TraceStore, J as JudgeSpan } from './store-BP5be6s7.js';
2
- import { T as Trajectory, a as TrajectoryStep } from './trajectory-BFmveYZt.js';
1
+ import { S as Span, T as TraceStore, J as JudgeSpan } from './store-Db2Bv8Cf.js';
2
+ import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
3
3
 
4
4
  /**
5
5
  * Process Reward Modeling — per-step rubric grading.
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-CqzahIbx.js';
1
+ import { R as RunRecord } from './run-record-nYf9x2hU.js';
2
2
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
3
 
4
4
  /**
@@ -1,4 +1,4 @@
1
- import { V as ValidationError } from './errors-BZ9sTdz7.js';
1
+ import { V as ValidationError } from './errors-mje_cKOs.js';
2
2
 
3
3
  /**
4
4
  * Paper-grade RunRecord schema + runtime validator.
@@ -294,4 +294,4 @@ declare class FileSystemTraceStore implements TraceStore {
294
294
  artifacts(runId: string): Promise<Artifact[]>;
295
295
  }
296
296
 
297
- export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type EventFilter as h, FAILURE_CLASSES as i, FileSystemTraceStore as j, type FileSystemTraceStoreOptions as k, type RunFilter as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
297
+ export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
@@ -1,5 +1,5 @@
1
- import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
2
- import { F as FailureClusterReport } from './failure-cluster-D1NZKqYu.js';
1
+ import { R as RunRecord, a as RunSplitTag } from './run-record-nYf9x2hU.js';
2
+ import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
3
3
 
4
4
  /**
5
5
  * HeldOutGate — first-class held-out paired-delta promotion gate.
@@ -1,5 +1,5 @@
1
- import { T as TraceEmitter } from './emitter-BqjeOvJh.js';
2
- import { R as Run, F as FailureClass, T as TraceStore } from './store-BP5be6s7.js';
1
+ import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
2
+ import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
3
3
 
4
4
  /**
5
5
  * SandboxHarness — executes a scenario in an isolated environment and