npm - @tangle-network/agent-eval - Versions diffs - 0.30.0 → 0.31.1 - Mend

@tangle-network/agent-eval 0.30.0 → 0.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
-import { T as TraceStore } from './store-BP5be6s7.js';
+import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
+import { T as TraceStore } from './store-Db2Bv8Cf.js';
 /**
  * RawProviderSink — first-class persistence for the actual HTTP-level

package/dist/knowledge/index.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BRdQ0wrx.js';
-import { T as TraceEmitter } from '../emitter-BqjeOvJh.js';
-import '../store-BP5be6s7.js';
+import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
+import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
+import '../store-Db2Bv8Cf.js';
 type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
 type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';

package/dist/meta-eval/index.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
-import { R as Run, T as TraceStore } from '../store-BP5be6s7.js';
+import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
 import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-C0uDYwG6.js';
-import '../run-record-CqzahIbx.js';
-import '../errors-BZ9sTdz7.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-CMHypZ_M.js';
+import '../run-record-BfX5y68A.js';
+import '../errors-mje_cKOs.js';
 /**
  * Correlation study — "does our eval score predict real-world outcomes?"

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.30.0",
+    "version": "0.31.1",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/optimization.d.ts CHANGED Viewed

@@ -1,11 +1,11 @@
-export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-ClDX3KZx.js';
-export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-j0nJFgC6.js';
-export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-jrSGb2xZ.js';
-import './errors-BZ9sTdz7.js';
-import './integrity-BAxLGJ9I.js';
-import './store-BP5be6s7.js';
-import './run-record-CqzahIbx.js';
-import './emitter-BqjeOvJh.js';
-import './control-runtime-BRdQ0wrx.js';
-import './dataset-CiK_3LDr.js';
-import './failure-cluster-D1NZKqYu.js';
+export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-BRHa5Jxo.js';
+export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
+export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-D7AQS7eB.js';
+import './errors-mje_cKOs.js';
+import './integrity-DYR5gWlb.js';
+import './store-Db2Bv8Cf.js';
+import './run-record-BfX5y68A.js';
+import './emitter-DP_cSSiw.js';
+import './control-runtime-BZ_lVLYW.js';
+import './dataset-ueRVTUoY.js';
+import './failure-cluster-Cw65_5FY.js';

package/dist/optimization.js CHANGED Viewed

@@ -25,19 +25,19 @@ import {
   summarizePreferenceMemory,
   trialTraceFromMultiShotTrial,
   withAssignedFeedbackSplit
-} from "./chunk-SZSBQUIJ.js";
-import "./chunk-NLMNWKVM.js";
+} from "./chunk-B73G44OH.js";
+import "./chunk-ZN2CMQIW.js";
 import {
   runEvalCampaign
-} from "./chunk-RUI6SIHY.js";
-import "./chunk-4S4BM3QQ.js";
-import "./chunk-5AKPEK5L.js";
-import "./chunk-R5UQJNKC.js";
-import "./chunk-KTGTIOFD.js";
+} from "./chunk-DTEJNZYK.js";
+import "./chunk-M6RZ5LJN.js";
+import "./chunk-CXJOVDJR.js";
+import "./chunk-4L3WJXQJ.js";
+import "./chunk-UBPIXOC4.js";
 import "./chunk-PC4UYEBM.js";
 import "./chunk-TVVP3ZZQ.js";
 import "./chunk-VSMTAMNK.js";
-import "./chunk-NG236HPC.js";
+import "./chunk-QYJT52YW.js";
 import "./chunk-PZ5AY32C.js";
 export {
   CallbackResearcher,

package/dist/pipelines/index.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
-import { g as BudgetSpec, T as TraceStore, l as RunFilter, R as Run, a as ToolSpan } from '../store-BP5be6s7.js';
-export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-D1NZKqYu.js';
-import { a as TrajectoryStep } from '../trajectory-BFmveYZt.js';
-import { B as BaselineOptions, a as BaselineReport } from '../baseline-BwdCXUS8.js';
-export { c as computeToolUseMetrics } from '../baseline-BwdCXUS8.js';
-import { l as llmSpans } from '../query-BFDT0kX_.js';
+import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
+export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
+import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
+import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
+export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
+import { l as llmSpans } from '../query-DODUYdPg.js';
 /**
  * BudgetBreachView — aggregates breach events across the corpus.

package/dist/pipelines/index.js CHANGED Viewed

@@ -2,13 +2,13 @@ import {
   compareToBaseline,
   computeToolUseMetrics,
   failureClusterView
-} from "../chunk-K33INZHH.js";
+} from "../chunk-GVQT44CS.js";
 import {
   buildTrajectory
 } from "../chunk-RZTMDUO7.js";
 import {
   interRaterReliability
-} from "../chunk-R5UQJNKC.js";
+} from "../chunk-4L3WJXQJ.js";
 import {
   aggregateLlm,
   argHash,
@@ -17,7 +17,7 @@ import {
   toolSpans
 } from "../chunk-47X6LRCE.js";
 import "../chunk-5BKGXME7.js";
-import "../chunk-NG236HPC.js";
+import "../chunk-QYJT52YW.js";
 import "../chunk-PZ5AY32C.js";
 // src/pipelines/budget-breach.ts

package/dist/prm/index.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
-import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-DgSqjqqj.js';
-export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-DgSqjqqj.js';
-import { S as Span, T as TraceStore } from '../store-BP5be6s7.js';
-import '../trajectory-BFmveYZt.js';
+import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
+export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
+import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
+import '../trajectory-CnoBo-JY.js';
 /**
  * Export PRM-graded traces as training data for downstream reward-model

package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-BP5be6s7.js';
+import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
 /**
  * Typed query helpers over TraceStore.

package/dist/{release-report-PWhGlpfO.d.ts → release-report-DLWbBPtH.d.ts} RENAMED Viewed

@@ -1,6 +1,6 @@
-import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-CiK_3LDr.js';
-import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-jrSGb2xZ.js';
-import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
+import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-ueRVTUoY.js';
+import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-D7AQS7eB.js';
+import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
 /**
  * Release confidence gate.

package/dist/reporting.d.ts CHANGED Viewed

@@ -1,10 +1,10 @@
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-C0uDYwG6.js';
-export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-PWhGlpfO.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CMHypZ_M.js';
+export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-DLWbBPtH.js';
 export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
-export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-jrSGb2xZ.js';
-import './run-record-CqzahIbx.js';
-import './errors-BZ9sTdz7.js';
+export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-D7AQS7eB.js';
+import './run-record-BfX5y68A.js';
+import './errors-mje_cKOs.js';
 import './outcome-store-D6KWmYvj.js';
-import './dataset-CiK_3LDr.js';
-import './failure-cluster-D1NZKqYu.js';
-import './store-BP5be6s7.js';
+import './dataset-ueRVTUoY.js';
+import './failure-cluster-Cw65_5FY.js';
+import './store-Db2Bv8Cf.js';

package/dist/reporting.js CHANGED Viewed

@@ -5,7 +5,7 @@ import {
   judgeReplayGate,
   releaseTraceEvidenceFromMultiShotTrials,
   renderReleaseReport
-} from "./chunk-DBIGN5MJ.js";
+} from "./chunk-WGXZAQLR.js";
 import {
   rubricPredictiveValidity
 } from "./chunk-YRZ4M5GS.js";
@@ -22,10 +22,10 @@ import {
   paretoChart,
   researchReport,
   summaryTable
-} from "./chunk-5AKPEK5L.js";
-import "./chunk-R5UQJNKC.js";
+} from "./chunk-CXJOVDJR.js";
+import "./chunk-4L3WJXQJ.js";
 import "./chunk-VSMTAMNK.js";
-import "./chunk-NG236HPC.js";
+import "./chunk-QYJT52YW.js";
 import "./chunk-PZ5AY32C.js";
 export {
   RESEARCH_REPORT_HARD_PAIR_FLOOR,

package/dist/{researcher-ClDX3KZx.d.ts → researcher-BRHa5Jxo.d.ts} RENAMED Viewed

@@ -1,9 +1,9 @@
-import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
-import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-BAxLGJ9I.js';
-import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-CqzahIbx.js';
-import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-jrSGb2xZ.js';
-import { T as TraceEmitter, R as RunCompleteHook } from './emitter-BqjeOvJh.js';
-import { T as TraceStore } from './store-BP5be6s7.js';
+import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
+import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DYR5gWlb.js';
+import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, R as RunRecord } from './run-record-BfX5y68A.js';
+import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-D7AQS7eB.js';
+import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
+import { T as TraceStore } from './store-Db2Bv8Cf.js';
 /**
  * LLM client with graceful degrade.
@@ -316,6 +316,12 @@ interface CampaignRunOutcome {
     failureMode?: string;
     /** Optional judge metadata when a judge was used. */
     judgeMetadata?: RunJudgeMetadata;
+    /**
+     * Optional per-judge / per-dim breakdown for ensemble-judged runs.
+     * Propagated to `outcome.judgeScores` on the resulting `RunRecord`.
+     * Single-judge or scalar-only runs leave this unset.
+     */
+    judgeScores?: JudgeScoresRecord;
 }
 type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>;
 type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log';

package/dist/rl.d.ts CHANGED Viewed

@@ -1,16 +1,16 @@
-import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
+import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
 import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
-import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-jrSGb2xZ.js';
+import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-D7AQS7eB.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
-import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-C0uDYwG6.js';
+import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CMHypZ_M.js';
 import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
-import { S as Span, T as TraceStore } from './store-BP5be6s7.js';
-import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-ClDX3KZx.js';
-export { r as runEvalCampaign } from './researcher-ClDX3KZx.js';
-import './errors-BZ9sTdz7.js';
-import './failure-cluster-D1NZKqYu.js';
-import './integrity-BAxLGJ9I.js';
-import './emitter-BqjeOvJh.js';
+import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
+import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-BRHa5Jxo.js';
+export { r as runEvalCampaign } from './researcher-BRHa5Jxo.js';
+import './errors-mje_cKOs.js';
+import './failure-cluster-Cw65_5FY.js';
+import './integrity-DYR5gWlb.js';
+import './emitter-DP_cSSiw.js';
 /**
  * Test-time compute scaling curves.

package/dist/rl.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import {
   runEvalCampaign
-} from "./chunk-RUI6SIHY.js";
-import "./chunk-4S4BM3QQ.js";
+} from "./chunk-DTEJNZYK.js";
+import "./chunk-M6RZ5LJN.js";
 import {
   rubricPredictiveValidity
 } from "./chunk-YRZ4M5GS.js";
@@ -10,17 +10,17 @@ import {
 } from "./chunk-MAZ26DC7.js";
 import {
   benjaminiHochberg
-} from "./chunk-5AKPEK5L.js";
+} from "./chunk-CXJOVDJR.js";
 import {
   wilcoxonSignedRank
-} from "./chunk-R5UQJNKC.js";
-import "./chunk-KTGTIOFD.js";
+} from "./chunk-4L3WJXQJ.js";
+import "./chunk-UBPIXOC4.js";
 import "./chunk-PC4UYEBM.js";
 import "./chunk-TVVP3ZZQ.js";
 import "./chunk-VSMTAMNK.js";
 import {
   ValidationError
-} from "./chunk-NG236HPC.js";
+} from "./chunk-QYJT52YW.js";
 import "./chunk-PZ5AY32C.js";
 // src/rl/compute-curves.ts

package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { S as Span, T as TraceStore, J as JudgeSpan } from './store-BP5be6s7.js';
-import { T as Trajectory, a as TrajectoryStep } from './trajectory-BFmveYZt.js';
+import { S as Span, T as TraceStore, J as JudgeSpan } from './store-Db2Bv8Cf.js';
+import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
 /**
  * Process Reward Modeling — per-step rubric grading.

package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-CMHypZ_M.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-CqzahIbx.js';
+import { R as RunRecord } from './run-record-BfX5y68A.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 /**

package/dist/{run-record-CqzahIbx.d.ts → run-record-BfX5y68A.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { V as ValidationError } from './errors-BZ9sTdz7.js';
+import { V as ValidationError } from './errors-mje_cKOs.js';
 /**
  * Paper-grade RunRecord schema + runtime validator.
@@ -42,6 +42,41 @@ interface RunJudgeMetadata {
      *  prior-call cache, etc.). The canary uses this to alert. */
     fallback: boolean;
 }
+/**
+ * Per-judge / per-dimension breakdown for runs scored by an ensemble of
+ * judges over a multi-dimensional rubric.
+ *
+ * The collapsed `outcome.searchScore` / `holdoutScore` carries the
+ * composite the gate uses. The full breakdown belongs here so consumers
+ * can answer "which judge disagreed?", "which dimension dragged the
+ * composite down?", and "did half the panel fail?" without re-running.
+ *
+ * `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and
+ * `composite` are convenience projections — derivable but precomputed so
+ * downstream IRR primitives (`interRaterReliability`,
+ * `corpusInterRaterAgreement`) and reporters don't pay the same
+ * aggregation twice.
+ *
+ * Fail-loud discipline: judges that errored out land in `failedJudges`
+ * by id. A missing key in `perJudge` is ambiguous (silent zero vs not
+ * run); the explicit list makes a partial-failure recorded as such.
+ */
+interface JudgeScoresRecord {
+    /** Per-judge per-dimension scores. `{ "kimi-k2.6": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */
+    perJudge: Record<string, Record<string, number>>;
+    /** Per-dim mean across judges. Convenience — derivable from `perJudge`. */
+    perDimMean: Record<string, number>;
+    /** Composite mean across all dims and judges. Mirrors the score
+     *  the gate sees on `outcome.searchScore` / `holdoutScore`. */
+    composite: number;
+    /** Judges that errored or returned an unparseable verdict. Recorded
+     *  by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit,
+     *  not inferred from missing keys in `perJudge`. */
+    failedJudges?: string[];
+    /** Free-form notes the judges emitted (joined across judges or
+     *  first-judge only — consumer's choice). */
+    notes?: string;
+}
 interface RunOutcome {
     /** Score on the search/optimization split. Optional because a
      *  holdout-only evaluation only fills `holdoutScore`. */
@@ -53,6 +88,12 @@ interface RunOutcome {
      *  pass/fail counters, latency stats, etc. Numeric only — keeps
      *  reporters honest. */
     raw: Record<string, number>;
+    /** Per-judge / per-dim breakdown. Consumers writing ensemble
+     *  judgements populate this; substrate primitives like
+     *  `interRaterReliability` and `corpusInterRaterAgreement` accept
+     *  these records as input. Optional — single-judge or scalar-only
+     *  runs leave it unset. */
+    judgeScores?: JudgeScoresRecord;
 }
 /**
  * Mandatory paper-grade fields for a single evaluation run. Optional
@@ -143,4 +184,4 @@ declare function parseRunRecordSafe(input: unknown): {
 /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
 declare function roundTripRunRecord(record: RunRecord): RunRecord;
-export { type RunRecord as R, type RunSplitTag as a, type RunTokenUsage as b, type RunJudgeMetadata as c, type RunOutcome as d, RunRecordValidationError as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };
+export { type JudgeScoresRecord as J, type RunRecord as R, type RunSplitTag as a, type RunTokenUsage as b, type RunJudgeMetadata as c, type RunOutcome as d, RunRecordValidationError as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };

package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} RENAMED Viewed

@@ -294,4 +294,4 @@ declare class FileSystemTraceStore implements TraceStore {
     artifacts(runId: string): Promise<Artifact[]>;
 }
-export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type EventFilter as h, FAILURE_CLASSES as i, FileSystemTraceStore as j, type FileSystemTraceStoreOptions as k, type RunFilter as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
+export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };

package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-D7AQS7eB.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
-import { F as FailureClusterReport } from './failure-cluster-D1NZKqYu.js';
+import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
+import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
 /**
  * HeldOutGate — first-class held-out paired-delta promotion gate.

package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { T as TraceEmitter } from './emitter-BqjeOvJh.js';
-import { R as Run, F as FailureClass, T as TraceStore } from './store-BP5be6s7.js';
+import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
+import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
 /**
  * SandboxHarness — executes a scenario in an isolated environment and