@tangle-network/agent-eval 0.30.0 → 0.31.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +79 -0
- package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +3 -3
- package/dist/builder-eval/index.d.ts +3 -3
- package/dist/builder-eval/index.js +2 -2
- package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
- package/dist/{chunk-SZSBQUIJ.js → chunk-B73G44OH.js} +3 -3
- package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
- package/dist/{chunk-RUI6SIHY.js → chunk-DTEJNZYK.js} +5 -4
- package/dist/chunk-DTEJNZYK.js.map +1 -0
- package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
- package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
- package/dist/chunk-HIO4UIS5.js.map +1 -0
- package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
- package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
- package/dist/chunk-QYJT52YW.js.map +1 -0
- package/dist/{chunk-PALJO75S.js → chunk-S4Y5VXMS.js} +2 -2
- package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
- package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
- package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
- package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
- package/dist/{chunk-NLMNWKVM.js → chunk-ZN2CMQIW.js} +54 -2
- package/dist/chunk-ZN2CMQIW.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{control-rJhEDdpy.d.ts → control-p2ns7elI.d.ts} +5 -5
- package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
- package/dist/control.d.ts +8 -8
- package/dist/control.js +3 -3
- package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
- package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
- package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
- package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
- package/dist/governance/index.d.ts +4 -4
- package/dist/{index--fVrWDiR.d.ts → index-BTqhGHJT.d.ts} +1 -1
- package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
- package/dist/index.d.ts +108 -38
- package/dist/index.js +159 -14
- package/dist/index.js.map +1 -1
- package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
- package/dist/knowledge/index.d.ts +3 -3
- package/dist/meta-eval/index.d.ts +4 -4
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +11 -11
- package/dist/optimization.js +8 -8
- package/dist/pipelines/index.d.ts +6 -6
- package/dist/pipelines/index.js +3 -3
- package/dist/prm/index.d.ts +4 -4
- package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
- package/dist/{release-report-PWhGlpfO.d.ts → release-report-DLWbBPtH.d.ts} +3 -3
- package/dist/reporting.d.ts +8 -8
- package/dist/reporting.js +4 -4
- package/dist/{researcher-ClDX3KZx.d.ts → researcher-BRHa5Jxo.d.ts} +12 -6
- package/dist/rl.d.ts +10 -10
- package/dist/rl.js +6 -6
- package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
- package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-CMHypZ_M.d.ts} +1 -1
- package/dist/{run-record-CqzahIbx.d.ts → run-record-BfX5y68A.d.ts} +43 -2
- package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-D7AQS7eB.d.ts} +2 -2
- package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
- package/dist/traces.d.ts +533 -10
- package/dist/traces.js +14 -300
- package/dist/traces.js.map +1 -1
- package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
- package/dist/wire/index.d.ts +6 -6
- package/dist/wire/index.js +3 -3
- package/package.json +12 -21
- package/dist/chunk-NG236HPC.js.map +0 -1
- package/dist/chunk-NLMNWKVM.js.map +0 -1
- package/dist/chunk-RUI6SIHY.js.map +0 -1
- package/dist/chunk-UW4NOOZI.js.map +0 -1
- package/dist/replay-BX5Fm8en.d.ts +0 -529
- /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
- /package/dist/{chunk-SZSBQUIJ.js.map → chunk-B73G44OH.js.map} +0 -0
- /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
- /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
- /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
- /package/dist/{chunk-PALJO75S.js.map → chunk-S4Y5VXMS.js.map} +0 -0
- /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
- /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
- /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
- /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { C as CaptureIntegrityError } from './errors-
|
|
2
|
-
import { T as TraceStore } from './store-
|
|
1
|
+
import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
|
|
2
|
+
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* RawProviderSink — first-class persistence for the actual HTTP-level
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-
|
|
2
|
-
import { T as TraceEmitter } from '../emitter-
|
|
3
|
-
import '../store-
|
|
1
|
+
import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
|
|
2
|
+
import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
|
|
3
|
+
import '../store-Db2Bv8Cf.js';
|
|
4
4
|
|
|
5
5
|
type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
|
|
6
6
|
type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { R as Run, T as TraceStore } from '../store-
|
|
1
|
+
import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
2
2
|
import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
3
3
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
4
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-
|
|
5
|
-
import '../run-record-
|
|
6
|
-
import '../errors-
|
|
4
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-CMHypZ_M.js';
|
|
5
|
+
import '../run-record-BfX5y68A.js';
|
|
6
|
+
import '../errors-mje_cKOs.js';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
9
|
* Correlation study — "does our eval score predict real-world outcomes?"
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.31.1",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/optimization.d.ts
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-
|
|
2
|
-
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
3
|
-
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-
|
|
4
|
-
import './errors-
|
|
5
|
-
import './integrity-
|
|
6
|
-
import './store-
|
|
7
|
-
import './run-record-
|
|
8
|
-
import './emitter-
|
|
9
|
-
import './control-runtime-
|
|
10
|
-
import './dataset-
|
|
11
|
-
import './failure-cluster-
|
|
1
|
+
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-BRHa5Jxo.js';
|
|
2
|
+
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
|
|
3
|
+
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-D7AQS7eB.js';
|
|
4
|
+
import './errors-mje_cKOs.js';
|
|
5
|
+
import './integrity-DYR5gWlb.js';
|
|
6
|
+
import './store-Db2Bv8Cf.js';
|
|
7
|
+
import './run-record-BfX5y68A.js';
|
|
8
|
+
import './emitter-DP_cSSiw.js';
|
|
9
|
+
import './control-runtime-BZ_lVLYW.js';
|
|
10
|
+
import './dataset-ueRVTUoY.js';
|
|
11
|
+
import './failure-cluster-Cw65_5FY.js';
|
package/dist/optimization.js
CHANGED
|
@@ -25,19 +25,19 @@ import {
|
|
|
25
25
|
summarizePreferenceMemory,
|
|
26
26
|
trialTraceFromMultiShotTrial,
|
|
27
27
|
withAssignedFeedbackSplit
|
|
28
|
-
} from "./chunk-
|
|
29
|
-
import "./chunk-
|
|
28
|
+
} from "./chunk-B73G44OH.js";
|
|
29
|
+
import "./chunk-ZN2CMQIW.js";
|
|
30
30
|
import {
|
|
31
31
|
runEvalCampaign
|
|
32
|
-
} from "./chunk-
|
|
33
|
-
import "./chunk-
|
|
34
|
-
import "./chunk-
|
|
35
|
-
import "./chunk-
|
|
36
|
-
import "./chunk-
|
|
32
|
+
} from "./chunk-DTEJNZYK.js";
|
|
33
|
+
import "./chunk-M6RZ5LJN.js";
|
|
34
|
+
import "./chunk-CXJOVDJR.js";
|
|
35
|
+
import "./chunk-4L3WJXQJ.js";
|
|
36
|
+
import "./chunk-UBPIXOC4.js";
|
|
37
37
|
import "./chunk-PC4UYEBM.js";
|
|
38
38
|
import "./chunk-TVVP3ZZQ.js";
|
|
39
39
|
import "./chunk-VSMTAMNK.js";
|
|
40
|
-
import "./chunk-
|
|
40
|
+
import "./chunk-QYJT52YW.js";
|
|
41
41
|
import "./chunk-PZ5AY32C.js";
|
|
42
42
|
export {
|
|
43
43
|
CallbackResearcher,
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { g as BudgetSpec, T as TraceStore,
|
|
2
|
-
export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-
|
|
3
|
-
import { a as TrajectoryStep } from '../trajectory-
|
|
4
|
-
import { B as BaselineOptions, a as BaselineReport } from '../baseline-
|
|
5
|
-
export { c as computeToolUseMetrics } from '../baseline-
|
|
6
|
-
import { l as llmSpans } from '../query-
|
|
1
|
+
import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
|
|
2
|
+
export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
|
|
3
|
+
import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
|
|
4
|
+
import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
|
|
5
|
+
export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
|
|
6
|
+
import { l as llmSpans } from '../query-DODUYdPg.js';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
9
|
* BudgetBreachView — aggregates breach events across the corpus.
|
package/dist/pipelines/index.js
CHANGED
|
@@ -2,13 +2,13 @@ import {
|
|
|
2
2
|
compareToBaseline,
|
|
3
3
|
computeToolUseMetrics,
|
|
4
4
|
failureClusterView
|
|
5
|
-
} from "../chunk-
|
|
5
|
+
} from "../chunk-GVQT44CS.js";
|
|
6
6
|
import {
|
|
7
7
|
buildTrajectory
|
|
8
8
|
} from "../chunk-RZTMDUO7.js";
|
|
9
9
|
import {
|
|
10
10
|
interRaterReliability
|
|
11
|
-
} from "../chunk-
|
|
11
|
+
} from "../chunk-4L3WJXQJ.js";
|
|
12
12
|
import {
|
|
13
13
|
aggregateLlm,
|
|
14
14
|
argHash,
|
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
toolSpans
|
|
18
18
|
} from "../chunk-47X6LRCE.js";
|
|
19
19
|
import "../chunk-5BKGXME7.js";
|
|
20
|
-
import "../chunk-
|
|
20
|
+
import "../chunk-QYJT52YW.js";
|
|
21
21
|
import "../chunk-PZ5AY32C.js";
|
|
22
22
|
|
|
23
23
|
// src/pipelines/budget-breach.ts
|
package/dist/prm/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-
|
|
2
|
-
export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-
|
|
3
|
-
import { S as Span, T as TraceStore } from '../store-
|
|
4
|
-
import '../trajectory-
|
|
1
|
+
import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
|
|
2
|
+
export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
|
|
3
|
+
import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
4
|
+
import '../trajectory-CnoBo-JY.js';
|
|
5
5
|
|
|
6
6
|
/**
|
|
7
7
|
* Export PRM-graded traces as training data for downstream reward-model
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-
|
|
1
|
+
import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Typed query helpers over TraceStore.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-
|
|
2
|
-
import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-
|
|
3
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
1
|
+
import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-ueRVTUoY.js';
|
|
2
|
+
import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-D7AQS7eB.js';
|
|
3
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Release confidence gate.
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-
|
|
1
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-CMHypZ_M.js';
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-DLWbBPtH.js';
|
|
3
3
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
|
-
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-
|
|
5
|
-
import './run-record-
|
|
6
|
-
import './errors-
|
|
4
|
+
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-D7AQS7eB.js';
|
|
5
|
+
import './run-record-BfX5y68A.js';
|
|
6
|
+
import './errors-mje_cKOs.js';
|
|
7
7
|
import './outcome-store-D6KWmYvj.js';
|
|
8
|
-
import './dataset-
|
|
9
|
-
import './failure-cluster-
|
|
10
|
-
import './store-
|
|
8
|
+
import './dataset-ueRVTUoY.js';
|
|
9
|
+
import './failure-cluster-Cw65_5FY.js';
|
|
10
|
+
import './store-Db2Bv8Cf.js';
|
package/dist/reporting.js
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
judgeReplayGate,
|
|
6
6
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
7
7
|
renderReleaseReport
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-WGXZAQLR.js";
|
|
9
9
|
import {
|
|
10
10
|
rubricPredictiveValidity
|
|
11
11
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -22,10 +22,10 @@ import {
|
|
|
22
22
|
paretoChart,
|
|
23
23
|
researchReport,
|
|
24
24
|
summaryTable
|
|
25
|
-
} from "./chunk-
|
|
26
|
-
import "./chunk-
|
|
25
|
+
} from "./chunk-CXJOVDJR.js";
|
|
26
|
+
import "./chunk-4L3WJXQJ.js";
|
|
27
27
|
import "./chunk-VSMTAMNK.js";
|
|
28
|
-
import "./chunk-
|
|
28
|
+
import "./chunk-QYJT52YW.js";
|
|
29
29
|
import "./chunk-PZ5AY32C.js";
|
|
30
30
|
export {
|
|
31
31
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-
|
|
2
|
-
import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-
|
|
3
|
-
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-
|
|
4
|
-
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-
|
|
5
|
-
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-
|
|
6
|
-
import { T as TraceStore } from './store-
|
|
1
|
+
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
|
|
2
|
+
import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DYR5gWlb.js';
|
|
3
|
+
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, R as RunRecord } from './run-record-BfX5y68A.js';
|
|
4
|
+
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-D7AQS7eB.js';
|
|
5
|
+
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
|
|
6
|
+
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
9
|
* LLM client with graceful degrade.
|
|
@@ -316,6 +316,12 @@ interface CampaignRunOutcome {
|
|
|
316
316
|
failureMode?: string;
|
|
317
317
|
/** Optional judge metadata when a judge was used. */
|
|
318
318
|
judgeMetadata?: RunJudgeMetadata;
|
|
319
|
+
/**
|
|
320
|
+
* Optional per-judge / per-dim breakdown for ensemble-judged runs.
|
|
321
|
+
* Propagated to `outcome.judgeScores` on the resulting `RunRecord`.
|
|
322
|
+
* Single-judge or scalar-only runs leave this unset.
|
|
323
|
+
*/
|
|
324
|
+
judgeScores?: JudgeScoresRecord;
|
|
319
325
|
}
|
|
320
326
|
type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>;
|
|
321
327
|
type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log';
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
1
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
|
|
2
2
|
import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
|
|
3
|
-
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-
|
|
3
|
+
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-D7AQS7eB.js';
|
|
4
4
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
5
|
-
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-
|
|
5
|
+
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CMHypZ_M.js';
|
|
6
6
|
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
7
|
-
import { S as Span, T as TraceStore } from './store-
|
|
8
|
-
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-
|
|
9
|
-
export { r as runEvalCampaign } from './researcher-
|
|
10
|
-
import './errors-
|
|
11
|
-
import './failure-cluster-
|
|
12
|
-
import './integrity-
|
|
13
|
-
import './emitter-
|
|
7
|
+
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
8
|
+
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-BRHa5Jxo.js';
|
|
9
|
+
export { r as runEvalCampaign } from './researcher-BRHa5Jxo.js';
|
|
10
|
+
import './errors-mje_cKOs.js';
|
|
11
|
+
import './failure-cluster-Cw65_5FY.js';
|
|
12
|
+
import './integrity-DYR5gWlb.js';
|
|
13
|
+
import './emitter-DP_cSSiw.js';
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
16
|
* Test-time compute scaling curves.
|
package/dist/rl.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runEvalCampaign
|
|
3
|
-
} from "./chunk-
|
|
4
|
-
import "./chunk-
|
|
3
|
+
} from "./chunk-DTEJNZYK.js";
|
|
4
|
+
import "./chunk-M6RZ5LJN.js";
|
|
5
5
|
import {
|
|
6
6
|
rubricPredictiveValidity
|
|
7
7
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -10,17 +10,17 @@ import {
|
|
|
10
10
|
} from "./chunk-MAZ26DC7.js";
|
|
11
11
|
import {
|
|
12
12
|
benjaminiHochberg
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-CXJOVDJR.js";
|
|
14
14
|
import {
|
|
15
15
|
wilcoxonSignedRank
|
|
16
|
-
} from "./chunk-
|
|
17
|
-
import "./chunk-
|
|
16
|
+
} from "./chunk-4L3WJXQJ.js";
|
|
17
|
+
import "./chunk-UBPIXOC4.js";
|
|
18
18
|
import "./chunk-PC4UYEBM.js";
|
|
19
19
|
import "./chunk-TVVP3ZZQ.js";
|
|
20
20
|
import "./chunk-VSMTAMNK.js";
|
|
21
21
|
import {
|
|
22
22
|
ValidationError
|
|
23
|
-
} from "./chunk-
|
|
23
|
+
} from "./chunk-QYJT52YW.js";
|
|
24
24
|
import "./chunk-PZ5AY32C.js";
|
|
25
25
|
|
|
26
26
|
// src/rl/compute-curves.ts
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { S as Span, T as TraceStore, J as JudgeSpan } from './store-
|
|
2
|
-
import { T as Trajectory, a as TrajectoryStep } from './trajectory-
|
|
1
|
+
import { S as Span, T as TraceStore, J as JudgeSpan } from './store-Db2Bv8Cf.js';
|
|
2
|
+
import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Process Reward Modeling — per-step rubric grading.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { V as ValidationError } from './errors-
|
|
1
|
+
import { V as ValidationError } from './errors-mje_cKOs.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Paper-grade RunRecord schema + runtime validator.
|
|
@@ -42,6 +42,41 @@ interface RunJudgeMetadata {
|
|
|
42
42
|
* prior-call cache, etc.). The canary uses this to alert. */
|
|
43
43
|
fallback: boolean;
|
|
44
44
|
}
|
|
45
|
+
/**
|
|
46
|
+
* Per-judge / per-dimension breakdown for runs scored by an ensemble of
|
|
47
|
+
* judges over a multi-dimensional rubric.
|
|
48
|
+
*
|
|
49
|
+
* The collapsed `outcome.searchScore` / `holdoutScore` carries the
|
|
50
|
+
* composite the gate uses. The full breakdown belongs here so consumers
|
|
51
|
+
* can answer "which judge disagreed?", "which dimension dragged the
|
|
52
|
+
* composite down?", and "did half the panel fail?" without re-running.
|
|
53
|
+
*
|
|
54
|
+
* `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and
|
|
55
|
+
* `composite` are convenience projections — derivable but precomputed so
|
|
56
|
+
* downstream IRR primitives (`interRaterReliability`,
|
|
57
|
+
* `corpusInterRaterAgreement`) and reporters don't pay the same
|
|
58
|
+
* aggregation twice.
|
|
59
|
+
*
|
|
60
|
+
* Fail-loud discipline: judges that errored out land in `failedJudges`
|
|
61
|
+
* by id. A missing key in `perJudge` is ambiguous (silent zero vs not
|
|
62
|
+
* run); the explicit list makes a partial-failure recorded as such.
|
|
63
|
+
*/
|
|
64
|
+
interface JudgeScoresRecord {
|
|
65
|
+
/** Per-judge per-dimension scores. `{ "kimi-k2.6": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */
|
|
66
|
+
perJudge: Record<string, Record<string, number>>;
|
|
67
|
+
/** Per-dim mean across judges. Convenience — derivable from `perJudge`. */
|
|
68
|
+
perDimMean: Record<string, number>;
|
|
69
|
+
/** Composite mean across all dims and judges. Mirrors the score
|
|
70
|
+
* the gate sees on `outcome.searchScore` / `holdoutScore`. */
|
|
71
|
+
composite: number;
|
|
72
|
+
/** Judges that errored or returned an unparseable verdict. Recorded
|
|
73
|
+
* by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit,
|
|
74
|
+
* not inferred from missing keys in `perJudge`. */
|
|
75
|
+
failedJudges?: string[];
|
|
76
|
+
/** Free-form notes the judges emitted (joined across judges or
|
|
77
|
+
* first-judge only — consumer's choice). */
|
|
78
|
+
notes?: string;
|
|
79
|
+
}
|
|
45
80
|
interface RunOutcome {
|
|
46
81
|
/** Score on the search/optimization split. Optional because a
|
|
47
82
|
* holdout-only evaluation only fills `holdoutScore`. */
|
|
@@ -53,6 +88,12 @@ interface RunOutcome {
|
|
|
53
88
|
* pass/fail counters, latency stats, etc. Numeric only — keeps
|
|
54
89
|
* reporters honest. */
|
|
55
90
|
raw: Record<string, number>;
|
|
91
|
+
/** Per-judge / per-dim breakdown. Consumers writing ensemble
|
|
92
|
+
* judgements populate this; substrate primitives like
|
|
93
|
+
* `interRaterReliability` and `corpusInterRaterAgreement` accept
|
|
94
|
+
* these records as input. Optional — single-judge or scalar-only
|
|
95
|
+
* runs leave it unset. */
|
|
96
|
+
judgeScores?: JudgeScoresRecord;
|
|
56
97
|
}
|
|
57
98
|
/**
|
|
58
99
|
* Mandatory paper-grade fields for a single evaluation run. Optional
|
|
@@ -143,4 +184,4 @@ declare function parseRunRecordSafe(input: unknown): {
|
|
|
143
184
|
/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
|
|
144
185
|
declare function roundTripRunRecord(record: RunRecord): RunRecord;
|
|
145
186
|
|
|
146
|
-
export { type RunRecord as R, type RunSplitTag as a, type RunTokenUsage as b, type RunJudgeMetadata as c, type RunOutcome as d, RunRecordValidationError as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };
|
|
187
|
+
export { type JudgeScoresRecord as J, type RunRecord as R, type RunSplitTag as a, type RunTokenUsage as b, type RunJudgeMetadata as c, type RunOutcome as d, RunRecordValidationError as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };
|
|
@@ -294,4 +294,4 @@ declare class FileSystemTraceStore implements TraceStore {
|
|
|
294
294
|
artifacts(runId: string): Promise<Artifact[]>;
|
|
295
295
|
}
|
|
296
296
|
|
|
297
|
-
export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type
|
|
297
|
+
export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
2
|
-
import { F as FailureClusterReport } from './failure-cluster-
|
|
1
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
|
|
2
|
+
import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* HeldOutGate — first-class held-out paired-delta promotion gate.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { T as TraceEmitter } from './emitter-
|
|
2
|
-
import { R as Run, F as FailureClass, T as TraceStore } from './store-
|
|
1
|
+
import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
|
|
2
|
+
import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* SandboxHarness — executes a scenario in an isolated environment and
|