@tangle-network/agent-eval 0.30.0 → 0.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +3 -3
- package/dist/builder-eval/index.d.ts +3 -3
- package/dist/builder-eval/index.js +2 -2
- package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
- package/dist/{chunk-RUI6SIHY.js → chunk-75ZREHD7.js} +4 -4
- package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
- package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
- package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
- package/dist/chunk-HIO4UIS5.js.map +1 -0
- package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
- package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
- package/dist/chunk-QYJT52YW.js.map +1 -0
- package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
- package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
- package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
- package/dist/{chunk-NLMNWKVM.js → chunk-WSI4K3WB.js} +2 -2
- package/dist/{chunk-PALJO75S.js → chunk-XEL6UP7C.js} +2 -2
- package/dist/{chunk-SZSBQUIJ.js → chunk-Y2CPBYKH.js} +3 -3
- package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
- package/dist/cli.js +3 -3
- package/dist/{control-rJhEDdpy.d.ts → control-BFpqHFV2.d.ts} +5 -5
- package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
- package/dist/control.d.ts +8 -8
- package/dist/control.js +3 -3
- package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
- package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
- package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
- package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
- package/dist/governance/index.d.ts +4 -4
- package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
- package/dist/{index--fVrWDiR.d.ts → index-TVjRYWRm.d.ts} +1 -1
- package/dist/index.d.ts +108 -38
- package/dist/index.js +159 -14
- package/dist/index.js.map +1 -1
- package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
- package/dist/knowledge/index.d.ts +3 -3
- package/dist/meta-eval/index.d.ts +4 -4
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +11 -11
- package/dist/optimization.js +8 -8
- package/dist/pipelines/index.d.ts +6 -6
- package/dist/pipelines/index.js +3 -3
- package/dist/prm/index.d.ts +4 -4
- package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
- package/dist/{release-report-PWhGlpfO.d.ts → release-report-C8r4Vben.d.ts} +3 -3
- package/dist/reporting.d.ts +8 -8
- package/dist/reporting.js +4 -4
- package/dist/{researcher-ClDX3KZx.d.ts → researcher-BmgJ_901.d.ts} +6 -6
- package/dist/rl.d.ts +10 -10
- package/dist/rl.js +6 -6
- package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
- package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-Bm-CbN46.d.ts} +1 -1
- package/dist/{run-record-CqzahIbx.d.ts → run-record-nYf9x2hU.d.ts} +1 -1
- package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-dir7A-eQ.d.ts} +2 -2
- package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
- package/dist/traces.d.ts +533 -10
- package/dist/traces.js +14 -300
- package/dist/traces.js.map +1 -1
- package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
- package/dist/wire/index.d.ts +6 -6
- package/dist/wire/index.js +3 -3
- package/package.json +12 -21
- package/dist/chunk-NG236HPC.js.map +0 -1
- package/dist/chunk-UW4NOOZI.js.map +0 -1
- package/dist/replay-BX5Fm8en.d.ts +0 -529
- /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
- /package/dist/{chunk-RUI6SIHY.js.map → chunk-75ZREHD7.js.map} +0 -0
- /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
- /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
- /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
- /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
- /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
- /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
- /package/dist/{chunk-NLMNWKVM.js.map → chunk-WSI4K3WB.js.map} +0 -0
- /package/dist/{chunk-PALJO75S.js.map → chunk-XEL6UP7C.js.map} +0 -0
- /package/dist/{chunk-SZSBQUIJ.js.map → chunk-Y2CPBYKH.js.map} +0 -0
- /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { C as CaptureIntegrityError } from './errors-
|
|
2
|
-
import { T as TraceStore } from './store-
|
|
1
|
+
import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
|
|
2
|
+
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* RawProviderSink — first-class persistence for the actual HTTP-level
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-
|
|
2
|
-
import { T as TraceEmitter } from '../emitter-
|
|
3
|
-
import '../store-
|
|
1
|
+
import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
|
|
2
|
+
import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
|
|
3
|
+
import '../store-Db2Bv8Cf.js';
|
|
4
4
|
|
|
5
5
|
type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
|
|
6
6
|
type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { R as Run, T as TraceStore } from '../store-
|
|
1
|
+
import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
2
2
|
import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
3
3
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
4
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-
|
|
5
|
-
import '../run-record-
|
|
6
|
-
import '../errors-
|
|
4
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-Bm-CbN46.js';
|
|
5
|
+
import '../run-record-nYf9x2hU.js';
|
|
6
|
+
import '../errors-mje_cKOs.js';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
9
|
* Correlation study — "does our eval score predict real-world outcomes?"
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.31.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/optimization.d.ts
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-
|
|
2
|
-
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
3
|
-
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-
|
|
4
|
-
import './errors-
|
|
5
|
-
import './integrity-
|
|
6
|
-
import './store-
|
|
7
|
-
import './run-record-
|
|
8
|
-
import './emitter-
|
|
9
|
-
import './control-runtime-
|
|
10
|
-
import './dataset-
|
|
11
|
-
import './failure-cluster-
|
|
1
|
+
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-BmgJ_901.js';
|
|
2
|
+
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
|
|
3
|
+
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-dir7A-eQ.js';
|
|
4
|
+
import './errors-mje_cKOs.js';
|
|
5
|
+
import './integrity-DYR5gWlb.js';
|
|
6
|
+
import './store-Db2Bv8Cf.js';
|
|
7
|
+
import './run-record-nYf9x2hU.js';
|
|
8
|
+
import './emitter-DP_cSSiw.js';
|
|
9
|
+
import './control-runtime-BZ_lVLYW.js';
|
|
10
|
+
import './dataset-ueRVTUoY.js';
|
|
11
|
+
import './failure-cluster-Cw65_5FY.js';
|
package/dist/optimization.js
CHANGED
|
@@ -25,19 +25,19 @@ import {
|
|
|
25
25
|
summarizePreferenceMemory,
|
|
26
26
|
trialTraceFromMultiShotTrial,
|
|
27
27
|
withAssignedFeedbackSplit
|
|
28
|
-
} from "./chunk-
|
|
29
|
-
import "./chunk-
|
|
28
|
+
} from "./chunk-Y2CPBYKH.js";
|
|
29
|
+
import "./chunk-WSI4K3WB.js";
|
|
30
30
|
import {
|
|
31
31
|
runEvalCampaign
|
|
32
|
-
} from "./chunk-
|
|
33
|
-
import "./chunk-
|
|
34
|
-
import "./chunk-
|
|
35
|
-
import "./chunk-
|
|
36
|
-
import "./chunk-
|
|
32
|
+
} from "./chunk-75ZREHD7.js";
|
|
33
|
+
import "./chunk-M6RZ5LJN.js";
|
|
34
|
+
import "./chunk-CXJOVDJR.js";
|
|
35
|
+
import "./chunk-4L3WJXQJ.js";
|
|
36
|
+
import "./chunk-UBPIXOC4.js";
|
|
37
37
|
import "./chunk-PC4UYEBM.js";
|
|
38
38
|
import "./chunk-TVVP3ZZQ.js";
|
|
39
39
|
import "./chunk-VSMTAMNK.js";
|
|
40
|
-
import "./chunk-
|
|
40
|
+
import "./chunk-QYJT52YW.js";
|
|
41
41
|
import "./chunk-PZ5AY32C.js";
|
|
42
42
|
export {
|
|
43
43
|
CallbackResearcher,
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { g as BudgetSpec, T as TraceStore,
|
|
2
|
-
export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-
|
|
3
|
-
import { a as TrajectoryStep } from '../trajectory-
|
|
4
|
-
import { B as BaselineOptions, a as BaselineReport } from '../baseline-
|
|
5
|
-
export { c as computeToolUseMetrics } from '../baseline-
|
|
6
|
-
import { l as llmSpans } from '../query-
|
|
1
|
+
import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
|
|
2
|
+
export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
|
|
3
|
+
import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
|
|
4
|
+
import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
|
|
5
|
+
export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
|
|
6
|
+
import { l as llmSpans } from '../query-DODUYdPg.js';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
9
|
* BudgetBreachView — aggregates breach events across the corpus.
|
package/dist/pipelines/index.js
CHANGED
|
@@ -2,13 +2,13 @@ import {
|
|
|
2
2
|
compareToBaseline,
|
|
3
3
|
computeToolUseMetrics,
|
|
4
4
|
failureClusterView
|
|
5
|
-
} from "../chunk-
|
|
5
|
+
} from "../chunk-GVQT44CS.js";
|
|
6
6
|
import {
|
|
7
7
|
buildTrajectory
|
|
8
8
|
} from "../chunk-RZTMDUO7.js";
|
|
9
9
|
import {
|
|
10
10
|
interRaterReliability
|
|
11
|
-
} from "../chunk-
|
|
11
|
+
} from "../chunk-4L3WJXQJ.js";
|
|
12
12
|
import {
|
|
13
13
|
aggregateLlm,
|
|
14
14
|
argHash,
|
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
toolSpans
|
|
18
18
|
} from "../chunk-47X6LRCE.js";
|
|
19
19
|
import "../chunk-5BKGXME7.js";
|
|
20
|
-
import "../chunk-
|
|
20
|
+
import "../chunk-QYJT52YW.js";
|
|
21
21
|
import "../chunk-PZ5AY32C.js";
|
|
22
22
|
|
|
23
23
|
// src/pipelines/budget-breach.ts
|
package/dist/prm/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-
|
|
2
|
-
export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-
|
|
3
|
-
import { S as Span, T as TraceStore } from '../store-
|
|
4
|
-
import '../trajectory-
|
|
1
|
+
import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
|
|
2
|
+
export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
|
|
3
|
+
import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
4
|
+
import '../trajectory-CnoBo-JY.js';
|
|
5
5
|
|
|
6
6
|
/**
|
|
7
7
|
* Export PRM-graded traces as training data for downstream reward-model
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-
|
|
1
|
+
import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Typed query helpers over TraceStore.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-
|
|
2
|
-
import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-
|
|
3
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
1
|
+
import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-ueRVTUoY.js';
|
|
2
|
+
import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-dir7A-eQ.js';
|
|
3
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-nYf9x2hU.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Release confidence gate.
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-
|
|
1
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-Bm-CbN46.js';
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-C8r4Vben.js';
|
|
3
3
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
|
-
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-
|
|
5
|
-
import './run-record-
|
|
6
|
-
import './errors-
|
|
4
|
+
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-dir7A-eQ.js';
|
|
5
|
+
import './run-record-nYf9x2hU.js';
|
|
6
|
+
import './errors-mje_cKOs.js';
|
|
7
7
|
import './outcome-store-D6KWmYvj.js';
|
|
8
|
-
import './dataset-
|
|
9
|
-
import './failure-cluster-
|
|
10
|
-
import './store-
|
|
8
|
+
import './dataset-ueRVTUoY.js';
|
|
9
|
+
import './failure-cluster-Cw65_5FY.js';
|
|
10
|
+
import './store-Db2Bv8Cf.js';
|
package/dist/reporting.js
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
judgeReplayGate,
|
|
6
6
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
7
7
|
renderReleaseReport
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-WGXZAQLR.js";
|
|
9
9
|
import {
|
|
10
10
|
rubricPredictiveValidity
|
|
11
11
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -22,10 +22,10 @@ import {
|
|
|
22
22
|
paretoChart,
|
|
23
23
|
researchReport,
|
|
24
24
|
summaryTable
|
|
25
|
-
} from "./chunk-
|
|
26
|
-
import "./chunk-
|
|
25
|
+
} from "./chunk-CXJOVDJR.js";
|
|
26
|
+
import "./chunk-4L3WJXQJ.js";
|
|
27
27
|
import "./chunk-VSMTAMNK.js";
|
|
28
|
-
import "./chunk-
|
|
28
|
+
import "./chunk-QYJT52YW.js";
|
|
29
29
|
import "./chunk-PZ5AY32C.js";
|
|
30
30
|
export {
|
|
31
31
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-
|
|
2
|
-
import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-
|
|
3
|
-
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-
|
|
4
|
-
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-
|
|
5
|
-
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-
|
|
6
|
-
import { T as TraceStore } from './store-
|
|
1
|
+
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
|
|
2
|
+
import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DYR5gWlb.js';
|
|
3
|
+
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-nYf9x2hU.js';
|
|
4
|
+
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-dir7A-eQ.js';
|
|
5
|
+
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
|
|
6
|
+
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
9
|
* LLM client with graceful degrade.
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
1
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-nYf9x2hU.js';
|
|
2
2
|
import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
|
|
3
|
-
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-
|
|
3
|
+
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-dir7A-eQ.js';
|
|
4
4
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
5
|
-
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-
|
|
5
|
+
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-Bm-CbN46.js';
|
|
6
6
|
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
7
|
-
import { S as Span, T as TraceStore } from './store-
|
|
8
|
-
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-
|
|
9
|
-
export { r as runEvalCampaign } from './researcher-
|
|
10
|
-
import './errors-
|
|
11
|
-
import './failure-cluster-
|
|
12
|
-
import './integrity-
|
|
13
|
-
import './emitter-
|
|
7
|
+
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
8
|
+
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-BmgJ_901.js';
|
|
9
|
+
export { r as runEvalCampaign } from './researcher-BmgJ_901.js';
|
|
10
|
+
import './errors-mje_cKOs.js';
|
|
11
|
+
import './failure-cluster-Cw65_5FY.js';
|
|
12
|
+
import './integrity-DYR5gWlb.js';
|
|
13
|
+
import './emitter-DP_cSSiw.js';
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
16
|
* Test-time compute scaling curves.
|
package/dist/rl.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runEvalCampaign
|
|
3
|
-
} from "./chunk-
|
|
4
|
-
import "./chunk-
|
|
3
|
+
} from "./chunk-75ZREHD7.js";
|
|
4
|
+
import "./chunk-M6RZ5LJN.js";
|
|
5
5
|
import {
|
|
6
6
|
rubricPredictiveValidity
|
|
7
7
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -10,17 +10,17 @@ import {
|
|
|
10
10
|
} from "./chunk-MAZ26DC7.js";
|
|
11
11
|
import {
|
|
12
12
|
benjaminiHochberg
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-CXJOVDJR.js";
|
|
14
14
|
import {
|
|
15
15
|
wilcoxonSignedRank
|
|
16
|
-
} from "./chunk-
|
|
17
|
-
import "./chunk-
|
|
16
|
+
} from "./chunk-4L3WJXQJ.js";
|
|
17
|
+
import "./chunk-UBPIXOC4.js";
|
|
18
18
|
import "./chunk-PC4UYEBM.js";
|
|
19
19
|
import "./chunk-TVVP3ZZQ.js";
|
|
20
20
|
import "./chunk-VSMTAMNK.js";
|
|
21
21
|
import {
|
|
22
22
|
ValidationError
|
|
23
|
-
} from "./chunk-
|
|
23
|
+
} from "./chunk-QYJT52YW.js";
|
|
24
24
|
import "./chunk-PZ5AY32C.js";
|
|
25
25
|
|
|
26
26
|
// src/rl/compute-curves.ts
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { S as Span, T as TraceStore, J as JudgeSpan } from './store-
|
|
2
|
-
import { T as Trajectory, a as TrajectoryStep } from './trajectory-
|
|
1
|
+
import { S as Span, T as TraceStore, J as JudgeSpan } from './store-Db2Bv8Cf.js';
|
|
2
|
+
import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Process Reward Modeling — per-step rubric grading.
|
|
@@ -294,4 +294,4 @@ declare class FileSystemTraceStore implements TraceStore {
|
|
|
294
294
|
artifacts(runId: string): Promise<Artifact[]>;
|
|
295
295
|
}
|
|
296
296
|
|
|
297
|
-
export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type
|
|
297
|
+
export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
2
|
-
import { F as FailureClusterReport } from './failure-cluster-
|
|
1
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-nYf9x2hU.js';
|
|
2
|
+
import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* HeldOutGate — first-class held-out paired-delta promotion gate.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { T as TraceEmitter } from './emitter-
|
|
2
|
-
import { R as Run, F as FailureClass, T as TraceStore } from './store-
|
|
1
|
+
import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
|
|
2
|
+
import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* SandboxHarness — executes a scenario in an isolated environment and
|