@tangle-network/agent-eval 0.25.0 → 0.27.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +5 -5
- package/dist/builder-eval/index.js +1 -1
- package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
- package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
- package/dist/chunk-4U4BKCXK.js.map +1 -0
- package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
- package/dist/chunk-5AKPEK5L.js.map +1 -0
- package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
- package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
- package/dist/chunk-K33INZHH.js.map +1 -0
- package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
- package/dist/chunk-NCRFYPS3.js.map +1 -0
- package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
- package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
- package/dist/chunk-QHF6EQKK.js.map +1 -0
- package/dist/chunk-R5UQJNKC.js +722 -0
- package/dist/chunk-R5UQJNKC.js.map +1 -0
- package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
- package/dist/chunk-RUI6SIHY.js.map +1 -0
- package/dist/{chunk-EDUKQ5AM.js → chunk-SZSBQUIJ.js} +2 -2
- package/dist/chunk-SZSBQUIJ.js.map +1 -0
- package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
- package/dist/chunk-VSMTAMNK.js.map +1 -0
- package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
- package/dist/chunk-XFZCM5Z3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
- package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
- package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/{index-Oj9fAPPN.d.ts → index-BhLlu-qO.d.ts} +63 -2
- package/dist/index.d.ts +279 -72
- package/dist/index.js +222 -136
- package/dist/index.js.map +1 -1
- package/dist/knowledge/index.d.ts +1 -1
- package/dist/knowledge/index.js +2 -2
- package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +5 -5
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.d.ts +1 -1
- package/dist/pipelines/index.js +2 -2
- package/dist/{release-report-BNgMdqPF.d.ts → release-report-CCQqnK46.d.ts} +1 -1
- package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
- package/dist/reporting.d.ts +4 -4
- package/dist/reporting.js +5 -5
- package/dist/{researcher-BPT8x_NT.d.ts → researcher-G81CWc0q.d.ts} +9 -10
- package/dist/rl.d.ts +26 -44
- package/dist/rl.js +5 -5
- package/dist/rl.js.map +1 -1
- package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
- package/dist/{summary-report-C7VPYEj2.d.ts → summary-report-Dl4akLKX.d.ts} +13 -1
- package/dist/traces.d.ts +1 -1
- package/dist/traces.js +2 -2
- package/dist/wire/index.d.ts +2 -2
- package/dist/wire/index.js +1 -1
- package/docs/concepts.md +11 -0
- package/docs/research-report-methodology.md +4 -4
- package/docs/three-package-architecture.md +12 -24
- package/package.json +1 -1
- package/dist/chunk-2A5XJB43.js.map +0 -1
- package/dist/chunk-4F5DQN55.js.map +0 -1
- package/dist/chunk-5LBB5B3Z.js.map +0 -1
- package/dist/chunk-EDUKQ5AM.js.map +0 -1
- package/dist/chunk-I4MBDTY5.js +0 -272
- package/dist/chunk-I4MBDTY5.js.map +0 -1
- package/dist/chunk-JLZQWFV3.js.map +0 -1
- package/dist/chunk-K2TPS5LB.js.map +0 -1
- package/dist/chunk-LSH4MMOZ.js.map +0 -1
- package/dist/chunk-NU65VQ7M.js.map +0 -1
- package/dist/chunk-OWLAAMME.js.map +0 -1
- package/dist/chunk-SESZDQPX.js.map +0 -1
- /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
- /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
- /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-
|
|
1
|
+
import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
|
|
2
2
|
import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
|
|
3
3
|
import '../store-Db2Bv8Cf.js';
|
|
4
4
|
|
package/dist/knowledge/index.js
CHANGED
|
@@ -4,8 +4,8 @@ import {
|
|
|
4
4
|
knowledgeReadinessTracePayload,
|
|
5
5
|
scoreKnowledgeReadiness,
|
|
6
6
|
userQuestionsForKnowledgeGaps
|
|
7
|
-
} from "../chunk-
|
|
8
|
-
import "../chunk-
|
|
7
|
+
} from "../chunk-3CKU6VGU.js";
|
|
8
|
+
import "../chunk-NCRFYPS3.js";
|
|
9
9
|
import "../chunk-TVVP3ZZQ.js";
|
|
10
10
|
import "../chunk-PZ5AY32C.js";
|
|
11
11
|
export {
|
|
@@ -51,7 +51,7 @@ interface LayerResult {
|
|
|
51
51
|
* diagnostic name; null = "diagnostic not applicable / not measured."
|
|
52
52
|
* Renderers that know the keys can display them; ones that don't,
|
|
53
53
|
* ignore. Free-form on purpose — consumers type the value shape in
|
|
54
|
-
* their own namespace.
|
|
54
|
+
* their own namespace.
|
|
55
55
|
*/
|
|
56
56
|
diagnostics?: Record<string, number | null>;
|
|
57
57
|
/** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.27.2",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/optimization.d.ts
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-
|
|
2
|
-
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
3
|
-
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-
|
|
1
|
+
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-G81CWc0q.js';
|
|
2
|
+
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-D1aGKusy.js';
|
|
3
|
+
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-Dl4akLKX.js';
|
|
4
4
|
import './errors-BZ9sTdz7.js';
|
|
5
5
|
import './integrity-DK2EBVZC.js';
|
|
6
6
|
import './store-Db2Bv8Cf.js';
|
|
7
7
|
import './run-record-CqzahIbx.js';
|
|
8
8
|
import './emitter-DP_cSSiw.js';
|
|
9
|
-
import './control-runtime-
|
|
9
|
+
import './control-runtime-BZ_lVLYW.js';
|
|
10
10
|
import './dataset-CiK_3LDr.js';
|
|
11
|
-
import './failure-cluster-
|
|
11
|
+
import './failure-cluster-Cw65_5FY.js';
|
package/dist/optimization.js
CHANGED
|
@@ -25,18 +25,18 @@ import {
|
|
|
25
25
|
summarizePreferenceMemory,
|
|
26
26
|
trialTraceFromMultiShotTrial,
|
|
27
27
|
withAssignedFeedbackSplit
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-SZSBQUIJ.js";
|
|
29
29
|
import "./chunk-NLMNWKVM.js";
|
|
30
30
|
import {
|
|
31
31
|
runEvalCampaign
|
|
32
|
-
} from "./chunk-
|
|
32
|
+
} from "./chunk-RUI6SIHY.js";
|
|
33
33
|
import "./chunk-4S4BM3QQ.js";
|
|
34
|
-
import "./chunk-
|
|
35
|
-
import "./chunk-
|
|
34
|
+
import "./chunk-5AKPEK5L.js";
|
|
35
|
+
import "./chunk-R5UQJNKC.js";
|
|
36
36
|
import "./chunk-KTGTIOFD.js";
|
|
37
37
|
import "./chunk-PC4UYEBM.js";
|
|
38
38
|
import "./chunk-TVVP3ZZQ.js";
|
|
39
|
-
import "./chunk-
|
|
39
|
+
import "./chunk-VSMTAMNK.js";
|
|
40
40
|
import "./chunk-NG236HPC.js";
|
|
41
41
|
import "./chunk-PZ5AY32C.js";
|
|
42
42
|
export {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
|
|
2
|
-
export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-
|
|
2
|
+
export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
|
|
3
3
|
import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
|
|
4
4
|
import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
|
|
5
5
|
export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
|
package/dist/pipelines/index.js
CHANGED
|
@@ -2,13 +2,13 @@ import {
|
|
|
2
2
|
compareToBaseline,
|
|
3
3
|
computeToolUseMetrics,
|
|
4
4
|
failureClusterView
|
|
5
|
-
} from "../chunk-
|
|
5
|
+
} from "../chunk-K33INZHH.js";
|
|
6
6
|
import {
|
|
7
7
|
buildTrajectory
|
|
8
8
|
} from "../chunk-RZTMDUO7.js";
|
|
9
9
|
import {
|
|
10
10
|
interRaterReliability
|
|
11
|
-
} from "../chunk-
|
|
11
|
+
} from "../chunk-R5UQJNKC.js";
|
|
12
12
|
import {
|
|
13
13
|
aggregateLlm,
|
|
14
14
|
argHash,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-CiK_3LDr.js';
|
|
2
|
-
import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-
|
|
2
|
+
import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-Dl4akLKX.js';
|
|
3
3
|
import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
@@ -107,11 +107,10 @@ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: R
|
|
|
107
107
|
* Replay-from-raw-events — turn every captured campaign run into a
|
|
108
108
|
* re-runnable artifact.
|
|
109
109
|
*
|
|
110
|
-
*
|
|
111
|
-
*
|
|
112
|
-
*
|
|
113
|
-
* the
|
|
114
|
-
* burning new LLM cost.
|
|
110
|
+
* `RawProviderSink` captures every provider HTTP envelope; `runEvalCampaign`
|
|
111
|
+
* makes that capture the default. Together they make every past run a
|
|
112
|
+
* complete fingerprint of what happened on the wire — enough to replay
|
|
113
|
+
* the run without burning new LLM cost.
|
|
115
114
|
*
|
|
116
115
|
* Three use cases this primitive enables:
|
|
117
116
|
*
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-C0uDYwG6.js';
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-
|
|
3
|
-
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-
|
|
4
|
-
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-CCQqnK46.js';
|
|
3
|
+
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
|
+
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-Dl4akLKX.js';
|
|
5
5
|
import './run-record-CqzahIbx.js';
|
|
6
6
|
import './errors-BZ9sTdz7.js';
|
|
7
7
|
import './outcome-store-D6KWmYvj.js';
|
|
8
8
|
import './dataset-CiK_3LDr.js';
|
|
9
|
-
import './failure-cluster-
|
|
9
|
+
import './failure-cluster-Cw65_5FY.js';
|
|
10
10
|
import './store-Db2Bv8Cf.js';
|
package/dist/reporting.js
CHANGED
|
@@ -5,14 +5,14 @@ import {
|
|
|
5
5
|
judgeReplayGate,
|
|
6
6
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
7
7
|
renderReleaseReport
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-DBIGN5MJ.js";
|
|
9
9
|
import {
|
|
10
10
|
rubricPredictiveValidity
|
|
11
11
|
} from "./chunk-YRZ4M5GS.js";
|
|
12
12
|
import {
|
|
13
13
|
evaluateInterimReleaseConfidence,
|
|
14
14
|
pairedEvalueSequence
|
|
15
|
-
} from "./chunk-
|
|
15
|
+
} from "./chunk-MAZ26DC7.js";
|
|
16
16
|
import {
|
|
17
17
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
18
18
|
bhAdjust,
|
|
@@ -22,9 +22,9 @@ import {
|
|
|
22
22
|
paretoChart,
|
|
23
23
|
researchReport,
|
|
24
24
|
summaryTable
|
|
25
|
-
} from "./chunk-
|
|
26
|
-
import "./chunk-
|
|
27
|
-
import "./chunk-
|
|
25
|
+
} from "./chunk-5AKPEK5L.js";
|
|
26
|
+
import "./chunk-R5UQJNKC.js";
|
|
27
|
+
import "./chunk-VSMTAMNK.js";
|
|
28
28
|
import "./chunk-NG236HPC.js";
|
|
29
29
|
import "./chunk-PZ5AY32C.js";
|
|
30
30
|
export {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
|
|
2
2
|
import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DK2EBVZC.js';
|
|
3
3
|
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-CqzahIbx.js';
|
|
4
|
-
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-
|
|
4
|
+
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-Dl4akLKX.js';
|
|
5
5
|
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
|
|
6
6
|
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
7
7
|
|
|
@@ -224,16 +224,15 @@ declare class LlmClient {
|
|
|
224
224
|
* EvalCampaign — opinionated matrix runner that wires the four
|
|
225
225
|
* capture-integrity directives by construction.
|
|
226
226
|
*
|
|
227
|
-
*
|
|
228
|
-
*
|
|
229
|
-
*
|
|
230
|
-
*
|
|
231
|
-
*
|
|
232
|
-
*
|
|
233
|
-
* directives in `SKILL.md § Capture integrity` are mitigations.
|
|
227
|
+
* The canonical benchmark shape — matrix runner → for each
|
|
228
|
+
* (variant, scenario, seed) → start a TraceEmitter → call LLMs → end the
|
|
229
|
+
* run → analyze — has a bug class at the integration boundary: raw
|
|
230
|
+
* events not captured, route silently wrong, integrity not asserted,
|
|
231
|
+
* analyst never run. The directives in `SKILL.md § Capture integrity`
|
|
232
|
+
* are the mitigations.
|
|
234
233
|
*
|
|
235
|
-
* `EvalCampaign` is the structural fix
|
|
236
|
-
* surface
|
|
234
|
+
* `EvalCampaign` is the structural fix — consumers don't wire the
|
|
235
|
+
* integrity surface themselves; the campaign owns it. Specifically:
|
|
237
236
|
*
|
|
238
237
|
* - calls `assertLlmRoute` once at preflight before any work runs
|
|
239
238
|
* - constructs a per-run `TraceStore` and `RawProviderSink` via factories
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
|
|
2
|
-
import { V as VerificationReport } from './multi-layer-verifier-
|
|
3
|
-
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-
|
|
2
|
+
import { V as VerificationReport } from './multi-layer-verifier-U-c8ge1k.js';
|
|
3
|
+
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-Dl4akLKX.js';
|
|
4
4
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
5
5
|
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-C0uDYwG6.js';
|
|
6
|
-
import { I as InterimReleaseConfidence } from './sequential-
|
|
6
|
+
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
7
7
|
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
8
|
-
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-
|
|
9
|
-
export { r as runEvalCampaign } from './researcher-
|
|
8
|
+
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-G81CWc0q.js';
|
|
9
|
+
export { r as runEvalCampaign } from './researcher-G81CWc0q.js';
|
|
10
10
|
import './errors-BZ9sTdz7.js';
|
|
11
|
-
import './failure-cluster-
|
|
11
|
+
import './failure-cluster-Cw65_5FY.js';
|
|
12
12
|
import './integrity-DK2EBVZC.js';
|
|
13
13
|
import './emitter-DP_cSSiw.js';
|
|
14
14
|
|
|
@@ -529,17 +529,12 @@ declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
|
|
|
529
529
|
}>;
|
|
530
530
|
|
|
531
531
|
/**
|
|
532
|
-
* Adapters: convert
|
|
533
|
-
* `RunRecord[]` artifact that
|
|
532
|
+
* Adapters: convert `TrialResult[]` (from `runMultiShotOptimization`,
|
|
533
|
+
* `runPromptEvolution`) into the canonical `RunRecord[]` artifact that
|
|
534
|
+
* `replayCache`, `pairedEvalueSequence`, and `rubricPredictiveValidity`
|
|
535
|
+
* consume.
|
|
534
536
|
*
|
|
535
|
-
*
|
|
536
|
-
* eval matrix produces one `RunRecord`. The pre-0.22 optimization
|
|
537
|
-
* primitives (`runMultiShotOptimization`, `runPromptEvolution`) produce
|
|
538
|
-
* `TrialResult[]` with a different shape. This file bridges the two so
|
|
539
|
-
* the new primitives (`replayCache`, `pairedEvalueSequence`,
|
|
540
|
-
* `rubricPredictiveValidity`) compose cleanly with the existing RL stack.
|
|
541
|
-
*
|
|
542
|
-
* The adapters are thin and explicit — every mandatory `RunRecord` field
|
|
537
|
+
* Adapters are thin and explicit — every mandatory `RunRecord` field
|
|
543
538
|
* comes from a caller-supplied context (`commitSha`, `model`,
|
|
544
539
|
* `promptHash`, `configHash`) plus the trial's runtime data. Defaults
|
|
545
540
|
* exist for fields the trial doesn't carry (`tokenUsage`, `costUsd`),
|
|
@@ -1505,18 +1500,16 @@ interface DetectRewardHackingInput {
|
|
|
1505
1500
|
declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
|
|
1506
1501
|
|
|
1507
1502
|
/**
|
|
1508
|
-
* `analyzeOptimizationResult` — unifies the
|
|
1503
|
+
* `analyzeOptimizationResult` — unifies the auto-research stack
|
|
1509
1504
|
* (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
|
|
1510
|
-
* Ax/AxRLM trace analyst) with the
|
|
1505
|
+
* Ax/AxRLM trace analyst) with the RL bridge in a single call.
|
|
1511
1506
|
*
|
|
1512
|
-
*
|
|
1513
|
-
*
|
|
1514
|
-
*
|
|
1515
|
-
* was decoupled from both. `analyzeOptimizationResult` does the wiring
|
|
1516
|
-
* once so consumers don't have to:
|
|
1507
|
+
* The optimization primitives produce `TrialResult[]`; the RL bridge
|
|
1508
|
+
* consumes `RunRecord[]`. Trace-analyst is independent of both. This
|
|
1509
|
+
* function does the wiring once so consumers don't have to:
|
|
1517
1510
|
*
|
|
1518
|
-
* Optimization (existing primitives) RL bridge
|
|
1519
|
-
* ──────────────────────────────────
|
|
1511
|
+
* Optimization (existing primitives) RL bridge
|
|
1512
|
+
* ────────────────────────────────── ────────
|
|
1520
1513
|
* runPromptEvolution → TrialResult[] →
|
|
1521
1514
|
* runMultiShotOptimization → MSTrial[] → analyzeOptimizationResult →
|
|
1522
1515
|
* reflective-mutation → mutations.jsonl → ↓
|
|
@@ -1527,10 +1520,10 @@ declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHac
|
|
|
1527
1520
|
* ↓ │
|
|
1528
1521
|
* TraceAnalyst.analyze(progressLog) ←─────────────────────────┘
|
|
1529
1522
|
*
|
|
1530
|
-
* The output
|
|
1531
|
-
*
|
|
1532
|
-
*
|
|
1533
|
-
*
|
|
1523
|
+
* The output is the canonical RL artifact set: `RunRecord[]` (so every
|
|
1524
|
+
* other RL primitive composes), preference triples, verifiable reward
|
|
1525
|
+
* signals, reward-hacking diagnosis, sequential interim verdict, and
|
|
1526
|
+
* (when wired) trace-analyst summary.
|
|
1534
1527
|
*
|
|
1535
1528
|
* What this primitive does NOT do: it does not modify the optimization
|
|
1536
1529
|
* primitives' internals. They keep producing `TrialResult` and emitting
|
|
@@ -1609,11 +1602,7 @@ declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOption
|
|
|
1609
1602
|
* `PredictiveValidityResearcher` — concrete `Researcher` implementation
|
|
1610
1603
|
* that drives selection from outcome-anchored predictive validity.
|
|
1611
1604
|
*
|
|
1612
|
-
*
|
|
1613
|
-
* 0.23. The 0.23 panel critique called this out: shipping the interface
|
|
1614
|
-
* without a default implementation that drives the loop is incomplete.
|
|
1615
|
-
*
|
|
1616
|
-
* This researcher answers each method:
|
|
1605
|
+
* Each method:
|
|
1617
1606
|
*
|
|
1618
1607
|
* - `inspectFailures(runs)` — synthesizes failure modes from the
|
|
1619
1608
|
* bottom-quartile of `RunRecord`s on the configured proxy reward.
|
|
@@ -1676,14 +1665,10 @@ declare class PredictiveValidityResearcher implements Researcher {
|
|
|
1676
1665
|
}
|
|
1677
1666
|
|
|
1678
1667
|
/**
|
|
1679
|
-
* `runRLCampaign` —
|
|
1668
|
+
* `runRLCampaign` — top-level orchestrator that runs the matrix and
|
|
1669
|
+
* produces every RL-ready artifact in one call.
|
|
1680
1670
|
*
|
|
1681
|
-
*
|
|
1682
|
-
* RL primitives consume that artifact in different ways. Until 0.24 they
|
|
1683
|
-
* had to be wired together by hand at every consumer; that defeats the
|
|
1684
|
-
* cohesion the package is supposed to provide.
|
|
1685
|
-
*
|
|
1686
|
-
* `runRLCampaign` wires:
|
|
1671
|
+
* Wires:
|
|
1687
1672
|
* 1. `runEvalCampaign` for the matrix run (capture, integrity, hooks)
|
|
1688
1673
|
* 2. `extractVerifiableReward` over each run, separating deterministic
|
|
1689
1674
|
* from probabilistic reward sources for the trainer
|
|
@@ -1697,9 +1682,6 @@ declare class PredictiveValidityResearcher implements Researcher {
|
|
|
1697
1682
|
* stage's output is in there. The consumer's downstream fits in a single
|
|
1698
1683
|
* line: pass `result.preferences` to their DPO trainer, `result.grpoRows`
|
|
1699
1684
|
* to GRPO, `result.runs` plus `result.rewardSignals` to a custom RL loop.
|
|
1700
|
-
*
|
|
1701
|
-
* This is what the 0.23 panel critique called the "missing top-level
|
|
1702
|
-
* primitive." Now shipped.
|
|
1703
1685
|
*/
|
|
1704
1686
|
|
|
1705
1687
|
interface RunRLCampaignOptions<V> extends EvalCampaignOptions<V> {
|
package/dist/rl.js
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runEvalCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-RUI6SIHY.js";
|
|
4
4
|
import "./chunk-4S4BM3QQ.js";
|
|
5
5
|
import {
|
|
6
6
|
rubricPredictiveValidity
|
|
7
7
|
} from "./chunk-YRZ4M5GS.js";
|
|
8
8
|
import {
|
|
9
9
|
evaluateInterimReleaseConfidence
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-MAZ26DC7.js";
|
|
11
11
|
import {
|
|
12
12
|
benjaminiHochberg
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-5AKPEK5L.js";
|
|
14
14
|
import {
|
|
15
15
|
wilcoxonSignedRank
|
|
16
|
-
} from "./chunk-
|
|
16
|
+
} from "./chunk-R5UQJNKC.js";
|
|
17
17
|
import "./chunk-KTGTIOFD.js";
|
|
18
18
|
import "./chunk-PC4UYEBM.js";
|
|
19
19
|
import "./chunk-TVVP3ZZQ.js";
|
|
20
|
-
import "./chunk-
|
|
20
|
+
import "./chunk-VSMTAMNK.js";
|
|
21
21
|
import {
|
|
22
22
|
ValidationError
|
|
23
23
|
} from "./chunk-NG236HPC.js";
|