@tangle-network/agent-eval 0.23.1 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +80 -0
- package/README.md +141 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OHEPNJQN.js +554 -0
- package/dist/chunk-OHEPNJQN.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
- package/dist/chunk-SY6WAAAD.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
- package/dist/chunk-VRJVTXRV.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +1866 -3151
- package/dist/index.js +5457 -7809
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +409 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-TDPn1cxq.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +22 -22
- package/dist/wire/index.js +4 -3
- package/package.json +44 -18
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/chunk-XPHOZPOM.js +0 -1947
- package/dist/chunk-XPHOZPOM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-layer verifier — ordered pipeline of verification layers.
|
|
3
|
+
*
|
|
4
|
+
* Different contract from {@link JudgeRunner} (which runs parallel
|
|
5
|
+
* specs against a sandbox). MultiLayerVerifier is a DAG of layers
|
|
6
|
+
* (install → typecheck → build → lint → serve → semantic → …) with
|
|
7
|
+
* dependency-based skip, per-layer findings, soft-fail semantics, and
|
|
8
|
+
* an aggregated `blendedScore` across all passed layers.
|
|
9
|
+
*
|
|
10
|
+
* Use when you want:
|
|
11
|
+
* - ordered stages where a failing upstream stage skips downstream ones
|
|
12
|
+
* - each stage produces rich `findings` (severity + message + evidence)
|
|
13
|
+
* - a single composite score across stages with per-stage weights
|
|
14
|
+
* - soft-fail stages whose failure doesn't abort the pipeline
|
|
15
|
+
*
|
|
16
|
+
* Use {@link JudgeRunner} when you want:
|
|
17
|
+
* - N independent judges running in parallel against the same artifact
|
|
18
|
+
* - no inter-judge dependencies
|
|
19
|
+
* - boolean `passed` per judge + overall
|
|
20
|
+
*
|
|
21
|
+
* Both primitives compose — JudgeRunner can be invoked as a single
|
|
22
|
+
* layer inside a MultiLayerVerifier if that suits the caller.
|
|
23
|
+
*/
|
|
24
|
+
type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
|
|
25
|
+
type Severity = 'critical' | 'major' | 'minor' | 'info';
|
|
26
|
+
interface Finding {
|
|
27
|
+
severity: Severity;
|
|
28
|
+
message: string;
|
|
29
|
+
evidence?: string;
|
|
30
|
+
/** Optional layer name the finding belongs to (set by the verifier if omitted). */
|
|
31
|
+
layer?: string;
|
|
32
|
+
/**
|
|
33
|
+
* Free-form structured payload — used by `multiToolchainLayer` to attach
|
|
34
|
+
* `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
|
|
35
|
+
* Renderers MAY interrogate; agent-eval primitives never assume shape.
|
|
36
|
+
*/
|
|
37
|
+
detail?: Record<string, unknown>;
|
|
38
|
+
}
|
|
39
|
+
interface LayerResult {
|
|
40
|
+
layer: string;
|
|
41
|
+
status: LayerStatus;
|
|
42
|
+
/** 0..1 score, optional — layers that don't produce a numeric score omit. */
|
|
43
|
+
score?: number;
|
|
44
|
+
durationMs: number;
|
|
45
|
+
findings: Finding[];
|
|
46
|
+
/** Short human-readable summary (one line). */
|
|
47
|
+
reason?: string;
|
|
48
|
+
/**
|
|
49
|
+
* Numeric layer-level diagnostics: error counts, warning counts,
|
|
50
|
+
* cyclomatic complexity, total adapter wall-time, etc. Keyed by
|
|
51
|
+
* diagnostic name; null = "diagnostic not applicable / not measured."
|
|
52
|
+
* Renderers that know the keys can display them; ones that don't,
|
|
53
|
+
* ignore. Free-form on purpose — consumers type the value shape in
|
|
54
|
+
* their own namespace. Added in 0.10.
|
|
55
|
+
*/
|
|
56
|
+
diagnostics?: Record<string, number | null>;
|
|
57
|
+
/** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
|
|
58
|
+
detail?: Record<string, unknown>;
|
|
59
|
+
}
|
|
60
|
+
interface VerifyContext<Env = unknown> {
|
|
61
|
+
/** Per-run opaque context the caller provides. Layers destructure what they need. */
|
|
62
|
+
env: Env;
|
|
63
|
+
/** Previously-computed results from layers that already ran. */
|
|
64
|
+
prior: Record<string, LayerResult>;
|
|
65
|
+
/** Signal — if aborted, layers MUST bail within reasonable wall. */
|
|
66
|
+
signal: AbortSignal;
|
|
67
|
+
}
|
|
68
|
+
interface Layer<Env = unknown> {
|
|
69
|
+
name: string;
|
|
70
|
+
/** Stages that must have `status: 'pass'` before this layer runs. */
|
|
71
|
+
dependsOn?: string[];
|
|
72
|
+
/**
|
|
73
|
+
* Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
|
|
74
|
+
* contribute findings but not score.
|
|
75
|
+
*/
|
|
76
|
+
weight?: number;
|
|
77
|
+
/**
|
|
78
|
+
* If true, a `fail` status contributes to `blendedScore` (as 0) instead of
|
|
79
|
+
* being dropped — use for layers whose failure is a real signal. Default:
|
|
80
|
+
* fail drops from numerator + denominator, matching VB's existing semantics.
|
|
81
|
+
*/
|
|
82
|
+
failContributesToScore?: boolean;
|
|
83
|
+
/** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
|
|
84
|
+
capMs?: number;
|
|
85
|
+
run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
|
|
86
|
+
}
|
|
87
|
+
interface VerifyOptions<Env = unknown> {
|
|
88
|
+
env: Env;
|
|
89
|
+
/**
|
|
90
|
+
* Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
|
|
91
|
+
* omits a cap. The verifier short-circuits remaining layers on overall cap.
|
|
92
|
+
*/
|
|
93
|
+
overallCapMs?: number;
|
|
94
|
+
/** Called with each layer result as it completes. */
|
|
95
|
+
onLayer?: (result: LayerResult) => void;
|
|
96
|
+
}
|
|
97
|
+
interface VerificationReport {
|
|
98
|
+
layers: LayerResult[];
|
|
99
|
+
passCount: number;
|
|
100
|
+
failCount: number;
|
|
101
|
+
skippedCount: number;
|
|
102
|
+
errorCount: number;
|
|
103
|
+
/** True iff at least one scored layer ran AND every scored layer passed. */
|
|
104
|
+
allPass: boolean;
|
|
105
|
+
/**
|
|
106
|
+
* Weighted mean of `score` across contributing layers. 0 when no layers
|
|
107
|
+
* contributed. See {@link Layer.failContributesToScore} for fail semantics.
|
|
108
|
+
*/
|
|
109
|
+
blendedScore: number;
|
|
110
|
+
durationMs: number;
|
|
111
|
+
startedAt: string;
|
|
112
|
+
finishedAt: string;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Grade a semantic-concept-style judge result into a single layer status.
|
|
116
|
+
*
|
|
117
|
+
* Pass when overall score >= threshold AND no critical-severity concept gap.
|
|
118
|
+
* Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
|
|
119
|
+
*
|
|
120
|
+
* Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
|
|
121
|
+
* too strict — a single concept at 6/10 failed the entire layer despite
|
|
122
|
+
* overall score being >= 0.7. Now we trust the judge's own `severity` field:
|
|
123
|
+
* `critical` findings veto; `major`/`minor` reduce the score but don't veto.
|
|
124
|
+
*/
|
|
125
|
+
declare function gradeSemanticStatus(input: {
|
|
126
|
+
score: number;
|
|
127
|
+
findings: Array<{
|
|
128
|
+
severity: Severity;
|
|
129
|
+
present?: boolean;
|
|
130
|
+
score?: number;
|
|
131
|
+
}>;
|
|
132
|
+
available: boolean;
|
|
133
|
+
threshold?: number;
|
|
134
|
+
}): LayerStatus;
|
|
135
|
+
declare class MultiLayerVerifier<Env = unknown> {
|
|
136
|
+
private readonly layers;
|
|
137
|
+
constructor(layers: Layer<Env>[]);
|
|
138
|
+
run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
export { type Finding as F, type Layer as L, MultiLayerVerifier as M, type Severity as S, type VerificationReport as V, type LayerResult as a, type VerifyContext as b, type LayerStatus as c, type VerifyOptions as d, gradeSemanticStatus as g };
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.24.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/optimization.d.ts
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './
|
|
2
|
-
export {
|
|
3
|
-
export {
|
|
4
|
-
import './
|
|
5
|
-
import './integrity-
|
|
6
|
-
import './store-
|
|
7
|
-
import './
|
|
8
|
-
import './
|
|
1
|
+
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-CUOiGcGv.js';
|
|
2
|
+
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DfFdrraJ.js';
|
|
3
|
+
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-BXGs_9V0.js';
|
|
4
|
+
import './errors-BZ9sTdz7.js';
|
|
5
|
+
import './integrity-DK2EBVZC.js';
|
|
6
|
+
import './store-Db2Bv8Cf.js';
|
|
7
|
+
import './run-record-CqzahIbx.js';
|
|
8
|
+
import './emitter-DP_cSSiw.js';
|
|
9
|
+
import './control-runtime-BuJHoLg0.js';
|
|
10
|
+
import './dataset-CiK_3LDr.js';
|
|
11
|
+
import './failure-cluster-C2EGSDiT.js';
|
package/dist/optimization.js
CHANGED
|
@@ -25,17 +25,19 @@ import {
|
|
|
25
25
|
summarizePreferenceMemory,
|
|
26
26
|
trialTraceFromMultiShotTrial,
|
|
27
27
|
withAssignedFeedbackSplit
|
|
28
|
-
} from "./chunk-
|
|
29
|
-
import "./chunk-
|
|
28
|
+
} from "./chunk-VRJVTXRV.js";
|
|
29
|
+
import "./chunk-NLMNWKVM.js";
|
|
30
30
|
import {
|
|
31
31
|
runEvalCampaign
|
|
32
|
-
} from "./chunk-
|
|
33
|
-
import "./chunk-
|
|
34
|
-
import "./chunk-
|
|
35
|
-
import "./chunk-
|
|
36
|
-
import "./chunk-
|
|
37
|
-
import "./chunk-
|
|
38
|
-
import "./chunk-
|
|
32
|
+
} from "./chunk-SESZDQPX.js";
|
|
33
|
+
import "./chunk-4S4BM3QQ.js";
|
|
34
|
+
import "./chunk-2A5XJB43.js";
|
|
35
|
+
import "./chunk-I4MBDTY5.js";
|
|
36
|
+
import "./chunk-KTGTIOFD.js";
|
|
37
|
+
import "./chunk-PC4UYEBM.js";
|
|
38
|
+
import "./chunk-TVVP3ZZQ.js";
|
|
39
|
+
import "./chunk-4F5DQN55.js";
|
|
40
|
+
import "./chunk-NG236HPC.js";
|
|
39
41
|
import "./chunk-PZ5AY32C.js";
|
|
40
42
|
export {
|
|
41
43
|
CallbackResearcher,
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OutcomeStore — deployment outcomes attached to Run IDs.
|
|
3
|
+
*
|
|
4
|
+
* Outcomes arrive asynchronously from production telemetry after the
|
|
5
|
+
* eval run completed: user ratings, retention flags, conversion events,
|
|
6
|
+
* revenue, support-ticket rate, anything a product team can measure.
|
|
7
|
+
* The store is a peer to TraceStore — separate lifecycle, same runId
|
|
8
|
+
* foreign key.
|
|
9
|
+
*
|
|
10
|
+
* The whole point of this module is to make the meta-eval correlation
|
|
11
|
+
* question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
|
|
12
|
+
*/
|
|
13
|
+
interface DeploymentOutcome {
|
|
14
|
+
runId: string;
|
|
15
|
+
capturedAt: number;
|
|
16
|
+
/** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
|
|
17
|
+
metrics: Record<string, number>;
|
|
18
|
+
/** Dimensions for stratified analysis — cohort, region, user_segment. */
|
|
19
|
+
labels?: Record<string, string>;
|
|
20
|
+
/** Free-form provenance (source system, pipeline version). */
|
|
21
|
+
source?: string;
|
|
22
|
+
}
|
|
23
|
+
interface OutcomeFilter {
|
|
24
|
+
runIds?: string[];
|
|
25
|
+
since?: number;
|
|
26
|
+
until?: number;
|
|
27
|
+
label?: {
|
|
28
|
+
key: string;
|
|
29
|
+
value: string;
|
|
30
|
+
};
|
|
31
|
+
source?: string;
|
|
32
|
+
}
|
|
33
|
+
interface OutcomeStore {
|
|
34
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
35
|
+
/** All outcomes attached to this run (a single run can have many — multiple
|
|
36
|
+
* capture windows over deployment time). */
|
|
37
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
38
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
39
|
+
}
|
|
40
|
+
declare class InMemoryOutcomeStore implements OutcomeStore {
|
|
41
|
+
private items;
|
|
42
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
43
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
44
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
45
|
+
}
|
|
46
|
+
interface FileSystemOutcomeStoreOptions {
|
|
47
|
+
dir: string;
|
|
48
|
+
maxBytes?: number;
|
|
49
|
+
}
|
|
50
|
+
declare class FileSystemOutcomeStore implements OutcomeStore {
|
|
51
|
+
private dir;
|
|
52
|
+
private maxBytes;
|
|
53
|
+
private memo?;
|
|
54
|
+
private loaded;
|
|
55
|
+
constructor(options: FileSystemOutcomeStoreOptions);
|
|
56
|
+
private ensureDir;
|
|
57
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
58
|
+
private load;
|
|
59
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
60
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export { type DeploymentOutcome as D, FileSystemOutcomeStore as F, InMemoryOutcomeStore as I, type OutcomeStore as O, type OutcomeFilter as a, type FileSystemOutcomeStoreOptions as b };
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
|
|
2
|
+
export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-C2EGSDiT.js';
|
|
3
|
+
import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
|
|
4
|
+
import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
|
|
5
|
+
export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
|
|
6
|
+
import { l as llmSpans } from '../query-DODUYdPg.js';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* BudgetBreachView — aggregates breach events across the corpus.
|
|
10
|
+
*
|
|
11
|
+
* Answers: which dimensions get hit most often? Which scenarios are
|
|
12
|
+
* underbudgeted? Which variants trigger the most breaches?
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
interface BudgetBreachFinding {
|
|
16
|
+
runId: string;
|
|
17
|
+
scenarioId: string;
|
|
18
|
+
variantId?: string;
|
|
19
|
+
dimension: keyof BudgetSpec;
|
|
20
|
+
limit: number;
|
|
21
|
+
consumed: number;
|
|
22
|
+
excessRatio: number;
|
|
23
|
+
timestamp: number;
|
|
24
|
+
}
|
|
25
|
+
interface BudgetBreachReport {
|
|
26
|
+
findings: BudgetBreachFinding[];
|
|
27
|
+
byDimension: Record<string, number>;
|
|
28
|
+
byScenario: Record<string, number>;
|
|
29
|
+
byVariant: Record<string, number>;
|
|
30
|
+
totalRuns: number;
|
|
31
|
+
breachedRunRatio: number;
|
|
32
|
+
}
|
|
33
|
+
declare function budgetBreachView(store: TraceStore, options?: {
|
|
34
|
+
scenarioId?: string;
|
|
35
|
+
variantId?: string;
|
|
36
|
+
}): Promise<BudgetBreachReport>;
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* FirstDivergenceView — aligns two trajectories by step index, reports
|
|
40
|
+
* the first step where they differ.
|
|
41
|
+
*
|
|
42
|
+
* "Differ" is configurable — default is (kind, toolName if tool, model
|
|
43
|
+
* if llm). Use this view to attribute "why is variant B better?" to a
|
|
44
|
+
* specific step rather than an aggregate mean delta.
|
|
45
|
+
*/
|
|
46
|
+
|
|
47
|
+
interface DivergenceReport {
|
|
48
|
+
runA: string;
|
|
49
|
+
runB: string;
|
|
50
|
+
firstDivergenceIndex: number | null;
|
|
51
|
+
aStep?: TrajectoryStep;
|
|
52
|
+
bStep?: TrajectoryStep;
|
|
53
|
+
reason?: string;
|
|
54
|
+
/** Common prefix length (steps that matched). */
|
|
55
|
+
commonPrefixLen: number;
|
|
56
|
+
}
|
|
57
|
+
interface DivergenceOptions {
|
|
58
|
+
/** Returns true if two steps are considered equal. Default: kind + tool/model match. */
|
|
59
|
+
stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
|
|
60
|
+
}
|
|
61
|
+
declare function firstDivergenceView(store: TraceStore, runA: string, runB: string, options?: DivergenceOptions): Promise<DivergenceReport>;
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* JudgeAgreementView — pairwise agreement between judges across the
|
|
65
|
+
* corpus, grouped by dimension.
|
|
66
|
+
*
|
|
67
|
+
* Output drives two workflows:
|
|
68
|
+
* - Judge robustness audit: "does Claude agree with GPT at κ ≥ 0.6?"
|
|
69
|
+
* - Calibration tracking: κ vs golden human labels over time (by
|
|
70
|
+
* providing a `humanGoldenJudgeId`).
|
|
71
|
+
*/
|
|
72
|
+
|
|
73
|
+
interface JudgePair {
|
|
74
|
+
judgeA: string;
|
|
75
|
+
judgeB: string;
|
|
76
|
+
dimension: string;
|
|
77
|
+
/** Number of (targetSpanId, dimension) tuples both judges scored. */
|
|
78
|
+
commonItems: number;
|
|
79
|
+
pearson: number;
|
|
80
|
+
krippendorff: number;
|
|
81
|
+
}
|
|
82
|
+
interface JudgeAgreementReport {
|
|
83
|
+
pairs: JudgePair[];
|
|
84
|
+
dimensions: string[];
|
|
85
|
+
judgeIds: string[];
|
|
86
|
+
}
|
|
87
|
+
declare function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport>;
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* RegressionView — compares a candidate slice to a baseline slice on a
|
|
91
|
+
* named metric. Delegates the statistics (Welch's t-test, Cohen's d,
|
|
92
|
+
* IQR stability) to `baseline.ts`.
|
|
93
|
+
*
|
|
94
|
+
* This is the entry point for CI regression gates: "given runs tagged
|
|
95
|
+
* release=A and release=B, did any metric regress?"
|
|
96
|
+
*/
|
|
97
|
+
|
|
98
|
+
interface RegressionSpec {
|
|
99
|
+
metric: string;
|
|
100
|
+
higherIsBetter: boolean;
|
|
101
|
+
/** Extract a scalar from a run. Default extractors handle common metrics. */
|
|
102
|
+
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
103
|
+
}
|
|
104
|
+
interface RegressionOptions extends BaselineOptions {
|
|
105
|
+
baseline: RunFilter;
|
|
106
|
+
candidate: RunFilter;
|
|
107
|
+
}
|
|
108
|
+
declare function regressionView(store: TraceStore, metrics: RegressionSpec[], options: RegressionOptions): Promise<BaselineReport>;
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* StuckLoopView — detects when an agent calls the same tool with the
|
|
112
|
+
* same (or structurally similar) arguments ≥ N times in a short window.
|
|
113
|
+
*
|
|
114
|
+
* Rationale: agents that loop are the number-one production failure
|
|
115
|
+
* mode on long-horizon flows. The view returns (runId, toolName,
|
|
116
|
+
* argHash, occurrences, windowMs) for each detected loop plus a
|
|
117
|
+
* fraction of runs affected.
|
|
118
|
+
*/
|
|
119
|
+
|
|
120
|
+
interface StuckLoopFinding {
|
|
121
|
+
runId: string;
|
|
122
|
+
toolName: string;
|
|
123
|
+
argHash: string;
|
|
124
|
+
occurrences: number;
|
|
125
|
+
spanIds: string[];
|
|
126
|
+
/** Milliseconds between first and last call in the loop. */
|
|
127
|
+
windowMs: number;
|
|
128
|
+
}
|
|
129
|
+
interface StuckLoopReport {
|
|
130
|
+
findings: StuckLoopFinding[];
|
|
131
|
+
affectedRunRatio: number;
|
|
132
|
+
totalRuns: number;
|
|
133
|
+
}
|
|
134
|
+
interface StuckLoopOptions {
|
|
135
|
+
/** Minimum call count to flag a loop (default 3). */
|
|
136
|
+
minOccurrences?: number;
|
|
137
|
+
/** Filter to a specific runId; omit to scan the entire corpus. */
|
|
138
|
+
runId?: string;
|
|
139
|
+
}
|
|
140
|
+
declare function stuckLoopView(store: TraceStore, options?: StuckLoopOptions): Promise<StuckLoopReport>;
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* ToolWasteView — fraction of tool calls whose results weren't used
|
|
144
|
+
* downstream. Without a "used" signal we fall back to structural
|
|
145
|
+
* proxies: error calls, duplicate calls, and tool calls followed by
|
|
146
|
+
* zero subsequent LLM spans are all considered waste.
|
|
147
|
+
*
|
|
148
|
+
* Consumers can pass a `usageOracle` that inspects a tool span and
|
|
149
|
+
* returns true iff the tool's result appears in a later LLM message,
|
|
150
|
+
* artifact, or state mutation — that's the canonical definition; the
|
|
151
|
+
* default heuristic is a reasonable fallback.
|
|
152
|
+
*/
|
|
153
|
+
|
|
154
|
+
interface ToolWasteFinding {
|
|
155
|
+
runId: string;
|
|
156
|
+
wastedCalls: number;
|
|
157
|
+
totalCalls: number;
|
|
158
|
+
wasteRate: number;
|
|
159
|
+
}
|
|
160
|
+
interface ToolWasteReport {
|
|
161
|
+
byRun: ToolWasteFinding[];
|
|
162
|
+
overallWasteRate: number;
|
|
163
|
+
}
|
|
164
|
+
interface ToolWasteOptions {
|
|
165
|
+
runId?: string;
|
|
166
|
+
usageOracle?: (tool: ToolSpan, later: {
|
|
167
|
+
llm: Awaited<ReturnType<typeof llmSpans>>;
|
|
168
|
+
}) => boolean;
|
|
169
|
+
}
|
|
170
|
+
declare function toolWasteView(store: TraceStore, options?: ToolWasteOptions): Promise<ToolWasteReport>;
|
|
171
|
+
|
|
172
|
+
export { type BudgetBreachFinding, type BudgetBreachReport, type DivergenceOptions, type DivergenceReport, type JudgeAgreementReport, type JudgePair, type RegressionOptions, type RegressionSpec, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, budgetBreachView, firstDivergenceView, judgeAgreementView, regressionView, stuckLoopView, toolWasteView };
|