@tangle-network/agent-eval 0.40.5 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/dist/campaign/index.d.ts +48 -355
  2. package/dist/campaign/index.js +106 -6
  3. package/dist/campaign/index.js.map +1 -1
  4. package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
  5. package/dist/chunk-H4TOS272.js.map +1 -0
  6. package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
  7. package/dist/chunk-KQ26DYTQ.js.map +1 -0
  8. package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
  9. package/dist/chunk-MNL6LXGQ.js.map +1 -0
  10. package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
  11. package/dist/chunk-N4SBKEPJ.js.map +1 -0
  12. package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
  13. package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
  14. package/dist/control.d.ts +2 -2
  15. package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
  16. package/dist/index.d.ts +227 -687
  17. package/dist/index.js +753 -1237
  18. package/dist/index.js.map +1 -1
  19. package/dist/integrity-CTDhR1Sg.d.ts +81 -0
  20. package/dist/llm-client-BXVRUZyX.d.ts +234 -0
  21. package/dist/openapi.json +1 -1
  22. package/dist/pipelines/index.js +67 -3
  23. package/dist/pipelines/index.js.map +1 -1
  24. package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
  25. package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
  26. package/dist/reporting.d.ts +2 -3
  27. package/dist/reporting.js +4 -8
  28. package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
  29. package/dist/rl.d.ts +103 -221
  30. package/dist/rl.js +44 -199
  31. package/dist/rl.js.map +1 -1
  32. package/dist/sequential-DdV5ShjT.d.ts +561 -0
  33. package/dist/traces.d.ts +3 -2
  34. package/dist/traces.js +5 -5
  35. package/dist/types-BLbRTxoc.d.ts +367 -0
  36. package/dist/wire/index.d.ts +1 -1
  37. package/package.json +1 -6
  38. package/dist/chunk-5U2DOJU4.js.map +0 -1
  39. package/dist/chunk-AU2JLNSZ.js.map +0 -1
  40. package/dist/chunk-DMW5VENN.js +0 -1412
  41. package/dist/chunk-DMW5VENN.js.map +0 -1
  42. package/dist/chunk-EGIPWXHL.js.map +0 -1
  43. package/dist/chunk-MAZ26DC7.js +0 -99
  44. package/dist/chunk-MAZ26DC7.js.map +0 -1
  45. package/dist/chunk-NKLGKF2Q.js.map +0 -1
  46. package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
  47. package/dist/optimization.d.ts +0 -11
  48. package/dist/optimization.js +0 -71
  49. package/dist/optimization.js.map +0 -1
  50. package/dist/sequential-5iSVfzl2.d.ts +0 -139
  51. package/dist/summary-report-DuZXOk7K.d.ts +0 -917
  52. /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
@@ -0,0 +1,81 @@
1
+ import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
2
+ import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
3
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
4
+
5
+ /**
6
+ * Run-completion integrity check — at end of run, verify the expected event
7
+ * types were actually captured. The point is the launch-review failure mode:
8
+ * a run *appears* successful but the raw provider events were never written,
9
+ * so a downstream reviewer can't reconstruct what happened.
10
+ *
11
+ * Pattern:
12
+ *
13
+ * const report = await assertRunCaptured(store, runId, {
14
+ * llmSpansMin: 1,
15
+ * judgeSpansMin: 1,
16
+ * rawSink: providerSink, // must have ≥ 1 event for this run
17
+ * requireRawCoverageOfLlmSpans: true, // every llm span has matching raw events
18
+ * })
19
+ * if (!report.ok) throwIfRunIncomplete(report) // or mark run failed and continue
20
+ *
21
+ * The function is read-only on the store and returns a structured report;
22
+ * the caller chooses the failure mode (throw, mark run failed, log warning).
23
+ * `throwIfRunIncomplete` is the convenient strict mode.
24
+ */
25
+
26
+ interface RunIntegrityExpectations {
27
+ /** Minimum LLM span count. Default 0 (no requirement). */
28
+ llmSpansMin?: number;
29
+ /** Minimum judge span count. Default 0. */
30
+ judgeSpansMin?: number;
31
+ /** Minimum tool span count. Default 0. */
32
+ toolSpansMin?: number;
33
+ /**
34
+ * Raw provider sink to consult for capture verification. When present,
35
+ * the check requires at least one raw event for the run.
36
+ */
37
+ rawSink?: RawProviderSink;
38
+ /** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
39
+ rawProviderEventsMin?: number;
40
+ /**
41
+ * Every LLM span must have at least one matching raw `request` event
42
+ * (matched by spanId). Catches the common bug where the structured span
43
+ * was emitted but the raw HTTP capture was wired to a different sink.
44
+ */
45
+ requireRawCoverageOfLlmSpans?: boolean;
46
+ /** Run outcome must be set (not null/undefined). Default false. */
47
+ requireOutcome?: boolean;
48
+ }
49
+ type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
50
+ interface RunIntegrityIssue {
51
+ code: RunIntegrityIssueCode;
52
+ message: string;
53
+ detail?: Record<string, unknown>;
54
+ }
55
+ interface RunIntegrityReport {
56
+ ok: boolean;
57
+ runId: string;
58
+ llmSpanCount: number;
59
+ judgeSpanCount: number;
60
+ toolSpanCount: number;
61
+ rawProviderEventCount: number;
62
+ /**
63
+ * Coverage of LLM spans by raw provider events keyed on spanId.
64
+ * `total` is the number of LLM spans; `covered` is the count with at
65
+ * least one matching `request` raw event.
66
+ */
67
+ rawSpanCoverage: {
68
+ covered: number;
69
+ total: number;
70
+ };
71
+ issues: RunIntegrityIssue[];
72
+ }
73
+ declare class RunIntegrityError extends CaptureIntegrityError {
74
+ readonly report: RunIntegrityReport;
75
+ constructor(report: RunIntegrityReport);
76
+ }
77
+ declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
78
+ /** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
79
+ declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
80
+
81
+ export { type RunIntegrityExpectations as R, type RunIntegrityReport as a, RunIntegrityError as b, type RunIntegrityIssue as c, type RunIntegrityIssueCode as d, assertRunCaptured as e, throwIfRunIncomplete as t };
@@ -0,0 +1,234 @@
1
+ import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
2
+ import { R as RawProviderSink, P as ProviderRedactor } from './raw-provider-sink-C46HDghv.js';
3
+
4
+ /**
5
+ * LLM client with graceful degrade.
6
+ *
7
+ * OpenAI-compatible `/v1/chat/completions` client with:
8
+ * - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
9
+ * - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
10
+ * - Graceful json_schema → json_object degrade on 400 with schema-reject body.
11
+ * - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
12
+ * - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
13
+ * directly, cli-bridge subscriptions, and any router that speaks the spec.
14
+ *
15
+ * Usage:
16
+ * const { value, result } = await callLlmJson<MyType>(
17
+ * { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
18
+ * { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
19
+ * )
20
+ *
21
+ * This is THE llm-calling seam for agent-eval primitives that need structured
22
+ * output (semantic concept judge, reviewer directives, critic scores). Primitives
23
+ * that need free-form text use `callLlm` and parse output themselves.
24
+ */
25
+
26
+ interface LlmMessage {
27
+ role: 'system' | 'user' | 'assistant';
28
+ /**
29
+ * Either a plain text content string OR a multimodal content array
30
+ * (text + image_url parts) for vision-capable models.
31
+ */
32
+ content: string | Array<{
33
+ type: 'text';
34
+ text: string;
35
+ } | {
36
+ type: 'image_url';
37
+ image_url: {
38
+ url: string;
39
+ detail?: 'auto' | 'low' | 'high';
40
+ };
41
+ }>;
42
+ }
43
+ interface LlmCallRequest {
44
+ model: string;
45
+ messages: LlmMessage[];
46
+ /** Optional JSON-mode response format (response_format: json_object). */
47
+ jsonMode?: boolean;
48
+ /** Optional structured output via JSON Schema. Falls back to json_object on 400. */
49
+ jsonSchema?: {
50
+ name: string;
51
+ schema: Record<string, unknown>;
52
+ };
53
+ temperature?: number;
54
+ maxTokens?: number;
55
+ /** Per-call timeout, default 60s. */
56
+ timeoutMs?: number;
57
+ }
58
+ interface LlmUsage {
59
+ promptTokens: number;
60
+ completionTokens: number;
61
+ totalTokens: number;
62
+ /** Proxies populate this when prompt caching is on. */
63
+ cachedPromptTokens?: number;
64
+ }
65
+ interface LlmCallResult {
66
+ /** The text content of the first choice. Empty string if none. */
67
+ content: string;
68
+ usage: LlmUsage;
69
+ /**
70
+ * Cost in USD. Pulled from proxy's `_response_cost` field when present;
71
+ * `null` when neither the proxy nor the caller can derive it.
72
+ */
73
+ costUsd: number | null;
74
+ /** Model name actually used (echoed from response). */
75
+ model: string;
76
+ /** Wall-clock duration of the HTTP call (last attempt, if retried). */
77
+ durationMs: number;
78
+ /** Raw response body. */
79
+ raw: Record<string, unknown>;
80
+ }
81
+ declare class LlmCallError extends AgentEvalError {
82
+ readonly status: number;
83
+ readonly body: string;
84
+ readonly model: string;
85
+ constructor(message: string, status: number, body: string, model: string);
86
+ }
87
+ interface LlmClientOptions {
88
+ /** Base URL (without trailing slash). Must end at the `/v1` prefix. */
89
+ baseUrl?: string;
90
+ /** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
91
+ apiKey?: string;
92
+ bearer?: string;
93
+ /** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
94
+ authHeader?: {
95
+ name: string;
96
+ value: string;
97
+ };
98
+ /** Default timeout in ms. Per-call can override. */
99
+ defaultTimeoutMs?: number;
100
+ /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
101
+ maxRetries?: number;
102
+ /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
103
+ fetch?: typeof fetch;
104
+ /**
105
+ * Optional raw HTTP capture sink. When provided, every request, response,
106
+ * and error (across all retry attempts) is recorded to the sink, with auth
107
+ * headers and credential-shaped body fields redacted by default. This is
108
+ * the layer-1 forensics primitive: structured `LlmSpan`s record intent,
109
+ * raw events record what actually crossed the wire.
110
+ */
111
+ rawSink?: RawProviderSink;
112
+ /**
113
+ * Logical provider id attached to raw events. When omitted, derived from
114
+ * `baseUrl` via `providerFromBaseUrl`.
115
+ */
116
+ provider?: string;
117
+ /** Trace context attached to raw events; populated by emitter-aware callers. */
118
+ traceContext?: {
119
+ runId?: string;
120
+ spanId?: string;
121
+ };
122
+ /** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
123
+ redactor?: ProviderRedactor;
124
+ }
125
+ /**
126
+ * True when an error is a transient transport/network fault worth retrying,
127
+ * as opposed to a deterministic failure (4xx schema reject, JSON parse) that
128
+ * a retry cannot fix. Inspects `LlmCallError.status`, then the error's
129
+ * name/message/code, then recurses into `error.cause` — undici nests the
130
+ * real socket fault one or more levels under `.cause`.
131
+ *
132
+ * This is THE retry classifier for the package: `callLlm` and
133
+ * `withJudgeRetry` both route through it, so a connection-class error is
134
+ * treated identically whether it surfaces in the HTTP client or a
135
+ * TCloud-backed judge.
136
+ */
137
+ declare function isTransientLlmError(err: unknown): boolean;
138
+ /** Exponential backoff: 500ms, 1s, 2s, 4s, ... capped at 16s. Attempt is 0-indexed. */
139
+ declare function backoffMs(attempt: number): number;
140
+ /**
141
+ * Strip a ```json / ``` code fence if the model emitted one.
142
+ * Idempotent for naked JSON. Some models (claude-code via router, certain
143
+ * deepseek models) wrap output even under json_object.
144
+ */
145
+ declare function stripFencedJson(raw: string): string;
146
+ /**
147
+ * Low-level call. Returns raw content + usage + cost. Retries on transient
148
+ * failures; does NOT degrade schema here — callers that want graceful
149
+ * degrade use `callLlmJson`.
150
+ */
151
+ declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
152
+ /**
153
+ * Structured-output call. Returns parsed JSON plus the raw result envelope.
154
+ * Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
155
+ * critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
156
+ * the `response_format.json_schema` shape but DO accept `json_object`.
157
+ */
158
+ declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
159
+ value: T;
160
+ result: LlmCallResult;
161
+ }>;
162
+ type LlmRouteAssertionReason = 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
163
+ declare class LlmRouteAssertionError extends CaptureIntegrityError {
164
+ readonly reason: LlmRouteAssertionReason;
165
+ readonly baseUrl: string;
166
+ constructor(message: string, reason: LlmRouteAssertionReason, baseUrl: string);
167
+ }
168
+ interface LlmRouteRequirements {
169
+ /**
170
+ * Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
171
+ * `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
172
+ * the public/free-tier router is a defect — the launch reviewer needs to
173
+ * know exactly which provider answered.
174
+ */
175
+ requireExplicitBaseUrl?: boolean;
176
+ /**
177
+ * Allowlist of acceptable base URLs. Strings match by prefix
178
+ * (case-insensitive); RegExps test against the full base URL.
179
+ */
180
+ allowedBaseUrls?: Array<string | RegExp>;
181
+ /** Blocklist that takes precedence over `allowedBaseUrls`. */
182
+ blockedBaseUrls?: Array<string | RegExp>;
183
+ /** Throw if no auth header / api key is configured. */
184
+ requireAuth?: boolean;
185
+ /**
186
+ * Logical provider id the configured `baseUrl` is expected to match (via
187
+ * `providerFromBaseUrl`). Mainly useful when paired with `requireExplicitBaseUrl`.
188
+ */
189
+ expectedProvider?: string;
190
+ }
191
+ /**
192
+ * Fail-loud assertion that the configured LLM client points at the route
193
+ * the caller intends. Designed for the matrix-runner preflight: invoke
194
+ * once before any LLM call to catch misconfiguration before a sweep burns
195
+ * dollars on the wrong provider.
196
+ *
197
+ * Throws `LlmRouteAssertionError`. Pure — no I/O — so it's safe to call
198
+ * from constructors and CI gates.
199
+ */
200
+ declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
201
+ /**
202
+ * Probe whether a model is reachable. Returns latency + null error on
203
+ * success; `ok=false` + error message on any failure (HTTP, timeout,
204
+ * network, parse). Designed for sweep preflights — fail loud at the
205
+ * boundary before burning a 30-leaf run on a misconfigured router.
206
+ *
207
+ * Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
208
+ * (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
209
+ * for short prompts, so don't tighten this further. We don't validate
210
+ * content; HTTP 200 means reachable.
211
+ */
212
+ declare function probeLlm(model: string, opts?: LlmClientOptions & {
213
+ timeoutMs?: number;
214
+ }): Promise<{
215
+ ok: boolean;
216
+ latencyMs: number;
217
+ error: string | null;
218
+ }>;
219
+ /**
220
+ * Stateful client — construct once with defaults, call many times.
221
+ * Thin wrapper around the free functions; exists for callers that want
222
+ * to inject a single configured instance into multiple primitives.
223
+ */
224
+ declare class LlmClient {
225
+ private readonly opts;
226
+ constructor(opts?: LlmClientOptions);
227
+ call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
228
+ callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
229
+ value: T;
230
+ result: LlmCallResult;
231
+ }>;
232
+ }
233
+
234
+ export { type LlmClientOptions as L, type LlmRouteRequirements as a, type LlmCallRequest as b, type LlmCallResult as c, LlmCallError as d, LlmClient as e, type LlmMessage as f, LlmRouteAssertionError as g, type LlmUsage as h, assertLlmRoute as i, backoffMs as j, callLlm as k, callLlmJson as l, isTransientLlmError as m, probeLlm as p, stripFencedJson as s };
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.40.5",
5
+ "version": "0.42.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,8 +1,9 @@
1
1
  import {
2
+ DEFAULT_RULES,
3
+ classifyFailure,
2
4
  compareToBaseline,
3
- computeToolUseMetrics,
4
- failureClusterView
5
- } from "../chunk-AU2JLNSZ.js";
5
+ computeToolUseMetrics
6
+ } from "../chunk-H4TOS272.js";
6
7
  import {
7
8
  buildTrajectory
8
9
  } from "../chunk-RZTMDUO7.js";
@@ -61,6 +62,69 @@ async function budgetBreachView(store, options = {}) {
61
62
  };
62
63
  }
63
64
 
65
+ // src/pipelines/failure-cluster.ts
66
+ async function failureClusterView(store, options = {}) {
67
+ const rules = options.rules ?? DEFAULT_RULES;
68
+ const minSize = options.minClusterSize ?? 1;
69
+ const runs = await store.listRuns();
70
+ const clusters = /* @__PURE__ */ new Map();
71
+ let totalFailures = 0;
72
+ for (const run of runs) {
73
+ if (run.status === "completed" && run.outcome?.pass !== false) continue;
74
+ totalFailures++;
75
+ const spans = await store.spans({ runId: run.runId });
76
+ const events = await store.events({ runId: run.runId });
77
+ const cls = classifyFailure({ run, spans, events }, rules);
78
+ let toolName;
79
+ let argPrefix;
80
+ let dimension;
81
+ if (cls.triggerSpanId) {
82
+ const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
83
+ if (trig?.kind === "tool") {
84
+ toolName = trig.toolName;
85
+ argPrefix = argHash(trig.args).slice(0, 16);
86
+ } else if (trig?.kind === "judge") {
87
+ dimension = trig.dimension;
88
+ }
89
+ }
90
+ if (!toolName) {
91
+ const ts = await toolSpans(store, run.runId);
92
+ const errored = ts.filter((t) => t.status === "error").pop();
93
+ if (errored) {
94
+ toolName = errored.toolName;
95
+ argPrefix = argHash(errored.args).slice(0, 16);
96
+ }
97
+ }
98
+ if (!dimension) {
99
+ const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
100
+ if (judge?.kind === "judge") dimension = judge.dimension;
101
+ }
102
+ const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
103
+ let cluster = clusters.get(key);
104
+ if (!cluster) {
105
+ cluster = {
106
+ failureClass: cls.failureClass,
107
+ toolName,
108
+ argPrefix,
109
+ dimension,
110
+ runCount: 0,
111
+ scenarioIds: [],
112
+ exampleRunId: run.runId,
113
+ exampleError: firstErrorMessage(spans) ?? cls.reason
114
+ };
115
+ clusters.set(key, cluster);
116
+ }
117
+ cluster.runCount++;
118
+ if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId);
119
+ }
120
+ const arr = [...clusters.values()].filter((c) => c.runCount >= minSize).sort((a, b) => b.runCount - a.runCount);
121
+ return { clusters: arr, totalFailures, totalRuns: runs.length };
122
+ }
123
+ function firstErrorMessage(spans) {
124
+ const errored = spans.find((s) => s.status === "error");
125
+ return errored?.error;
126
+ }
127
+
64
128
  // src/pipelines/first-divergence.ts
65
129
  async function firstDivergenceView(store, runA, runB, options = {}) {
66
130
  const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)]);
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipelines/budget-breach.ts","../../src/pipelines/first-divergence.ts","../../src/pipelines/judge-agreement.ts","../../src/pipelines/regression.ts","../../src/pipelines/stuck-loop.ts","../../src/pipelines/tool-waste.ts"],"sourcesContent":["/**\n * BudgetBreachView — aggregates breach events across the corpus.\n *\n * Answers: which dimensions get hit most often? Which scenarios are\n * underbudgeted? Which variants trigger the most breaches?\n */\n\nimport type { BudgetSpec } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface BudgetBreachFinding {\n runId: string\n scenarioId: string\n variantId?: string\n dimension: keyof BudgetSpec\n limit: number\n consumed: number\n excessRatio: number\n timestamp: number\n}\n\nexport interface BudgetBreachReport {\n findings: BudgetBreachFinding[]\n byDimension: Record<string, number>\n byScenario: Record<string, number>\n byVariant: Record<string, number>\n totalRuns: number\n breachedRunRatio: number\n}\n\nexport async function budgetBreachView(\n store: TraceStore,\n options: { scenarioId?: string; variantId?: string } = {},\n): Promise<BudgetBreachReport> {\n const runs = await store.listRuns({\n scenarioId: options.scenarioId,\n variantId: options.variantId,\n })\n const findings: BudgetBreachFinding[] = []\n const byDimension: Record<string, number> = {}\n const byScenario: Record<string, number> = {}\n const byVariant: Record<string, number> = {}\n\n for (const run of runs) {\n const entries = await store.budget(run.runId)\n for (const e of entries) {\n if (!e.breached) continue\n const excessRatio = e.limit > 0 ? e.consumed / e.limit : Infinity\n findings.push({\n runId: run.runId,\n scenarioId: run.scenarioId,\n variantId: run.variantId,\n dimension: e.dimension,\n limit: e.limit,\n consumed: e.consumed,\n excessRatio,\n timestamp: e.timestamp,\n })\n byDimension[e.dimension] = (byDimension[e.dimension] ?? 0) + 1\n byScenario[run.scenarioId] = (byScenario[run.scenarioId] ?? 0) + 1\n if (run.variantId) byVariant[run.variantId] = (byVariant[run.variantId] ?? 0) + 1\n }\n }\n\n const breachedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n byDimension,\n byScenario,\n byVariant,\n totalRuns: runs.length,\n breachedRunRatio: runs.length > 0 ? breachedRuns.size / runs.length : 0,\n }\n}\n","/**\n * FirstDivergenceView — aligns two trajectories by step index, reports\n * the first step where they differ.\n *\n * \"Differ\" is configurable — default is (kind, toolName if tool, model\n * if llm). Use this view to attribute \"why is variant B better?\" to a\n * specific step rather than an aggregate mean delta.\n */\n\nimport type { TraceStore } from '../trace/store'\nimport { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'\n\nexport interface DivergenceReport {\n runA: string\n runB: string\n firstDivergenceIndex: number | null\n aStep?: TrajectoryStep\n bStep?: TrajectoryStep\n reason?: string\n /** Common prefix length (steps that matched). */\n commonPrefixLen: number\n}\n\nexport interface DivergenceOptions {\n /** Returns true if two steps are considered equal. Default: kind + tool/model match. */\n stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean\n}\n\nexport async function firstDivergenceView(\n store: TraceStore,\n runA: string,\n runB: string,\n options: DivergenceOptions = {},\n): Promise<DivergenceReport> {\n const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)])\n const eq = options.stepEquals ?? defaultStepEquals\n const minLen = Math.min(a.steps.length, b.steps.length)\n for (let i = 0; i < minLen; i++) {\n const aStep = a.steps[i]!\n const bStep = b.steps[i]!\n if (!eq(aStep, bStep)) {\n return {\n runA,\n runB,\n firstDivergenceIndex: i,\n aStep,\n bStep,\n reason: describeDifference(aStep, bStep),\n commonPrefixLen: i,\n }\n }\n }\n if (a.steps.length === b.steps.length) {\n return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen }\n }\n const longer: Trajectory = a.steps.length > b.steps.length ? a : b\n return {\n runA,\n runB,\n firstDivergenceIndex: minLen,\n aStep: a.steps[minLen],\n bStep: b.steps[minLen],\n reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`,\n commonPrefixLen: minLen,\n }\n}\n\nfunction defaultStepEquals(a: TrajectoryStep, b: TrajectoryStep): boolean {\n if (a.span.kind !== b.span.kind) return false\n if (a.span.kind === 'tool' && b.span.kind === 'tool') return a.span.toolName === b.span.toolName\n if (a.span.kind === 'llm' && b.span.kind === 'llm') return a.span.model === b.span.model\n if (a.span.kind === 'judge' && b.span.kind === 'judge')\n return a.span.dimension === b.span.dimension\n return a.span.name === b.span.name\n}\n\nfunction describeDifference(a: TrajectoryStep, b: TrajectoryStep): string {\n if (a.span.kind !== b.span.kind) return `kind ${a.span.kind} vs ${b.span.kind}`\n if (a.span.kind === 'tool' && b.span.kind === 'tool' && a.span.toolName !== b.span.toolName) {\n return `tool ${a.span.toolName} vs ${b.span.toolName}`\n }\n if (a.span.kind === 'llm' && b.span.kind === 'llm' && a.span.model !== b.span.model) {\n return `model ${a.span.model} vs ${b.span.model}`\n }\n return `name \"${a.span.name}\" vs \"${b.span.name}\"`\n}\n","/**\n * JudgeAgreementView — pairwise agreement between judges across the\n * corpus, grouped by dimension.\n *\n * Output drives two workflows:\n * - Judge robustness audit: \"does Claude agree with GPT at κ ≥ 0.6?\"\n * - Calibration tracking: κ vs golden human labels over time (by\n * providing a `humanGoldenJudgeId`).\n */\n\nimport { interRaterReliability } from '../statistics'\nimport type { JudgeSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface JudgePair {\n judgeA: string\n judgeB: string\n dimension: string\n /** Number of (targetSpanId, dimension) tuples both judges scored. */\n commonItems: number\n pearson: number\n krippendorff: number\n}\n\nexport interface JudgeAgreementReport {\n pairs: JudgePair[]\n dimensions: string[]\n judgeIds: string[]\n}\n\nexport async function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport> {\n const all = (await store.spans({ kind: 'judge' })).filter(\n (s): s is JudgeSpan => s.kind === 'judge',\n )\n if (all.length === 0) return { pairs: [], dimensions: [], judgeIds: [] }\n\n const byDimension = new Map<string, JudgeSpan[]>()\n for (const s of all) {\n const arr = byDimension.get(s.dimension) ?? []\n arr.push(s)\n byDimension.set(s.dimension, arr)\n }\n\n const judgeIds = [...new Set(all.map((s) => s.judgeId))].sort()\n const pairs: JudgePair[] = []\n for (const [dim, spans] of byDimension) {\n const byJudge = new Map<string, Map<string, number>>()\n for (const s of spans) {\n const m = byJudge.get(s.judgeId) ?? new Map<string, number>()\n m.set(s.targetSpanId, s.score)\n byJudge.set(s.judgeId, m)\n }\n const judgesHere = [...byJudge.keys()]\n for (let i = 0; i < judgesHere.length; i++) {\n for (let j = i + 1; j < judgesHere.length; j++) {\n const judgeI = judgesHere[i]!\n const judgeJ = judgesHere[j]!\n const a = byJudge.get(judgeI)!\n const b = byJudge.get(judgeJ)!\n const common: Array<[number, number]> = []\n for (const [target, scoreA] of a) {\n const scoreB = b.get(target)\n if (scoreB !== undefined) common.push([scoreA, scoreB])\n }\n if (common.length < 2) continue\n const judgeScores = common.map(\n ([scoreA, scoreB]) =>\n [\n { judgeName: judgeI, dimension: dim, score: scoreA, reasoning: '' },\n { judgeName: judgeJ, dimension: dim, score: scoreB, reasoning: '' },\n ] as const,\n )\n const k = interRaterReliability(\n judgeScores[0]!.map((_, k2) => judgeScores.map((pair) => pair[k2]!)),\n )\n pairs.push({\n judgeA: judgeI,\n judgeB: judgeJ,\n dimension: dim,\n commonItems: common.length,\n pearson: pearson(\n common.map((c) => c[0]),\n common.map((c) => c[1]),\n ),\n krippendorff: k,\n })\n }\n }\n }\n\n return {\n pairs: pairs.sort((a, b) => b.commonItems - a.commonItems),\n dimensions: [...byDimension.keys()].sort(),\n judgeIds,\n }\n}\n\nfunction pearson(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n denA = 0,\n denB = 0\n for (let i = 0; i < a.length; i++) {\n const dA = a[i]! - mA\n const dB = b[i]! - mB\n num += dA * dB\n denA += dA * dA\n denB += dB * dB\n }\n if (denA === 0 || denB === 0) return denA === 0 && denB === 0 ? 1 : 0\n return num / Math.sqrt(denA * denB)\n}\n","/**\n * RegressionView — compares a candidate slice to a baseline slice on a\n * named metric. Delegates the statistics (Welch's t-test, Cohen's d,\n * IQR stability) to `baseline.ts`.\n *\n * This is the entry point for CI regression gates: \"given runs tagged\n * release=A and release=B, did any metric regress?\"\n */\n\nimport { type BaselineOptions, type BaselineReport, compareToBaseline } from '../baseline'\nimport { aggregateLlm, llmSpans, runFailureClass } from '../trace/query'\nimport type { Run } from '../trace/schema'\nimport type { RunFilter, TraceStore } from '../trace/store'\n\nexport interface RegressionSpec {\n metric: string\n higherIsBetter: boolean\n /** Extract a scalar from a run. Default extractors handle common metrics. */\n extract?: (run: Run, store: TraceStore) => Promise<number | null>\n}\n\nexport interface RegressionOptions extends BaselineOptions {\n baseline: RunFilter\n candidate: RunFilter\n}\n\nexport async function regressionView(\n store: TraceStore,\n metrics: RegressionSpec[],\n options: RegressionOptions,\n): Promise<BaselineReport> {\n const baselineRuns = await store.listRuns(options.baseline)\n const candidateRuns = await store.listRuns(options.candidate)\n const samples = await Promise.all(\n metrics.map(async (m) => {\n const extract = m.extract ?? defaultExtract(m.metric)\n const baseline = await extractAll(baselineRuns, extract, store)\n const candidate = await extractAll(candidateRuns, extract, store)\n return { metric: m.metric, higherIsBetter: m.higherIsBetter, baseline, candidate }\n }),\n )\n return compareToBaseline(samples, options)\n}\n\nasync function extractAll(\n runs: Run[],\n extract: (r: Run, s: TraceStore) => Promise<number | null>,\n store: TraceStore,\n): Promise<number[]> {\n const out: number[] = []\n for (const r of runs) {\n const v = await extract(r, store)\n if (v !== null && Number.isFinite(v)) out.push(v)\n }\n return out\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run, store) => {\n switch (metric) {\n case 'score':\n case 'overallScore':\n return run.outcome?.score ?? null\n case 'pass':\n return run.outcome?.pass === true ? 1 : 0\n case 'durationMs':\n return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null\n case 'costUsd': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).costUsd\n }\n case 'inputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).inputTokens\n }\n case 'outputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).outputTokens\n }\n case 'failureClass': {\n return runFailureClass(run) === 'success' ? 1 : 0\n }\n default:\n return null\n }\n }\n}\n","/**\n * StuckLoopView — detects when an agent calls the same tool with the\n * same (or structurally similar) arguments ≥ N times in a short window.\n *\n * Rationale: agents that loop are the number-one production failure\n * mode on long-horizon flows. The view returns (runId, toolName,\n * argHash, occurrences, windowMs) for each detected loop plus a\n * fraction of runs affected.\n */\n\nimport { argHash, toolSpans } from '../trace/query'\nimport type { TraceStore } from '../trace/store'\n\nexport interface StuckLoopFinding {\n runId: string\n toolName: string\n argHash: string\n occurrences: number\n spanIds: string[]\n /** Milliseconds between first and last call in the loop. */\n windowMs: number\n}\n\nexport interface StuckLoopReport {\n findings: StuckLoopFinding[]\n affectedRunRatio: number\n totalRuns: number\n}\n\nexport interface StuckLoopOptions {\n /** Minimum call count to flag a loop (default 3). */\n minOccurrences?: number\n /** Filter to a specific runId; omit to scan the entire corpus. */\n runId?: string\n}\n\nexport async function stuckLoopView(\n store: TraceStore,\n options: StuckLoopOptions = {},\n): Promise<StuckLoopReport> {\n const minOccurrences = options.minOccurrences ?? 3\n const runs = options.runId\n ? [{ runId: options.runId }]\n : (await store.listRuns()).map((r) => ({ runId: r.runId }))\n\n const findings: StuckLoopFinding[] = []\n for (const { runId } of runs) {\n const tools = await toolSpans(store, runId)\n const byKey = new Map<string, { spans: typeof tools; argHash: string }>()\n for (const t of tools) {\n const h = argHash(t.args)\n const key = `${t.toolName}|${h}`\n const bucket = byKey.get(key) ?? { spans: [], argHash: h }\n bucket.spans.push(t)\n byKey.set(key, bucket)\n }\n for (const [key, { spans, argHash: h }] of byKey) {\n if (spans.length < minOccurrences) continue\n const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt)\n const first = sorted[0]!.startedAt\n const last = sorted[sorted.length - 1]!.startedAt\n findings.push({\n runId,\n toolName: key.split('|')[0]!,\n argHash: h,\n occurrences: sorted.length,\n spanIds: sorted.map((s) => s.spanId),\n windowMs: last - first,\n })\n }\n }\n\n const affectedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n affectedRunRatio: runs.length > 0 ? affectedRuns.size / runs.length : 0,\n totalRuns: runs.length,\n }\n}\n","/**\n * ToolWasteView — fraction of tool calls whose results weren't used\n * downstream. Without a \"used\" signal we fall back to structural\n * proxies: error calls, duplicate calls, and tool calls followed by\n * zero subsequent LLM spans are all considered waste.\n *\n * Consumers can pass a `usageOracle` that inspects a tool span and\n * returns true iff the tool's result appears in a later LLM message,\n * artifact, or state mutation — that's the canonical definition; the\n * default heuristic is a reasonable fallback.\n */\n\nimport { computeToolUseMetrics } from '../tool-use-metrics'\nimport { llmSpans, toolSpans } from '../trace/query'\nimport type { ToolSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface ToolWasteFinding {\n runId: string\n wastedCalls: number\n totalCalls: number\n wasteRate: number\n}\n\nexport interface ToolWasteReport {\n byRun: ToolWasteFinding[]\n overallWasteRate: number\n}\n\nexport interface ToolWasteOptions {\n runId?: string\n usageOracle?: (tool: ToolSpan, later: { llm: Awaited<ReturnType<typeof llmSpans>> }) => boolean\n}\n\nexport async function toolWasteView(\n store: TraceStore,\n options: ToolWasteOptions = {},\n): Promise<ToolWasteReport> {\n const runs = options.runId ? [options.runId] : (await store.listRuns()).map((r) => r.runId)\n\n const byRun: ToolWasteFinding[] = []\n let totalCalls = 0\n let totalWasted = 0\n for (const runId of runs) {\n const tools = await toolSpans(store, runId)\n if (tools.length === 0) {\n byRun.push({ runId, wastedCalls: 0, totalCalls: 0, wasteRate: 0 })\n continue\n }\n const llms = await llmSpans(store, runId)\n let wasted = 0\n for (const t of tools) {\n if (t.status === 'error') {\n wasted++\n continue\n }\n const laterLlm = llms.filter((l) => l.startedAt > t.startedAt)\n if (options.usageOracle) {\n if (!options.usageOracle(t, { llm: laterLlm })) wasted++\n } else {\n // Default heuristic: a tool whose result is NOT mentioned in any\n // later LLM input message is likely wasted.\n const resultStr = stringify(t.result)\n const used = laterLlm.some((l) =>\n l.messages.some(\n (m) =>\n typeof m.content === 'string' &&\n resultStr &&\n m.content.includes(resultStr.slice(0, 120)),\n ),\n )\n if (!used) wasted++\n }\n }\n const wasteRate = wasted / tools.length\n byRun.push({ runId, wastedCalls: wasted, totalCalls: tools.length, wasteRate })\n totalCalls += tools.length\n totalWasted += wasted\n }\n return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 }\n}\n\nfunction stringify(v: unknown): string {\n if (v === null || v === undefined) return ''\n if (typeof v === 'string') return v\n try {\n return JSON.stringify(v)\n } catch {\n return String(v)\n }\n}\n\n// Re-export for convenience in consumers that want both descriptive and usage metrics.\nexport { computeToolUseMetrics }\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;AA8BA,eAAsB,iBACpB,OACA,UAAuD,CAAC,GAC3B;AAC7B,QAAM,OAAO,MAAM,MAAM,SAAS;AAAA,IAChC,YAAY,QAAQ;AAAA,IACpB,WAAW,QAAQ;AAAA,EACrB,CAAC;AACD,QAAM,WAAkC,CAAC;AACzC,QAAM,cAAsC,CAAC;AAC7C,QAAM,aAAqC,CAAC;AAC5C,QAAM,YAAoC,CAAC;AAE3C,aAAW,OAAO,MAAM;AACtB,UAAM,UAAU,MAAM,MAAM,OAAO,IAAI,KAAK;AAC5C,eAAW,KAAK,SAAS;AACvB,UAAI,CAAC,EAAE,SAAU;AACjB,YAAM,cAAc,EAAE,QAAQ,IAAI,EAAE,WAAW,EAAE,QAAQ;AACzD,eAAS,KAAK;AAAA,QACZ,OAAO,IAAI;AAAA,QACX,YAAY,IAAI;AAAA,QAChB,WAAW,IAAI;AAAA,QACf,WAAW,EAAE;AAAA,QACb,OAAO,EAAE;AAAA,QACT,UAAU,EAAE;AAAA,QACZ;AAAA,QACA,WAAW,EAAE;AAAA,MACf,CAAC;AACD,kBAAY,EAAE,SAAS,KAAK,YAAY,EAAE,SAAS,KAAK,KAAK;AAC7D,iBAAW,IAAI,UAAU,KAAK,WAAW,IAAI,UAAU,KAAK,KAAK;AACjE,UAAI,IAAI,UAAW,WAAU,IAAI,SAAS,KAAK,UAAU,IAAI,SAAS,KAAK,KAAK;AAAA,IAClF;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,WAAW,KAAK;AAAA,IAChB,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,EACxE;AACF;;;AC7CA,eAAsB,oBACpB,OACA,MACA,MACA,UAA6B,CAAC,GACH;AAC3B,QAAM,CAAC,GAAG,CAAC,IAAI,MAAM,QAAQ,IAAI,CAAC,gBAAgB,OAAO,IAAI,GAAG,gBAAgB,OAAO,IAAI,CAAC,CAAC;AAC7F,QAAM,KAAK,QAAQ,cAAc;AACjC,QAAM,SAAS,KAAK,IAAI,EAAE,MAAM,QAAQ,EAAE,MAAM,MAAM;AACtD,WAAS,IAAI,GAAG,IAAI,QAAQ,KAAK;AAC/B,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,QAAI,CAAC,GAAG,OAAO,KAAK,GAAG;AACrB,aAAO;AAAA,QACL;AAAA,QACA;AAAA,QACA,sBAAsB;AAAA,QACtB;AAAA,QACA;AAAA,QACA,QAAQ,mBAAmB,OAAO,KAAK;AAAA,QACvC,iBAAiB;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AACA,MAAI,EAAE,MAAM,WAAW,EAAE,MAAM,QAAQ;AACrC,WAAO,EAAE,MAAM,MAAM,sBAAsB,MAAM,iBAAiB,OAAO;AAAA,EAC3E;AACA,QAAM,SAAqB,EAAE,MAAM,SAAS,EAAE,MAAM,SAAS,IAAI;AACjE,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,sBAAsB;AAAA,IACtB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,QAAQ,sBAAsB,OAAO,MAAM,SAAS,MAAM,6BAA6B,SAAS,CAAC;AAAA,IACjG,iBAAiB;AAAA,EACnB;AACF;AAEA,SAAS,kBAAkB,GAAmB,GAA4B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO;AACxC,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,OAAQ,QAAO,EAAE,KAAK,aAAa,EAAE,KAAK;AACxF,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,MAAO,QAAO,EAAE,KAAK,UAAU,EAAE,KAAK;AACnF,MAAI,EAAE,KAAK,SAAS,WAAW,EAAE,KAAK,SAAS;AAC7C,WAAO,EAAE,KAAK,cAAc,EAAE,KAAK;AACrC,SAAO,EAAE,KAAK,SAAS,EAAE,KAAK;AAChC;AAEA,SAAS,mBAAmB,GAAmB,GAA2B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO,QAAQ,EAAE,KAAK,IAAI,OAAO,EAAE,KAAK,IAAI;AAC7E,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,aAAa,EAAE,KAAK,UAAU;AAC3F,WAAO,QAAQ,EAAE,KAAK,QAAQ,OAAO,EAAE,KAAK,QAAQ;AAAA,EACtD;AACA,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AACnF,WAAO,SAAS,EAAE,KAAK,KAAK,OAAO,EAAE,KAAK,KAAK;AAAA,EACjD;AACA,SAAO,SAAS,EAAE,KAAK,IAAI,SAAS,EAAE,KAAK,IAAI;AACjD;;;ACvDA,eAAsB,mBAAmB,OAAkD;AACzF,QAAM,OAAO,MAAM,MAAM,MAAM,EAAE,MAAM,QAAQ,CAAC,GAAG;AAAA,IACjD,CAAC,MAAsB,EAAE,SAAS;AAAA,EACpC;AACA,MAAI,IAAI,WAAW,EAAG,QAAO,EAAE,OAAO,CAAC,GAAG,YAAY,CAAC,GAAG,UAAU,CAAC,EAAE;AAEvE,QAAM,cAAc,oBAAI,IAAyB;AACjD,aAAW,KAAK,KAAK;AACnB,UAAM,MAAM,YAAY,IAAI,EAAE,SAAS,KAAK,CAAC;AAC7C,QAAI,KAAK,CAAC;AACV,gBAAY,IAAI,EAAE,WAAW,GAAG;AAAA,EAClC;AAEA,QAAM,WAAW,CAAC,GAAG,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,KAAK;AAC9D,QAAM,QAAqB,CAAC;AAC5B,aAAW,CAAC,KAAK,KAAK,KAAK,aAAa;AACtC,UAAM,UAAU,oBAAI,IAAiC;AACrD,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,IAAI,EAAE,OAAO,KAAK,oBAAI,IAAoB;AAC5D,QAAE,IAAI,EAAE,cAAc,EAAE,KAAK;AAC7B,cAAQ,IAAI,EAAE,SAAS,CAAC;AAAA,IAC1B;AACA,UAAM,aAAa,CAAC,GAAG,QAAQ,KAAK,CAAC;AACrC,aAAS,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC1C,eAAS,IAAI,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC9C,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,SAAkC,CAAC;AACzC,mBAAW,CAAC,QAAQ,MAAM,KAAK,GAAG;AAChC,gBAAM,SAAS,EAAE,IAAI,MAAM;AAC3B,cAAI,WAAW,OAAW,QAAO,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,QACxD;AACA,YAAI,OAAO,SAAS,EAAG;AACvB,cAAM,cAAc,OAAO;AAAA,UACzB,CAAC,CAAC,QAAQ,MAAM,MACd;AAAA,YACE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,YAClE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,UACpE;AAAA,QACJ;AACA,cAAM,IAAI;AAAA,UACR,YAAY,CAAC,EAAG,IAAI,CAAC,GAAG,OAAO,YAAY,IAAI,CAAC,SAAS,KAAK,EAAE,CAAE,CAAC;AAAA,QACrE;AACA,cAAM,KAAK;AAAA,UACT,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR,WAAW;AAAA,UACX,aAAa,OAAO;AAAA,UACpB,SAAS;AAAA,YACP,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,YACtB,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,UACxB;AAAA,UACA,cAAc;AAAA,QAChB,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,MAAM,KAAK,CAAC,GAAG,MAAM,EAAE,cAAc,EAAE,WAAW;AAAA,IACzD,YAAY,CAAC,GAAG,YAAY,KAAK,CAAC,EAAE,KAAK;AAAA,IACzC;AAAA,EACF;AACF;AAEA,SAAS,QAAQ,GAAa,GAAqB;AACjD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,OAAO,GACP,OAAO;AACT,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,WAAO,KAAK;AACZ,YAAQ,KAAK;AACb,YAAQ,KAAK;AAAA,EACf;AACA,MAAI,SAAS,KAAK,SAAS,EAAG,QAAO,SAAS,KAAK,SAAS,IAAI,IAAI;AACpE,SAAO,MAAM,KAAK,KAAK,OAAO,IAAI;AACpC;;;ACvFA,eAAsB,eACpB,OACA,SACA,SACyB;AACzB,QAAM,eAAe,MAAM,MAAM,SAAS,QAAQ,QAAQ;AAC1D,QAAM,gBAAgB,MAAM,MAAM,SAAS,QAAQ,SAAS;AAC5D,QAAM,UAAU,MAAM,QAAQ;AAAA,IAC5B,QAAQ,IAAI,OAAO,MAAM;AACvB,YAAM,UAAU,EAAE,WAAW,eAAe,EAAE,MAAM;AACpD,YAAM,WAAW,MAAM,WAAW,cAAc,SAAS,KAAK;AAC9D,YAAM,YAAY,MAAM,WAAW,eAAe,SAAS,KAAK;AAChE,aAAO,EAAE,QAAQ,EAAE,QAAQ,gBAAgB,EAAE,gBAAgB,UAAU,UAAU;AAAA,IACnF,CAAC;AAAA,EACH;AACA,SAAO,kBAAkB,SAAS,OAAO;AAC3C;AAEA,eAAe,WACb,MACA,SACA,OACmB;AACnB,QAAM,MAAgB,CAAC;AACvB,aAAW,KAAK,MAAM;AACpB,UAAM,IAAI,MAAM,QAAQ,GAAG,KAAK;AAChC,QAAI,MAAM,QAAQ,OAAO,SAAS,CAAC,EAAG,KAAI,KAAK,CAAC;AAAA,EAClD;AACA,SAAO;AACT;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,KAAK,UAAU;AAC3B,YAAQ,QAAQ;AAAA,MACd,KAAK;AAAA,MACL,KAAK;AACH,eAAO,IAAI,SAAS,SAAS;AAAA,MAC/B,KAAK;AACH,eAAO,IAAI,SAAS,SAAS,OAAO,IAAI;AAAA,MAC1C,KAAK;AACH,eAAO,IAAI,WAAW,IAAI,YAAY,IAAI,UAAU,IAAI,YAAY;AAAA,MACtE,KAAK,WAAW;AACd,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,eAAe;AAClB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,eAAO,gBAAgB,GAAG,MAAM,YAAY,IAAI;AAAA,MAClD;AAAA,MACA;AACE,eAAO;AAAA,IACX;AAAA,EACF;AACF;;;AClDA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,OAAO,QAAQ,QACjB,CAAC,EAAE,OAAO,QAAQ,MAAM,CAAC,KACxB,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE;AAE5D,QAAM,WAA+B,CAAC;AACtC,aAAW,EAAE,MAAM,KAAK,MAAM;AAC5B,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,UAAM,QAAQ,oBAAI,IAAsD;AACxE,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,EAAE,IAAI;AACxB,YAAM,MAAM,GAAG,EAAE,QAAQ,IAAI,CAAC;AAC9B,YAAM,SAAS,MAAM,IAAI,GAAG,KAAK,EAAE,OAAO,CAAC,GAAG,SAAS,EAAE;AACzD,aAAO,MAAM,KAAK,CAAC;AACnB,YAAM,IAAI,KAAK,MAAM;AAAA,IACvB;AACA,eAAW,CAAC,KAAK,EAAE,OAAO,SAAS,EAAE,CAAC,KAAK,OAAO;AAChD,UAAI,MAAM,SAAS,eAAgB;AACnC,YAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAClE,YAAM,QAAQ,OAAO,CAAC,EAAG;AACzB,YAAM,OAAO,OAAO,OAAO,SAAS,CAAC,EAAG;AACxC,eAAS,KAAK;AAAA,QACZ;AAAA,QACA,UAAU,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,QAC1B,SAAS;AAAA,QACT,aAAa,OAAO;AAAA,QACpB,SAAS,OAAO,IAAI,CAAC,MAAM,EAAE,MAAM;AAAA,QACnC,UAAU,OAAO;AAAA,MACnB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,IACtE,WAAW,KAAK;AAAA,EAClB;AACF;;;AC5CA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,OAAO,QAAQ,QAAQ,CAAC,QAAQ,KAAK,KAAK,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK;AAE1F,QAAM,QAA4B,CAAC;AACnC,MAAI,aAAa;AACjB,MAAI,cAAc;AAClB,aAAW,SAAS,MAAM;AACxB,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,QAAI,MAAM,WAAW,GAAG;AACtB,YAAM,KAAK,EAAE,OAAO,aAAa,GAAG,YAAY,GAAG,WAAW,EAAE,CAAC;AACjE;AAAA,IACF;AACA,UAAM,OAAO,MAAM,SAAS,OAAO,KAAK;AACxC,QAAI,SAAS;AACb,eAAW,KAAK,OAAO;AACrB,UAAI,EAAE,WAAW,SAAS;AACxB;AACA;AAAA,MACF;AACA,YAAM,WAAW,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,EAAE,SAAS;AAC7D,UAAI,QAAQ,aAAa;AACvB,YAAI,CAAC,QAAQ,YAAY,GAAG,EAAE,KAAK,SAAS,CAAC,EAAG;AAAA,MAClD,OAAO;AAGL,cAAM,YAAY,UAAU,EAAE,MAAM;AACpC,cAAM,OAAO,SAAS;AAAA,UAAK,CAAC,MAC1B,EAAE,SAAS;AAAA,YACT,CAAC,MACC,OAAO,EAAE,YAAY,YACrB,aACA,EAAE,QAAQ,SAAS,UAAU,MAAM,GAAG,GAAG,CAAC;AAAA,UAC9C;AAAA,QACF;AACA,YAAI,CAAC,KAAM;AAAA,MACb;AAAA,IACF;AACA,UAAM,YAAY,SAAS,MAAM;AACjC,UAAM,KAAK,EAAE,OAAO,aAAa,QAAQ,YAAY,MAAM,QAAQ,UAAU,CAAC;AAC9E,kBAAc,MAAM;AACpB,mBAAe;AAAA,EACjB;AACA,SAAO,EAAE,OAAO,kBAAkB,aAAa,IAAI,cAAc,aAAa,EAAE;AAClF;AAEA,SAAS,UAAU,GAAoB;AACrC,MAAI,MAAM,QAAQ,MAAM,OAAW,QAAO;AAC1C,MAAI,OAAO,MAAM,SAAU,QAAO;AAClC,MAAI;AACF,WAAO,KAAK,UAAU,CAAC;AAAA,EACzB,QAAQ;AACN,WAAO,OAAO,CAAC;AAAA,EACjB;AACF;","names":[]}
1
+ {"version":3,"sources":["../../src/pipelines/budget-breach.ts","../../src/pipelines/failure-cluster.ts","../../src/pipelines/first-divergence.ts","../../src/pipelines/judge-agreement.ts","../../src/pipelines/regression.ts","../../src/pipelines/stuck-loop.ts","../../src/pipelines/tool-waste.ts"],"sourcesContent":["/**\n * BudgetBreachView — aggregates breach events across the corpus.\n *\n * Answers: which dimensions get hit most often? Which scenarios are\n * underbudgeted? Which variants trigger the most breaches?\n */\n\nimport type { BudgetSpec } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface BudgetBreachFinding {\n runId: string\n scenarioId: string\n variantId?: string\n dimension: keyof BudgetSpec\n limit: number\n consumed: number\n excessRatio: number\n timestamp: number\n}\n\nexport interface BudgetBreachReport {\n findings: BudgetBreachFinding[]\n byDimension: Record<string, number>\n byScenario: Record<string, number>\n byVariant: Record<string, number>\n totalRuns: number\n breachedRunRatio: number\n}\n\nexport async function budgetBreachView(\n store: TraceStore,\n options: { scenarioId?: string; variantId?: string } = {},\n): Promise<BudgetBreachReport> {\n const runs = await store.listRuns({\n scenarioId: options.scenarioId,\n variantId: options.variantId,\n })\n const findings: BudgetBreachFinding[] = []\n const byDimension: Record<string, number> = {}\n const byScenario: Record<string, number> = {}\n const byVariant: Record<string, number> = {}\n\n for (const run of runs) {\n const entries = await store.budget(run.runId)\n for (const e of entries) {\n if (!e.breached) continue\n const excessRatio = e.limit > 0 ? e.consumed / e.limit : Infinity\n findings.push({\n runId: run.runId,\n scenarioId: run.scenarioId,\n variantId: run.variantId,\n dimension: e.dimension,\n limit: e.limit,\n consumed: e.consumed,\n excessRatio,\n timestamp: e.timestamp,\n })\n byDimension[e.dimension] = (byDimension[e.dimension] ?? 0) + 1\n byScenario[run.scenarioId] = (byScenario[run.scenarioId] ?? 0) + 1\n if (run.variantId) byVariant[run.variantId] = (byVariant[run.variantId] ?? 0) + 1\n }\n }\n\n const breachedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n byDimension,\n byScenario,\n byVariant,\n totalRuns: runs.length,\n breachedRunRatio: runs.length > 0 ? breachedRuns.size / runs.length : 0,\n }\n}\n","/**\n * FailureClusterView — groups failed runs by (failureClass, triggerTool,\n * argHash-prefix) so weekly reviews can prioritize the top-N clusters.\n *\n * Each cluster includes: N runs, scenarios affected, representative\n * error message, a proposed mitigation hint (rule → action table).\n */\n\nimport { classifyFailure, DEFAULT_RULES, type FailureRule } from '../failure-taxonomy'\nimport { argHash, toolSpans } from '../trace/query'\nimport type { FailureClass, Span } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface FailureCluster {\n failureClass: FailureClass\n /** Tool name when the trigger was a tool span, else undefined. */\n toolName?: string\n /** First 16 chars of argHash — clusters similar args. */\n argPrefix?: string\n /**\n * Source dimension when the trigger was a judge span (e.g. `'format'`,\n * `'safety'`, `'correctness'`). Lets cross-template aggregators\n * group failures by the dimension that fired without overloading\n * `argPrefix`. Optional — clusters without this field deserialize cleanly.\n */\n dimension?: string\n runCount: number\n scenarioIds: string[]\n exampleError?: string\n exampleRunId: string\n}\n\nexport interface FailureClusterReport {\n clusters: FailureCluster[]\n totalFailures: number\n totalRuns: number\n}\n\nexport async function failureClusterView(\n store: TraceStore,\n options: { rules?: FailureRule[]; minClusterSize?: number } = {},\n): Promise<FailureClusterReport> {\n const rules = options.rules ?? DEFAULT_RULES\n const minSize = options.minClusterSize ?? 1\n const runs = await store.listRuns()\n\n type Key = string\n const clusters = new Map<Key, FailureCluster>()\n let totalFailures = 0\n\n for (const run of runs) {\n if (run.status === 'completed' && run.outcome?.pass !== false) continue\n totalFailures++\n const spans = await store.spans({ runId: run.runId })\n const events = await store.events({ runId: run.runId })\n const cls = classifyFailure({ run, spans, events }, rules)\n\n let toolName: string | undefined\n let argPrefix: string | undefined\n let dimension: string | undefined\n if (cls.triggerSpanId) {\n const trig = spans.find((s) => s.spanId === cls.triggerSpanId)\n if (trig?.kind === 'tool') {\n toolName = trig.toolName\n argPrefix = argHash(trig.args).slice(0, 16)\n } else if (trig?.kind === 'judge') {\n dimension = trig.dimension\n }\n }\n // Fallback: look at the last errored tool span\n if (!toolName) {\n const ts = await toolSpans(store, run.runId)\n const errored = ts.filter((t) => t.status === 'error').pop()\n if (errored) {\n toolName = errored.toolName\n argPrefix = argHash(errored.args).slice(0, 16)\n }\n }\n // Secondary signal: any judge span on the failed run carries a\n // dimension. Useful when the rule classified by judge score but\n // didn't surface the trigger span (or surfaced a non-judge span).\n if (!dimension) {\n const judge = spans.find((s) => s.kind === 'judge' && typeof s.dimension === 'string')\n if (judge?.kind === 'judge') dimension = judge.dimension\n }\n\n const key = `${cls.failureClass}|${toolName ?? ''}|${argPrefix ?? ''}|${dimension ?? ''}`\n let cluster = clusters.get(key)\n if (!cluster) {\n cluster = {\n failureClass: cls.failureClass,\n toolName,\n argPrefix,\n dimension,\n runCount: 0,\n scenarioIds: [],\n exampleRunId: run.runId,\n exampleError: firstErrorMessage(spans) ?? cls.reason,\n }\n clusters.set(key, cluster)\n }\n cluster.runCount++\n if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId)\n }\n\n const arr = [...clusters.values()]\n .filter((c) => c.runCount >= minSize)\n .sort((a, b) => b.runCount - a.runCount)\n\n return { clusters: arr, totalFailures, totalRuns: runs.length }\n}\n\nfunction firstErrorMessage(spans: Span[]): string | undefined {\n const errored = spans.find((s) => s.status === 'error')\n return errored?.error\n}\n","/**\n * FirstDivergenceView — aligns two trajectories by step index, reports\n * the first step where they differ.\n *\n * \"Differ\" is configurable — default is (kind, toolName if tool, model\n * if llm). Use this view to attribute \"why is variant B better?\" to a\n * specific step rather than an aggregate mean delta.\n */\n\nimport type { TraceStore } from '../trace/store'\nimport { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'\n\nexport interface DivergenceReport {\n runA: string\n runB: string\n firstDivergenceIndex: number | null\n aStep?: TrajectoryStep\n bStep?: TrajectoryStep\n reason?: string\n /** Common prefix length (steps that matched). */\n commonPrefixLen: number\n}\n\nexport interface DivergenceOptions {\n /** Returns true if two steps are considered equal. Default: kind + tool/model match. */\n stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean\n}\n\nexport async function firstDivergenceView(\n store: TraceStore,\n runA: string,\n runB: string,\n options: DivergenceOptions = {},\n): Promise<DivergenceReport> {\n const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)])\n const eq = options.stepEquals ?? defaultStepEquals\n const minLen = Math.min(a.steps.length, b.steps.length)\n for (let i = 0; i < minLen; i++) {\n const aStep = a.steps[i]!\n const bStep = b.steps[i]!\n if (!eq(aStep, bStep)) {\n return {\n runA,\n runB,\n firstDivergenceIndex: i,\n aStep,\n bStep,\n reason: describeDifference(aStep, bStep),\n commonPrefixLen: i,\n }\n }\n }\n if (a.steps.length === b.steps.length) {\n return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen }\n }\n const longer: Trajectory = a.steps.length > b.steps.length ? a : b\n return {\n runA,\n runB,\n firstDivergenceIndex: minLen,\n aStep: a.steps[minLen],\n bStep: b.steps[minLen],\n reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`,\n commonPrefixLen: minLen,\n }\n}\n\nfunction defaultStepEquals(a: TrajectoryStep, b: TrajectoryStep): boolean {\n if (a.span.kind !== b.span.kind) return false\n if (a.span.kind === 'tool' && b.span.kind === 'tool') return a.span.toolName === b.span.toolName\n if (a.span.kind === 'llm' && b.span.kind === 'llm') return a.span.model === b.span.model\n if (a.span.kind === 'judge' && b.span.kind === 'judge')\n return a.span.dimension === b.span.dimension\n return a.span.name === b.span.name\n}\n\nfunction describeDifference(a: TrajectoryStep, b: TrajectoryStep): string {\n if (a.span.kind !== b.span.kind) return `kind ${a.span.kind} vs ${b.span.kind}`\n if (a.span.kind === 'tool' && b.span.kind === 'tool' && a.span.toolName !== b.span.toolName) {\n return `tool ${a.span.toolName} vs ${b.span.toolName}`\n }\n if (a.span.kind === 'llm' && b.span.kind === 'llm' && a.span.model !== b.span.model) {\n return `model ${a.span.model} vs ${b.span.model}`\n }\n return `name \"${a.span.name}\" vs \"${b.span.name}\"`\n}\n","/**\n * JudgeAgreementView — pairwise agreement between judges across the\n * corpus, grouped by dimension.\n *\n * Output drives two workflows:\n * - Judge robustness audit: \"does Claude agree with GPT at κ ≥ 0.6?\"\n * - Calibration tracking: κ vs golden human labels over time (by\n * providing a `humanGoldenJudgeId`).\n */\n\nimport { interRaterReliability } from '../statistics'\nimport type { JudgeSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface JudgePair {\n judgeA: string\n judgeB: string\n dimension: string\n /** Number of (targetSpanId, dimension) tuples both judges scored. */\n commonItems: number\n pearson: number\n krippendorff: number\n}\n\nexport interface JudgeAgreementReport {\n pairs: JudgePair[]\n dimensions: string[]\n judgeIds: string[]\n}\n\nexport async function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport> {\n const all = (await store.spans({ kind: 'judge' })).filter(\n (s): s is JudgeSpan => s.kind === 'judge',\n )\n if (all.length === 0) return { pairs: [], dimensions: [], judgeIds: [] }\n\n const byDimension = new Map<string, JudgeSpan[]>()\n for (const s of all) {\n const arr = byDimension.get(s.dimension) ?? []\n arr.push(s)\n byDimension.set(s.dimension, arr)\n }\n\n const judgeIds = [...new Set(all.map((s) => s.judgeId))].sort()\n const pairs: JudgePair[] = []\n for (const [dim, spans] of byDimension) {\n const byJudge = new Map<string, Map<string, number>>()\n for (const s of spans) {\n const m = byJudge.get(s.judgeId) ?? new Map<string, number>()\n m.set(s.targetSpanId, s.score)\n byJudge.set(s.judgeId, m)\n }\n const judgesHere = [...byJudge.keys()]\n for (let i = 0; i < judgesHere.length; i++) {\n for (let j = i + 1; j < judgesHere.length; j++) {\n const judgeI = judgesHere[i]!\n const judgeJ = judgesHere[j]!\n const a = byJudge.get(judgeI)!\n const b = byJudge.get(judgeJ)!\n const common: Array<[number, number]> = []\n for (const [target, scoreA] of a) {\n const scoreB = b.get(target)\n if (scoreB !== undefined) common.push([scoreA, scoreB])\n }\n if (common.length < 2) continue\n const judgeScores = common.map(\n ([scoreA, scoreB]) =>\n [\n { judgeName: judgeI, dimension: dim, score: scoreA, reasoning: '' },\n { judgeName: judgeJ, dimension: dim, score: scoreB, reasoning: '' },\n ] as const,\n )\n const k = interRaterReliability(\n judgeScores[0]!.map((_, k2) => judgeScores.map((pair) => pair[k2]!)),\n )\n pairs.push({\n judgeA: judgeI,\n judgeB: judgeJ,\n dimension: dim,\n commonItems: common.length,\n pearson: pearson(\n common.map((c) => c[0]),\n common.map((c) => c[1]),\n ),\n krippendorff: k,\n })\n }\n }\n }\n\n return {\n pairs: pairs.sort((a, b) => b.commonItems - a.commonItems),\n dimensions: [...byDimension.keys()].sort(),\n judgeIds,\n }\n}\n\nfunction pearson(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n denA = 0,\n denB = 0\n for (let i = 0; i < a.length; i++) {\n const dA = a[i]! - mA\n const dB = b[i]! - mB\n num += dA * dB\n denA += dA * dA\n denB += dB * dB\n }\n if (denA === 0 || denB === 0) return denA === 0 && denB === 0 ? 1 : 0\n return num / Math.sqrt(denA * denB)\n}\n","/**\n * RegressionView — compares a candidate slice to a baseline slice on a\n * named metric. Delegates the statistics (Welch's t-test, Cohen's d,\n * IQR stability) to `baseline.ts`.\n *\n * This is the entry point for CI regression gates: \"given runs tagged\n * release=A and release=B, did any metric regress?\"\n */\n\nimport { type BaselineOptions, type BaselineReport, compareToBaseline } from '../baseline'\nimport { aggregateLlm, llmSpans, runFailureClass } from '../trace/query'\nimport type { Run } from '../trace/schema'\nimport type { RunFilter, TraceStore } from '../trace/store'\n\nexport interface RegressionSpec {\n metric: string\n higherIsBetter: boolean\n /** Extract a scalar from a run. Default extractors handle common metrics. */\n extract?: (run: Run, store: TraceStore) => Promise<number | null>\n}\n\nexport interface RegressionOptions extends BaselineOptions {\n baseline: RunFilter\n candidate: RunFilter\n}\n\nexport async function regressionView(\n store: TraceStore,\n metrics: RegressionSpec[],\n options: RegressionOptions,\n): Promise<BaselineReport> {\n const baselineRuns = await store.listRuns(options.baseline)\n const candidateRuns = await store.listRuns(options.candidate)\n const samples = await Promise.all(\n metrics.map(async (m) => {\n const extract = m.extract ?? defaultExtract(m.metric)\n const baseline = await extractAll(baselineRuns, extract, store)\n const candidate = await extractAll(candidateRuns, extract, store)\n return { metric: m.metric, higherIsBetter: m.higherIsBetter, baseline, candidate }\n }),\n )\n return compareToBaseline(samples, options)\n}\n\nasync function extractAll(\n runs: Run[],\n extract: (r: Run, s: TraceStore) => Promise<number | null>,\n store: TraceStore,\n): Promise<number[]> {\n const out: number[] = []\n for (const r of runs) {\n const v = await extract(r, store)\n if (v !== null && Number.isFinite(v)) out.push(v)\n }\n return out\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run, store) => {\n switch (metric) {\n case 'score':\n case 'overallScore':\n return run.outcome?.score ?? null\n case 'pass':\n return run.outcome?.pass === true ? 1 : 0\n case 'durationMs':\n return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null\n case 'costUsd': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).costUsd\n }\n case 'inputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).inputTokens\n }\n case 'outputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).outputTokens\n }\n case 'failureClass': {\n return runFailureClass(run) === 'success' ? 1 : 0\n }\n default:\n return null\n }\n }\n}\n","/**\n * StuckLoopView — detects when an agent calls the same tool with the\n * same (or structurally similar) arguments ≥ N times in a short window.\n *\n * Rationale: agents that loop are the number-one production failure\n * mode on long-horizon flows. The view returns (runId, toolName,\n * argHash, occurrences, windowMs) for each detected loop plus a\n * fraction of runs affected.\n */\n\nimport { argHash, toolSpans } from '../trace/query'\nimport type { TraceStore } from '../trace/store'\n\nexport interface StuckLoopFinding {\n runId: string\n toolName: string\n argHash: string\n occurrences: number\n spanIds: string[]\n /** Milliseconds between first and last call in the loop. */\n windowMs: number\n}\n\nexport interface StuckLoopReport {\n findings: StuckLoopFinding[]\n affectedRunRatio: number\n totalRuns: number\n}\n\nexport interface StuckLoopOptions {\n /** Minimum call count to flag a loop (default 3). */\n minOccurrences?: number\n /** Filter to a specific runId; omit to scan the entire corpus. */\n runId?: string\n}\n\nexport async function stuckLoopView(\n store: TraceStore,\n options: StuckLoopOptions = {},\n): Promise<StuckLoopReport> {\n const minOccurrences = options.minOccurrences ?? 3\n const runs = options.runId\n ? [{ runId: options.runId }]\n : (await store.listRuns()).map((r) => ({ runId: r.runId }))\n\n const findings: StuckLoopFinding[] = []\n for (const { runId } of runs) {\n const tools = await toolSpans(store, runId)\n const byKey = new Map<string, { spans: typeof tools; argHash: string }>()\n for (const t of tools) {\n const h = argHash(t.args)\n const key = `${t.toolName}|${h}`\n const bucket = byKey.get(key) ?? { spans: [], argHash: h }\n bucket.spans.push(t)\n byKey.set(key, bucket)\n }\n for (const [key, { spans, argHash: h }] of byKey) {\n if (spans.length < minOccurrences) continue\n const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt)\n const first = sorted[0]!.startedAt\n const last = sorted[sorted.length - 1]!.startedAt\n findings.push({\n runId,\n toolName: key.split('|')[0]!,\n argHash: h,\n occurrences: sorted.length,\n spanIds: sorted.map((s) => s.spanId),\n windowMs: last - first,\n })\n }\n }\n\n const affectedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n affectedRunRatio: runs.length > 0 ? affectedRuns.size / runs.length : 0,\n totalRuns: runs.length,\n }\n}\n","/**\n * ToolWasteView — fraction of tool calls whose results weren't used\n * downstream. Without a \"used\" signal we fall back to structural\n * proxies: error calls, duplicate calls, and tool calls followed by\n * zero subsequent LLM spans are all considered waste.\n *\n * Consumers can pass a `usageOracle` that inspects a tool span and\n * returns true iff the tool's result appears in a later LLM message,\n * artifact, or state mutation — that's the canonical definition; the\n * default heuristic is a reasonable fallback.\n */\n\nimport { computeToolUseMetrics } from '../tool-use-metrics'\nimport { llmSpans, toolSpans } from '../trace/query'\nimport type { ToolSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface ToolWasteFinding {\n runId: string\n wastedCalls: number\n totalCalls: number\n wasteRate: number\n}\n\nexport interface ToolWasteReport {\n byRun: ToolWasteFinding[]\n overallWasteRate: number\n}\n\nexport interface ToolWasteOptions {\n runId?: string\n usageOracle?: (tool: ToolSpan, later: { llm: Awaited<ReturnType<typeof llmSpans>> }) => boolean\n}\n\nexport async function toolWasteView(\n store: TraceStore,\n options: ToolWasteOptions = {},\n): Promise<ToolWasteReport> {\n const runs = options.runId ? [options.runId] : (await store.listRuns()).map((r) => r.runId)\n\n const byRun: ToolWasteFinding[] = []\n let totalCalls = 0\n let totalWasted = 0\n for (const runId of runs) {\n const tools = await toolSpans(store, runId)\n if (tools.length === 0) {\n byRun.push({ runId, wastedCalls: 0, totalCalls: 0, wasteRate: 0 })\n continue\n }\n const llms = await llmSpans(store, runId)\n let wasted = 0\n for (const t of tools) {\n if (t.status === 'error') {\n wasted++\n continue\n }\n const laterLlm = llms.filter((l) => l.startedAt > t.startedAt)\n if (options.usageOracle) {\n if (!options.usageOracle(t, { llm: laterLlm })) wasted++\n } else {\n // Default heuristic: a tool whose result is NOT mentioned in any\n // later LLM input message is likely wasted.\n const resultStr = stringify(t.result)\n const used = laterLlm.some((l) =>\n l.messages.some(\n (m) =>\n typeof m.content === 'string' &&\n resultStr &&\n m.content.includes(resultStr.slice(0, 120)),\n ),\n )\n if (!used) wasted++\n }\n }\n const wasteRate = wasted / tools.length\n byRun.push({ runId, wastedCalls: wasted, totalCalls: tools.length, wasteRate })\n totalCalls += tools.length\n totalWasted += wasted\n }\n return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 }\n}\n\nfunction stringify(v: unknown): string {\n if (v === null || v === undefined) return ''\n if (typeof v === 'string') return v\n try {\n return JSON.stringify(v)\n } catch {\n return String(v)\n }\n}\n\n// Re-export for convenience in consumers that want both descriptive and usage metrics.\nexport { computeToolUseMetrics }\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;AA8BA,eAAsB,iBACpB,OACA,UAAuD,CAAC,GAC3B;AAC7B,QAAM,OAAO,MAAM,MAAM,SAAS;AAAA,IAChC,YAAY,QAAQ;AAAA,IACpB,WAAW,QAAQ;AAAA,EACrB,CAAC;AACD,QAAM,WAAkC,CAAC;AACzC,QAAM,cAAsC,CAAC;AAC7C,QAAM,aAAqC,CAAC;AAC5C,QAAM,YAAoC,CAAC;AAE3C,aAAW,OAAO,MAAM;AACtB,UAAM,UAAU,MAAM,MAAM,OAAO,IAAI,KAAK;AAC5C,eAAW,KAAK,SAAS;AACvB,UAAI,CAAC,EAAE,SAAU;AACjB,YAAM,cAAc,EAAE,QAAQ,IAAI,EAAE,WAAW,EAAE,QAAQ;AACzD,eAAS,KAAK;AAAA,QACZ,OAAO,IAAI;AAAA,QACX,YAAY,IAAI;AAAA,QAChB,WAAW,IAAI;AAAA,QACf,WAAW,EAAE;AAAA,QACb,OAAO,EAAE;AAAA,QACT,UAAU,EAAE;AAAA,QACZ;AAAA,QACA,WAAW,EAAE;AAAA,MACf,CAAC;AACD,kBAAY,EAAE,SAAS,KAAK,YAAY,EAAE,SAAS,KAAK,KAAK;AAC7D,iBAAW,IAAI,UAAU,KAAK,WAAW,IAAI,UAAU,KAAK,KAAK;AACjE,UAAI,IAAI,UAAW,WAAU,IAAI,SAAS,KAAK,UAAU,IAAI,SAAS,KAAK,KAAK;AAAA,IAClF;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,WAAW,KAAK;AAAA,IAChB,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,EACxE;AACF;;;ACnCA,eAAsB,mBACpB,OACA,UAA8D,CAAC,GAChC;AAC/B,QAAM,QAAQ,QAAQ,SAAS;AAC/B,QAAM,UAAU,QAAQ,kBAAkB;AAC1C,QAAM,OAAO,MAAM,MAAM,SAAS;AAGlC,QAAM,WAAW,oBAAI,IAAyB;AAC9C,MAAI,gBAAgB;AAEpB,aAAW,OAAO,MAAM;AACtB,QAAI,IAAI,WAAW,eAAe,IAAI,SAAS,SAAS,MAAO;AAC/D;AACA,UAAM,QAAQ,MAAM,MAAM,MAAM,EAAE,OAAO,IAAI,MAAM,CAAC;AACpD,UAAM,SAAS,MAAM,MAAM,OAAO,EAAE,OAAO,IAAI,MAAM,CAAC;AACtD,UAAM,MAAM,gBAAgB,EAAE,KAAK,OAAO,OAAO,GAAG,KAAK;AAEzD,QAAI;AACJ,QAAI;AACJ,QAAI;AACJ,QAAI,IAAI,eAAe;AACrB,YAAM,OAAO,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,IAAI,aAAa;AAC7D,UAAI,MAAM,SAAS,QAAQ;AACzB,mBAAW,KAAK;AAChB,oBAAY,QAAQ,KAAK,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC5C,WAAW,MAAM,SAAS,SAAS;AACjC,oBAAY,KAAK;AAAA,MACnB;AAAA,IACF;AAEA,QAAI,CAAC,UAAU;AACb,YAAM,KAAK,MAAM,UAAU,OAAO,IAAI,KAAK;AAC3C,YAAM,UAAU,GAAG,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO,EAAE,IAAI;AAC3D,UAAI,SAAS;AACX,mBAAW,QAAQ;AACnB,oBAAY,QAAQ,QAAQ,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC/C;AAAA,IACF;AAIA,QAAI,CAAC,WAAW;AACd,YAAM,QAAQ,MAAM,KAAK,CAAC,MAAM,EAAE,SAAS,WAAW,OAAO,EAAE,cAAc,QAAQ;AACrF,UAAI,OAAO,SAAS,QAAS,aAAY,MAAM;AAAA,IACjD;AAEA,UAAM,MAAM,GAAG,IAAI,YAAY,IAAI,YAAY,EAAE,IAAI,aAAa,EAAE,IAAI,aAAa,EAAE;AACvF,QAAI,UAAU,SAAS,IAAI,GAAG;AAC9B,QAAI,CAAC,SAAS;AACZ,gBAAU;AAAA,QACR,cAAc,IAAI;AAAA,QAClB;AAAA,QACA;AAAA,QACA;AAAA,QACA,UAAU;AAAA,QACV,aAAa,CAAC;AAAA,QACd,cAAc,IAAI;AAAA,QAClB,cAAc,kBAAkB,KAAK,KAAK,IAAI;AAAA,MAChD;AACA,eAAS,IAAI,KAAK,OAAO;AAAA,IAC3B;AACA,YAAQ;AACR,QAAI,CAAC,QAAQ,YAAY,SAAS,IAAI,UAAU,EAAG,SAAQ,YAAY,KAAK,IAAI,UAAU;AAAA,EAC5F;AAEA,QAAM,MAAM,CAAC,GAAG,SAAS,OAAO,CAAC,EAC9B,OAAO,CAAC,MAAM,EAAE,YAAY,OAAO,EACnC,KAAK,CAAC,GAAG,MAAM,EAAE,WAAW,EAAE,QAAQ;AAEzC,SAAO,EAAE,UAAU,KAAK,eAAe,WAAW,KAAK,OAAO;AAChE;AAEA,SAAS,kBAAkB,OAAmC;AAC5D,QAAM,UAAU,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,OAAO;AACtD,SAAO,SAAS;AAClB;;;ACvFA,eAAsB,oBACpB,OACA,MACA,MACA,UAA6B,CAAC,GACH;AAC3B,QAAM,CAAC,GAAG,CAAC,IAAI,MAAM,QAAQ,IAAI,CAAC,gBAAgB,OAAO,IAAI,GAAG,gBAAgB,OAAO,IAAI,CAAC,CAAC;AAC7F,QAAM,KAAK,QAAQ,cAAc;AACjC,QAAM,SAAS,KAAK,IAAI,EAAE,MAAM,QAAQ,EAAE,MAAM,MAAM;AACtD,WAAS,IAAI,GAAG,IAAI,QAAQ,KAAK;AAC/B,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,QAAI,CAAC,GAAG,OAAO,KAAK,GAAG;AACrB,aAAO;AAAA,QACL;AAAA,QACA;AAAA,QACA,sBAAsB;AAAA,QACtB;AAAA,QACA;AAAA,QACA,QAAQ,mBAAmB,OAAO,KAAK;AAAA,QACvC,iBAAiB;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AACA,MAAI,EAAE,MAAM,WAAW,EAAE,MAAM,QAAQ;AACrC,WAAO,EAAE,MAAM,MAAM,sBAAsB,MAAM,iBAAiB,OAAO;AAAA,EAC3E;AACA,QAAM,SAAqB,EAAE,MAAM,SAAS,EAAE,MAAM,SAAS,IAAI;AACjE,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,sBAAsB;AAAA,IACtB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,QAAQ,sBAAsB,OAAO,MAAM,SAAS,MAAM,6BAA6B,SAAS,CAAC;AAAA,IACjG,iBAAiB;AAAA,EACnB;AACF;AAEA,SAAS,kBAAkB,GAAmB,GAA4B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO;AACxC,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,OAAQ,QAAO,EAAE,KAAK,aAAa,EAAE,KAAK;AACxF,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,MAAO,QAAO,EAAE,KAAK,UAAU,EAAE,KAAK;AACnF,MAAI,EAAE,KAAK,SAAS,WAAW,EAAE,KAAK,SAAS;AAC7C,WAAO,EAAE,KAAK,cAAc,EAAE,KAAK;AACrC,SAAO,EAAE,KAAK,SAAS,EAAE,KAAK;AAChC;AAEA,SAAS,mBAAmB,GAAmB,GAA2B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO,QAAQ,EAAE,KAAK,IAAI,OAAO,EAAE,KAAK,IAAI;AAC7E,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,aAAa,EAAE,KAAK,UAAU;AAC3F,WAAO,QAAQ,EAAE,KAAK,QAAQ,OAAO,EAAE,KAAK,QAAQ;AAAA,EACtD;AACA,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AACnF,WAAO,SAAS,EAAE,KAAK,KAAK,OAAO,EAAE,KAAK,KAAK;AAAA,EACjD;AACA,SAAO,SAAS,EAAE,KAAK,IAAI,SAAS,EAAE,KAAK,IAAI;AACjD;;;ACvDA,eAAsB,mBAAmB,OAAkD;AACzF,QAAM,OAAO,MAAM,MAAM,MAAM,EAAE,MAAM,QAAQ,CAAC,GAAG;AAAA,IACjD,CAAC,MAAsB,EAAE,SAAS;AAAA,EACpC;AACA,MAAI,IAAI,WAAW,EAAG,QAAO,EAAE,OAAO,CAAC,GAAG,YAAY,CAAC,GAAG,UAAU,CAAC,EAAE;AAEvE,QAAM,cAAc,oBAAI,IAAyB;AACjD,aAAW,KAAK,KAAK;AACnB,UAAM,MAAM,YAAY,IAAI,EAAE,SAAS,KAAK,CAAC;AAC7C,QAAI,KAAK,CAAC;AACV,gBAAY,IAAI,EAAE,WAAW,GAAG;AAAA,EAClC;AAEA,QAAM,WAAW,CAAC,GAAG,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,KAAK;AAC9D,QAAM,QAAqB,CAAC;AAC5B,aAAW,CAAC,KAAK,KAAK,KAAK,aAAa;AACtC,UAAM,UAAU,oBAAI,IAAiC;AACrD,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,IAAI,EAAE,OAAO,KAAK,oBAAI,IAAoB;AAC5D,QAAE,IAAI,EAAE,cAAc,EAAE,KAAK;AAC7B,cAAQ,IAAI,EAAE,SAAS,CAAC;AAAA,IAC1B;AACA,UAAM,aAAa,CAAC,GAAG,QAAQ,KAAK,CAAC;AACrC,aAAS,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC1C,eAAS,IAAI,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC9C,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,SAAkC,CAAC;AACzC,mBAAW,CAAC,QAAQ,MAAM,KAAK,GAAG;AAChC,gBAAM,SAAS,EAAE,IAAI,MAAM;AAC3B,cAAI,WAAW,OAAW,QAAO,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,QACxD;AACA,YAAI,OAAO,SAAS,EAAG;AACvB,cAAM,cAAc,OAAO;AAAA,UACzB,CAAC,CAAC,QAAQ,MAAM,MACd;AAAA,YACE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,YAClE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,UACpE;AAAA,QACJ;AACA,cAAM,IAAI;AAAA,UACR,YAAY,CAAC,EAAG,IAAI,CAAC,GAAG,OAAO,YAAY,IAAI,CAAC,SAAS,KAAK,EAAE,CAAE,CAAC;AAAA,QACrE;AACA,cAAM,KAAK;AAAA,UACT,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR,WAAW;AAAA,UACX,aAAa,OAAO;AAAA,UACpB,SAAS;AAAA,YACP,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,YACtB,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,UACxB;AAAA,UACA,cAAc;AAAA,QAChB,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,MAAM,KAAK,CAAC,GAAG,MAAM,EAAE,cAAc,EAAE,WAAW;AAAA,IACzD,YAAY,CAAC,GAAG,YAAY,KAAK,CAAC,EAAE,KAAK;AAAA,IACzC;AAAA,EACF;AACF;AAEA,SAAS,QAAQ,GAAa,GAAqB;AACjD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,OAAO,GACP,OAAO;AACT,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,WAAO,KAAK;AACZ,YAAQ,KAAK;AACb,YAAQ,KAAK;AAAA,EACf;AACA,MAAI,SAAS,KAAK,SAAS,EAAG,QAAO,SAAS,KAAK,SAAS,IAAI,IAAI;AACpE,SAAO,MAAM,KAAK,KAAK,OAAO,IAAI;AACpC;;;ACvFA,eAAsB,eACpB,OACA,SACA,SACyB;AACzB,QAAM,eAAe,MAAM,MAAM,SAAS,QAAQ,QAAQ;AAC1D,QAAM,gBAAgB,MAAM,MAAM,SAAS,QAAQ,SAAS;AAC5D,QAAM,UAAU,MAAM,QAAQ;AAAA,IAC5B,QAAQ,IAAI,OAAO,MAAM;AACvB,YAAM,UAAU,EAAE,WAAW,eAAe,EAAE,MAAM;AACpD,YAAM,WAAW,MAAM,WAAW,cAAc,SAAS,KAAK;AAC9D,YAAM,YAAY,MAAM,WAAW,eAAe,SAAS,KAAK;AAChE,aAAO,EAAE,QAAQ,EAAE,QAAQ,gBAAgB,EAAE,gBAAgB,UAAU,UAAU;AAAA,IACnF,CAAC;AAAA,EACH;AACA,SAAO,kBAAkB,SAAS,OAAO;AAC3C;AAEA,eAAe,WACb,MACA,SACA,OACmB;AACnB,QAAM,MAAgB,CAAC;AACvB,aAAW,KAAK,MAAM;AACpB,UAAM,IAAI,MAAM,QAAQ,GAAG,KAAK;AAChC,QAAI,MAAM,QAAQ,OAAO,SAAS,CAAC,EAAG,KAAI,KAAK,CAAC;AAAA,EAClD;AACA,SAAO;AACT;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,KAAK,UAAU;AAC3B,YAAQ,QAAQ;AAAA,MACd,KAAK;AAAA,MACL,KAAK;AACH,eAAO,IAAI,SAAS,SAAS;AAAA,MAC/B,KAAK;AACH,eAAO,IAAI,SAAS,SAAS,OAAO,IAAI;AAAA,MAC1C,KAAK;AACH,eAAO,IAAI,WAAW,IAAI,YAAY,IAAI,UAAU,IAAI,YAAY;AAAA,MACtE,KAAK,WAAW;AACd,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,eAAe;AAClB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,eAAO,gBAAgB,GAAG,MAAM,YAAY,IAAI;AAAA,MAClD;AAAA,MACA;AACE,eAAO;AAAA,IACX;AAAA,EACF;AACF;;;AClDA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,OAAO,QAAQ,QACjB,CAAC,EAAE,OAAO,QAAQ,MAAM,CAAC,KACxB,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE;AAE5D,QAAM,WAA+B,CAAC;AACtC,aAAW,EAAE,MAAM,KAAK,MAAM;AAC5B,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,UAAM,QAAQ,oBAAI,IAAsD;AACxE,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,EAAE,IAAI;AACxB,YAAM,MAAM,GAAG,EAAE,QAAQ,IAAI,CAAC;AAC9B,YAAM,SAAS,MAAM,IAAI,GAAG,KAAK,EAAE,OAAO,CAAC,GAAG,SAAS,EAAE;AACzD,aAAO,MAAM,KAAK,CAAC;AACnB,YAAM,IAAI,KAAK,MAAM;AAAA,IACvB;AACA,eAAW,CAAC,KAAK,EAAE,OAAO,SAAS,EAAE,CAAC,KAAK,OAAO;AAChD,UAAI,MAAM,SAAS,eAAgB;AACnC,YAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAClE,YAAM,QAAQ,OAAO,CAAC,EAAG;AACzB,YAAM,OAAO,OAAO,OAAO,SAAS,CAAC,EAAG;AACxC,eAAS,KAAK;AAAA,QACZ;AAAA,QACA,UAAU,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,QAC1B,SAAS;AAAA,QACT,aAAa,OAAO;AAAA,QACpB,SAAS,OAAO,IAAI,CAAC,MAAM,EAAE,MAAM;AAAA,QACnC,UAAU,OAAO;AAAA,MACnB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,IACtE,WAAW,KAAK;AAAA,EAClB;AACF;;;AC5CA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,OAAO,QAAQ,QAAQ,CAAC,QAAQ,KAAK,KAAK,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK;AAE1F,QAAM,QAA4B,CAAC;AACnC,MAAI,aAAa;AACjB,MAAI,cAAc;AAClB,aAAW,SAAS,MAAM;AACxB,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,QAAI,MAAM,WAAW,GAAG;AACtB,YAAM,KAAK,EAAE,OAAO,aAAa,GAAG,YAAY,GAAG,WAAW,EAAE,CAAC;AACjE;AAAA,IACF;AACA,UAAM,OAAO,MAAM,SAAS,OAAO,KAAK;AACxC,QAAI,SAAS;AACb,eAAW,KAAK,OAAO;AACrB,UAAI,EAAE,WAAW,SAAS;AACxB;AACA;AAAA,MACF;AACA,YAAM,WAAW,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,EAAE,SAAS;AAC7D,UAAI,QAAQ,aAAa;AACvB,YAAI,CAAC,QAAQ,YAAY,GAAG,EAAE,KAAK,SAAS,CAAC,EAAG;AAAA,MAClD,OAAO;AAGL,cAAM,YAAY,UAAU,EAAE,MAAM;AACpC,cAAM,OAAO,SAAS;AAAA,UAAK,CAAC,MAC1B,EAAE,SAAS;AAAA,YACT,CAAC,MACC,OAAO,EAAE,YAAY,YACrB,aACA,EAAE,QAAQ,SAAS,UAAU,MAAM,GAAG,GAAG,CAAC;AAAA,UAC9C;AAAA,QACF;AACA,YAAI,CAAC,KAAM;AAAA,MACb;AAAA,IACF;AACA,UAAM,YAAY,SAAS,MAAM;AACjC,UAAM,KAAK,EAAE,OAAO,aAAa,QAAQ,YAAY,MAAM,QAAQ,UAAU,CAAC;AAC9E,kBAAc,MAAM;AACpB,mBAAe;AAAA,EACjB;AACA,SAAO,EAAE,OAAO,kBAAkB,aAAa,IAAI,cAAc,aAAa,EAAE;AAClF;AAEA,SAAS,UAAU,GAAoB;AACrC,MAAI,MAAM,QAAQ,MAAM,OAAW,QAAO;AAC1C,MAAI,OAAO,MAAM,SAAU,QAAO;AAClC,MAAI;AACF,WAAO,KAAK,UAAU,CAAC;AAAA,EACzB,QAAQ;AACN,WAAO,OAAO,CAAC;AAAA,EACjB;AACF;","names":[]}
@@ -1,6 +1,3 @@
1
- import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
2
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
3
-
4
1
  /**
5
2
  * RawProviderSink — first-class persistence for the actual HTTP-level
6
3
  * request/response bodies of every LLM provider call.
@@ -132,80 +129,4 @@ declare class FileSystemRawProviderSink implements RawProviderSink {
132
129
  */
133
130
  declare function providerFromBaseUrl(baseUrl: string): string;
134
131
 
135
- /**
136
- * Run-completion integrity check — at end of run, verify the expected event
137
- * types were actually captured. The point is the launch-review failure mode:
138
- * a run *appears* successful but the raw provider events were never written,
139
- * so a downstream reviewer can't reconstruct what happened.
140
- *
141
- * Pattern:
142
- *
143
- * const report = await assertRunCaptured(store, runId, {
144
- * llmSpansMin: 1,
145
- * judgeSpansMin: 1,
146
- * rawSink: providerSink, // must have ≥ 1 event for this run
147
- * requireRawCoverageOfLlmSpans: true, // every llm span has matching raw events
148
- * })
149
- * if (!report.ok) throwIfRunIncomplete(report) // or mark run failed and continue
150
- *
151
- * The function is read-only on the store and returns a structured report;
152
- * the caller chooses the failure mode (throw, mark run failed, log warning).
153
- * `throwIfRunIncomplete` is the convenient strict mode.
154
- */
155
-
156
- interface RunIntegrityExpectations {
157
- /** Minimum LLM span count. Default 0 (no requirement). */
158
- llmSpansMin?: number;
159
- /** Minimum judge span count. Default 0. */
160
- judgeSpansMin?: number;
161
- /** Minimum tool span count. Default 0. */
162
- toolSpansMin?: number;
163
- /**
164
- * Raw provider sink to consult for capture verification. When present,
165
- * the check requires at least one raw event for the run.
166
- */
167
- rawSink?: RawProviderSink;
168
- /** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
169
- rawProviderEventsMin?: number;
170
- /**
171
- * Every LLM span must have at least one matching raw `request` event
172
- * (matched by spanId). Catches the common bug where the structured span
173
- * was emitted but the raw HTTP capture was wired to a different sink.
174
- */
175
- requireRawCoverageOfLlmSpans?: boolean;
176
- /** Run outcome must be set (not null/undefined). Default false. */
177
- requireOutcome?: boolean;
178
- }
179
- type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
180
- interface RunIntegrityIssue {
181
- code: RunIntegrityIssueCode;
182
- message: string;
183
- detail?: Record<string, unknown>;
184
- }
185
- interface RunIntegrityReport {
186
- ok: boolean;
187
- runId: string;
188
- llmSpanCount: number;
189
- judgeSpanCount: number;
190
- toolSpanCount: number;
191
- rawProviderEventCount: number;
192
- /**
193
- * Coverage of LLM spans by raw provider events keyed on spanId.
194
- * `total` is the number of LLM spans; `covered` is the count with at
195
- * least one matching `request` raw event.
196
- */
197
- rawSpanCoverage: {
198
- covered: number;
199
- total: number;
200
- };
201
- issues: RunIntegrityIssue[];
202
- }
203
- declare class RunIntegrityError extends CaptureIntegrityError {
204
- readonly report: RunIntegrityReport;
205
- constructor(report: RunIntegrityReport);
206
- }
207
- declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
208
- /** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
209
- declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
210
-
211
- export { FileSystemRawProviderSink as F, InMemoryRawProviderSink as I, NoopRawProviderSink as N, type ProviderRedactor as P, type RawProviderSink as R, type RunIntegrityExpectations as a, type RunIntegrityReport as b, type FileSystemRawProviderSinkOptions as c, type InMemoryRawProviderSinkOptions as d, type RawProviderDirection as e, type RawProviderEvent as f, type RawProviderSinkFilter as g, RunIntegrityError as h, type RunIntegrityIssue as i, type RunIntegrityIssueCode as j, assertRunCaptured as k, defaultProviderRedactor as l, providerFromBaseUrl as p, throwIfRunIncomplete as t };
132
+ export { FileSystemRawProviderSink as F, InMemoryRawProviderSink as I, NoopRawProviderSink as N, type ProviderRedactor as P, type RawProviderSink as R, type FileSystemRawProviderSinkOptions as a, type InMemoryRawProviderSinkOptions as b, type RawProviderDirection as c, type RawProviderEvent as d, type RawProviderSinkFilter as e, defaultProviderRedactor as f, providerFromBaseUrl as p };