@tangle-network/agent-eval 0.41.0 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +48 -368
- package/dist/campaign/index.js +67 -1
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +753 -1237
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +67 -3
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +4 -8
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +44 -199
- package/dist/rl.js.map +1 -1
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +5 -5
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/package.json +1 -6
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/optimization.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
|
|
2
|
+
import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
|
3
|
+
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Run-completion integrity check — at end of run, verify the expected event
|
|
7
|
+
* types were actually captured. The point is the launch-review failure mode:
|
|
8
|
+
* a run *appears* successful but the raw provider events were never written,
|
|
9
|
+
* so a downstream reviewer can't reconstruct what happened.
|
|
10
|
+
*
|
|
11
|
+
* Pattern:
|
|
12
|
+
*
|
|
13
|
+
* const report = await assertRunCaptured(store, runId, {
|
|
14
|
+
* llmSpansMin: 1,
|
|
15
|
+
* judgeSpansMin: 1,
|
|
16
|
+
* rawSink: providerSink, // must have ≥ 1 event for this run
|
|
17
|
+
* requireRawCoverageOfLlmSpans: true, // every llm span has matching raw events
|
|
18
|
+
* })
|
|
19
|
+
* if (!report.ok) throwIfRunIncomplete(report) // or mark run failed and continue
|
|
20
|
+
*
|
|
21
|
+
* The function is read-only on the store and returns a structured report;
|
|
22
|
+
* the caller chooses the failure mode (throw, mark run failed, log warning).
|
|
23
|
+
* `throwIfRunIncomplete` is the convenient strict mode.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
interface RunIntegrityExpectations {
|
|
27
|
+
/** Minimum LLM span count. Default 0 (no requirement). */
|
|
28
|
+
llmSpansMin?: number;
|
|
29
|
+
/** Minimum judge span count. Default 0. */
|
|
30
|
+
judgeSpansMin?: number;
|
|
31
|
+
/** Minimum tool span count. Default 0. */
|
|
32
|
+
toolSpansMin?: number;
|
|
33
|
+
/**
|
|
34
|
+
* Raw provider sink to consult for capture verification. When present,
|
|
35
|
+
* the check requires at least one raw event for the run.
|
|
36
|
+
*/
|
|
37
|
+
rawSink?: RawProviderSink;
|
|
38
|
+
/** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
|
|
39
|
+
rawProviderEventsMin?: number;
|
|
40
|
+
/**
|
|
41
|
+
* Every LLM span must have at least one matching raw `request` event
|
|
42
|
+
* (matched by spanId). Catches the common bug where the structured span
|
|
43
|
+
* was emitted but the raw HTTP capture was wired to a different sink.
|
|
44
|
+
*/
|
|
45
|
+
requireRawCoverageOfLlmSpans?: boolean;
|
|
46
|
+
/** Run outcome must be set (not null/undefined). Default false. */
|
|
47
|
+
requireOutcome?: boolean;
|
|
48
|
+
}
|
|
49
|
+
type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
|
|
50
|
+
interface RunIntegrityIssue {
|
|
51
|
+
code: RunIntegrityIssueCode;
|
|
52
|
+
message: string;
|
|
53
|
+
detail?: Record<string, unknown>;
|
|
54
|
+
}
|
|
55
|
+
interface RunIntegrityReport {
|
|
56
|
+
ok: boolean;
|
|
57
|
+
runId: string;
|
|
58
|
+
llmSpanCount: number;
|
|
59
|
+
judgeSpanCount: number;
|
|
60
|
+
toolSpanCount: number;
|
|
61
|
+
rawProviderEventCount: number;
|
|
62
|
+
/**
|
|
63
|
+
* Coverage of LLM spans by raw provider events keyed on spanId.
|
|
64
|
+
* `total` is the number of LLM spans; `covered` is the count with at
|
|
65
|
+
* least one matching `request` raw event.
|
|
66
|
+
*/
|
|
67
|
+
rawSpanCoverage: {
|
|
68
|
+
covered: number;
|
|
69
|
+
total: number;
|
|
70
|
+
};
|
|
71
|
+
issues: RunIntegrityIssue[];
|
|
72
|
+
}
|
|
73
|
+
declare class RunIntegrityError extends CaptureIntegrityError {
|
|
74
|
+
readonly report: RunIntegrityReport;
|
|
75
|
+
constructor(report: RunIntegrityReport);
|
|
76
|
+
}
|
|
77
|
+
declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
|
|
78
|
+
/** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
|
|
79
|
+
declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
|
|
80
|
+
|
|
81
|
+
export { type RunIntegrityExpectations as R, type RunIntegrityReport as a, RunIntegrityError as b, type RunIntegrityIssue as c, type RunIntegrityIssueCode as d, assertRunCaptured as e, throwIfRunIncomplete as t };
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
|
|
2
|
+
import { R as RawProviderSink, P as ProviderRedactor } from './raw-provider-sink-C46HDghv.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* LLM client with graceful degrade.
|
|
6
|
+
*
|
|
7
|
+
* OpenAI-compatible `/v1/chat/completions` client with:
|
|
8
|
+
* - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
|
|
9
|
+
* - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
|
|
10
|
+
* - Graceful json_schema → json_object degrade on 400 with schema-reject body.
|
|
11
|
+
* - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
|
|
12
|
+
* - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
|
|
13
|
+
* directly, cli-bridge subscriptions, and any router that speaks the spec.
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
* const { value, result } = await callLlmJson<MyType>(
|
|
17
|
+
* { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
|
|
18
|
+
* { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
|
|
19
|
+
* )
|
|
20
|
+
*
|
|
21
|
+
* This is THE llm-calling seam for agent-eval primitives that need structured
|
|
22
|
+
* output (semantic concept judge, reviewer directives, critic scores). Primitives
|
|
23
|
+
* that need free-form text use `callLlm` and parse output themselves.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
interface LlmMessage {
|
|
27
|
+
role: 'system' | 'user' | 'assistant';
|
|
28
|
+
/**
|
|
29
|
+
* Either a plain text content string OR a multimodal content array
|
|
30
|
+
* (text + image_url parts) for vision-capable models.
|
|
31
|
+
*/
|
|
32
|
+
content: string | Array<{
|
|
33
|
+
type: 'text';
|
|
34
|
+
text: string;
|
|
35
|
+
} | {
|
|
36
|
+
type: 'image_url';
|
|
37
|
+
image_url: {
|
|
38
|
+
url: string;
|
|
39
|
+
detail?: 'auto' | 'low' | 'high';
|
|
40
|
+
};
|
|
41
|
+
}>;
|
|
42
|
+
}
|
|
43
|
+
interface LlmCallRequest {
|
|
44
|
+
model: string;
|
|
45
|
+
messages: LlmMessage[];
|
|
46
|
+
/** Optional JSON-mode response format (response_format: json_object). */
|
|
47
|
+
jsonMode?: boolean;
|
|
48
|
+
/** Optional structured output via JSON Schema. Falls back to json_object on 400. */
|
|
49
|
+
jsonSchema?: {
|
|
50
|
+
name: string;
|
|
51
|
+
schema: Record<string, unknown>;
|
|
52
|
+
};
|
|
53
|
+
temperature?: number;
|
|
54
|
+
maxTokens?: number;
|
|
55
|
+
/** Per-call timeout, default 60s. */
|
|
56
|
+
timeoutMs?: number;
|
|
57
|
+
}
|
|
58
|
+
interface LlmUsage {
|
|
59
|
+
promptTokens: number;
|
|
60
|
+
completionTokens: number;
|
|
61
|
+
totalTokens: number;
|
|
62
|
+
/** Proxies populate this when prompt caching is on. */
|
|
63
|
+
cachedPromptTokens?: number;
|
|
64
|
+
}
|
|
65
|
+
interface LlmCallResult {
|
|
66
|
+
/** The text content of the first choice. Empty string if none. */
|
|
67
|
+
content: string;
|
|
68
|
+
usage: LlmUsage;
|
|
69
|
+
/**
|
|
70
|
+
* Cost in USD. Pulled from proxy's `_response_cost` field when present;
|
|
71
|
+
* `null` when neither the proxy nor the caller can derive it.
|
|
72
|
+
*/
|
|
73
|
+
costUsd: number | null;
|
|
74
|
+
/** Model name actually used (echoed from response). */
|
|
75
|
+
model: string;
|
|
76
|
+
/** Wall-clock duration of the HTTP call (last attempt, if retried). */
|
|
77
|
+
durationMs: number;
|
|
78
|
+
/** Raw response body. */
|
|
79
|
+
raw: Record<string, unknown>;
|
|
80
|
+
}
|
|
81
|
+
declare class LlmCallError extends AgentEvalError {
|
|
82
|
+
readonly status: number;
|
|
83
|
+
readonly body: string;
|
|
84
|
+
readonly model: string;
|
|
85
|
+
constructor(message: string, status: number, body: string, model: string);
|
|
86
|
+
}
|
|
87
|
+
interface LlmClientOptions {
|
|
88
|
+
/** Base URL (without trailing slash). Must end at the `/v1` prefix. */
|
|
89
|
+
baseUrl?: string;
|
|
90
|
+
/** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
|
|
91
|
+
apiKey?: string;
|
|
92
|
+
bearer?: string;
|
|
93
|
+
/** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
|
|
94
|
+
authHeader?: {
|
|
95
|
+
name: string;
|
|
96
|
+
value: string;
|
|
97
|
+
};
|
|
98
|
+
/** Default timeout in ms. Per-call can override. */
|
|
99
|
+
defaultTimeoutMs?: number;
|
|
100
|
+
/** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
|
|
101
|
+
maxRetries?: number;
|
|
102
|
+
/** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
|
|
103
|
+
fetch?: typeof fetch;
|
|
104
|
+
/**
|
|
105
|
+
* Optional raw HTTP capture sink. When provided, every request, response,
|
|
106
|
+
* and error (across all retry attempts) is recorded to the sink, with auth
|
|
107
|
+
* headers and credential-shaped body fields redacted by default. This is
|
|
108
|
+
* the layer-1 forensics primitive: structured `LlmSpan`s record intent,
|
|
109
|
+
* raw events record what actually crossed the wire.
|
|
110
|
+
*/
|
|
111
|
+
rawSink?: RawProviderSink;
|
|
112
|
+
/**
|
|
113
|
+
* Logical provider id attached to raw events. When omitted, derived from
|
|
114
|
+
* `baseUrl` via `providerFromBaseUrl`.
|
|
115
|
+
*/
|
|
116
|
+
provider?: string;
|
|
117
|
+
/** Trace context attached to raw events; populated by emitter-aware callers. */
|
|
118
|
+
traceContext?: {
|
|
119
|
+
runId?: string;
|
|
120
|
+
spanId?: string;
|
|
121
|
+
};
|
|
122
|
+
/** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
|
|
123
|
+
redactor?: ProviderRedactor;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* True when an error is a transient transport/network fault worth retrying,
|
|
127
|
+
* as opposed to a deterministic failure (4xx schema reject, JSON parse) that
|
|
128
|
+
* a retry cannot fix. Inspects `LlmCallError.status`, then the error's
|
|
129
|
+
* name/message/code, then recurses into `error.cause` — undici nests the
|
|
130
|
+
* real socket fault one or more levels under `.cause`.
|
|
131
|
+
*
|
|
132
|
+
* This is THE retry classifier for the package: `callLlm` and
|
|
133
|
+
* `withJudgeRetry` both route through it, so a connection-class error is
|
|
134
|
+
* treated identically whether it surfaces in the HTTP client or a
|
|
135
|
+
* TCloud-backed judge.
|
|
136
|
+
*/
|
|
137
|
+
declare function isTransientLlmError(err: unknown): boolean;
|
|
138
|
+
/** Exponential backoff: 500ms, 1s, 2s, 4s, ... capped at 16s. Attempt is 0-indexed. */
|
|
139
|
+
declare function backoffMs(attempt: number): number;
|
|
140
|
+
/**
|
|
141
|
+
* Strip a ```json / ``` code fence if the model emitted one.
|
|
142
|
+
* Idempotent for naked JSON. Some models (claude-code via router, certain
|
|
143
|
+
* deepseek models) wrap output even under json_object.
|
|
144
|
+
*/
|
|
145
|
+
declare function stripFencedJson(raw: string): string;
|
|
146
|
+
/**
|
|
147
|
+
* Low-level call. Returns raw content + usage + cost. Retries on transient
|
|
148
|
+
* failures; does NOT degrade schema here — callers that want graceful
|
|
149
|
+
* degrade use `callLlmJson`.
|
|
150
|
+
*/
|
|
151
|
+
declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
|
|
152
|
+
/**
|
|
153
|
+
* Structured-output call. Returns parsed JSON plus the raw result envelope.
|
|
154
|
+
* Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
|
|
155
|
+
* critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
|
|
156
|
+
* the `response_format.json_schema` shape but DO accept `json_object`.
|
|
157
|
+
*/
|
|
158
|
+
declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
|
|
159
|
+
value: T;
|
|
160
|
+
result: LlmCallResult;
|
|
161
|
+
}>;
|
|
162
|
+
type LlmRouteAssertionReason = 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
|
|
163
|
+
declare class LlmRouteAssertionError extends CaptureIntegrityError {
|
|
164
|
+
readonly reason: LlmRouteAssertionReason;
|
|
165
|
+
readonly baseUrl: string;
|
|
166
|
+
constructor(message: string, reason: LlmRouteAssertionReason, baseUrl: string);
|
|
167
|
+
}
|
|
168
|
+
interface LlmRouteRequirements {
|
|
169
|
+
/**
|
|
170
|
+
* Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
|
|
171
|
+
* `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
|
|
172
|
+
* the public/free-tier router is a defect — the launch reviewer needs to
|
|
173
|
+
* know exactly which provider answered.
|
|
174
|
+
*/
|
|
175
|
+
requireExplicitBaseUrl?: boolean;
|
|
176
|
+
/**
|
|
177
|
+
* Allowlist of acceptable base URLs. Strings match by prefix
|
|
178
|
+
* (case-insensitive); RegExps test against the full base URL.
|
|
179
|
+
*/
|
|
180
|
+
allowedBaseUrls?: Array<string | RegExp>;
|
|
181
|
+
/** Blocklist that takes precedence over `allowedBaseUrls`. */
|
|
182
|
+
blockedBaseUrls?: Array<string | RegExp>;
|
|
183
|
+
/** Throw if no auth header / api key is configured. */
|
|
184
|
+
requireAuth?: boolean;
|
|
185
|
+
/**
|
|
186
|
+
* Logical provider id the configured `baseUrl` is expected to match (via
|
|
187
|
+
* `providerFromBaseUrl`). Mainly useful when paired with `requireExplicitBaseUrl`.
|
|
188
|
+
*/
|
|
189
|
+
expectedProvider?: string;
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Fail-loud assertion that the configured LLM client points at the route
|
|
193
|
+
* the caller intends. Designed for the matrix-runner preflight: invoke
|
|
194
|
+
* once before any LLM call to catch misconfiguration before a sweep burns
|
|
195
|
+
* dollars on the wrong provider.
|
|
196
|
+
*
|
|
197
|
+
* Throws `LlmRouteAssertionError`. Pure — no I/O — so it's safe to call
|
|
198
|
+
* from constructors and CI gates.
|
|
199
|
+
*/
|
|
200
|
+
declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
|
|
201
|
+
/**
|
|
202
|
+
* Probe whether a model is reachable. Returns latency + null error on
|
|
203
|
+
* success; `ok=false` + error message on any failure (HTTP, timeout,
|
|
204
|
+
* network, parse). Designed for sweep preflights — fail loud at the
|
|
205
|
+
* boundary before burning a 30-leaf run on a misconfigured router.
|
|
206
|
+
*
|
|
207
|
+
* Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
|
|
208
|
+
* (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
|
|
209
|
+
* for short prompts, so don't tighten this further. We don't validate
|
|
210
|
+
* content; HTTP 200 means reachable.
|
|
211
|
+
*/
|
|
212
|
+
declare function probeLlm(model: string, opts?: LlmClientOptions & {
|
|
213
|
+
timeoutMs?: number;
|
|
214
|
+
}): Promise<{
|
|
215
|
+
ok: boolean;
|
|
216
|
+
latencyMs: number;
|
|
217
|
+
error: string | null;
|
|
218
|
+
}>;
|
|
219
|
+
/**
|
|
220
|
+
* Stateful client — construct once with defaults, call many times.
|
|
221
|
+
* Thin wrapper around the free functions; exists for callers that want
|
|
222
|
+
* to inject a single configured instance into multiple primitives.
|
|
223
|
+
*/
|
|
224
|
+
declare class LlmClient {
|
|
225
|
+
private readonly opts;
|
|
226
|
+
constructor(opts?: LlmClientOptions);
|
|
227
|
+
call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
|
|
228
|
+
callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
|
|
229
|
+
value: T;
|
|
230
|
+
result: LlmCallResult;
|
|
231
|
+
}>;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
export { type LlmClientOptions as L, type LlmRouteRequirements as a, type LlmCallRequest as b, type LlmCallResult as c, LlmCallError as d, LlmClient as e, type LlmMessage as f, LlmRouteAssertionError as g, type LlmUsage as h, assertLlmRoute as i, backoffMs as j, callLlm as k, callLlmJson as l, isTransientLlmError as m, probeLlm as p, stripFencedJson as s };
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.42.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/pipelines/index.js
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import {
|
|
2
|
+
DEFAULT_RULES,
|
|
3
|
+
classifyFailure,
|
|
2
4
|
compareToBaseline,
|
|
3
|
-
computeToolUseMetrics
|
|
4
|
-
|
|
5
|
-
} from "../chunk-AU2JLNSZ.js";
|
|
5
|
+
computeToolUseMetrics
|
|
6
|
+
} from "../chunk-H4TOS272.js";
|
|
6
7
|
import {
|
|
7
8
|
buildTrajectory
|
|
8
9
|
} from "../chunk-RZTMDUO7.js";
|
|
@@ -61,6 +62,69 @@ async function budgetBreachView(store, options = {}) {
|
|
|
61
62
|
};
|
|
62
63
|
}
|
|
63
64
|
|
|
65
|
+
// src/pipelines/failure-cluster.ts
|
|
66
|
+
async function failureClusterView(store, options = {}) {
|
|
67
|
+
const rules = options.rules ?? DEFAULT_RULES;
|
|
68
|
+
const minSize = options.minClusterSize ?? 1;
|
|
69
|
+
const runs = await store.listRuns();
|
|
70
|
+
const clusters = /* @__PURE__ */ new Map();
|
|
71
|
+
let totalFailures = 0;
|
|
72
|
+
for (const run of runs) {
|
|
73
|
+
if (run.status === "completed" && run.outcome?.pass !== false) continue;
|
|
74
|
+
totalFailures++;
|
|
75
|
+
const spans = await store.spans({ runId: run.runId });
|
|
76
|
+
const events = await store.events({ runId: run.runId });
|
|
77
|
+
const cls = classifyFailure({ run, spans, events }, rules);
|
|
78
|
+
let toolName;
|
|
79
|
+
let argPrefix;
|
|
80
|
+
let dimension;
|
|
81
|
+
if (cls.triggerSpanId) {
|
|
82
|
+
const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
|
|
83
|
+
if (trig?.kind === "tool") {
|
|
84
|
+
toolName = trig.toolName;
|
|
85
|
+
argPrefix = argHash(trig.args).slice(0, 16);
|
|
86
|
+
} else if (trig?.kind === "judge") {
|
|
87
|
+
dimension = trig.dimension;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
if (!toolName) {
|
|
91
|
+
const ts = await toolSpans(store, run.runId);
|
|
92
|
+
const errored = ts.filter((t) => t.status === "error").pop();
|
|
93
|
+
if (errored) {
|
|
94
|
+
toolName = errored.toolName;
|
|
95
|
+
argPrefix = argHash(errored.args).slice(0, 16);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
if (!dimension) {
|
|
99
|
+
const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
|
|
100
|
+
if (judge?.kind === "judge") dimension = judge.dimension;
|
|
101
|
+
}
|
|
102
|
+
const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
|
|
103
|
+
let cluster = clusters.get(key);
|
|
104
|
+
if (!cluster) {
|
|
105
|
+
cluster = {
|
|
106
|
+
failureClass: cls.failureClass,
|
|
107
|
+
toolName,
|
|
108
|
+
argPrefix,
|
|
109
|
+
dimension,
|
|
110
|
+
runCount: 0,
|
|
111
|
+
scenarioIds: [],
|
|
112
|
+
exampleRunId: run.runId,
|
|
113
|
+
exampleError: firstErrorMessage(spans) ?? cls.reason
|
|
114
|
+
};
|
|
115
|
+
clusters.set(key, cluster);
|
|
116
|
+
}
|
|
117
|
+
cluster.runCount++;
|
|
118
|
+
if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId);
|
|
119
|
+
}
|
|
120
|
+
const arr = [...clusters.values()].filter((c) => c.runCount >= minSize).sort((a, b) => b.runCount - a.runCount);
|
|
121
|
+
return { clusters: arr, totalFailures, totalRuns: runs.length };
|
|
122
|
+
}
|
|
123
|
+
function firstErrorMessage(spans) {
|
|
124
|
+
const errored = spans.find((s) => s.status === "error");
|
|
125
|
+
return errored?.error;
|
|
126
|
+
}
|
|
127
|
+
|
|
64
128
|
// src/pipelines/first-divergence.ts
|
|
65
129
|
async function firstDivergenceView(store, runA, runB, options = {}) {
|
|
66
130
|
const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)]);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/pipelines/budget-breach.ts","../../src/pipelines/first-divergence.ts","../../src/pipelines/judge-agreement.ts","../../src/pipelines/regression.ts","../../src/pipelines/stuck-loop.ts","../../src/pipelines/tool-waste.ts"],"sourcesContent":["/**\n * BudgetBreachView — aggregates breach events across the corpus.\n *\n * Answers: which dimensions get hit most often? Which scenarios are\n * underbudgeted? Which variants trigger the most breaches?\n */\n\nimport type { BudgetSpec } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface BudgetBreachFinding {\n runId: string\n scenarioId: string\n variantId?: string\n dimension: keyof BudgetSpec\n limit: number\n consumed: number\n excessRatio: number\n timestamp: number\n}\n\nexport interface BudgetBreachReport {\n findings: BudgetBreachFinding[]\n byDimension: Record<string, number>\n byScenario: Record<string, number>\n byVariant: Record<string, number>\n totalRuns: number\n breachedRunRatio: number\n}\n\nexport async function budgetBreachView(\n store: TraceStore,\n options: { scenarioId?: string; variantId?: string } = {},\n): Promise<BudgetBreachReport> {\n const runs = await store.listRuns({\n scenarioId: options.scenarioId,\n variantId: options.variantId,\n })\n const findings: BudgetBreachFinding[] = []\n const byDimension: Record<string, number> = {}\n const byScenario: Record<string, number> = {}\n const byVariant: Record<string, number> = {}\n\n for (const run of runs) {\n const entries = await store.budget(run.runId)\n for (const e of entries) {\n if (!e.breached) continue\n const excessRatio = e.limit > 0 ? e.consumed / e.limit : Infinity\n findings.push({\n runId: run.runId,\n scenarioId: run.scenarioId,\n variantId: run.variantId,\n dimension: e.dimension,\n limit: e.limit,\n consumed: e.consumed,\n excessRatio,\n timestamp: e.timestamp,\n })\n byDimension[e.dimension] = (byDimension[e.dimension] ?? 0) + 1\n byScenario[run.scenarioId] = (byScenario[run.scenarioId] ?? 0) + 1\n if (run.variantId) byVariant[run.variantId] = (byVariant[run.variantId] ?? 0) + 1\n }\n }\n\n const breachedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n byDimension,\n byScenario,\n byVariant,\n totalRuns: runs.length,\n breachedRunRatio: runs.length > 0 ? breachedRuns.size / runs.length : 0,\n }\n}\n","/**\n * FirstDivergenceView — aligns two trajectories by step index, reports\n * the first step where they differ.\n *\n * \"Differ\" is configurable — default is (kind, toolName if tool, model\n * if llm). Use this view to attribute \"why is variant B better?\" to a\n * specific step rather than an aggregate mean delta.\n */\n\nimport type { TraceStore } from '../trace/store'\nimport { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'\n\nexport interface DivergenceReport {\n runA: string\n runB: string\n firstDivergenceIndex: number | null\n aStep?: TrajectoryStep\n bStep?: TrajectoryStep\n reason?: string\n /** Common prefix length (steps that matched). */\n commonPrefixLen: number\n}\n\nexport interface DivergenceOptions {\n /** Returns true if two steps are considered equal. Default: kind + tool/model match. */\n stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean\n}\n\nexport async function firstDivergenceView(\n store: TraceStore,\n runA: string,\n runB: string,\n options: DivergenceOptions = {},\n): Promise<DivergenceReport> {\n const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)])\n const eq = options.stepEquals ?? defaultStepEquals\n const minLen = Math.min(a.steps.length, b.steps.length)\n for (let i = 0; i < minLen; i++) {\n const aStep = a.steps[i]!\n const bStep = b.steps[i]!\n if (!eq(aStep, bStep)) {\n return {\n runA,\n runB,\n firstDivergenceIndex: i,\n aStep,\n bStep,\n reason: describeDifference(aStep, bStep),\n commonPrefixLen: i,\n }\n }\n }\n if (a.steps.length === b.steps.length) {\n return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen }\n }\n const longer: Trajectory = a.steps.length > b.steps.length ? a : b\n return {\n runA,\n runB,\n firstDivergenceIndex: minLen,\n aStep: a.steps[minLen],\n bStep: b.steps[minLen],\n reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`,\n commonPrefixLen: minLen,\n }\n}\n\nfunction defaultStepEquals(a: TrajectoryStep, b: TrajectoryStep): boolean {\n if (a.span.kind !== b.span.kind) return false\n if (a.span.kind === 'tool' && b.span.kind === 'tool') return a.span.toolName === b.span.toolName\n if (a.span.kind === 'llm' && b.span.kind === 'llm') return a.span.model === b.span.model\n if (a.span.kind === 'judge' && b.span.kind === 'judge')\n return a.span.dimension === b.span.dimension\n return a.span.name === b.span.name\n}\n\nfunction describeDifference(a: TrajectoryStep, b: TrajectoryStep): string {\n if (a.span.kind !== b.span.kind) return `kind ${a.span.kind} vs ${b.span.kind}`\n if (a.span.kind === 'tool' && b.span.kind === 'tool' && a.span.toolName !== b.span.toolName) {\n return `tool ${a.span.toolName} vs ${b.span.toolName}`\n }\n if (a.span.kind === 'llm' && b.span.kind === 'llm' && a.span.model !== b.span.model) {\n return `model ${a.span.model} vs ${b.span.model}`\n }\n return `name \"${a.span.name}\" vs \"${b.span.name}\"`\n}\n","/**\n * JudgeAgreementView — pairwise agreement between judges across the\n * corpus, grouped by dimension.\n *\n * Output drives two workflows:\n * - Judge robustness audit: \"does Claude agree with GPT at κ ≥ 0.6?\"\n * - Calibration tracking: κ vs golden human labels over time (by\n * providing a `humanGoldenJudgeId`).\n */\n\nimport { interRaterReliability } from '../statistics'\nimport type { JudgeSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface JudgePair {\n judgeA: string\n judgeB: string\n dimension: string\n /** Number of (targetSpanId, dimension) tuples both judges scored. */\n commonItems: number\n pearson: number\n krippendorff: number\n}\n\nexport interface JudgeAgreementReport {\n pairs: JudgePair[]\n dimensions: string[]\n judgeIds: string[]\n}\n\nexport async function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport> {\n const all = (await store.spans({ kind: 'judge' })).filter(\n (s): s is JudgeSpan => s.kind === 'judge',\n )\n if (all.length === 0) return { pairs: [], dimensions: [], judgeIds: [] }\n\n const byDimension = new Map<string, JudgeSpan[]>()\n for (const s of all) {\n const arr = byDimension.get(s.dimension) ?? []\n arr.push(s)\n byDimension.set(s.dimension, arr)\n }\n\n const judgeIds = [...new Set(all.map((s) => s.judgeId))].sort()\n const pairs: JudgePair[] = []\n for (const [dim, spans] of byDimension) {\n const byJudge = new Map<string, Map<string, number>>()\n for (const s of spans) {\n const m = byJudge.get(s.judgeId) ?? new Map<string, number>()\n m.set(s.targetSpanId, s.score)\n byJudge.set(s.judgeId, m)\n }\n const judgesHere = [...byJudge.keys()]\n for (let i = 0; i < judgesHere.length; i++) {\n for (let j = i + 1; j < judgesHere.length; j++) {\n const judgeI = judgesHere[i]!\n const judgeJ = judgesHere[j]!\n const a = byJudge.get(judgeI)!\n const b = byJudge.get(judgeJ)!\n const common: Array<[number, number]> = []\n for (const [target, scoreA] of a) {\n const scoreB = b.get(target)\n if (scoreB !== undefined) common.push([scoreA, scoreB])\n }\n if (common.length < 2) continue\n const judgeScores = common.map(\n ([scoreA, scoreB]) =>\n [\n { judgeName: judgeI, dimension: dim, score: scoreA, reasoning: '' },\n { judgeName: judgeJ, dimension: dim, score: scoreB, reasoning: '' },\n ] as const,\n )\n const k = interRaterReliability(\n judgeScores[0]!.map((_, k2) => judgeScores.map((pair) => pair[k2]!)),\n )\n pairs.push({\n judgeA: judgeI,\n judgeB: judgeJ,\n dimension: dim,\n commonItems: common.length,\n pearson: pearson(\n common.map((c) => c[0]),\n common.map((c) => c[1]),\n ),\n krippendorff: k,\n })\n }\n }\n }\n\n return {\n pairs: pairs.sort((a, b) => b.commonItems - a.commonItems),\n dimensions: [...byDimension.keys()].sort(),\n judgeIds,\n }\n}\n\nfunction pearson(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n denA = 0,\n denB = 0\n for (let i = 0; i < a.length; i++) {\n const dA = a[i]! - mA\n const dB = b[i]! - mB\n num += dA * dB\n denA += dA * dA\n denB += dB * dB\n }\n if (denA === 0 || denB === 0) return denA === 0 && denB === 0 ? 1 : 0\n return num / Math.sqrt(denA * denB)\n}\n","/**\n * RegressionView — compares a candidate slice to a baseline slice on a\n * named metric. Delegates the statistics (Welch's t-test, Cohen's d,\n * IQR stability) to `baseline.ts`.\n *\n * This is the entry point for CI regression gates: \"given runs tagged\n * release=A and release=B, did any metric regress?\"\n */\n\nimport { type BaselineOptions, type BaselineReport, compareToBaseline } from '../baseline'\nimport { aggregateLlm, llmSpans, runFailureClass } from '../trace/query'\nimport type { Run } from '../trace/schema'\nimport type { RunFilter, TraceStore } from '../trace/store'\n\nexport interface RegressionSpec {\n metric: string\n higherIsBetter: boolean\n /** Extract a scalar from a run. Default extractors handle common metrics. */\n extract?: (run: Run, store: TraceStore) => Promise<number | null>\n}\n\nexport interface RegressionOptions extends BaselineOptions {\n baseline: RunFilter\n candidate: RunFilter\n}\n\nexport async function regressionView(\n store: TraceStore,\n metrics: RegressionSpec[],\n options: RegressionOptions,\n): Promise<BaselineReport> {\n const baselineRuns = await store.listRuns(options.baseline)\n const candidateRuns = await store.listRuns(options.candidate)\n const samples = await Promise.all(\n metrics.map(async (m) => {\n const extract = m.extract ?? defaultExtract(m.metric)\n const baseline = await extractAll(baselineRuns, extract, store)\n const candidate = await extractAll(candidateRuns, extract, store)\n return { metric: m.metric, higherIsBetter: m.higherIsBetter, baseline, candidate }\n }),\n )\n return compareToBaseline(samples, options)\n}\n\nasync function extractAll(\n runs: Run[],\n extract: (r: Run, s: TraceStore) => Promise<number | null>,\n store: TraceStore,\n): Promise<number[]> {\n const out: number[] = []\n for (const r of runs) {\n const v = await extract(r, store)\n if (v !== null && Number.isFinite(v)) out.push(v)\n }\n return out\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run, store) => {\n switch (metric) {\n case 'score':\n case 'overallScore':\n return run.outcome?.score ?? null\n case 'pass':\n return run.outcome?.pass === true ? 1 : 0\n case 'durationMs':\n return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null\n case 'costUsd': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).costUsd\n }\n case 'inputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).inputTokens\n }\n case 'outputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).outputTokens\n }\n case 'failureClass': {\n return runFailureClass(run) === 'success' ? 1 : 0\n }\n default:\n return null\n }\n }\n}\n","/**\n * StuckLoopView — detects when an agent calls the same tool with the\n * same (or structurally similar) arguments ≥ N times in a short window.\n *\n * Rationale: agents that loop are the number-one production failure\n * mode on long-horizon flows. The view returns (runId, toolName,\n * argHash, occurrences, windowMs) for each detected loop plus a\n * fraction of runs affected.\n */\n\nimport { argHash, toolSpans } from '../trace/query'\nimport type { TraceStore } from '../trace/store'\n\nexport interface StuckLoopFinding {\n runId: string\n toolName: string\n argHash: string\n occurrences: number\n spanIds: string[]\n /** Milliseconds between first and last call in the loop. */\n windowMs: number\n}\n\nexport interface StuckLoopReport {\n findings: StuckLoopFinding[]\n affectedRunRatio: number\n totalRuns: number\n}\n\nexport interface StuckLoopOptions {\n /** Minimum call count to flag a loop (default 3). */\n minOccurrences?: number\n /** Filter to a specific runId; omit to scan the entire corpus. */\n runId?: string\n}\n\nexport async function stuckLoopView(\n store: TraceStore,\n options: StuckLoopOptions = {},\n): Promise<StuckLoopReport> {\n const minOccurrences = options.minOccurrences ?? 3\n const runs = options.runId\n ? [{ runId: options.runId }]\n : (await store.listRuns()).map((r) => ({ runId: r.runId }))\n\n const findings: StuckLoopFinding[] = []\n for (const { runId } of runs) {\n const tools = await toolSpans(store, runId)\n const byKey = new Map<string, { spans: typeof tools; argHash: string }>()\n for (const t of tools) {\n const h = argHash(t.args)\n const key = `${t.toolName}|${h}`\n const bucket = byKey.get(key) ?? { spans: [], argHash: h }\n bucket.spans.push(t)\n byKey.set(key, bucket)\n }\n for (const [key, { spans, argHash: h }] of byKey) {\n if (spans.length < minOccurrences) continue\n const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt)\n const first = sorted[0]!.startedAt\n const last = sorted[sorted.length - 1]!.startedAt\n findings.push({\n runId,\n toolName: key.split('|')[0]!,\n argHash: h,\n occurrences: sorted.length,\n spanIds: sorted.map((s) => s.spanId),\n windowMs: last - first,\n })\n }\n }\n\n const affectedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n affectedRunRatio: runs.length > 0 ? affectedRuns.size / runs.length : 0,\n totalRuns: runs.length,\n }\n}\n","/**\n * ToolWasteView — fraction of tool calls whose results weren't used\n * downstream. Without a \"used\" signal we fall back to structural\n * proxies: error calls, duplicate calls, and tool calls followed by\n * zero subsequent LLM spans are all considered waste.\n *\n * Consumers can pass a `usageOracle` that inspects a tool span and\n * returns true iff the tool's result appears in a later LLM message,\n * artifact, or state mutation — that's the canonical definition; the\n * default heuristic is a reasonable fallback.\n */\n\nimport { computeToolUseMetrics } from '../tool-use-metrics'\nimport { llmSpans, toolSpans } from '../trace/query'\nimport type { ToolSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface ToolWasteFinding {\n runId: string\n wastedCalls: number\n totalCalls: number\n wasteRate: number\n}\n\nexport interface ToolWasteReport {\n byRun: ToolWasteFinding[]\n overallWasteRate: number\n}\n\nexport interface ToolWasteOptions {\n runId?: string\n usageOracle?: (tool: ToolSpan, later: { llm: Awaited<ReturnType<typeof llmSpans>> }) => boolean\n}\n\nexport async function toolWasteView(\n store: TraceStore,\n options: ToolWasteOptions = {},\n): Promise<ToolWasteReport> {\n const runs = options.runId ? [options.runId] : (await store.listRuns()).map((r) => r.runId)\n\n const byRun: ToolWasteFinding[] = []\n let totalCalls = 0\n let totalWasted = 0\n for (const runId of runs) {\n const tools = await toolSpans(store, runId)\n if (tools.length === 0) {\n byRun.push({ runId, wastedCalls: 0, totalCalls: 0, wasteRate: 0 })\n continue\n }\n const llms = await llmSpans(store, runId)\n let wasted = 0\n for (const t of tools) {\n if (t.status === 'error') {\n wasted++\n continue\n }\n const laterLlm = llms.filter((l) => l.startedAt > t.startedAt)\n if (options.usageOracle) {\n if (!options.usageOracle(t, { llm: laterLlm })) wasted++\n } else {\n // Default heuristic: a tool whose result is NOT mentioned in any\n // later LLM input message is likely wasted.\n const resultStr = stringify(t.result)\n const used = laterLlm.some((l) =>\n l.messages.some(\n (m) =>\n typeof m.content === 'string' &&\n resultStr &&\n m.content.includes(resultStr.slice(0, 120)),\n ),\n )\n if (!used) wasted++\n }\n }\n const wasteRate = wasted / tools.length\n byRun.push({ runId, wastedCalls: wasted, totalCalls: tools.length, wasteRate })\n totalCalls += tools.length\n totalWasted += wasted\n }\n return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 }\n}\n\nfunction stringify(v: unknown): string {\n if (v === null || v === undefined) return ''\n if (typeof v === 'string') return v\n try {\n return JSON.stringify(v)\n } catch {\n return String(v)\n }\n}\n\n// Re-export for convenience in consumers that want both descriptive and usage metrics.\nexport { computeToolUseMetrics }\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;AA8BA,eAAsB,iBACpB,OACA,UAAuD,CAAC,GAC3B;AAC7B,QAAM,OAAO,MAAM,MAAM,SAAS;AAAA,IAChC,YAAY,QAAQ;AAAA,IACpB,WAAW,QAAQ;AAAA,EACrB,CAAC;AACD,QAAM,WAAkC,CAAC;AACzC,QAAM,cAAsC,CAAC;AAC7C,QAAM,aAAqC,CAAC;AAC5C,QAAM,YAAoC,CAAC;AAE3C,aAAW,OAAO,MAAM;AACtB,UAAM,UAAU,MAAM,MAAM,OAAO,IAAI,KAAK;AAC5C,eAAW,KAAK,SAAS;AACvB,UAAI,CAAC,EAAE,SAAU;AACjB,YAAM,cAAc,EAAE,QAAQ,IAAI,EAAE,WAAW,EAAE,QAAQ;AACzD,eAAS,KAAK;AAAA,QACZ,OAAO,IAAI;AAAA,QACX,YAAY,IAAI;AAAA,QAChB,WAAW,IAAI;AAAA,QACf,WAAW,EAAE;AAAA,QACb,OAAO,EAAE;AAAA,QACT,UAAU,EAAE;AAAA,QACZ;AAAA,QACA,WAAW,EAAE;AAAA,MACf,CAAC;AACD,kBAAY,EAAE,SAAS,KAAK,YAAY,EAAE,SAAS,KAAK,KAAK;AAC7D,iBAAW,IAAI,UAAU,KAAK,WAAW,IAAI,UAAU,KAAK,KAAK;AACjE,UAAI,IAAI,UAAW,WAAU,IAAI,SAAS,KAAK,UAAU,IAAI,SAAS,KAAK,KAAK;AAAA,IAClF;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,WAAW,KAAK;AAAA,IAChB,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,EACxE;AACF;;;AC7CA,eAAsB,oBACpB,OACA,MACA,MACA,UAA6B,CAAC,GACH;AAC3B,QAAM,CAAC,GAAG,CAAC,IAAI,MAAM,QAAQ,IAAI,CAAC,gBAAgB,OAAO,IAAI,GAAG,gBAAgB,OAAO,IAAI,CAAC,CAAC;AAC7F,QAAM,KAAK,QAAQ,cAAc;AACjC,QAAM,SAAS,KAAK,IAAI,EAAE,MAAM,QAAQ,EAAE,MAAM,MAAM;AACtD,WAAS,IAAI,GAAG,IAAI,QAAQ,KAAK;AAC/B,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,QAAI,CAAC,GAAG,OAAO,KAAK,GAAG;AACrB,aAAO;AAAA,QACL;AAAA,QACA;AAAA,QACA,sBAAsB;AAAA,QACtB;AAAA,QACA;AAAA,QACA,QAAQ,mBAAmB,OAAO,KAAK;AAAA,QACvC,iBAAiB;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AACA,MAAI,EAAE,MAAM,WAAW,EAAE,MAAM,QAAQ;AACrC,WAAO,EAAE,MAAM,MAAM,sBAAsB,MAAM,iBAAiB,OAAO;AAAA,EAC3E;AACA,QAAM,SAAqB,EAAE,MAAM,SAAS,EAAE,MAAM,SAAS,IAAI;AACjE,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,sBAAsB;AAAA,IACtB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,QAAQ,sBAAsB,OAAO,MAAM,SAAS,MAAM,6BAA6B,SAAS,CAAC;AAAA,IACjG,iBAAiB;AAAA,EACnB;AACF;AAEA,SAAS,kBAAkB,GAAmB,GAA4B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO;AACxC,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,OAAQ,QAAO,EAAE,KAAK,aAAa,EAAE,KAAK;AACxF,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,MAAO,QAAO,EAAE,KAAK,UAAU,EAAE,KAAK;AACnF,MAAI,EAAE,KAAK,SAAS,WAAW,EAAE,KAAK,SAAS;AAC7C,WAAO,EAAE,KAAK,cAAc,EAAE,KAAK;AACrC,SAAO,EAAE,KAAK,SAAS,EAAE,KAAK;AAChC;AAEA,SAAS,mBAAmB,GAAmB,GAA2B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO,QAAQ,EAAE,KAAK,IAAI,OAAO,EAAE,KAAK,IAAI;AAC7E,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,aAAa,EAAE,KAAK,UAAU;AAC3F,WAAO,QAAQ,EAAE,KAAK,QAAQ,OAAO,EAAE,KAAK,QAAQ;AAAA,EACtD;AACA,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AACnF,WAAO,SAAS,EAAE,KAAK,KAAK,OAAO,EAAE,KAAK,KAAK;AAAA,EACjD;AACA,SAAO,SAAS,EAAE,KAAK,IAAI,SAAS,EAAE,KAAK,IAAI;AACjD;;;ACvDA,eAAsB,mBAAmB,OAAkD;AACzF,QAAM,OAAO,MAAM,MAAM,MAAM,EAAE,MAAM,QAAQ,CAAC,GAAG;AAAA,IACjD,CAAC,MAAsB,EAAE,SAAS;AAAA,EACpC;AACA,MAAI,IAAI,WAAW,EAAG,QAAO,EAAE,OAAO,CAAC,GAAG,YAAY,CAAC,GAAG,UAAU,CAAC,EAAE;AAEvE,QAAM,cAAc,oBAAI,IAAyB;AACjD,aAAW,KAAK,KAAK;AACnB,UAAM,MAAM,YAAY,IAAI,EAAE,SAAS,KAAK,CAAC;AAC7C,QAAI,KAAK,CAAC;AACV,gBAAY,IAAI,EAAE,WAAW,GAAG;AAAA,EAClC;AAEA,QAAM,WAAW,CAAC,GAAG,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,KAAK;AAC9D,QAAM,QAAqB,CAAC;AAC5B,aAAW,CAAC,KAAK,KAAK,KAAK,aAAa;AACtC,UAAM,UAAU,oBAAI,IAAiC;AACrD,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,IAAI,EAAE,OAAO,KAAK,oBAAI,IAAoB;AAC5D,QAAE,IAAI,EAAE,cAAc,EAAE,KAAK;AAC7B,cAAQ,IAAI,EAAE,SAAS,CAAC;AAAA,IAC1B;AACA,UAAM,aAAa,CAAC,GAAG,QAAQ,KAAK,CAAC;AACrC,aAAS,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC1C,eAAS,IAAI,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC9C,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,SAAkC,CAAC;AACzC,mBAAW,CAAC,QAAQ,MAAM,KAAK,GAAG;AAChC,gBAAM,SAAS,EAAE,IAAI,MAAM;AAC3B,cAAI,WAAW,OAAW,QAAO,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,QACxD;AACA,YAAI,OAAO,SAAS,EAAG;AACvB,cAAM,cAAc,OAAO;AAAA,UACzB,CAAC,CAAC,QAAQ,MAAM,MACd;AAAA,YACE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,YAClE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,UACpE;AAAA,QACJ;AACA,cAAM,IAAI;AAAA,UACR,YAAY,CAAC,EAAG,IAAI,CAAC,GAAG,OAAO,YAAY,IAAI,CAAC,SAAS,KAAK,EAAE,CAAE,CAAC;AAAA,QACrE;AACA,cAAM,KAAK;AAAA,UACT,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR,WAAW;AAAA,UACX,aAAa,OAAO;AAAA,UACpB,SAAS;AAAA,YACP,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,YACtB,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,UACxB;AAAA,UACA,cAAc;AAAA,QAChB,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,MAAM,KAAK,CAAC,GAAG,MAAM,EAAE,cAAc,EAAE,WAAW;AAAA,IACzD,YAAY,CAAC,GAAG,YAAY,KAAK,CAAC,EAAE,KAAK;AAAA,IACzC;AAAA,EACF;AACF;AAEA,SAAS,QAAQ,GAAa,GAAqB;AACjD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,OAAO,GACP,OAAO;AACT,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,WAAO,KAAK;AACZ,YAAQ,KAAK;AACb,YAAQ,KAAK;AAAA,EACf;AACA,MAAI,SAAS,KAAK,SAAS,EAAG,QAAO,SAAS,KAAK,SAAS,IAAI,IAAI;AACpE,SAAO,MAAM,KAAK,KAAK,OAAO,IAAI;AACpC;;;ACvFA,eAAsB,eACpB,OACA,SACA,SACyB;AACzB,QAAM,eAAe,MAAM,MAAM,SAAS,QAAQ,QAAQ;AAC1D,QAAM,gBAAgB,MAAM,MAAM,SAAS,QAAQ,SAAS;AAC5D,QAAM,UAAU,MAAM,QAAQ;AAAA,IAC5B,QAAQ,IAAI,OAAO,MAAM;AACvB,YAAM,UAAU,EAAE,WAAW,eAAe,EAAE,MAAM;AACpD,YAAM,WAAW,MAAM,WAAW,cAAc,SAAS,KAAK;AAC9D,YAAM,YAAY,MAAM,WAAW,eAAe,SAAS,KAAK;AAChE,aAAO,EAAE,QAAQ,EAAE,QAAQ,gBAAgB,EAAE,gBAAgB,UAAU,UAAU;AAAA,IACnF,CAAC;AAAA,EACH;AACA,SAAO,kBAAkB,SAAS,OAAO;AAC3C;AAEA,eAAe,WACb,MACA,SACA,OACmB;AACnB,QAAM,MAAgB,CAAC;AACvB,aAAW,KAAK,MAAM;AACpB,UAAM,IAAI,MAAM,QAAQ,GAAG,KAAK;AAChC,QAAI,MAAM,QAAQ,OAAO,SAAS,CAAC,EAAG,KAAI,KAAK,CAAC;AAAA,EAClD;AACA,SAAO;AACT;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,KAAK,UAAU;AAC3B,YAAQ,QAAQ;AAAA,MACd,KAAK;AAAA,MACL,KAAK;AACH,eAAO,IAAI,SAAS,SAAS;AAAA,MAC/B,KAAK;AACH,eAAO,IAAI,SAAS,SAAS,OAAO,IAAI;AAAA,MAC1C,KAAK;AACH,eAAO,IAAI,WAAW,IAAI,YAAY,IAAI,UAAU,IAAI,YAAY;AAAA,MACtE,KAAK,WAAW;AACd,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,eAAe;AAClB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,eAAO,gBAAgB,GAAG,MAAM,YAAY,IAAI;AAAA,MAClD;AAAA,MACA;AACE,eAAO;AAAA,IACX;AAAA,EACF;AACF;;;AClDA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,OAAO,QAAQ,QACjB,CAAC,EAAE,OAAO,QAAQ,MAAM,CAAC,KACxB,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE;AAE5D,QAAM,WAA+B,CAAC;AACtC,aAAW,EAAE,MAAM,KAAK,MAAM;AAC5B,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,UAAM,QAAQ,oBAAI,IAAsD;AACxE,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,EAAE,IAAI;AACxB,YAAM,MAAM,GAAG,EAAE,QAAQ,IAAI,CAAC;AAC9B,YAAM,SAAS,MAAM,IAAI,GAAG,KAAK,EAAE,OAAO,CAAC,GAAG,SAAS,EAAE;AACzD,aAAO,MAAM,KAAK,CAAC;AACnB,YAAM,IAAI,KAAK,MAAM;AAAA,IACvB;AACA,eAAW,CAAC,KAAK,EAAE,OAAO,SAAS,EAAE,CAAC,KAAK,OAAO;AAChD,UAAI,MAAM,SAAS,eAAgB;AACnC,YAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAClE,YAAM,QAAQ,OAAO,CAAC,EAAG;AACzB,YAAM,OAAO,OAAO,OAAO,SAAS,CAAC,EAAG;AACxC,eAAS,KAAK;AAAA,QACZ;AAAA,QACA,UAAU,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,QAC1B,SAAS;AAAA,QACT,aAAa,OAAO;AAAA,QACpB,SAAS,OAAO,IAAI,CAAC,MAAM,EAAE,MAAM;AAAA,QACnC,UAAU,OAAO;AAAA,MACnB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,IACtE,WAAW,KAAK;AAAA,EAClB;AACF;;;AC5CA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,OAAO,QAAQ,QAAQ,CAAC,QAAQ,KAAK,KAAK,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK;AAE1F,QAAM,QAA4B,CAAC;AACnC,MAAI,aAAa;AACjB,MAAI,cAAc;AAClB,aAAW,SAAS,MAAM;AACxB,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,QAAI,MAAM,WAAW,GAAG;AACtB,YAAM,KAAK,EAAE,OAAO,aAAa,GAAG,YAAY,GAAG,WAAW,EAAE,CAAC;AACjE;AAAA,IACF;AACA,UAAM,OAAO,MAAM,SAAS,OAAO,KAAK;AACxC,QAAI,SAAS;AACb,eAAW,KAAK,OAAO;AACrB,UAAI,EAAE,WAAW,SAAS;AACxB;AACA;AAAA,MACF;AACA,YAAM,WAAW,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,EAAE,SAAS;AAC7D,UAAI,QAAQ,aAAa;AACvB,YAAI,CAAC,QAAQ,YAAY,GAAG,EAAE,KAAK,SAAS,CAAC,EAAG;AAAA,MAClD,OAAO;AAGL,cAAM,YAAY,UAAU,EAAE,MAAM;AACpC,cAAM,OAAO,SAAS;AAAA,UAAK,CAAC,MAC1B,EAAE,SAAS;AAAA,YACT,CAAC,MACC,OAAO,EAAE,YAAY,YACrB,aACA,EAAE,QAAQ,SAAS,UAAU,MAAM,GAAG,GAAG,CAAC;AAAA,UAC9C;AAAA,QACF;AACA,YAAI,CAAC,KAAM;AAAA,MACb;AAAA,IACF;AACA,UAAM,YAAY,SAAS,MAAM;AACjC,UAAM,KAAK,EAAE,OAAO,aAAa,QAAQ,YAAY,MAAM,QAAQ,UAAU,CAAC;AAC9E,kBAAc,MAAM;AACpB,mBAAe;AAAA,EACjB;AACA,SAAO,EAAE,OAAO,kBAAkB,aAAa,IAAI,cAAc,aAAa,EAAE;AAClF;AAEA,SAAS,UAAU,GAAoB;AACrC,MAAI,MAAM,QAAQ,MAAM,OAAW,QAAO;AAC1C,MAAI,OAAO,MAAM,SAAU,QAAO;AAClC,MAAI;AACF,WAAO,KAAK,UAAU,CAAC;AAAA,EACzB,QAAQ;AACN,WAAO,OAAO,CAAC;AAAA,EACjB;AACF;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/pipelines/budget-breach.ts","../../src/pipelines/failure-cluster.ts","../../src/pipelines/first-divergence.ts","../../src/pipelines/judge-agreement.ts","../../src/pipelines/regression.ts","../../src/pipelines/stuck-loop.ts","../../src/pipelines/tool-waste.ts"],"sourcesContent":["/**\n * BudgetBreachView — aggregates breach events across the corpus.\n *\n * Answers: which dimensions get hit most often? Which scenarios are\n * underbudgeted? Which variants trigger the most breaches?\n */\n\nimport type { BudgetSpec } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface BudgetBreachFinding {\n runId: string\n scenarioId: string\n variantId?: string\n dimension: keyof BudgetSpec\n limit: number\n consumed: number\n excessRatio: number\n timestamp: number\n}\n\nexport interface BudgetBreachReport {\n findings: BudgetBreachFinding[]\n byDimension: Record<string, number>\n byScenario: Record<string, number>\n byVariant: Record<string, number>\n totalRuns: number\n breachedRunRatio: number\n}\n\nexport async function budgetBreachView(\n store: TraceStore,\n options: { scenarioId?: string; variantId?: string } = {},\n): Promise<BudgetBreachReport> {\n const runs = await store.listRuns({\n scenarioId: options.scenarioId,\n variantId: options.variantId,\n })\n const findings: BudgetBreachFinding[] = []\n const byDimension: Record<string, number> = {}\n const byScenario: Record<string, number> = {}\n const byVariant: Record<string, number> = {}\n\n for (const run of runs) {\n const entries = await store.budget(run.runId)\n for (const e of entries) {\n if (!e.breached) continue\n const excessRatio = e.limit > 0 ? e.consumed / e.limit : Infinity\n findings.push({\n runId: run.runId,\n scenarioId: run.scenarioId,\n variantId: run.variantId,\n dimension: e.dimension,\n limit: e.limit,\n consumed: e.consumed,\n excessRatio,\n timestamp: e.timestamp,\n })\n byDimension[e.dimension] = (byDimension[e.dimension] ?? 0) + 1\n byScenario[run.scenarioId] = (byScenario[run.scenarioId] ?? 0) + 1\n if (run.variantId) byVariant[run.variantId] = (byVariant[run.variantId] ?? 0) + 1\n }\n }\n\n const breachedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n byDimension,\n byScenario,\n byVariant,\n totalRuns: runs.length,\n breachedRunRatio: runs.length > 0 ? breachedRuns.size / runs.length : 0,\n }\n}\n","/**\n * FailureClusterView — groups failed runs by (failureClass, triggerTool,\n * argHash-prefix) so weekly reviews can prioritize the top-N clusters.\n *\n * Each cluster includes: N runs, scenarios affected, representative\n * error message, a proposed mitigation hint (rule → action table).\n */\n\nimport { classifyFailure, DEFAULT_RULES, type FailureRule } from '../failure-taxonomy'\nimport { argHash, toolSpans } from '../trace/query'\nimport type { FailureClass, Span } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface FailureCluster {\n failureClass: FailureClass\n /** Tool name when the trigger was a tool span, else undefined. */\n toolName?: string\n /** First 16 chars of argHash — clusters similar args. */\n argPrefix?: string\n /**\n * Source dimension when the trigger was a judge span (e.g. `'format'`,\n * `'safety'`, `'correctness'`). Lets cross-template aggregators\n * group failures by the dimension that fired without overloading\n * `argPrefix`. Optional — clusters without this field deserialize cleanly.\n */\n dimension?: string\n runCount: number\n scenarioIds: string[]\n exampleError?: string\n exampleRunId: string\n}\n\nexport interface FailureClusterReport {\n clusters: FailureCluster[]\n totalFailures: number\n totalRuns: number\n}\n\nexport async function failureClusterView(\n store: TraceStore,\n options: { rules?: FailureRule[]; minClusterSize?: number } = {},\n): Promise<FailureClusterReport> {\n const rules = options.rules ?? DEFAULT_RULES\n const minSize = options.minClusterSize ?? 1\n const runs = await store.listRuns()\n\n type Key = string\n const clusters = new Map<Key, FailureCluster>()\n let totalFailures = 0\n\n for (const run of runs) {\n if (run.status === 'completed' && run.outcome?.pass !== false) continue\n totalFailures++\n const spans = await store.spans({ runId: run.runId })\n const events = await store.events({ runId: run.runId })\n const cls = classifyFailure({ run, spans, events }, rules)\n\n let toolName: string | undefined\n let argPrefix: string | undefined\n let dimension: string | undefined\n if (cls.triggerSpanId) {\n const trig = spans.find((s) => s.spanId === cls.triggerSpanId)\n if (trig?.kind === 'tool') {\n toolName = trig.toolName\n argPrefix = argHash(trig.args).slice(0, 16)\n } else if (trig?.kind === 'judge') {\n dimension = trig.dimension\n }\n }\n // Fallback: look at the last errored tool span\n if (!toolName) {\n const ts = await toolSpans(store, run.runId)\n const errored = ts.filter((t) => t.status === 'error').pop()\n if (errored) {\n toolName = errored.toolName\n argPrefix = argHash(errored.args).slice(0, 16)\n }\n }\n // Secondary signal: any judge span on the failed run carries a\n // dimension. Useful when the rule classified by judge score but\n // didn't surface the trigger span (or surfaced a non-judge span).\n if (!dimension) {\n const judge = spans.find((s) => s.kind === 'judge' && typeof s.dimension === 'string')\n if (judge?.kind === 'judge') dimension = judge.dimension\n }\n\n const key = `${cls.failureClass}|${toolName ?? ''}|${argPrefix ?? ''}|${dimension ?? ''}`\n let cluster = clusters.get(key)\n if (!cluster) {\n cluster = {\n failureClass: cls.failureClass,\n toolName,\n argPrefix,\n dimension,\n runCount: 0,\n scenarioIds: [],\n exampleRunId: run.runId,\n exampleError: firstErrorMessage(spans) ?? cls.reason,\n }\n clusters.set(key, cluster)\n }\n cluster.runCount++\n if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId)\n }\n\n const arr = [...clusters.values()]\n .filter((c) => c.runCount >= minSize)\n .sort((a, b) => b.runCount - a.runCount)\n\n return { clusters: arr, totalFailures, totalRuns: runs.length }\n}\n\nfunction firstErrorMessage(spans: Span[]): string | undefined {\n const errored = spans.find((s) => s.status === 'error')\n return errored?.error\n}\n","/**\n * FirstDivergenceView — aligns two trajectories by step index, reports\n * the first step where they differ.\n *\n * \"Differ\" is configurable — default is (kind, toolName if tool, model\n * if llm). Use this view to attribute \"why is variant B better?\" to a\n * specific step rather than an aggregate mean delta.\n */\n\nimport type { TraceStore } from '../trace/store'\nimport { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'\n\nexport interface DivergenceReport {\n runA: string\n runB: string\n firstDivergenceIndex: number | null\n aStep?: TrajectoryStep\n bStep?: TrajectoryStep\n reason?: string\n /** Common prefix length (steps that matched). */\n commonPrefixLen: number\n}\n\nexport interface DivergenceOptions {\n /** Returns true if two steps are considered equal. Default: kind + tool/model match. */\n stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean\n}\n\nexport async function firstDivergenceView(\n store: TraceStore,\n runA: string,\n runB: string,\n options: DivergenceOptions = {},\n): Promise<DivergenceReport> {\n const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)])\n const eq = options.stepEquals ?? defaultStepEquals\n const minLen = Math.min(a.steps.length, b.steps.length)\n for (let i = 0; i < minLen; i++) {\n const aStep = a.steps[i]!\n const bStep = b.steps[i]!\n if (!eq(aStep, bStep)) {\n return {\n runA,\n runB,\n firstDivergenceIndex: i,\n aStep,\n bStep,\n reason: describeDifference(aStep, bStep),\n commonPrefixLen: i,\n }\n }\n }\n if (a.steps.length === b.steps.length) {\n return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen }\n }\n const longer: Trajectory = a.steps.length > b.steps.length ? a : b\n return {\n runA,\n runB,\n firstDivergenceIndex: minLen,\n aStep: a.steps[minLen],\n bStep: b.steps[minLen],\n reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`,\n commonPrefixLen: minLen,\n }\n}\n\nfunction defaultStepEquals(a: TrajectoryStep, b: TrajectoryStep): boolean {\n if (a.span.kind !== b.span.kind) return false\n if (a.span.kind === 'tool' && b.span.kind === 'tool') return a.span.toolName === b.span.toolName\n if (a.span.kind === 'llm' && b.span.kind === 'llm') return a.span.model === b.span.model\n if (a.span.kind === 'judge' && b.span.kind === 'judge')\n return a.span.dimension === b.span.dimension\n return a.span.name === b.span.name\n}\n\nfunction describeDifference(a: TrajectoryStep, b: TrajectoryStep): string {\n if (a.span.kind !== b.span.kind) return `kind ${a.span.kind} vs ${b.span.kind}`\n if (a.span.kind === 'tool' && b.span.kind === 'tool' && a.span.toolName !== b.span.toolName) {\n return `tool ${a.span.toolName} vs ${b.span.toolName}`\n }\n if (a.span.kind === 'llm' && b.span.kind === 'llm' && a.span.model !== b.span.model) {\n return `model ${a.span.model} vs ${b.span.model}`\n }\n return `name \"${a.span.name}\" vs \"${b.span.name}\"`\n}\n","/**\n * JudgeAgreementView — pairwise agreement between judges across the\n * corpus, grouped by dimension.\n *\n * Output drives two workflows:\n * - Judge robustness audit: \"does Claude agree with GPT at κ ≥ 0.6?\"\n * - Calibration tracking: κ vs golden human labels over time (by\n * providing a `humanGoldenJudgeId`).\n */\n\nimport { interRaterReliability } from '../statistics'\nimport type { JudgeSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface JudgePair {\n judgeA: string\n judgeB: string\n dimension: string\n /** Number of (targetSpanId, dimension) tuples both judges scored. */\n commonItems: number\n pearson: number\n krippendorff: number\n}\n\nexport interface JudgeAgreementReport {\n pairs: JudgePair[]\n dimensions: string[]\n judgeIds: string[]\n}\n\nexport async function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport> {\n const all = (await store.spans({ kind: 'judge' })).filter(\n (s): s is JudgeSpan => s.kind === 'judge',\n )\n if (all.length === 0) return { pairs: [], dimensions: [], judgeIds: [] }\n\n const byDimension = new Map<string, JudgeSpan[]>()\n for (const s of all) {\n const arr = byDimension.get(s.dimension) ?? []\n arr.push(s)\n byDimension.set(s.dimension, arr)\n }\n\n const judgeIds = [...new Set(all.map((s) => s.judgeId))].sort()\n const pairs: JudgePair[] = []\n for (const [dim, spans] of byDimension) {\n const byJudge = new Map<string, Map<string, number>>()\n for (const s of spans) {\n const m = byJudge.get(s.judgeId) ?? new Map<string, number>()\n m.set(s.targetSpanId, s.score)\n byJudge.set(s.judgeId, m)\n }\n const judgesHere = [...byJudge.keys()]\n for (let i = 0; i < judgesHere.length; i++) {\n for (let j = i + 1; j < judgesHere.length; j++) {\n const judgeI = judgesHere[i]!\n const judgeJ = judgesHere[j]!\n const a = byJudge.get(judgeI)!\n const b = byJudge.get(judgeJ)!\n const common: Array<[number, number]> = []\n for (const [target, scoreA] of a) {\n const scoreB = b.get(target)\n if (scoreB !== undefined) common.push([scoreA, scoreB])\n }\n if (common.length < 2) continue\n const judgeScores = common.map(\n ([scoreA, scoreB]) =>\n [\n { judgeName: judgeI, dimension: dim, score: scoreA, reasoning: '' },\n { judgeName: judgeJ, dimension: dim, score: scoreB, reasoning: '' },\n ] as const,\n )\n const k = interRaterReliability(\n judgeScores[0]!.map((_, k2) => judgeScores.map((pair) => pair[k2]!)),\n )\n pairs.push({\n judgeA: judgeI,\n judgeB: judgeJ,\n dimension: dim,\n commonItems: common.length,\n pearson: pearson(\n common.map((c) => c[0]),\n common.map((c) => c[1]),\n ),\n krippendorff: k,\n })\n }\n }\n }\n\n return {\n pairs: pairs.sort((a, b) => b.commonItems - a.commonItems),\n dimensions: [...byDimension.keys()].sort(),\n judgeIds,\n }\n}\n\nfunction pearson(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n denA = 0,\n denB = 0\n for (let i = 0; i < a.length; i++) {\n const dA = a[i]! - mA\n const dB = b[i]! - mB\n num += dA * dB\n denA += dA * dA\n denB += dB * dB\n }\n if (denA === 0 || denB === 0) return denA === 0 && denB === 0 ? 1 : 0\n return num / Math.sqrt(denA * denB)\n}\n","/**\n * RegressionView — compares a candidate slice to a baseline slice on a\n * named metric. Delegates the statistics (Welch's t-test, Cohen's d,\n * IQR stability) to `baseline.ts`.\n *\n * This is the entry point for CI regression gates: \"given runs tagged\n * release=A and release=B, did any metric regress?\"\n */\n\nimport { type BaselineOptions, type BaselineReport, compareToBaseline } from '../baseline'\nimport { aggregateLlm, llmSpans, runFailureClass } from '../trace/query'\nimport type { Run } from '../trace/schema'\nimport type { RunFilter, TraceStore } from '../trace/store'\n\nexport interface RegressionSpec {\n metric: string\n higherIsBetter: boolean\n /** Extract a scalar from a run. Default extractors handle common metrics. */\n extract?: (run: Run, store: TraceStore) => Promise<number | null>\n}\n\nexport interface RegressionOptions extends BaselineOptions {\n baseline: RunFilter\n candidate: RunFilter\n}\n\nexport async function regressionView(\n store: TraceStore,\n metrics: RegressionSpec[],\n options: RegressionOptions,\n): Promise<BaselineReport> {\n const baselineRuns = await store.listRuns(options.baseline)\n const candidateRuns = await store.listRuns(options.candidate)\n const samples = await Promise.all(\n metrics.map(async (m) => {\n const extract = m.extract ?? defaultExtract(m.metric)\n const baseline = await extractAll(baselineRuns, extract, store)\n const candidate = await extractAll(candidateRuns, extract, store)\n return { metric: m.metric, higherIsBetter: m.higherIsBetter, baseline, candidate }\n }),\n )\n return compareToBaseline(samples, options)\n}\n\nasync function extractAll(\n runs: Run[],\n extract: (r: Run, s: TraceStore) => Promise<number | null>,\n store: TraceStore,\n): Promise<number[]> {\n const out: number[] = []\n for (const r of runs) {\n const v = await extract(r, store)\n if (v !== null && Number.isFinite(v)) out.push(v)\n }\n return out\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run, store) => {\n switch (metric) {\n case 'score':\n case 'overallScore':\n return run.outcome?.score ?? null\n case 'pass':\n return run.outcome?.pass === true ? 1 : 0\n case 'durationMs':\n return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null\n case 'costUsd': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).costUsd\n }\n case 'inputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).inputTokens\n }\n case 'outputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).outputTokens\n }\n case 'failureClass': {\n return runFailureClass(run) === 'success' ? 1 : 0\n }\n default:\n return null\n }\n }\n}\n","/**\n * StuckLoopView — detects when an agent calls the same tool with the\n * same (or structurally similar) arguments ≥ N times in a short window.\n *\n * Rationale: agents that loop are the number-one production failure\n * mode on long-horizon flows. The view returns (runId, toolName,\n * argHash, occurrences, windowMs) for each detected loop plus a\n * fraction of runs affected.\n */\n\nimport { argHash, toolSpans } from '../trace/query'\nimport type { TraceStore } from '../trace/store'\n\nexport interface StuckLoopFinding {\n runId: string\n toolName: string\n argHash: string\n occurrences: number\n spanIds: string[]\n /** Milliseconds between first and last call in the loop. */\n windowMs: number\n}\n\nexport interface StuckLoopReport {\n findings: StuckLoopFinding[]\n affectedRunRatio: number\n totalRuns: number\n}\n\nexport interface StuckLoopOptions {\n /** Minimum call count to flag a loop (default 3). */\n minOccurrences?: number\n /** Filter to a specific runId; omit to scan the entire corpus. */\n runId?: string\n}\n\nexport async function stuckLoopView(\n store: TraceStore,\n options: StuckLoopOptions = {},\n): Promise<StuckLoopReport> {\n const minOccurrences = options.minOccurrences ?? 3\n const runs = options.runId\n ? [{ runId: options.runId }]\n : (await store.listRuns()).map((r) => ({ runId: r.runId }))\n\n const findings: StuckLoopFinding[] = []\n for (const { runId } of runs) {\n const tools = await toolSpans(store, runId)\n const byKey = new Map<string, { spans: typeof tools; argHash: string }>()\n for (const t of tools) {\n const h = argHash(t.args)\n const key = `${t.toolName}|${h}`\n const bucket = byKey.get(key) ?? { spans: [], argHash: h }\n bucket.spans.push(t)\n byKey.set(key, bucket)\n }\n for (const [key, { spans, argHash: h }] of byKey) {\n if (spans.length < minOccurrences) continue\n const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt)\n const first = sorted[0]!.startedAt\n const last = sorted[sorted.length - 1]!.startedAt\n findings.push({\n runId,\n toolName: key.split('|')[0]!,\n argHash: h,\n occurrences: sorted.length,\n spanIds: sorted.map((s) => s.spanId),\n windowMs: last - first,\n })\n }\n }\n\n const affectedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n affectedRunRatio: runs.length > 0 ? affectedRuns.size / runs.length : 0,\n totalRuns: runs.length,\n }\n}\n","/**\n * ToolWasteView — fraction of tool calls whose results weren't used\n * downstream. Without a \"used\" signal we fall back to structural\n * proxies: error calls, duplicate calls, and tool calls followed by\n * zero subsequent LLM spans are all considered waste.\n *\n * Consumers can pass a `usageOracle` that inspects a tool span and\n * returns true iff the tool's result appears in a later LLM message,\n * artifact, or state mutation — that's the canonical definition; the\n * default heuristic is a reasonable fallback.\n */\n\nimport { computeToolUseMetrics } from '../tool-use-metrics'\nimport { llmSpans, toolSpans } from '../trace/query'\nimport type { ToolSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface ToolWasteFinding {\n runId: string\n wastedCalls: number\n totalCalls: number\n wasteRate: number\n}\n\nexport interface ToolWasteReport {\n byRun: ToolWasteFinding[]\n overallWasteRate: number\n}\n\nexport interface ToolWasteOptions {\n runId?: string\n usageOracle?: (tool: ToolSpan, later: { llm: Awaited<ReturnType<typeof llmSpans>> }) => boolean\n}\n\nexport async function toolWasteView(\n store: TraceStore,\n options: ToolWasteOptions = {},\n): Promise<ToolWasteReport> {\n const runs = options.runId ? [options.runId] : (await store.listRuns()).map((r) => r.runId)\n\n const byRun: ToolWasteFinding[] = []\n let totalCalls = 0\n let totalWasted = 0\n for (const runId of runs) {\n const tools = await toolSpans(store, runId)\n if (tools.length === 0) {\n byRun.push({ runId, wastedCalls: 0, totalCalls: 0, wasteRate: 0 })\n continue\n }\n const llms = await llmSpans(store, runId)\n let wasted = 0\n for (const t of tools) {\n if (t.status === 'error') {\n wasted++\n continue\n }\n const laterLlm = llms.filter((l) => l.startedAt > t.startedAt)\n if (options.usageOracle) {\n if (!options.usageOracle(t, { llm: laterLlm })) wasted++\n } else {\n // Default heuristic: a tool whose result is NOT mentioned in any\n // later LLM input message is likely wasted.\n const resultStr = stringify(t.result)\n const used = laterLlm.some((l) =>\n l.messages.some(\n (m) =>\n typeof m.content === 'string' &&\n resultStr &&\n m.content.includes(resultStr.slice(0, 120)),\n ),\n )\n if (!used) wasted++\n }\n }\n const wasteRate = wasted / tools.length\n byRun.push({ runId, wastedCalls: wasted, totalCalls: tools.length, wasteRate })\n totalCalls += tools.length\n totalWasted += wasted\n }\n return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 }\n}\n\nfunction stringify(v: unknown): string {\n if (v === null || v === undefined) return ''\n if (typeof v === 'string') return v\n try {\n return JSON.stringify(v)\n } catch {\n return String(v)\n }\n}\n\n// Re-export for convenience in consumers that want both descriptive and usage metrics.\nexport { computeToolUseMetrics }\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;AA8BA,eAAsB,iBACpB,OACA,UAAuD,CAAC,GAC3B;AAC7B,QAAM,OAAO,MAAM,MAAM,SAAS;AAAA,IAChC,YAAY,QAAQ;AAAA,IACpB,WAAW,QAAQ;AAAA,EACrB,CAAC;AACD,QAAM,WAAkC,CAAC;AACzC,QAAM,cAAsC,CAAC;AAC7C,QAAM,aAAqC,CAAC;AAC5C,QAAM,YAAoC,CAAC;AAE3C,aAAW,OAAO,MAAM;AACtB,UAAM,UAAU,MAAM,MAAM,OAAO,IAAI,KAAK;AAC5C,eAAW,KAAK,SAAS;AACvB,UAAI,CAAC,EAAE,SAAU;AACjB,YAAM,cAAc,EAAE,QAAQ,IAAI,EAAE,WAAW,EAAE,QAAQ;AACzD,eAAS,KAAK;AAAA,QACZ,OAAO,IAAI;AAAA,QACX,YAAY,IAAI;AAAA,QAChB,WAAW,IAAI;AAAA,QACf,WAAW,EAAE;AAAA,QACb,OAAO,EAAE;AAAA,QACT,UAAU,EAAE;AAAA,QACZ;AAAA,QACA,WAAW,EAAE;AAAA,MACf,CAAC;AACD,kBAAY,EAAE,SAAS,KAAK,YAAY,EAAE,SAAS,KAAK,KAAK;AAC7D,iBAAW,IAAI,UAAU,KAAK,WAAW,IAAI,UAAU,KAAK,KAAK;AACjE,UAAI,IAAI,UAAW,WAAU,IAAI,SAAS,KAAK,UAAU,IAAI,SAAS,KAAK,KAAK;AAAA,IAClF;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,WAAW,KAAK;AAAA,IAChB,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,EACxE;AACF;;;ACnCA,eAAsB,mBACpB,OACA,UAA8D,CAAC,GAChC;AAC/B,QAAM,QAAQ,QAAQ,SAAS;AAC/B,QAAM,UAAU,QAAQ,kBAAkB;AAC1C,QAAM,OAAO,MAAM,MAAM,SAAS;AAGlC,QAAM,WAAW,oBAAI,IAAyB;AAC9C,MAAI,gBAAgB;AAEpB,aAAW,OAAO,MAAM;AACtB,QAAI,IAAI,WAAW,eAAe,IAAI,SAAS,SAAS,MAAO;AAC/D;AACA,UAAM,QAAQ,MAAM,MAAM,MAAM,EAAE,OAAO,IAAI,MAAM,CAAC;AACpD,UAAM,SAAS,MAAM,MAAM,OAAO,EAAE,OAAO,IAAI,MAAM,CAAC;AACtD,UAAM,MAAM,gBAAgB,EAAE,KAAK,OAAO,OAAO,GAAG,KAAK;AAEzD,QAAI;AACJ,QAAI;AACJ,QAAI;AACJ,QAAI,IAAI,eAAe;AACrB,YAAM,OAAO,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,IAAI,aAAa;AAC7D,UAAI,MAAM,SAAS,QAAQ;AACzB,mBAAW,KAAK;AAChB,oBAAY,QAAQ,KAAK,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC5C,WAAW,MAAM,SAAS,SAAS;AACjC,oBAAY,KAAK;AAAA,MACnB;AAAA,IACF;AAEA,QAAI,CAAC,UAAU;AACb,YAAM,KAAK,MAAM,UAAU,OAAO,IAAI,KAAK;AAC3C,YAAM,UAAU,GAAG,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO,EAAE,IAAI;AAC3D,UAAI,SAAS;AACX,mBAAW,QAAQ;AACnB,oBAAY,QAAQ,QAAQ,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC/C;AAAA,IACF;AAIA,QAAI,CAAC,WAAW;AACd,YAAM,QAAQ,MAAM,KAAK,CAAC,MAAM,EAAE,SAAS,WAAW,OAAO,EAAE,cAAc,QAAQ;AACrF,UAAI,OAAO,SAAS,QAAS,aAAY,MAAM;AAAA,IACjD;AAEA,UAAM,MAAM,GAAG,IAAI,YAAY,IAAI,YAAY,EAAE,IAAI,aAAa,EAAE,IAAI,aAAa,EAAE;AACvF,QAAI,UAAU,SAAS,IAAI,GAAG;AAC9B,QAAI,CAAC,SAAS;AACZ,gBAAU;AAAA,QACR,cAAc,IAAI;AAAA,QAClB;AAAA,QACA;AAAA,QACA;AAAA,QACA,UAAU;AAAA,QACV,aAAa,CAAC;AAAA,QACd,cAAc,IAAI;AAAA,QAClB,cAAc,kBAAkB,KAAK,KAAK,IAAI;AAAA,MAChD;AACA,eAAS,IAAI,KAAK,OAAO;AAAA,IAC3B;AACA,YAAQ;AACR,QAAI,CAAC,QAAQ,YAAY,SAAS,IAAI,UAAU,EAAG,SAAQ,YAAY,KAAK,IAAI,UAAU;AAAA,EAC5F;AAEA,QAAM,MAAM,CAAC,GAAG,SAAS,OAAO,CAAC,EAC9B,OAAO,CAAC,MAAM,EAAE,YAAY,OAAO,EACnC,KAAK,CAAC,GAAG,MAAM,EAAE,WAAW,EAAE,QAAQ;AAEzC,SAAO,EAAE,UAAU,KAAK,eAAe,WAAW,KAAK,OAAO;AAChE;AAEA,SAAS,kBAAkB,OAAmC;AAC5D,QAAM,UAAU,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,OAAO;AACtD,SAAO,SAAS;AAClB;;;ACvFA,eAAsB,oBACpB,OACA,MACA,MACA,UAA6B,CAAC,GACH;AAC3B,QAAM,CAAC,GAAG,CAAC,IAAI,MAAM,QAAQ,IAAI,CAAC,gBAAgB,OAAO,IAAI,GAAG,gBAAgB,OAAO,IAAI,CAAC,CAAC;AAC7F,QAAM,KAAK,QAAQ,cAAc;AACjC,QAAM,SAAS,KAAK,IAAI,EAAE,MAAM,QAAQ,EAAE,MAAM,MAAM;AACtD,WAAS,IAAI,GAAG,IAAI,QAAQ,KAAK;AAC/B,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,QAAI,CAAC,GAAG,OAAO,KAAK,GAAG;AACrB,aAAO;AAAA,QACL;AAAA,QACA;AAAA,QACA,sBAAsB;AAAA,QACtB;AAAA,QACA;AAAA,QACA,QAAQ,mBAAmB,OAAO,KAAK;AAAA,QACvC,iBAAiB;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AACA,MAAI,EAAE,MAAM,WAAW,EAAE,MAAM,QAAQ;AACrC,WAAO,EAAE,MAAM,MAAM,sBAAsB,MAAM,iBAAiB,OAAO;AAAA,EAC3E;AACA,QAAM,SAAqB,EAAE,MAAM,SAAS,EAAE,MAAM,SAAS,IAAI;AACjE,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,sBAAsB;AAAA,IACtB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,QAAQ,sBAAsB,OAAO,MAAM,SAAS,MAAM,6BAA6B,SAAS,CAAC;AAAA,IACjG,iBAAiB;AAAA,EACnB;AACF;AAEA,SAAS,kBAAkB,GAAmB,GAA4B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO;AACxC,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,OAAQ,QAAO,EAAE,KAAK,aAAa,EAAE,KAAK;AACxF,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,MAAO,QAAO,EAAE,KAAK,UAAU,EAAE,KAAK;AACnF,MAAI,EAAE,KAAK,SAAS,WAAW,EAAE,KAAK,SAAS;AAC7C,WAAO,EAAE,KAAK,cAAc,EAAE,KAAK;AACrC,SAAO,EAAE,KAAK,SAAS,EAAE,KAAK;AAChC;AAEA,SAAS,mBAAmB,GAAmB,GAA2B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO,QAAQ,EAAE,KAAK,IAAI,OAAO,EAAE,KAAK,IAAI;AAC7E,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,aAAa,EAAE,KAAK,UAAU;AAC3F,WAAO,QAAQ,EAAE,KAAK,QAAQ,OAAO,EAAE,KAAK,QAAQ;AAAA,EACtD;AACA,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AACnF,WAAO,SAAS,EAAE,KAAK,KAAK,OAAO,EAAE,KAAK,KAAK;AAAA,EACjD;AACA,SAAO,SAAS,EAAE,KAAK,IAAI,SAAS,EAAE,KAAK,IAAI;AACjD;;;ACvDA,eAAsB,mBAAmB,OAAkD;AACzF,QAAM,OAAO,MAAM,MAAM,MAAM,EAAE,MAAM,QAAQ,CAAC,GAAG;AAAA,IACjD,CAAC,MAAsB,EAAE,SAAS;AAAA,EACpC;AACA,MAAI,IAAI,WAAW,EAAG,QAAO,EAAE,OAAO,CAAC,GAAG,YAAY,CAAC,GAAG,UAAU,CAAC,EAAE;AAEvE,QAAM,cAAc,oBAAI,IAAyB;AACjD,aAAW,KAAK,KAAK;AACnB,UAAM,MAAM,YAAY,IAAI,EAAE,SAAS,KAAK,CAAC;AAC7C,QAAI,KAAK,CAAC;AACV,gBAAY,IAAI,EAAE,WAAW,GAAG;AAAA,EAClC;AAEA,QAAM,WAAW,CAAC,GAAG,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,KAAK;AAC9D,QAAM,QAAqB,CAAC;AAC5B,aAAW,CAAC,KAAK,KAAK,KAAK,aAAa;AACtC,UAAM,UAAU,oBAAI,IAAiC;AACrD,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,IAAI,EAAE,OAAO,KAAK,oBAAI,IAAoB;AAC5D,QAAE,IAAI,EAAE,cAAc,EAAE,KAAK;AAC7B,cAAQ,IAAI,EAAE,SAAS,CAAC;AAAA,IAC1B;AACA,UAAM,aAAa,CAAC,GAAG,QAAQ,KAAK,CAAC;AACrC,aAAS,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC1C,eAAS,IAAI,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC9C,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,SAAkC,CAAC;AACzC,mBAAW,CAAC,QAAQ,MAAM,KAAK,GAAG;AAChC,gBAAM,SAAS,EAAE,IAAI,MAAM;AAC3B,cAAI,WAAW,OAAW,QAAO,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,QACxD;AACA,YAAI,OAAO,SAAS,EAAG;AACvB,cAAM,cAAc,OAAO;AAAA,UACzB,CAAC,CAAC,QAAQ,MAAM,MACd;AAAA,YACE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,YAClE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,UACpE;AAAA,QACJ;AACA,cAAM,IAAI;AAAA,UACR,YAAY,CAAC,EAAG,IAAI,CAAC,GAAG,OAAO,YAAY,IAAI,CAAC,SAAS,KAAK,EAAE,CAAE,CAAC;AAAA,QACrE;AACA,cAAM,KAAK;AAAA,UACT,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR,WAAW;AAAA,UACX,aAAa,OAAO;AAAA,UACpB,SAAS;AAAA,YACP,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,YACtB,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,UACxB;AAAA,UACA,cAAc;AAAA,QAChB,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,MAAM,KAAK,CAAC,GAAG,MAAM,EAAE,cAAc,EAAE,WAAW;AAAA,IACzD,YAAY,CAAC,GAAG,YAAY,KAAK,CAAC,EAAE,KAAK;AAAA,IACzC;AAAA,EACF;AACF;AAEA,SAAS,QAAQ,GAAa,GAAqB;AACjD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,OAAO,GACP,OAAO;AACT,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,WAAO,KAAK;AACZ,YAAQ,KAAK;AACb,YAAQ,KAAK;AAAA,EACf;AACA,MAAI,SAAS,KAAK,SAAS,EAAG,QAAO,SAAS,KAAK,SAAS,IAAI,IAAI;AACpE,SAAO,MAAM,KAAK,KAAK,OAAO,IAAI;AACpC;;;ACvFA,eAAsB,eACpB,OACA,SACA,SACyB;AACzB,QAAM,eAAe,MAAM,MAAM,SAAS,QAAQ,QAAQ;AAC1D,QAAM,gBAAgB,MAAM,MAAM,SAAS,QAAQ,SAAS;AAC5D,QAAM,UAAU,MAAM,QAAQ;AAAA,IAC5B,QAAQ,IAAI,OAAO,MAAM;AACvB,YAAM,UAAU,EAAE,WAAW,eAAe,EAAE,MAAM;AACpD,YAAM,WAAW,MAAM,WAAW,cAAc,SAAS,KAAK;AAC9D,YAAM,YAAY,MAAM,WAAW,eAAe,SAAS,KAAK;AAChE,aAAO,EAAE,QAAQ,EAAE,QAAQ,gBAAgB,EAAE,gBAAgB,UAAU,UAAU;AAAA,IACnF,CAAC;AAAA,EACH;AACA,SAAO,kBAAkB,SAAS,OAAO;AAC3C;AAEA,eAAe,WACb,MACA,SACA,OACmB;AACnB,QAAM,MAAgB,CAAC;AACvB,aAAW,KAAK,MAAM;AACpB,UAAM,IAAI,MAAM,QAAQ,GAAG,KAAK;AAChC,QAAI,MAAM,QAAQ,OAAO,SAAS,CAAC,EAAG,KAAI,KAAK,CAAC;AAAA,EAClD;AACA,SAAO;AACT;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,KAAK,UAAU;AAC3B,YAAQ,QAAQ;AAAA,MACd,KAAK;AAAA,MACL,KAAK;AACH,eAAO,IAAI,SAAS,SAAS;AAAA,MAC/B,KAAK;AACH,eAAO,IAAI,SAAS,SAAS,OAAO,IAAI;AAAA,MAC1C,KAAK;AACH,eAAO,IAAI,WAAW,IAAI,YAAY,IAAI,UAAU,IAAI,YAAY;AAAA,MACtE,KAAK,WAAW;AACd,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,eAAe;AAClB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,eAAO,gBAAgB,GAAG,MAAM,YAAY,IAAI;AAAA,MAClD;AAAA,MACA;AACE,eAAO;AAAA,IACX;AAAA,EACF;AACF;;;AClDA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,OAAO,QAAQ,QACjB,CAAC,EAAE,OAAO,QAAQ,MAAM,CAAC,KACxB,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE;AAE5D,QAAM,WAA+B,CAAC;AACtC,aAAW,EAAE,MAAM,KAAK,MAAM;AAC5B,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,UAAM,QAAQ,oBAAI,IAAsD;AACxE,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,EAAE,IAAI;AACxB,YAAM,MAAM,GAAG,EAAE,QAAQ,IAAI,CAAC;AAC9B,YAAM,SAAS,MAAM,IAAI,GAAG,KAAK,EAAE,OAAO,CAAC,GAAG,SAAS,EAAE;AACzD,aAAO,MAAM,KAAK,CAAC;AACnB,YAAM,IAAI,KAAK,MAAM;AAAA,IACvB;AACA,eAAW,CAAC,KAAK,EAAE,OAAO,SAAS,EAAE,CAAC,KAAK,OAAO;AAChD,UAAI,MAAM,SAAS,eAAgB;AACnC,YAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAClE,YAAM,QAAQ,OAAO,CAAC,EAAG;AACzB,YAAM,OAAO,OAAO,OAAO,SAAS,CAAC,EAAG;AACxC,eAAS,KAAK;AAAA,QACZ;AAAA,QACA,UAAU,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,QAC1B,SAAS;AAAA,QACT,aAAa,OAAO;AAAA,QACpB,SAAS,OAAO,IAAI,CAAC,MAAM,EAAE,MAAM;AAAA,QACnC,UAAU,OAAO;AAAA,MACnB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,IACtE,WAAW,KAAK;AAAA,EAClB;AACF;;;AC5CA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,OAAO,QAAQ,QAAQ,CAAC,QAAQ,KAAK,KAAK,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK;AAE1F,QAAM,QAA4B,CAAC;AACnC,MAAI,aAAa;AACjB,MAAI,cAAc;AAClB,aAAW,SAAS,MAAM;AACxB,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,QAAI,MAAM,WAAW,GAAG;AACtB,YAAM,KAAK,EAAE,OAAO,aAAa,GAAG,YAAY,GAAG,WAAW,EAAE,CAAC;AACjE;AAAA,IACF;AACA,UAAM,OAAO,MAAM,SAAS,OAAO,KAAK;AACxC,QAAI,SAAS;AACb,eAAW,KAAK,OAAO;AACrB,UAAI,EAAE,WAAW,SAAS;AACxB;AACA;AAAA,MACF;AACA,YAAM,WAAW,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,EAAE,SAAS;AAC7D,UAAI,QAAQ,aAAa;AACvB,YAAI,CAAC,QAAQ,YAAY,GAAG,EAAE,KAAK,SAAS,CAAC,EAAG;AAAA,MAClD,OAAO;AAGL,cAAM,YAAY,UAAU,EAAE,MAAM;AACpC,cAAM,OAAO,SAAS;AAAA,UAAK,CAAC,MAC1B,EAAE,SAAS;AAAA,YACT,CAAC,MACC,OAAO,EAAE,YAAY,YACrB,aACA,EAAE,QAAQ,SAAS,UAAU,MAAM,GAAG,GAAG,CAAC;AAAA,UAC9C;AAAA,QACF;AACA,YAAI,CAAC,KAAM;AAAA,MACb;AAAA,IACF;AACA,UAAM,YAAY,SAAS,MAAM;AACjC,UAAM,KAAK,EAAE,OAAO,aAAa,QAAQ,YAAY,MAAM,QAAQ,UAAU,CAAC;AAC9E,kBAAc,MAAM;AACpB,mBAAe;AAAA,EACjB;AACA,SAAO,EAAE,OAAO,kBAAkB,aAAa,IAAI,cAAc,aAAa,EAAE;AAClF;AAEA,SAAS,UAAU,GAAoB;AACrC,MAAI,MAAM,QAAQ,MAAM,OAAW,QAAO;AAC1C,MAAI,OAAO,MAAM,SAAU,QAAO;AAClC,MAAI;AACF,WAAO,KAAK,UAAU,CAAC;AAAA,EACzB,QAAQ;AACN,WAAO,OAAO,CAAC;AAAA,EACjB;AACF;","names":[]}
|
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
|
|
2
|
-
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
3
|
-
|
|
4
1
|
/**
|
|
5
2
|
* RawProviderSink — first-class persistence for the actual HTTP-level
|
|
6
3
|
* request/response bodies of every LLM provider call.
|
|
@@ -132,80 +129,4 @@ declare class FileSystemRawProviderSink implements RawProviderSink {
|
|
|
132
129
|
*/
|
|
133
130
|
declare function providerFromBaseUrl(baseUrl: string): string;
|
|
134
131
|
|
|
135
|
-
|
|
136
|
-
* Run-completion integrity check — at end of run, verify the expected event
|
|
137
|
-
* types were actually captured. The point is the launch-review failure mode:
|
|
138
|
-
* a run *appears* successful but the raw provider events were never written,
|
|
139
|
-
* so a downstream reviewer can't reconstruct what happened.
|
|
140
|
-
*
|
|
141
|
-
* Pattern:
|
|
142
|
-
*
|
|
143
|
-
* const report = await assertRunCaptured(store, runId, {
|
|
144
|
-
* llmSpansMin: 1,
|
|
145
|
-
* judgeSpansMin: 1,
|
|
146
|
-
* rawSink: providerSink, // must have ≥ 1 event for this run
|
|
147
|
-
* requireRawCoverageOfLlmSpans: true, // every llm span has matching raw events
|
|
148
|
-
* })
|
|
149
|
-
* if (!report.ok) throwIfRunIncomplete(report) // or mark run failed and continue
|
|
150
|
-
*
|
|
151
|
-
* The function is read-only on the store and returns a structured report;
|
|
152
|
-
* the caller chooses the failure mode (throw, mark run failed, log warning).
|
|
153
|
-
* `throwIfRunIncomplete` is the convenient strict mode.
|
|
154
|
-
*/
|
|
155
|
-
|
|
156
|
-
interface RunIntegrityExpectations {
|
|
157
|
-
/** Minimum LLM span count. Default 0 (no requirement). */
|
|
158
|
-
llmSpansMin?: number;
|
|
159
|
-
/** Minimum judge span count. Default 0. */
|
|
160
|
-
judgeSpansMin?: number;
|
|
161
|
-
/** Minimum tool span count. Default 0. */
|
|
162
|
-
toolSpansMin?: number;
|
|
163
|
-
/**
|
|
164
|
-
* Raw provider sink to consult for capture verification. When present,
|
|
165
|
-
* the check requires at least one raw event for the run.
|
|
166
|
-
*/
|
|
167
|
-
rawSink?: RawProviderSink;
|
|
168
|
-
/** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
|
|
169
|
-
rawProviderEventsMin?: number;
|
|
170
|
-
/**
|
|
171
|
-
* Every LLM span must have at least one matching raw `request` event
|
|
172
|
-
* (matched by spanId). Catches the common bug where the structured span
|
|
173
|
-
* was emitted but the raw HTTP capture was wired to a different sink.
|
|
174
|
-
*/
|
|
175
|
-
requireRawCoverageOfLlmSpans?: boolean;
|
|
176
|
-
/** Run outcome must be set (not null/undefined). Default false. */
|
|
177
|
-
requireOutcome?: boolean;
|
|
178
|
-
}
|
|
179
|
-
type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
|
|
180
|
-
interface RunIntegrityIssue {
|
|
181
|
-
code: RunIntegrityIssueCode;
|
|
182
|
-
message: string;
|
|
183
|
-
detail?: Record<string, unknown>;
|
|
184
|
-
}
|
|
185
|
-
interface RunIntegrityReport {
|
|
186
|
-
ok: boolean;
|
|
187
|
-
runId: string;
|
|
188
|
-
llmSpanCount: number;
|
|
189
|
-
judgeSpanCount: number;
|
|
190
|
-
toolSpanCount: number;
|
|
191
|
-
rawProviderEventCount: number;
|
|
192
|
-
/**
|
|
193
|
-
* Coverage of LLM spans by raw provider events keyed on spanId.
|
|
194
|
-
* `total` is the number of LLM spans; `covered` is the count with at
|
|
195
|
-
* least one matching `request` raw event.
|
|
196
|
-
*/
|
|
197
|
-
rawSpanCoverage: {
|
|
198
|
-
covered: number;
|
|
199
|
-
total: number;
|
|
200
|
-
};
|
|
201
|
-
issues: RunIntegrityIssue[];
|
|
202
|
-
}
|
|
203
|
-
declare class RunIntegrityError extends CaptureIntegrityError {
|
|
204
|
-
readonly report: RunIntegrityReport;
|
|
205
|
-
constructor(report: RunIntegrityReport);
|
|
206
|
-
}
|
|
207
|
-
declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
|
|
208
|
-
/** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
|
|
209
|
-
declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
|
|
210
|
-
|
|
211
|
-
export { FileSystemRawProviderSink as F, InMemoryRawProviderSink as I, NoopRawProviderSink as N, type ProviderRedactor as P, type RawProviderSink as R, type RunIntegrityExpectations as a, type RunIntegrityReport as b, type FileSystemRawProviderSinkOptions as c, type InMemoryRawProviderSinkOptions as d, type RawProviderDirection as e, type RawProviderEvent as f, type RawProviderSinkFilter as g, RunIntegrityError as h, type RunIntegrityIssue as i, type RunIntegrityIssueCode as j, assertRunCaptured as k, defaultProviderRedactor as l, providerFromBaseUrl as p, throwIfRunIncomplete as t };
|
|
132
|
+
export { FileSystemRawProviderSink as F, InMemoryRawProviderSink as I, NoopRawProviderSink as N, type ProviderRedactor as P, type RawProviderSink as R, type FileSystemRawProviderSinkOptions as a, type InMemoryRawProviderSinkOptions as b, type RawProviderDirection as c, type RawProviderEvent as d, type RawProviderSinkFilter as e, defaultProviderRedactor as f, providerFromBaseUrl as p };
|