@tangle-network/agent-eval 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pipelines/budget-breach.ts","../../src/pipelines/failure-cluster.ts","../../src/pipelines/first-divergence.ts","../../src/pipelines/judge-agreement.ts","../../src/pipelines/regression.ts","../../src/pipelines/stuck-loop.ts","../../src/pipelines/tool-waste.ts"],"sourcesContent":["/**\n * BudgetBreachView — aggregates breach events across the corpus.\n *\n * Answers: which dimensions get hit most often? Which scenarios are\n * underbudgeted? Which variants trigger the most breaches?\n */\n\nimport type { BudgetSpec } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface BudgetBreachFinding {\n runId: string\n scenarioId: string\n variantId?: string\n dimension: keyof BudgetSpec\n limit: number\n consumed: number\n excessRatio: number\n timestamp: number\n}\n\nexport interface BudgetBreachReport {\n findings: BudgetBreachFinding[]\n byDimension: Record<string, number>\n byScenario: Record<string, number>\n byVariant: Record<string, number>\n totalRuns: number\n breachedRunRatio: number\n}\n\nexport async function budgetBreachView(\n store: TraceStore,\n options: { scenarioId?: string; variantId?: string } = {},\n): Promise<BudgetBreachReport> {\n const runs = await store.listRuns({\n scenarioId: options.scenarioId,\n variantId: options.variantId,\n })\n const findings: BudgetBreachFinding[] = []\n const byDimension: Record<string, number> = {}\n const byScenario: Record<string, number> = {}\n const byVariant: Record<string, number> = {}\n\n for (const run of runs) {\n const entries = await store.budget(run.runId)\n for (const e of entries) {\n if (!e.breached) continue\n const excessRatio = e.limit > 0 ? e.consumed / e.limit : Infinity\n findings.push({\n runId: run.runId,\n scenarioId: run.scenarioId,\n variantId: run.variantId,\n dimension: e.dimension,\n limit: e.limit,\n consumed: e.consumed,\n excessRatio,\n timestamp: e.timestamp,\n })\n byDimension[e.dimension] = (byDimension[e.dimension] ?? 0) + 1\n byScenario[run.scenarioId] = (byScenario[run.scenarioId] ?? 0) + 1\n if (run.variantId) byVariant[run.variantId] = (byVariant[run.variantId] ?? 0) + 1\n }\n }\n\n const breachedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n byDimension,\n byScenario,\n byVariant,\n totalRuns: runs.length,\n breachedRunRatio: runs.length > 0 ? breachedRuns.size / runs.length : 0,\n }\n}\n","/**\n * FailureClusterView — groups failed runs by (failureClass, triggerTool,\n * argHash-prefix) so weekly reviews can prioritize the top-N clusters.\n *\n * Each cluster includes: N runs, scenarios affected, representative\n * error message, a proposed mitigation hint (rule → action table).\n */\n\nimport { classifyFailure, DEFAULT_RULES, type FailureRule } from '../failure-taxonomy'\nimport { argHash, toolSpans } from '../trace/query'\nimport type { FailureClass, Span } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface FailureCluster {\n failureClass: FailureClass\n /** Tool name when the trigger was a tool span, else undefined. */\n toolName?: string\n /** First 16 chars of argHash — clusters similar args. */\n argPrefix?: string\n /**\n * Source dimension when the trigger was a judge span (e.g. `'format'`,\n * `'safety'`, `'correctness'`). Lets cross-template aggregators\n * group failures by the dimension that fired without overloading\n * `argPrefix`. Optional — legacy clusters without this field\n * deserialize cleanly.\n */\n dimension?: string\n runCount: number\n scenarioIds: string[]\n exampleError?: string\n exampleRunId: string\n}\n\nexport interface FailureClusterReport {\n clusters: FailureCluster[]\n totalFailures: number\n totalRuns: number\n}\n\nexport async function failureClusterView(\n store: TraceStore,\n options: { rules?: FailureRule[]; minClusterSize?: number } = {},\n): Promise<FailureClusterReport> {\n const rules = options.rules ?? DEFAULT_RULES\n const minSize = options.minClusterSize ?? 1\n const runs = await store.listRuns()\n\n type Key = string\n const clusters = new Map<Key, FailureCluster>()\n let totalFailures = 0\n\n for (const run of runs) {\n if (run.status === 'completed' && run.outcome?.pass !== false) continue\n totalFailures++\n const spans = await store.spans({ runId: run.runId })\n const events = await store.events({ runId: run.runId })\n const cls = classifyFailure({ run, spans, events }, rules)\n\n let toolName: string | undefined\n let argPrefix: string | undefined\n let dimension: string | undefined\n if (cls.triggerSpanId) {\n const trig = spans.find((s) => s.spanId === cls.triggerSpanId)\n if (trig?.kind === 'tool') {\n toolName = trig.toolName\n argPrefix = argHash(trig.args).slice(0, 16)\n } else if (trig?.kind === 'judge') {\n dimension = trig.dimension\n }\n }\n // Fallback: look at the last errored tool span\n if (!toolName) {\n const ts = await toolSpans(store, run.runId)\n const errored = ts.filter((t) => t.status === 'error').pop()\n if (errored) {\n toolName = errored.toolName\n argPrefix = argHash(errored.args).slice(0, 16)\n }\n }\n // Secondary signal: any judge span on the failed run carries a\n // dimension. Useful when the rule classified by judge score but\n // didn't surface the trigger span (or surfaced a non-judge span).\n if (!dimension) {\n const judge = spans.find((s) => s.kind === 'judge' && typeof s.dimension === 'string')\n if (judge?.kind === 'judge') dimension = judge.dimension\n }\n\n const key = `${cls.failureClass}|${toolName ?? ''}|${argPrefix ?? ''}|${dimension ?? ''}`\n let cluster = clusters.get(key)\n if (!cluster) {\n cluster = {\n failureClass: cls.failureClass,\n toolName,\n argPrefix,\n dimension,\n runCount: 0,\n scenarioIds: [],\n exampleRunId: run.runId,\n exampleError: firstErrorMessage(spans) ?? cls.reason,\n }\n clusters.set(key, cluster)\n }\n cluster.runCount++\n if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId)\n }\n\n const arr = [...clusters.values()]\n .filter((c) => c.runCount >= minSize)\n .sort((a, b) => b.runCount - a.runCount)\n\n return { clusters: arr, totalFailures, totalRuns: runs.length }\n}\n\nfunction firstErrorMessage(spans: Span[]): string | undefined {\n const errored = spans.find((s) => s.status === 'error')\n return errored?.error\n}\n","/**\n * FirstDivergenceView — aligns two trajectories by step index, reports\n * the first step where they differ.\n *\n * \"Differ\" is configurable — default is (kind, toolName if tool, model\n * if llm). Use this view to attribute \"why is variant B better?\" to a\n * specific step rather than an aggregate mean delta.\n */\n\nimport type { TraceStore } from '../trace/store'\nimport { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'\n\nexport interface DivergenceReport {\n runA: string\n runB: string\n firstDivergenceIndex: number | null\n aStep?: TrajectoryStep\n bStep?: TrajectoryStep\n reason?: string\n /** Common prefix length (steps that matched). */\n commonPrefixLen: number\n}\n\nexport interface DivergenceOptions {\n /** Returns true if two steps are considered equal. Default: kind + tool/model match. */\n stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean\n}\n\nexport async function firstDivergenceView(\n store: TraceStore,\n runA: string,\n runB: string,\n options: DivergenceOptions = {},\n): Promise<DivergenceReport> {\n const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)])\n const eq = options.stepEquals ?? defaultStepEquals\n const minLen = Math.min(a.steps.length, b.steps.length)\n for (let i = 0; i < minLen; i++) {\n const aStep = a.steps[i]!\n const bStep = b.steps[i]!\n if (!eq(aStep, bStep)) {\n return {\n runA,\n runB,\n firstDivergenceIndex: i,\n aStep,\n bStep,\n reason: describeDifference(aStep, bStep),\n commonPrefixLen: i,\n }\n }\n }\n if (a.steps.length === b.steps.length) {\n return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen }\n }\n const longer: Trajectory = a.steps.length > b.steps.length ? a : b\n return {\n runA,\n runB,\n firstDivergenceIndex: minLen,\n aStep: a.steps[minLen],\n bStep: b.steps[minLen],\n reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`,\n commonPrefixLen: minLen,\n }\n}\n\nfunction defaultStepEquals(a: TrajectoryStep, b: TrajectoryStep): boolean {\n if (a.span.kind !== b.span.kind) return false\n if (a.span.kind === 'tool' && b.span.kind === 'tool') return a.span.toolName === b.span.toolName\n if (a.span.kind === 'llm' && b.span.kind === 'llm') return a.span.model === b.span.model\n if (a.span.kind === 'judge' && b.span.kind === 'judge')\n return a.span.dimension === b.span.dimension\n return a.span.name === b.span.name\n}\n\nfunction describeDifference(a: TrajectoryStep, b: TrajectoryStep): string {\n if (a.span.kind !== b.span.kind) return `kind ${a.span.kind} vs ${b.span.kind}`\n if (a.span.kind === 'tool' && b.span.kind === 'tool' && a.span.toolName !== b.span.toolName) {\n return `tool ${a.span.toolName} vs ${b.span.toolName}`\n }\n if (a.span.kind === 'llm' && b.span.kind === 'llm' && a.span.model !== b.span.model) {\n return `model ${a.span.model} vs ${b.span.model}`\n }\n return `name \"${a.span.name}\" vs \"${b.span.name}\"`\n}\n","/**\n * JudgeAgreementView — pairwise agreement between judges across the\n * corpus, grouped by dimension.\n *\n * Output drives two workflows:\n * - Judge robustness audit: \"does Claude agree with GPT at κ ≥ 0.6?\"\n * - Calibration tracking: κ vs golden human labels over time (by\n * providing a `humanGoldenJudgeId`).\n */\n\nimport { interRaterReliability } from '../statistics'\nimport type { JudgeSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface JudgePair {\n judgeA: string\n judgeB: string\n dimension: string\n /** Number of (targetSpanId, dimension) tuples both judges scored. */\n commonItems: number\n pearson: number\n krippendorff: number\n}\n\nexport interface JudgeAgreementReport {\n pairs: JudgePair[]\n dimensions: string[]\n judgeIds: string[]\n}\n\nexport async function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport> {\n const all = (await store.spans({ kind: 'judge' })).filter(\n (s): s is JudgeSpan => s.kind === 'judge',\n )\n if (all.length === 0) return { pairs: [], dimensions: [], judgeIds: [] }\n\n const byDimension = new Map<string, JudgeSpan[]>()\n for (const s of all) {\n const arr = byDimension.get(s.dimension) ?? []\n arr.push(s)\n byDimension.set(s.dimension, arr)\n }\n\n const judgeIds = [...new Set(all.map((s) => s.judgeId))].sort()\n const pairs: JudgePair[] = []\n for (const [dim, spans] of byDimension) {\n const byJudge = new Map<string, Map<string, number>>()\n for (const s of spans) {\n const m = byJudge.get(s.judgeId) ?? new Map<string, number>()\n m.set(s.targetSpanId, s.score)\n byJudge.set(s.judgeId, m)\n }\n const judgesHere = [...byJudge.keys()]\n for (let i = 0; i < judgesHere.length; i++) {\n for (let j = i + 1; j < judgesHere.length; j++) {\n const judgeI = judgesHere[i]!\n const judgeJ = judgesHere[j]!\n const a = byJudge.get(judgeI)!\n const b = byJudge.get(judgeJ)!\n const common: Array<[number, number]> = []\n for (const [target, scoreA] of a) {\n const scoreB = b.get(target)\n if (scoreB !== undefined) common.push([scoreA, scoreB])\n }\n if (common.length < 2) continue\n const judgeScores = common.map(\n ([scoreA, scoreB]) =>\n [\n { judgeName: judgeI, dimension: dim, score: scoreA, reasoning: '' },\n { judgeName: judgeJ, dimension: dim, score: scoreB, reasoning: '' },\n ] as const,\n )\n const k = interRaterReliability(\n judgeScores[0]!.map((_, k2) => judgeScores.map((pair) => pair[k2]!)),\n )\n pairs.push({\n judgeA: judgeI,\n judgeB: judgeJ,\n dimension: dim,\n commonItems: common.length,\n pearson: pearson(\n common.map((c) => c[0]),\n common.map((c) => c[1]),\n ),\n krippendorff: k,\n })\n }\n }\n }\n\n return {\n pairs: pairs.sort((a, b) => b.commonItems - a.commonItems),\n dimensions: [...byDimension.keys()].sort(),\n judgeIds,\n }\n}\n\nfunction pearson(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n denA = 0,\n denB = 0\n for (let i = 0; i < a.length; i++) {\n const dA = a[i]! - mA\n const dB = b[i]! - mB\n num += dA * dB\n denA += dA * dA\n denB += dB * dB\n }\n if (denA === 0 || denB === 0) return denA === 0 && denB === 0 ? 1 : 0\n return num / Math.sqrt(denA * denB)\n}\n","/**\n * RegressionView — compares a candidate slice to a baseline slice on a\n * named metric. Delegates the statistics (Welch's t-test, Cohen's d,\n * IQR stability) to `baseline.ts`.\n *\n * This is the entry point for CI regression gates: \"given runs tagged\n * release=A and release=B, did any metric regress?\"\n */\n\nimport { type BaselineOptions, type BaselineReport, compareToBaseline } from '../baseline'\nimport { aggregateLlm, llmSpans, runFailureClass } from '../trace/query'\nimport type { Run } from '../trace/schema'\nimport type { RunFilter, TraceStore } from '../trace/store'\n\nexport interface RegressionSpec {\n metric: string\n higherIsBetter: boolean\n /** Extract a scalar from a run. Default extractors handle common metrics. */\n extract?: (run: Run, store: TraceStore) => Promise<number | null>\n}\n\nexport interface RegressionOptions extends BaselineOptions {\n baseline: RunFilter\n candidate: RunFilter\n}\n\nexport async function regressionView(\n store: TraceStore,\n metrics: RegressionSpec[],\n options: RegressionOptions,\n): Promise<BaselineReport> {\n const baselineRuns = await store.listRuns(options.baseline)\n const candidateRuns = await store.listRuns(options.candidate)\n const samples = await Promise.all(\n metrics.map(async (m) => {\n const extract = m.extract ?? defaultExtract(m.metric)\n const baseline = await extractAll(baselineRuns, extract, store)\n const candidate = await extractAll(candidateRuns, extract, store)\n return { metric: m.metric, higherIsBetter: m.higherIsBetter, baseline, candidate }\n }),\n )\n return compareToBaseline(samples, options)\n}\n\nasync function extractAll(\n runs: Run[],\n extract: (r: Run, s: TraceStore) => Promise<number | null>,\n store: TraceStore,\n): Promise<number[]> {\n const out: number[] = []\n for (const r of runs) {\n const v = await extract(r, store)\n if (v !== null && Number.isFinite(v)) out.push(v)\n }\n return out\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run, store) => {\n switch (metric) {\n case 'score':\n case 'overallScore':\n return run.outcome?.score ?? null\n case 'pass':\n return run.outcome?.pass === true ? 1 : 0\n case 'durationMs':\n return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null\n case 'costUsd': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).costUsd\n }\n case 'inputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).inputTokens\n }\n case 'outputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).outputTokens\n }\n case 'failureClass': {\n return runFailureClass(run) === 'success' ? 1 : 0\n }\n default:\n return null\n }\n }\n}\n","/**\n * StuckLoopView — detects when an agent calls the same tool with the\n * same (or structurally similar) arguments ≥ N times in a short window.\n *\n * Rationale: agents that loop are the number-one production failure\n * mode on long-horizon flows. The view returns (runId, toolName,\n * argHash, occurrences, windowMs) for each detected loop plus a\n * fraction of runs affected.\n */\n\nimport { argHash, toolSpans } from '../trace/query'\nimport type { TraceStore } from '../trace/store'\n\nexport interface StuckLoopFinding {\n runId: string\n toolName: string\n argHash: string\n occurrences: number\n spanIds: string[]\n /** Milliseconds between first and last call in the loop. */\n windowMs: number\n}\n\nexport interface StuckLoopReport {\n findings: StuckLoopFinding[]\n affectedRunRatio: number\n totalRuns: number\n}\n\nexport interface StuckLoopOptions {\n /** Minimum call count to flag a loop (default 3). */\n minOccurrences?: number\n /** Filter to a specific runId; omit to scan the entire corpus. */\n runId?: string\n}\n\nexport async function stuckLoopView(\n store: TraceStore,\n options: StuckLoopOptions = {},\n): Promise<StuckLoopReport> {\n const minOccurrences = options.minOccurrences ?? 3\n const runs = options.runId\n ? [{ runId: options.runId }]\n : (await store.listRuns()).map((r) => ({ runId: r.runId }))\n\n const findings: StuckLoopFinding[] = []\n for (const { runId } of runs) {\n const tools = await toolSpans(store, runId)\n const byKey = new Map<string, { spans: typeof tools; argHash: string }>()\n for (const t of tools) {\n const h = argHash(t.args)\n const key = `${t.toolName}|${h}`\n const bucket = byKey.get(key) ?? { spans: [], argHash: h }\n bucket.spans.push(t)\n byKey.set(key, bucket)\n }\n for (const [key, { spans, argHash: h }] of byKey) {\n if (spans.length < minOccurrences) continue\n const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt)\n const first = sorted[0]!.startedAt\n const last = sorted[sorted.length - 1]!.startedAt\n findings.push({\n runId,\n toolName: key.split('|')[0]!,\n argHash: h,\n occurrences: sorted.length,\n spanIds: sorted.map((s) => s.spanId),\n windowMs: last - first,\n })\n }\n }\n\n const affectedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n affectedRunRatio: runs.length > 0 ? affectedRuns.size / runs.length : 0,\n totalRuns: runs.length,\n }\n}\n","/**\n * ToolWasteView — fraction of tool calls whose results weren't used\n * downstream. Without a \"used\" signal we fall back to structural\n * proxies: error calls, duplicate calls, and tool calls followed by\n * zero subsequent LLM spans are all considered waste.\n *\n * Consumers can pass a `usageOracle` that inspects a tool span and\n * returns true iff the tool's result appears in a later LLM message,\n * artifact, or state mutation — that's the canonical definition; the\n * default heuristic is a reasonable fallback.\n */\n\nimport { computeToolUseMetrics } from '../tool-use-metrics'\nimport { llmSpans, toolSpans } from '../trace/query'\nimport type { ToolSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface ToolWasteFinding {\n runId: string\n wastedCalls: number\n totalCalls: number\n wasteRate: number\n}\n\nexport interface ToolWasteReport {\n byRun: ToolWasteFinding[]\n overallWasteRate: number\n}\n\nexport interface ToolWasteOptions {\n runId?: string\n usageOracle?: (tool: ToolSpan, later: { llm: Awaited<ReturnType<typeof llmSpans>> }) => boolean\n}\n\nexport async function toolWasteView(\n store: TraceStore,\n options: ToolWasteOptions = {},\n): Promise<ToolWasteReport> {\n const runs = options.runId ? [options.runId] : (await store.listRuns()).map((r) => r.runId)\n\n const byRun: ToolWasteFinding[] = []\n let totalCalls = 0\n let totalWasted = 0\n for (const runId of runs) {\n const tools = await toolSpans(store, runId)\n if (tools.length === 0) {\n byRun.push({ runId, wastedCalls: 0, totalCalls: 0, wasteRate: 0 })\n continue\n }\n const llms = await llmSpans(store, runId)\n let wasted = 0\n for (const t of tools) {\n if (t.status === 'error') {\n wasted++\n continue\n }\n const laterLlm = llms.filter((l) => l.startedAt > t.startedAt)\n if (options.usageOracle) {\n if (!options.usageOracle(t, { llm: laterLlm })) wasted++\n } else {\n // Default heuristic: a tool whose result is NOT mentioned in any\n // later LLM input message is likely wasted.\n const resultStr = stringify(t.result)\n const used = laterLlm.some((l) =>\n l.messages.some(\n (m) =>\n typeof m.content === 'string' &&\n resultStr &&\n m.content.includes(resultStr.slice(0, 120)),\n ),\n )\n if (!used) wasted++\n }\n }\n const wasteRate = wasted / tools.length\n byRun.push({ runId, wastedCalls: wasted, totalCalls: tools.length, wasteRate })\n totalCalls += tools.length\n totalWasted += wasted\n }\n return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 }\n}\n\nfunction stringify(v: unknown): string {\n if (v === null || v === undefined) return ''\n if (typeof v === 'string') return v\n try {\n return JSON.stringify(v)\n } catch {\n return String(v)\n }\n}\n\n// Re-export for convenience in consumers that want both descriptive and usage metrics.\nexport { computeToolUseMetrics }\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;AA8BA,eAAsB,iBACpB,OACA,UAAuD,CAAC,GAC3B;AAC7B,QAAM,OAAO,MAAM,MAAM,SAAS;AAAA,IAChC,YAAY,QAAQ;AAAA,IACpB,WAAW,QAAQ;AAAA,EACrB,CAAC;AACD,QAAM,WAAkC,CAAC;AACzC,QAAM,cAAsC,CAAC;AAC7C,QAAM,aAAqC,CAAC;AAC5C,QAAM,YAAoC,CAAC;AAE3C,aAAW,OAAO,MAAM;AACtB,UAAM,UAAU,MAAM,MAAM,OAAO,IAAI,KAAK;AAC5C,eAAW,KAAK,SAAS;AACvB,UAAI,CAAC,EAAE,SAAU;AACjB,YAAM,cAAc,EAAE,QAAQ,IAAI,EAAE,WAAW,EAAE,QAAQ;AACzD,eAAS,KAAK;AAAA,QACZ,OAAO,IAAI;AAAA,QACX,YAAY,IAAI;AAAA,QAChB,WAAW,IAAI;AAAA,QACf,WAAW,EAAE;AAAA,QACb,OAAO,EAAE;AAAA,QACT,UAAU,EAAE;AAAA,QACZ;AAAA,QACA,WAAW,EAAE;AAAA,MACf,CAAC;AACD,kBAAY,EAAE,SAAS,KAAK,YAAY,EAAE,SAAS,KAAK,KAAK;AAC7D,iBAAW,IAAI,UAAU,KAAK,WAAW,IAAI,UAAU,KAAK,KAAK;AACjE,UAAI,IAAI,UAAW,WAAU,IAAI,SAAS,KAAK,UAAU,IAAI,SAAS,KAAK,KAAK;AAAA,IAClF;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,WAAW,KAAK;AAAA,IAChB,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,EACxE;AACF;;;AClCA,eAAsB,mBACpB,OACA,UAA8D,CAAC,GAChC;AAC/B,QAAM,QAAQ,QAAQ,SAAS;AAC/B,QAAM,UAAU,QAAQ,kBAAkB;AAC1C,QAAM,OAAO,MAAM,MAAM,SAAS;AAGlC,QAAM,WAAW,oBAAI,IAAyB;AAC9C,MAAI,gBAAgB;AAEpB,aAAW,OAAO,MAAM;AACtB,QAAI,IAAI,WAAW,eAAe,IAAI,SAAS,SAAS,MAAO;AAC/D;AACA,UAAM,QAAQ,MAAM,MAAM,MAAM,EAAE,OAAO,IAAI,MAAM,CAAC;AACpD,UAAM,SAAS,MAAM,MAAM,OAAO,EAAE,OAAO,IAAI,MAAM,CAAC;AACtD,UAAM,MAAM,gBAAgB,EAAE,KAAK,OAAO,OAAO,GAAG,KAAK;AAEzD,QAAI;AACJ,QAAI;AACJ,QAAI;AACJ,QAAI,IAAI,eAAe;AACrB,YAAM,OAAO,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,IAAI,aAAa;AAC7D,UAAI,MAAM,SAAS,QAAQ;AACzB,mBAAW,KAAK;AAChB,oBAAY,QAAQ,KAAK,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC5C,WAAW,MAAM,SAAS,SAAS;AACjC,oBAAY,KAAK;AAAA,MACnB;AAAA,IACF;AAEA,QAAI,CAAC,UAAU;AACb,YAAM,KAAK,MAAM,UAAU,OAAO,IAAI,KAAK;AAC3C,YAAM,UAAU,GAAG,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO,EAAE,IAAI;AAC3D,UAAI,SAAS;AACX,mBAAW,QAAQ;AACnB,oBAAY,QAAQ,QAAQ,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC/C;AAAA,IACF;AAIA,QAAI,CAAC,WAAW;AACd,YAAM,QAAQ,MAAM,KAAK,CAAC,MAAM,EAAE,SAAS,WAAW,OAAO,EAAE,cAAc,QAAQ;AACrF,UAAI,OAAO,SAAS,QAAS,aAAY,MAAM;AAAA,IACjD;AAEA,UAAM,MAAM,GAAG,IAAI,YAAY,IAAI,YAAY,EAAE,IAAI,aAAa,EAAE,IAAI,aAAa,EAAE;AACvF,QAAI,UAAU,SAAS,IAAI,GAAG;AAC9B,QAAI,CAAC,SAAS;AACZ,gBAAU;AAAA,QACR,cAAc,IAAI;AAAA,QAClB;AAAA,QACA;AAAA,QACA;AAAA,QACA,UAAU;AAAA,QACV,aAAa,CAAC;AAAA,QACd,cAAc,IAAI;AAAA,QAClB,cAAc,kBAAkB,KAAK,KAAK,IAAI;AAAA,MAChD;AACA,eAAS,IAAI,KAAK,OAAO;AAAA,IAC3B;AACA,YAAQ;AACR,QAAI,CAAC,QAAQ,YAAY,SAAS,IAAI,UAAU,EAAG,SAAQ,YAAY,KAAK,IAAI,UAAU;AAAA,EAC5F;AAEA,QAAM,MAAM,CAAC,GAAG,SAAS,OAAO,CAAC,EAC9B,OAAO,CAAC,MAAM,EAAE,YAAY,OAAO,EACnC,KAAK,CAAC,GAAG,MAAM,EAAE,WAAW,EAAE,QAAQ;AAEzC,SAAO,EAAE,UAAU,KAAK,eAAe,WAAW,KAAK,OAAO;AAChE;AAEA,SAAS,kBAAkB,OAAmC;AAC5D,QAAM,UAAU,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,OAAO;AACtD,SAAO,SAAS;AAClB;;;ACxFA,eAAsB,oBACpB,OACA,MACA,MACA,UAA6B,CAAC,GACH;AAC3B,QAAM,CAAC,GAAG,CAAC,IAAI,MAAM,QAAQ,IAAI,CAAC,gBAAgB,OAAO,IAAI,GAAG,gBAAgB,OAAO,IAAI,CAAC,CAAC;AAC7F,QAAM,KAAK,QAAQ,cAAc;AACjC,QAAM,SAAS,KAAK,IAAI,EAAE,MAAM,QAAQ,EAAE,MAAM,MAAM;AACtD,WAAS,IAAI,GAAG,IAAI,QAAQ,KAAK;AAC/B,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,QAAI,CAAC,GAAG,OAAO,KAAK,GAAG;AACrB,aAAO;AAAA,QACL;AAAA,QACA;AAAA,QACA,sBAAsB;AAAA,QACtB;AAAA,QACA;AAAA,QACA,QAAQ,mBAAmB,OAAO,KAAK;AAAA,QACvC,iBAAiB;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AACA,MAAI,EAAE,MAAM,WAAW,EAAE,MAAM,QAAQ;AACrC,WAAO,EAAE,MAAM,MAAM,sBAAsB,MAAM,iBAAiB,OAAO;AAAA,EAC3E;AACA,QAAM,SAAqB,EAAE,MAAM,SAAS,EAAE,MAAM,SAAS,IAAI;AACjE,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,sBAAsB;AAAA,IACtB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,QAAQ,sBAAsB,OAAO,MAAM,SAAS,MAAM,6BAA6B,SAAS,CAAC;AAAA,IACjG,iBAAiB;AAAA,EACnB;AACF;AAEA,SAAS,kBAAkB,GAAmB,GAA4B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO;AACxC,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,OAAQ,QAAO,EAAE,KAAK,aAAa,EAAE,KAAK;AACxF,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,MAAO,QAAO,EAAE,KAAK,UAAU,EAAE,KAAK;AACnF,MAAI,EAAE,KAAK,SAAS,WAAW,EAAE,KAAK,SAAS;AAC7C,WAAO,EAAE,KAAK,cAAc,EAAE,KAAK;AACrC,SAAO,EAAE,KAAK,SAAS,EAAE,KAAK;AAChC;AAEA,SAAS,mBAAmB,GAAmB,GAA2B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO,QAAQ,EAAE,KAAK,IAAI,OAAO,EAAE,KAAK,IAAI;AAC7E,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,aAAa,EAAE,KAAK,UAAU;AAC3F,WAAO,QAAQ,EAAE,KAAK,QAAQ,OAAO,EAAE,KAAK,QAAQ;AAAA,EACtD;AACA,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AACnF,WAAO,SAAS,EAAE,KAAK,KAAK,OAAO,EAAE,KAAK,KAAK;AAAA,EACjD;AACA,SAAO,SAAS,EAAE,KAAK,IAAI,SAAS,EAAE,KAAK,IAAI;AACjD;;;ACvDA,eAAsB,mBAAmB,OAAkD;AACzF,QAAM,OAAO,MAAM,MAAM,MAAM,EAAE,MAAM,QAAQ,CAAC,GAAG;AAAA,IACjD,CAAC,MAAsB,EAAE,SAAS;AAAA,EACpC;AACA,MAAI,IAAI,WAAW,EAAG,QAAO,EAAE,OAAO,CAAC,GAAG,YAAY,CAAC,GAAG,UAAU,CAAC,EAAE;AAEvE,QAAM,cAAc,oBAAI,IAAyB;AACjD,aAAW,KAAK,KAAK;AACnB,UAAM,MAAM,YAAY,IAAI,EAAE,SAAS,KAAK,CAAC;AAC7C,QAAI,KAAK,CAAC;AACV,gBAAY,IAAI,EAAE,WAAW,GAAG;AAAA,EAClC;AAEA,QAAM,WAAW,CAAC,GAAG,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,KAAK;AAC9D,QAAM,QAAqB,CAAC;AAC5B,aAAW,CAAC,KAAK,KAAK,KAAK,aAAa;AACtC,UAAM,UAAU,oBAAI,IAAiC;AACrD,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,IAAI,EAAE,OAAO,KAAK,oBAAI,IAAoB;AAC5D,QAAE,IAAI,EAAE,cAAc,EAAE,KAAK;AAC7B,cAAQ,IAAI,EAAE,SAAS,CAAC;AAAA,IAC1B;AACA,UAAM,aAAa,CAAC,GAAG,QAAQ,KAAK,CAAC;AACrC,aAAS,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC1C,eAAS,IAAI,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC9C,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,SAAkC,CAAC;AACzC,mBAAW,CAAC,QAAQ,MAAM,KAAK,GAAG;AAChC,gBAAM,SAAS,EAAE,IAAI,MAAM;AAC3B,cAAI,WAAW,OAAW,QAAO,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,QACxD;AACA,YAAI,OAAO,SAAS,EAAG;AACvB,cAAM,cAAc,OAAO;AAAA,UACzB,CAAC,CAAC,QAAQ,MAAM,MACd;AAAA,YACE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,YAClE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,UACpE;AAAA,QACJ;AACA,cAAM,IAAI;AAAA,UACR,YAAY,CAAC,EAAG,IAAI,CAAC,GAAG,OAAO,YAAY,IAAI,CAAC,SAAS,KAAK,EAAE,CAAE,CAAC;AAAA,QACrE;AACA,cAAM,KAAK;AAAA,UACT,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR,WAAW;AAAA,UACX,aAAa,OAAO;AAAA,UACpB,SAAS;AAAA,YACP,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,YACtB,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,UACxB;AAAA,UACA,cAAc;AAAA,QAChB,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,MAAM,KAAK,CAAC,GAAG,MAAM,EAAE,cAAc,EAAE,WAAW;AAAA,IACzD,YAAY,CAAC,GAAG,YAAY,KAAK,CAAC,EAAE,KAAK;AAAA,IACzC;AAAA,EACF;AACF;AAEA,SAAS,QAAQ,GAAa,GAAqB;AACjD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,OAAO,GACP,OAAO;AACT,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,WAAO,KAAK;AACZ,YAAQ,KAAK;AACb,YAAQ,KAAK;AAAA,EACf;AACA,MAAI,SAAS,KAAK,SAAS,EAAG,QAAO,SAAS,KAAK,SAAS,IAAI,IAAI;AACpE,SAAO,MAAM,KAAK,KAAK,OAAO,IAAI;AACpC;;;ACvFA,eAAsB,eACpB,OACA,SACA,SACyB;AACzB,QAAM,eAAe,MAAM,MAAM,SAAS,QAAQ,QAAQ;AAC1D,QAAM,gBAAgB,MAAM,MAAM,SAAS,QAAQ,SAAS;AAC5D,QAAM,UAAU,MAAM,QAAQ;AAAA,IAC5B,QAAQ,IAAI,OAAO,MAAM;AACvB,YAAM,UAAU,EAAE,WAAW,eAAe,EAAE,MAAM;AACpD,YAAM,WAAW,MAAM,WAAW,cAAc,SAAS,KAAK;AAC9D,YAAM,YAAY,MAAM,WAAW,eAAe,SAAS,KAAK;AAChE,aAAO,EAAE,QAAQ,EAAE,QAAQ,gBAAgB,EAAE,gBAAgB,UAAU,UAAU;AAAA,IACnF,CAAC;AAAA,EACH;AACA,SAAO,kBAAkB,SAAS,OAAO;AAC3C;AAEA,eAAe,WACb,MACA,SACA,OACmB;AACnB,QAAM,MAAgB,CAAC;AACvB,aAAW,KAAK,MAAM;AACpB,UAAM,IAAI,MAAM,QAAQ,GAAG,KAAK;AAChC,QAAI,MAAM,QAAQ,OAAO,SAAS,CAAC,EAAG,KAAI,KAAK,CAAC;AAAA,EAClD;AACA,SAAO;AACT;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,KAAK,UAAU;AAC3B,YAAQ,QAAQ;AAAA,MACd,KAAK;AAAA,MACL,KAAK;AACH,eAAO,IAAI,SAAS,SAAS;AAAA,MAC/B,KAAK;AACH,eAAO,IAAI,SAAS,SAAS,OAAO,IAAI;AAAA,MAC1C,KAAK;AACH,eAAO,IAAI,WAAW,IAAI,YAAY,IAAI,UAAU,IAAI,YAAY;AAAA,MACtE,KAAK,WAAW;AACd,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,eAAe;AAClB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,eAAO,gBAAgB,GAAG,MAAM,YAAY,IAAI;AAAA,MAClD;AAAA,MACA;AACE,eAAO;AAAA,IACX;AAAA,EACF;AACF;;;AClDA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,OAAO,QAAQ,QACjB,CAAC,EAAE,OAAO,QAAQ,MAAM,CAAC,KACxB,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE;AAE5D,QAAM,WAA+B,CAAC;AACtC,aAAW,EAAE,MAAM,KAAK,MAAM;AAC5B,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,UAAM,QAAQ,oBAAI,IAAsD;AACxE,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,EAAE,IAAI;AACxB,YAAM,MAAM,GAAG,EAAE,QAAQ,IAAI,CAAC;AAC9B,YAAM,SAAS,MAAM,IAAI,GAAG,KAAK,EAAE,OAAO,CAAC,GAAG,SAAS,EAAE;AACzD,aAAO,MAAM,KAAK,CAAC;AACnB,YAAM,IAAI,KAAK,MAAM;AAAA,IACvB;AACA,eAAW,CAAC,KAAK,EAAE,OAAO,SAAS,EAAE,CAAC,KAAK,OAAO;AAChD,UAAI,MAAM,SAAS,eAAgB;AACnC,YAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAClE,YAAM,QAAQ,OAAO,CAAC,EAAG;AACzB,YAAM,OAAO,OAAO,OAAO,SAAS,CAAC,EAAG;AACxC,eAAS,KAAK;AAAA,QACZ;AAAA,QACA,UAAU,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,QAC1B,SAAS;AAAA,QACT,aAAa,OAAO;AAAA,QACpB,SAAS,OAAO,IAAI,CAAC,MAAM,EAAE,MAAM;AAAA,QACnC,UAAU,OAAO;AAAA,MACnB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,IACtE,WAAW,KAAK;AAAA,EAClB;AACF;;;AC5CA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,OAAO,QAAQ,QAAQ,CAAC,QAAQ,KAAK,KAAK,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK;AAE1F,QAAM,QAA4B,CAAC;AACnC,MAAI,aAAa;AACjB,MAAI,cAAc;AAClB,aAAW,SAAS,MAAM;AACxB,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,QAAI,MAAM,WAAW,GAAG;AACtB,YAAM,KAAK,EAAE,OAAO,aAAa,GAAG,YAAY,GAAG,WAAW,EAAE,CAAC;AACjE;AAAA,IACF;AACA,UAAM,OAAO,MAAM,SAAS,OAAO,KAAK;AACxC,QAAI,SAAS;AACb,eAAW,KAAK,OAAO;AACrB,UAAI,EAAE,WAAW,SAAS;AACxB;AACA;AAAA,MACF;AACA,YAAM,WAAW,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,EAAE,SAAS;AAC7D,UAAI,QAAQ,aAAa;AACvB,YAAI,CAAC,QAAQ,YAAY,GAAG,EAAE,KAAK,SAAS,CAAC,EAAG;AAAA,MAClD,OAAO;AAGL,cAAM,YAAY,UAAU,EAAE,MAAM;AACpC,cAAM,OAAO,SAAS;AAAA,UAAK,CAAC,MAC1B,EAAE,SAAS;AAAA,YACT,CAAC,MACC,OAAO,EAAE,YAAY,YACrB,aACA,EAAE,QAAQ,SAAS,UAAU,MAAM,GAAG,GAAG,CAAC;AAAA,UAC9C;AAAA,QACF;AACA,YAAI,CAAC,KAAM;AAAA,MACb;AAAA,IACF;AACA,UAAM,YAAY,SAAS,MAAM;AACjC,UAAM,KAAK,EAAE,OAAO,aAAa,QAAQ,YAAY,MAAM,QAAQ,UAAU,CAAC;AAC9E,kBAAc,MAAM;AACpB,mBAAe;AAAA,EACjB;AACA,SAAO,EAAE,OAAO,kBAAkB,aAAa,IAAI,cAAc,aAAa,EAAE;AAClF;AAEA,SAAS,UAAU,GAAoB;AACrC,MAAI,MAAM,QAAQ,MAAM,OAAW,QAAO;AAC1C,MAAI,OAAO,MAAM,SAAU,QAAO;AAClC,MAAI;AACF,WAAO,KAAK,UAAU,CAAC;AAAA,EACzB,QAAQ;AACN,WAAO,OAAO,CAAC;AAAA,EACjB;AACF;","names":[]}
1
+ {"version":3,"sources":["../../src/pipelines/budget-breach.ts","../../src/pipelines/first-divergence.ts","../../src/pipelines/judge-agreement.ts","../../src/pipelines/regression.ts","../../src/pipelines/stuck-loop.ts","../../src/pipelines/tool-waste.ts"],"sourcesContent":["/**\n * BudgetBreachView — aggregates breach events across the corpus.\n *\n * Answers: which dimensions get hit most often? Which scenarios are\n * underbudgeted? Which variants trigger the most breaches?\n */\n\nimport type { BudgetSpec } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface BudgetBreachFinding {\n runId: string\n scenarioId: string\n variantId?: string\n dimension: keyof BudgetSpec\n limit: number\n consumed: number\n excessRatio: number\n timestamp: number\n}\n\nexport interface BudgetBreachReport {\n findings: BudgetBreachFinding[]\n byDimension: Record<string, number>\n byScenario: Record<string, number>\n byVariant: Record<string, number>\n totalRuns: number\n breachedRunRatio: number\n}\n\nexport async function budgetBreachView(\n store: TraceStore,\n options: { scenarioId?: string; variantId?: string } = {},\n): Promise<BudgetBreachReport> {\n const runs = await store.listRuns({\n scenarioId: options.scenarioId,\n variantId: options.variantId,\n })\n const findings: BudgetBreachFinding[] = []\n const byDimension: Record<string, number> = {}\n const byScenario: Record<string, number> = {}\n const byVariant: Record<string, number> = {}\n\n for (const run of runs) {\n const entries = await store.budget(run.runId)\n for (const e of entries) {\n if (!e.breached) continue\n const excessRatio = e.limit > 0 ? e.consumed / e.limit : Infinity\n findings.push({\n runId: run.runId,\n scenarioId: run.scenarioId,\n variantId: run.variantId,\n dimension: e.dimension,\n limit: e.limit,\n consumed: e.consumed,\n excessRatio,\n timestamp: e.timestamp,\n })\n byDimension[e.dimension] = (byDimension[e.dimension] ?? 0) + 1\n byScenario[run.scenarioId] = (byScenario[run.scenarioId] ?? 0) + 1\n if (run.variantId) byVariant[run.variantId] = (byVariant[run.variantId] ?? 0) + 1\n }\n }\n\n const breachedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n byDimension,\n byScenario,\n byVariant,\n totalRuns: runs.length,\n breachedRunRatio: runs.length > 0 ? breachedRuns.size / runs.length : 0,\n }\n}\n","/**\n * FirstDivergenceView — aligns two trajectories by step index, reports\n * the first step where they differ.\n *\n * \"Differ\" is configurable — default is (kind, toolName if tool, model\n * if llm). Use this view to attribute \"why is variant B better?\" to a\n * specific step rather than an aggregate mean delta.\n */\n\nimport type { TraceStore } from '../trace/store'\nimport { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'\n\nexport interface DivergenceReport {\n runA: string\n runB: string\n firstDivergenceIndex: number | null\n aStep?: TrajectoryStep\n bStep?: TrajectoryStep\n reason?: string\n /** Common prefix length (steps that matched). */\n commonPrefixLen: number\n}\n\nexport interface DivergenceOptions {\n /** Returns true if two steps are considered equal. Default: kind + tool/model match. */\n stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean\n}\n\nexport async function firstDivergenceView(\n store: TraceStore,\n runA: string,\n runB: string,\n options: DivergenceOptions = {},\n): Promise<DivergenceReport> {\n const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)])\n const eq = options.stepEquals ?? defaultStepEquals\n const minLen = Math.min(a.steps.length, b.steps.length)\n for (let i = 0; i < minLen; i++) {\n const aStep = a.steps[i]!\n const bStep = b.steps[i]!\n if (!eq(aStep, bStep)) {\n return {\n runA,\n runB,\n firstDivergenceIndex: i,\n aStep,\n bStep,\n reason: describeDifference(aStep, bStep),\n commonPrefixLen: i,\n }\n }\n }\n if (a.steps.length === b.steps.length) {\n return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen }\n }\n const longer: Trajectory = a.steps.length > b.steps.length ? a : b\n return {\n runA,\n runB,\n firstDivergenceIndex: minLen,\n aStep: a.steps[minLen],\n bStep: b.steps[minLen],\n reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`,\n commonPrefixLen: minLen,\n }\n}\n\nfunction defaultStepEquals(a: TrajectoryStep, b: TrajectoryStep): boolean {\n if (a.span.kind !== b.span.kind) return false\n if (a.span.kind === 'tool' && b.span.kind === 'tool') return a.span.toolName === b.span.toolName\n if (a.span.kind === 'llm' && b.span.kind === 'llm') return a.span.model === b.span.model\n if (a.span.kind === 'judge' && b.span.kind === 'judge')\n return a.span.dimension === b.span.dimension\n return a.span.name === b.span.name\n}\n\nfunction describeDifference(a: TrajectoryStep, b: TrajectoryStep): string {\n if (a.span.kind !== b.span.kind) return `kind ${a.span.kind} vs ${b.span.kind}`\n if (a.span.kind === 'tool' && b.span.kind === 'tool' && a.span.toolName !== b.span.toolName) {\n return `tool ${a.span.toolName} vs ${b.span.toolName}`\n }\n if (a.span.kind === 'llm' && b.span.kind === 'llm' && a.span.model !== b.span.model) {\n return `model ${a.span.model} vs ${b.span.model}`\n }\n return `name \"${a.span.name}\" vs \"${b.span.name}\"`\n}\n","/**\n * JudgeAgreementView — pairwise agreement between judges across the\n * corpus, grouped by dimension.\n *\n * Output drives two workflows:\n * - Judge robustness audit: \"does Claude agree with GPT at κ ≥ 0.6?\"\n * - Calibration tracking: κ vs golden human labels over time (by\n * providing a `humanGoldenJudgeId`).\n */\n\nimport { interRaterReliability } from '../statistics'\nimport type { JudgeSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface JudgePair {\n judgeA: string\n judgeB: string\n dimension: string\n /** Number of (targetSpanId, dimension) tuples both judges scored. */\n commonItems: number\n pearson: number\n krippendorff: number\n}\n\nexport interface JudgeAgreementReport {\n pairs: JudgePair[]\n dimensions: string[]\n judgeIds: string[]\n}\n\nexport async function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport> {\n const all = (await store.spans({ kind: 'judge' })).filter(\n (s): s is JudgeSpan => s.kind === 'judge',\n )\n if (all.length === 0) return { pairs: [], dimensions: [], judgeIds: [] }\n\n const byDimension = new Map<string, JudgeSpan[]>()\n for (const s of all) {\n const arr = byDimension.get(s.dimension) ?? []\n arr.push(s)\n byDimension.set(s.dimension, arr)\n }\n\n const judgeIds = [...new Set(all.map((s) => s.judgeId))].sort()\n const pairs: JudgePair[] = []\n for (const [dim, spans] of byDimension) {\n const byJudge = new Map<string, Map<string, number>>()\n for (const s of spans) {\n const m = byJudge.get(s.judgeId) ?? new Map<string, number>()\n m.set(s.targetSpanId, s.score)\n byJudge.set(s.judgeId, m)\n }\n const judgesHere = [...byJudge.keys()]\n for (let i = 0; i < judgesHere.length; i++) {\n for (let j = i + 1; j < judgesHere.length; j++) {\n const judgeI = judgesHere[i]!\n const judgeJ = judgesHere[j]!\n const a = byJudge.get(judgeI)!\n const b = byJudge.get(judgeJ)!\n const common: Array<[number, number]> = []\n for (const [target, scoreA] of a) {\n const scoreB = b.get(target)\n if (scoreB !== undefined) common.push([scoreA, scoreB])\n }\n if (common.length < 2) continue\n const judgeScores = common.map(\n ([scoreA, scoreB]) =>\n [\n { judgeName: judgeI, dimension: dim, score: scoreA, reasoning: '' },\n { judgeName: judgeJ, dimension: dim, score: scoreB, reasoning: '' },\n ] as const,\n )\n const k = interRaterReliability(\n judgeScores[0]!.map((_, k2) => judgeScores.map((pair) => pair[k2]!)),\n )\n pairs.push({\n judgeA: judgeI,\n judgeB: judgeJ,\n dimension: dim,\n commonItems: common.length,\n pearson: pearson(\n common.map((c) => c[0]),\n common.map((c) => c[1]),\n ),\n krippendorff: k,\n })\n }\n }\n }\n\n return {\n pairs: pairs.sort((a, b) => b.commonItems - a.commonItems),\n dimensions: [...byDimension.keys()].sort(),\n judgeIds,\n }\n}\n\nfunction pearson(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n denA = 0,\n denB = 0\n for (let i = 0; i < a.length; i++) {\n const dA = a[i]! - mA\n const dB = b[i]! - mB\n num += dA * dB\n denA += dA * dA\n denB += dB * dB\n }\n if (denA === 0 || denB === 0) return denA === 0 && denB === 0 ? 1 : 0\n return num / Math.sqrt(denA * denB)\n}\n","/**\n * RegressionView — compares a candidate slice to a baseline slice on a\n * named metric. Delegates the statistics (Welch's t-test, Cohen's d,\n * IQR stability) to `baseline.ts`.\n *\n * This is the entry point for CI regression gates: \"given runs tagged\n * release=A and release=B, did any metric regress?\"\n */\n\nimport { type BaselineOptions, type BaselineReport, compareToBaseline } from '../baseline'\nimport { aggregateLlm, llmSpans, runFailureClass } from '../trace/query'\nimport type { Run } from '../trace/schema'\nimport type { RunFilter, TraceStore } from '../trace/store'\n\nexport interface RegressionSpec {\n metric: string\n higherIsBetter: boolean\n /** Extract a scalar from a run. Default extractors handle common metrics. */\n extract?: (run: Run, store: TraceStore) => Promise<number | null>\n}\n\nexport interface RegressionOptions extends BaselineOptions {\n baseline: RunFilter\n candidate: RunFilter\n}\n\nexport async function regressionView(\n store: TraceStore,\n metrics: RegressionSpec[],\n options: RegressionOptions,\n): Promise<BaselineReport> {\n const baselineRuns = await store.listRuns(options.baseline)\n const candidateRuns = await store.listRuns(options.candidate)\n const samples = await Promise.all(\n metrics.map(async (m) => {\n const extract = m.extract ?? defaultExtract(m.metric)\n const baseline = await extractAll(baselineRuns, extract, store)\n const candidate = await extractAll(candidateRuns, extract, store)\n return { metric: m.metric, higherIsBetter: m.higherIsBetter, baseline, candidate }\n }),\n )\n return compareToBaseline(samples, options)\n}\n\nasync function extractAll(\n runs: Run[],\n extract: (r: Run, s: TraceStore) => Promise<number | null>,\n store: TraceStore,\n): Promise<number[]> {\n const out: number[] = []\n for (const r of runs) {\n const v = await extract(r, store)\n if (v !== null && Number.isFinite(v)) out.push(v)\n }\n return out\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run, store) => {\n switch (metric) {\n case 'score':\n case 'overallScore':\n return run.outcome?.score ?? null\n case 'pass':\n return run.outcome?.pass === true ? 1 : 0\n case 'durationMs':\n return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null\n case 'costUsd': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).costUsd\n }\n case 'inputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).inputTokens\n }\n case 'outputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).outputTokens\n }\n case 'failureClass': {\n return runFailureClass(run) === 'success' ? 1 : 0\n }\n default:\n return null\n }\n }\n}\n","/**\n * StuckLoopView — detects when an agent calls the same tool with the\n * same (or structurally similar) arguments ≥ N times in a short window.\n *\n * Rationale: agents that loop are the number-one production failure\n * mode on long-horizon flows. The view returns (runId, toolName,\n * argHash, occurrences, windowMs) for each detected loop plus a\n * fraction of runs affected.\n */\n\nimport { argHash, toolSpans } from '../trace/query'\nimport type { TraceStore } from '../trace/store'\n\nexport interface StuckLoopFinding {\n runId: string\n toolName: string\n argHash: string\n occurrences: number\n spanIds: string[]\n /** Milliseconds between first and last call in the loop. */\n windowMs: number\n}\n\nexport interface StuckLoopReport {\n findings: StuckLoopFinding[]\n affectedRunRatio: number\n totalRuns: number\n}\n\nexport interface StuckLoopOptions {\n /** Minimum call count to flag a loop (default 3). */\n minOccurrences?: number\n /** Filter to a specific runId; omit to scan the entire corpus. */\n runId?: string\n}\n\nexport async function stuckLoopView(\n store: TraceStore,\n options: StuckLoopOptions = {},\n): Promise<StuckLoopReport> {\n const minOccurrences = options.minOccurrences ?? 3\n const runs = options.runId\n ? [{ runId: options.runId }]\n : (await store.listRuns()).map((r) => ({ runId: r.runId }))\n\n const findings: StuckLoopFinding[] = []\n for (const { runId } of runs) {\n const tools = await toolSpans(store, runId)\n const byKey = new Map<string, { spans: typeof tools; argHash: string }>()\n for (const t of tools) {\n const h = argHash(t.args)\n const key = `${t.toolName}|${h}`\n const bucket = byKey.get(key) ?? { spans: [], argHash: h }\n bucket.spans.push(t)\n byKey.set(key, bucket)\n }\n for (const [key, { spans, argHash: h }] of byKey) {\n if (spans.length < minOccurrences) continue\n const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt)\n const first = sorted[0]!.startedAt\n const last = sorted[sorted.length - 1]!.startedAt\n findings.push({\n runId,\n toolName: key.split('|')[0]!,\n argHash: h,\n occurrences: sorted.length,\n spanIds: sorted.map((s) => s.spanId),\n windowMs: last - first,\n })\n }\n }\n\n const affectedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n affectedRunRatio: runs.length > 0 ? affectedRuns.size / runs.length : 0,\n totalRuns: runs.length,\n }\n}\n","/**\n * ToolWasteView — fraction of tool calls whose results weren't used\n * downstream. Without a \"used\" signal we fall back to structural\n * proxies: error calls, duplicate calls, and tool calls followed by\n * zero subsequent LLM spans are all considered waste.\n *\n * Consumers can pass a `usageOracle` that inspects a tool span and\n * returns true iff the tool's result appears in a later LLM message,\n * artifact, or state mutation — that's the canonical definition; the\n * default heuristic is a reasonable fallback.\n */\n\nimport { computeToolUseMetrics } from '../tool-use-metrics'\nimport { llmSpans, toolSpans } from '../trace/query'\nimport type { ToolSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface ToolWasteFinding {\n runId: string\n wastedCalls: number\n totalCalls: number\n wasteRate: number\n}\n\nexport interface ToolWasteReport {\n byRun: ToolWasteFinding[]\n overallWasteRate: number\n}\n\nexport interface ToolWasteOptions {\n runId?: string\n usageOracle?: (tool: ToolSpan, later: { llm: Awaited<ReturnType<typeof llmSpans>> }) => boolean\n}\n\nexport async function toolWasteView(\n store: TraceStore,\n options: ToolWasteOptions = {},\n): Promise<ToolWasteReport> {\n const runs = options.runId ? [options.runId] : (await store.listRuns()).map((r) => r.runId)\n\n const byRun: ToolWasteFinding[] = []\n let totalCalls = 0\n let totalWasted = 0\n for (const runId of runs) {\n const tools = await toolSpans(store, runId)\n if (tools.length === 0) {\n byRun.push({ runId, wastedCalls: 0, totalCalls: 0, wasteRate: 0 })\n continue\n }\n const llms = await llmSpans(store, runId)\n let wasted = 0\n for (const t of tools) {\n if (t.status === 'error') {\n wasted++\n continue\n }\n const laterLlm = llms.filter((l) => l.startedAt > t.startedAt)\n if (options.usageOracle) {\n if (!options.usageOracle(t, { llm: laterLlm })) wasted++\n } else {\n // Default heuristic: a tool whose result is NOT mentioned in any\n // later LLM input message is likely wasted.\n const resultStr = stringify(t.result)\n const used = laterLlm.some((l) =>\n l.messages.some(\n (m) =>\n typeof m.content === 'string' &&\n resultStr &&\n m.content.includes(resultStr.slice(0, 120)),\n ),\n )\n if (!used) wasted++\n }\n }\n const wasteRate = wasted / tools.length\n byRun.push({ runId, wastedCalls: wasted, totalCalls: tools.length, wasteRate })\n totalCalls += tools.length\n totalWasted += wasted\n }\n return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 }\n}\n\nfunction stringify(v: unknown): string {\n if (v === null || v === undefined) return ''\n if (typeof v === 'string') return v\n try {\n return JSON.stringify(v)\n } catch {\n return String(v)\n }\n}\n\n// Re-export for convenience in consumers that want both descriptive and usage metrics.\nexport { computeToolUseMetrics }\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;AA8BA,eAAsB,iBACpB,OACA,UAAuD,CAAC,GAC3B;AAC7B,QAAM,OAAO,MAAM,MAAM,SAAS;AAAA,IAChC,YAAY,QAAQ;AAAA,IACpB,WAAW,QAAQ;AAAA,EACrB,CAAC;AACD,QAAM,WAAkC,CAAC;AACzC,QAAM,cAAsC,CAAC;AAC7C,QAAM,aAAqC,CAAC;AAC5C,QAAM,YAAoC,CAAC;AAE3C,aAAW,OAAO,MAAM;AACtB,UAAM,UAAU,MAAM,MAAM,OAAO,IAAI,KAAK;AAC5C,eAAW,KAAK,SAAS;AACvB,UAAI,CAAC,EAAE,SAAU;AACjB,YAAM,cAAc,EAAE,QAAQ,IAAI,EAAE,WAAW,EAAE,QAAQ;AACzD,eAAS,KAAK;AAAA,QACZ,OAAO,IAAI;AAAA,QACX,YAAY,IAAI;AAAA,QAChB,WAAW,IAAI;AAAA,QACf,WAAW,EAAE;AAAA,QACb,OAAO,EAAE;AAAA,QACT,UAAU,EAAE;AAAA,QACZ;AAAA,QACA,WAAW,EAAE;AAAA,MACf,CAAC;AACD,kBAAY,EAAE,SAAS,KAAK,YAAY,EAAE,SAAS,KAAK,KAAK;AAC7D,iBAAW,IAAI,UAAU,KAAK,WAAW,IAAI,UAAU,KAAK,KAAK;AACjE,UAAI,IAAI,UAAW,WAAU,IAAI,SAAS,KAAK,UAAU,IAAI,SAAS,KAAK,KAAK;AAAA,IAClF;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,WAAW,KAAK;AAAA,IAChB,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,EACxE;AACF;;;AC7CA,eAAsB,oBACpB,OACA,MACA,MACA,UAA6B,CAAC,GACH;AAC3B,QAAM,CAAC,GAAG,CAAC,IAAI,MAAM,QAAQ,IAAI,CAAC,gBAAgB,OAAO,IAAI,GAAG,gBAAgB,OAAO,IAAI,CAAC,CAAC;AAC7F,QAAM,KAAK,QAAQ,cAAc;AACjC,QAAM,SAAS,KAAK,IAAI,EAAE,MAAM,QAAQ,EAAE,MAAM,MAAM;AACtD,WAAS,IAAI,GAAG,IAAI,QAAQ,KAAK;AAC/B,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,QAAI,CAAC,GAAG,OAAO,KAAK,GAAG;AACrB,aAAO;AAAA,QACL;AAAA,QACA;AAAA,QACA,sBAAsB;AAAA,QACtB;AAAA,QACA;AAAA,QACA,QAAQ,mBAAmB,OAAO,KAAK;AAAA,QACvC,iBAAiB;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AACA,MAAI,EAAE,MAAM,WAAW,EAAE,MAAM,QAAQ;AACrC,WAAO,EAAE,MAAM,MAAM,sBAAsB,MAAM,iBAAiB,OAAO;AAAA,EAC3E;AACA,QAAM,SAAqB,EAAE,MAAM,SAAS,EAAE,MAAM,SAAS,IAAI;AACjE,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,sBAAsB;AAAA,IACtB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,QAAQ,sBAAsB,OAAO,MAAM,SAAS,MAAM,6BAA6B,SAAS,CAAC;AAAA,IACjG,iBAAiB;AAAA,EACnB;AACF;AAEA,SAAS,kBAAkB,GAAmB,GAA4B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO;AACxC,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,OAAQ,QAAO,EAAE,KAAK,aAAa,EAAE,KAAK;AACxF,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,MAAO,QAAO,EAAE,KAAK,UAAU,EAAE,KAAK;AACnF,MAAI,EAAE,KAAK,SAAS,WAAW,EAAE,KAAK,SAAS;AAC7C,WAAO,EAAE,KAAK,cAAc,EAAE,KAAK;AACrC,SAAO,EAAE,KAAK,SAAS,EAAE,KAAK;AAChC;AAEA,SAAS,mBAAmB,GAAmB,GAA2B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO,QAAQ,EAAE,KAAK,IAAI,OAAO,EAAE,KAAK,IAAI;AAC7E,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,aAAa,EAAE,KAAK,UAAU;AAC3F,WAAO,QAAQ,EAAE,KAAK,QAAQ,OAAO,EAAE,KAAK,QAAQ;AAAA,EACtD;AACA,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AACnF,WAAO,SAAS,EAAE,KAAK,KAAK,OAAO,EAAE,KAAK,KAAK;AAAA,EACjD;AACA,SAAO,SAAS,EAAE,KAAK,IAAI,SAAS,EAAE,KAAK,IAAI;AACjD;;;ACvDA,eAAsB,mBAAmB,OAAkD;AACzF,QAAM,OAAO,MAAM,MAAM,MAAM,EAAE,MAAM,QAAQ,CAAC,GAAG;AAAA,IACjD,CAAC,MAAsB,EAAE,SAAS;AAAA,EACpC;AACA,MAAI,IAAI,WAAW,EAAG,QAAO,EAAE,OAAO,CAAC,GAAG,YAAY,CAAC,GAAG,UAAU,CAAC,EAAE;AAEvE,QAAM,cAAc,oBAAI,IAAyB;AACjD,aAAW,KAAK,KAAK;AACnB,UAAM,MAAM,YAAY,IAAI,EAAE,SAAS,KAAK,CAAC;AAC7C,QAAI,KAAK,CAAC;AACV,gBAAY,IAAI,EAAE,WAAW,GAAG;AAAA,EAClC;AAEA,QAAM,WAAW,CAAC,GAAG,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,KAAK;AAC9D,QAAM,QAAqB,CAAC;AAC5B,aAAW,CAAC,KAAK,KAAK,KAAK,aAAa;AACtC,UAAM,UAAU,oBAAI,IAAiC;AACrD,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,IAAI,EAAE,OAAO,KAAK,oBAAI,IAAoB;AAC5D,QAAE,IAAI,EAAE,cAAc,EAAE,KAAK;AAC7B,cAAQ,IAAI,EAAE,SAAS,CAAC;AAAA,IAC1B;AACA,UAAM,aAAa,CAAC,GAAG,QAAQ,KAAK,CAAC;AACrC,aAAS,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC1C,eAAS,IAAI,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC9C,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,SAAkC,CAAC;AACzC,mBAAW,CAAC,QAAQ,MAAM,KAAK,GAAG;AAChC,gBAAM,SAAS,EAAE,IAAI,MAAM;AAC3B,cAAI,WAAW,OAAW,QAAO,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,QACxD;AACA,YAAI,OAAO,SAAS,EAAG;AACvB,cAAM,cAAc,OAAO;AAAA,UACzB,CAAC,CAAC,QAAQ,MAAM,MACd;AAAA,YACE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,YAClE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,UACpE;AAAA,QACJ;AACA,cAAM,IAAI;AAAA,UACR,YAAY,CAAC,EAAG,IAAI,CAAC,GAAG,OAAO,YAAY,IAAI,CAAC,SAAS,KAAK,EAAE,CAAE,CAAC;AAAA,QACrE;AACA,cAAM,KAAK;AAAA,UACT,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR,WAAW;AAAA,UACX,aAAa,OAAO;AAAA,UACpB,SAAS;AAAA,YACP,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,YACtB,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,UACxB;AAAA,UACA,cAAc;AAAA,QAChB,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,MAAM,KAAK,CAAC,GAAG,MAAM,EAAE,cAAc,EAAE,WAAW;AAAA,IACzD,YAAY,CAAC,GAAG,YAAY,KAAK,CAAC,EAAE,KAAK;AAAA,IACzC;AAAA,EACF;AACF;AAEA,SAAS,QAAQ,GAAa,GAAqB;AACjD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,OAAO,GACP,OAAO;AACT,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,WAAO,KAAK;AACZ,YAAQ,KAAK;AACb,YAAQ,KAAK;AAAA,EACf;AACA,MAAI,SAAS,KAAK,SAAS,EAAG,QAAO,SAAS,KAAK,SAAS,IAAI,IAAI;AACpE,SAAO,MAAM,KAAK,KAAK,OAAO,IAAI;AACpC;;;ACvFA,eAAsB,eACpB,OACA,SACA,SACyB;AACzB,QAAM,eAAe,MAAM,MAAM,SAAS,QAAQ,QAAQ;AAC1D,QAAM,gBAAgB,MAAM,MAAM,SAAS,QAAQ,SAAS;AAC5D,QAAM,UAAU,MAAM,QAAQ;AAAA,IAC5B,QAAQ,IAAI,OAAO,MAAM;AACvB,YAAM,UAAU,EAAE,WAAW,eAAe,EAAE,MAAM;AACpD,YAAM,WAAW,MAAM,WAAW,cAAc,SAAS,KAAK;AAC9D,YAAM,YAAY,MAAM,WAAW,eAAe,SAAS,KAAK;AAChE,aAAO,EAAE,QAAQ,EAAE,QAAQ,gBAAgB,EAAE,gBAAgB,UAAU,UAAU;AAAA,IACnF,CAAC;AAAA,EACH;AACA,SAAO,kBAAkB,SAAS,OAAO;AAC3C;AAEA,eAAe,WACb,MACA,SACA,OACmB;AACnB,QAAM,MAAgB,CAAC;AACvB,aAAW,KAAK,MAAM;AACpB,UAAM,IAAI,MAAM,QAAQ,GAAG,KAAK;AAChC,QAAI,MAAM,QAAQ,OAAO,SAAS,CAAC,EAAG,KAAI,KAAK,CAAC;AAAA,EAClD;AACA,SAAO;AACT;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,KAAK,UAAU;AAC3B,YAAQ,QAAQ;AAAA,MACd,KAAK;AAAA,MACL,KAAK;AACH,eAAO,IAAI,SAAS,SAAS;AAAA,MAC/B,KAAK;AACH,eAAO,IAAI,SAAS,SAAS,OAAO,IAAI;AAAA,MAC1C,KAAK;AACH,eAAO,IAAI,WAAW,IAAI,YAAY,IAAI,UAAU,IAAI,YAAY;AAAA,MACtE,KAAK,WAAW;AACd,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,eAAe;AAClB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,eAAO,gBAAgB,GAAG,MAAM,YAAY,IAAI;AAAA,MAClD;AAAA,MACA;AACE,eAAO;AAAA,IACX;AAAA,EACF;AACF;;;AClDA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,OAAO,QAAQ,QACjB,CAAC,EAAE,OAAO,QAAQ,MAAM,CAAC,KACxB,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE;AAE5D,QAAM,WAA+B,CAAC;AACtC,aAAW,EAAE,MAAM,KAAK,MAAM;AAC5B,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,UAAM,QAAQ,oBAAI,IAAsD;AACxE,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,EAAE,IAAI;AACxB,YAAM,MAAM,GAAG,EAAE,QAAQ,IAAI,CAAC;AAC9B,YAAM,SAAS,MAAM,IAAI,GAAG,KAAK,EAAE,OAAO,CAAC,GAAG,SAAS,EAAE;AACzD,aAAO,MAAM,KAAK,CAAC;AACnB,YAAM,IAAI,KAAK,MAAM;AAAA,IACvB;AACA,eAAW,CAAC,KAAK,EAAE,OAAO,SAAS,EAAE,CAAC,KAAK,OAAO;AAChD,UAAI,MAAM,SAAS,eAAgB;AACnC,YAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAClE,YAAM,QAAQ,OAAO,CAAC,EAAG;AACzB,YAAM,OAAO,OAAO,OAAO,SAAS,CAAC,EAAG;AACxC,eAAS,KAAK;AAAA,QACZ;AAAA,QACA,UAAU,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,QAC1B,SAAS;AAAA,QACT,aAAa,OAAO;AAAA,QACpB,SAAS,OAAO,IAAI,CAAC,MAAM,EAAE,MAAM;AAAA,QACnC,UAAU,OAAO;AAAA,MACnB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,IACtE,WAAW,KAAK;AAAA,EAClB;AACF;;;AC5CA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,OAAO,QAAQ,QAAQ,CAAC,QAAQ,KAAK,KAAK,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK;AAE1F,QAAM,QAA4B,CAAC;AACnC,MAAI,aAAa;AACjB,MAAI,cAAc;AAClB,aAAW,SAAS,MAAM;AACxB,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,QAAI,MAAM,WAAW,GAAG;AACtB,YAAM,KAAK,EAAE,OAAO,aAAa,GAAG,YAAY,GAAG,WAAW,EAAE,CAAC;AACjE;AAAA,IACF;AACA,UAAM,OAAO,MAAM,SAAS,OAAO,KAAK;AACxC,QAAI,SAAS;AACb,eAAW,KAAK,OAAO;AACrB,UAAI,EAAE,WAAW,SAAS;AACxB;AACA;AAAA,MACF;AACA,YAAM,WAAW,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,EAAE,SAAS;AAC7D,UAAI,QAAQ,aAAa;AACvB,YAAI,CAAC,QAAQ,YAAY,GAAG,EAAE,KAAK,SAAS,CAAC,EAAG;AAAA,MAClD,OAAO;AAGL,cAAM,YAAY,UAAU,EAAE,MAAM;AACpC,cAAM,OAAO,SAAS;AAAA,UAAK,CAAC,MAC1B,EAAE,SAAS;AAAA,YACT,CAAC,MACC,OAAO,EAAE,YAAY,YACrB,aACA,EAAE,QAAQ,SAAS,UAAU,MAAM,GAAG,GAAG,CAAC;AAAA,UAC9C;AAAA,QACF;AACA,YAAI,CAAC,KAAM;AAAA,MACb;AAAA,IACF;AACA,UAAM,YAAY,SAAS,MAAM;AACjC,UAAM,KAAK,EAAE,OAAO,aAAa,QAAQ,YAAY,MAAM,QAAQ,UAAU,CAAC;AAC9E,kBAAc,MAAM;AACpB,mBAAe;AAAA,EACjB;AACA,SAAO,EAAE,OAAO,kBAAkB,aAAa,IAAI,cAAc,aAAa,EAAE;AAClF;AAEA,SAAS,UAAU,GAAoB;AACrC,MAAI,MAAM,QAAQ,MAAM,OAAW,QAAO;AAC1C,MAAI,OAAO,MAAM,SAAU,QAAO;AAClC,MAAI;AACF,WAAO,KAAK,UAAU,CAAC;AAAA,EACzB,QAAQ;AACN,WAAO,OAAO,CAAC;AAAA,EACjB;AACF;","names":[]}
@@ -1,5 +1,5 @@
1
1
  import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-CiK_3LDr.js';
2
- import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-BXGs_9V0.js';
2
+ import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-C7VPYEj2.js';
3
3
  import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
4
4
 
5
5
  /**
@@ -1,7 +1,7 @@
1
1
  export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-C0uDYwG6.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-TDPn1cxq.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-BNgMdqPF.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-Dgz1n51-.js';
4
- export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-BXGs_9V0.js';
4
+ export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-C7VPYEj2.js';
5
5
  import './run-record-CqzahIbx.js';
6
6
  import './errors-BZ9sTdz7.js';
7
7
  import './outcome-store-D6KWmYvj.js';
@@ -1,7 +1,7 @@
1
1
  import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
2
2
  import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DK2EBVZC.js';
3
3
  import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, R as RunRecord } from './run-record-CqzahIbx.js';
4
- import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-BXGs_9V0.js';
4
+ import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-C7VPYEj2.js';
5
5
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
6
6
  import { T as TraceStore } from './store-Db2Bv8Cf.js';
7
7
 
package/dist/rl.d.ts CHANGED
@@ -1,12 +1,12 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
2
2
  import { V as VerificationReport } from './multi-layer-verifier-LkP3LVKj.js';
3
- import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-BXGs_9V0.js';
3
+ import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-C7VPYEj2.js';
4
4
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
5
5
  import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-C0uDYwG6.js';
6
6
  import { I as InterimReleaseConfidence } from './sequential-Dgz1n51-.js';
7
7
  import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
8
- import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-CUOiGcGv.js';
9
- export { r as runEvalCampaign } from './researcher-CUOiGcGv.js';
8
+ import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-BPT8x_NT.js';
9
+ export { r as runEvalCampaign } from './researcher-BPT8x_NT.js';
10
10
  import './errors-BZ9sTdz7.js';
11
11
  import './failure-cluster-C2EGSDiT.js';
12
12
  import './integrity-DK2EBVZC.js';
@@ -902,4 +902,4 @@ interface ResearchReport {
902
902
  */
903
903
  declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
904
904
 
905
- export { gainHistogram as $, type ActionableSideInfo as A, trialTraceFromMultiShotTrial as B, type GainDistributionBin as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, type GainDistributionFigureSpec as F, type GenerationReport as G, type GainDistributionOptions as H, InMemoryTrialCache as I, type ParetoFigureSpec as J, type ParetoPoint as K, RESEARCH_REPORT_HARD_PAIR_FLOOR as L, type MultiShotGateConfig as M, type ResearchReport as N, type ResearchReportCandidate as O, type PromptEvolutionConfig as P, type ResearchReportDecision as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, type ResearchReportMethodology as U, type VariantAggregate as V, type ResearchReportOptions as W, type ResearchReportRecommendation as X, type SummaryTable as Y, type SummaryTableOptions as Z, type SummaryTableRow as _, type AsiSeverity as a, paretoChart as a0, researchReport as a1, summaryTable as a2, type GateDecision as a3, type Objective as a4, type ParetoResult as a5, type Direction as a6, type GateEvidence as a7, HeldOutGate as a8, type HeldOutGateConfig as a9, type HeldOutGateRejectionCode as aa, crowdingDistance as ab, dominates as ac, paretoFrontier as ad, paretoFrontierWithCrowding as ae, scalarScore as af, type MultiShotGateResult as b, type MultiShotMutateAdapter as c, type MultiShotOptimizationConfig as d, type MultiShotOptimizationResult as e, type MultiShotRun as f, type MultiShotRunInput as g, type MultiShotRunner as h, type MultiShotScore as i, type MultiShotScorer as j, type MultiShotSplit as k, type MultiShotTrace as l, type MultiShotTrialResult as m, type MultiShotVariant as n, type MutateAdapter as o, type PromptEvolutionEvent as p, type PromptEvolutionResult as q, type ReflectionProposal as r, type ScoreAdapter as s, type TrialResult as t, type TrialTrace as u, buildReflectionPrompt as v, defaultMultiShotObjectives as w, parseReflectionResponse as x, runMultiShotOptimization as y, runPromptEvolution as z };
905
+ export { gainHistogram as $, type ActionableSideInfo as A, trialTraceFromMultiShotTrial as B, type GainDistributionBin as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, type GainDistributionFigureSpec as F, type GenerationReport as G, type GainDistributionOptions as H, InMemoryTrialCache as I, type ParetoFigureSpec as J, type ParetoPoint as K, RESEARCH_REPORT_HARD_PAIR_FLOOR as L, type MultiShotGateConfig as M, type ResearchReport as N, type ResearchReportCandidate as O, type PromptEvolutionConfig as P, type ResearchReportDecision as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, type ResearchReportMethodology as U, type VariantAggregate as V, type ResearchReportOptions as W, type ResearchReportRecommendation as X, type SummaryTable as Y, type SummaryTableOptions as Z, type SummaryTableRow as _, type AsiSeverity as a, paretoChart as a0, researchReport as a1, summaryTable as a2, type GateDecision as a3, type HeldOutGateConfig as a4, type Objective as a5, type ParetoResult as a6, type Direction as a7, type GateEvidence as a8, HeldOutGate as a9, type HeldOutGateRejectionCode as aa, crowdingDistance as ab, dominates as ac, paretoFrontier as ad, paretoFrontierWithCrowding as ae, scalarScore as af, type MultiShotGateResult as b, type MultiShotMutateAdapter as c, type MultiShotOptimizationConfig as d, type MultiShotOptimizationResult as e, type MultiShotRun as f, type MultiShotRunInput as g, type MultiShotRunner as h, type MultiShotScore as i, type MultiShotScorer as j, type MultiShotSplit as k, type MultiShotTrace as l, type MultiShotTrialResult as m, type MultiShotVariant as n, type MutateAdapter as o, type PromptEvolutionEvent as p, type PromptEvolutionResult as q, type ReflectionProposal as r, type ScoreAdapter as s, type TrialResult as t, type TrialTrace as u, buildReflectionPrompt as v, defaultMultiShotObjectives as w, parseReflectionResponse as x, runMultiShotOptimization as y, runPromptEvolution as z };
@@ -1,8 +1,14 @@
1
+ import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-DfFdrraJ.js';
2
+ import { T as TraceStore } from '../store-Db2Bv8Cf.js';
1
3
  import { z } from 'zod';
2
4
  import { OpenAPIObject } from 'openapi3-ts/oas31';
3
5
  import * as hono_types from 'hono/types';
4
6
  import { ServerType } from '@hono/node-server';
5
7
  import { Hono } from 'hono';
8
+ import '../control-runtime-BuJHoLg0.js';
9
+ import '../emitter-DP_cSSiw.js';
10
+ import '../dataset-CiK_3LDr.js';
11
+ import '../errors-BZ9sTdz7.js';
6
12
 
7
13
  declare const RubricDimensionSchema: z.ZodObject<{
8
14
  id: z.ZodString;
@@ -105,6 +111,287 @@ declare const HealthResponseSchema: z.ZodObject<{
105
111
  status: z.ZodLiteral<"ok">;
106
112
  uptimeSec: z.ZodNumber;
107
113
  }, z.core.$strip>;
114
+ /**
115
+ * Minimal `TraceEvent` shape that the production runtime emits.
116
+ * Matches `trace/schema.ts` `TraceEvent` but is duplicated here as a
117
+ * wire schema so non-TypeScript clients can validate without depending
118
+ * on internal types.
119
+ */
120
+ declare const TraceEventSchema: z.ZodObject<{
121
+ eventId: z.ZodString;
122
+ runId: z.ZodString;
123
+ spanId: z.ZodOptional<z.ZodString>;
124
+ kind: z.ZodEnum<{
125
+ policy_violation: "policy_violation";
126
+ custom: "custom";
127
+ error: "error";
128
+ log: "log";
129
+ budget_decrement: "budget_decrement";
130
+ budget_breach: "budget_breach";
131
+ state_mutation: "state_mutation";
132
+ redaction_applied: "redaction_applied";
133
+ }>;
134
+ timestamp: z.ZodNumber;
135
+ payload: z.ZodRecord<z.ZodString, z.ZodUnknown>;
136
+ }, z.core.$strip>;
137
+ declare const TracesIngestRequestSchema: z.ZodObject<{
138
+ events: z.ZodArray<z.ZodObject<{
139
+ eventId: z.ZodString;
140
+ runId: z.ZodString;
141
+ spanId: z.ZodOptional<z.ZodString>;
142
+ kind: z.ZodEnum<{
143
+ policy_violation: "policy_violation";
144
+ custom: "custom";
145
+ error: "error";
146
+ log: "log";
147
+ budget_decrement: "budget_decrement";
148
+ budget_breach: "budget_breach";
149
+ state_mutation: "state_mutation";
150
+ redaction_applied: "redaction_applied";
151
+ }>;
152
+ timestamp: z.ZodNumber;
153
+ payload: z.ZodRecord<z.ZodString, z.ZodUnknown>;
154
+ }, z.core.$strip>>;
155
+ }, z.core.$strip>;
156
+ declare const TracesIngestResponseSchema: z.ZodObject<{
157
+ accepted: z.ZodNumber;
158
+ rejected: z.ZodNumber;
159
+ errors: z.ZodDefault<z.ZodArray<z.ZodObject<{
160
+ eventId: z.ZodString;
161
+ message: z.ZodString;
162
+ }, z.core.$strip>>>;
163
+ }, z.core.$strip>;
164
+ declare const FeedbackLabelSchema: z.ZodObject<{
165
+ id: z.ZodOptional<z.ZodString>;
166
+ source: z.ZodEnum<{
167
+ judge: "judge";
168
+ system: "system";
169
+ user: "user";
170
+ policy: "policy";
171
+ environment: "environment";
172
+ metric: "metric";
173
+ }>;
174
+ kind: z.ZodEnum<{
175
+ approve: "approve";
176
+ reject: "reject";
177
+ select: "select";
178
+ edit: "edit";
179
+ rank: "rank";
180
+ rate: "rate";
181
+ comment: "comment";
182
+ metric_outcome: "metric_outcome";
183
+ policy_block: "policy_block";
184
+ revision_request: "revision_request";
185
+ }>;
186
+ value: z.ZodUnknown;
187
+ reason: z.ZodOptional<z.ZodString>;
188
+ severity: z.ZodOptional<z.ZodEnum<{
189
+ error: "error";
190
+ info: "info";
191
+ warning: "warning";
192
+ critical: "critical";
193
+ }>>;
194
+ createdAt: z.ZodString;
195
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
196
+ }, z.core.$strip>;
197
+ declare const FeedbackAttemptSchema: z.ZodObject<{
198
+ id: z.ZodString;
199
+ stepIndex: z.ZodNumber;
200
+ artifactType: z.ZodEnum<{
201
+ action: "action";
202
+ decision: "decision";
203
+ text: "text";
204
+ code: "code";
205
+ plan: "plan";
206
+ research: "research";
207
+ ui: "ui";
208
+ data: "data";
209
+ other: "other";
210
+ }>;
211
+ artifact: z.ZodUnknown;
212
+ options: z.ZodOptional<z.ZodArray<z.ZodUnknown>>;
213
+ proposedAction: z.ZodOptional<z.ZodObject<{
214
+ type: z.ZodString;
215
+ risk: z.ZodOptional<z.ZodEnum<{
216
+ medium: "medium";
217
+ low: "low";
218
+ high: "high";
219
+ }>>;
220
+ costUsd: z.ZodOptional<z.ZodNumber>;
221
+ externalSideEffect: z.ZodOptional<z.ZodBoolean>;
222
+ requiresApproval: z.ZodOptional<z.ZodBoolean>;
223
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
224
+ }, z.core.$strip>>;
225
+ feedback: z.ZodOptional<z.ZodArray<z.ZodObject<{
226
+ id: z.ZodOptional<z.ZodString>;
227
+ source: z.ZodEnum<{
228
+ judge: "judge";
229
+ system: "system";
230
+ user: "user";
231
+ policy: "policy";
232
+ environment: "environment";
233
+ metric: "metric";
234
+ }>;
235
+ kind: z.ZodEnum<{
236
+ approve: "approve";
237
+ reject: "reject";
238
+ select: "select";
239
+ edit: "edit";
240
+ rank: "rank";
241
+ rate: "rate";
242
+ comment: "comment";
243
+ metric_outcome: "metric_outcome";
244
+ policy_block: "policy_block";
245
+ revision_request: "revision_request";
246
+ }>;
247
+ value: z.ZodUnknown;
248
+ reason: z.ZodOptional<z.ZodString>;
249
+ severity: z.ZodOptional<z.ZodEnum<{
250
+ error: "error";
251
+ info: "info";
252
+ warning: "warning";
253
+ critical: "critical";
254
+ }>>;
255
+ createdAt: z.ZodString;
256
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
257
+ }, z.core.$strip>>>;
258
+ createdAt: z.ZodString;
259
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
260
+ }, z.core.$strip>;
261
+ declare const FeedbackTrajectorySchema: z.ZodObject<{
262
+ id: z.ZodString;
263
+ projectId: z.ZodOptional<z.ZodString>;
264
+ scenarioId: z.ZodOptional<z.ZodString>;
265
+ task: z.ZodObject<{
266
+ intent: z.ZodString;
267
+ context: z.ZodOptional<z.ZodUnknown>;
268
+ }, z.core.$strip>;
269
+ attempts: z.ZodDefault<z.ZodArray<z.ZodObject<{
270
+ id: z.ZodString;
271
+ stepIndex: z.ZodNumber;
272
+ artifactType: z.ZodEnum<{
273
+ action: "action";
274
+ decision: "decision";
275
+ text: "text";
276
+ code: "code";
277
+ plan: "plan";
278
+ research: "research";
279
+ ui: "ui";
280
+ data: "data";
281
+ other: "other";
282
+ }>;
283
+ artifact: z.ZodUnknown;
284
+ options: z.ZodOptional<z.ZodArray<z.ZodUnknown>>;
285
+ proposedAction: z.ZodOptional<z.ZodObject<{
286
+ type: z.ZodString;
287
+ risk: z.ZodOptional<z.ZodEnum<{
288
+ medium: "medium";
289
+ low: "low";
290
+ high: "high";
291
+ }>>;
292
+ costUsd: z.ZodOptional<z.ZodNumber>;
293
+ externalSideEffect: z.ZodOptional<z.ZodBoolean>;
294
+ requiresApproval: z.ZodOptional<z.ZodBoolean>;
295
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
296
+ }, z.core.$strip>>;
297
+ feedback: z.ZodOptional<z.ZodArray<z.ZodObject<{
298
+ id: z.ZodOptional<z.ZodString>;
299
+ source: z.ZodEnum<{
300
+ judge: "judge";
301
+ system: "system";
302
+ user: "user";
303
+ policy: "policy";
304
+ environment: "environment";
305
+ metric: "metric";
306
+ }>;
307
+ kind: z.ZodEnum<{
308
+ approve: "approve";
309
+ reject: "reject";
310
+ select: "select";
311
+ edit: "edit";
312
+ rank: "rank";
313
+ rate: "rate";
314
+ comment: "comment";
315
+ metric_outcome: "metric_outcome";
316
+ policy_block: "policy_block";
317
+ revision_request: "revision_request";
318
+ }>;
319
+ value: z.ZodUnknown;
320
+ reason: z.ZodOptional<z.ZodString>;
321
+ severity: z.ZodOptional<z.ZodEnum<{
322
+ error: "error";
323
+ info: "info";
324
+ warning: "warning";
325
+ critical: "critical";
326
+ }>>;
327
+ createdAt: z.ZodString;
328
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
329
+ }, z.core.$strip>>>;
330
+ createdAt: z.ZodString;
331
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
332
+ }, z.core.$strip>>>;
333
+ labels: z.ZodDefault<z.ZodArray<z.ZodObject<{
334
+ id: z.ZodOptional<z.ZodString>;
335
+ source: z.ZodEnum<{
336
+ judge: "judge";
337
+ system: "system";
338
+ user: "user";
339
+ policy: "policy";
340
+ environment: "environment";
341
+ metric: "metric";
342
+ }>;
343
+ kind: z.ZodEnum<{
344
+ approve: "approve";
345
+ reject: "reject";
346
+ select: "select";
347
+ edit: "edit";
348
+ rank: "rank";
349
+ rate: "rate";
350
+ comment: "comment";
351
+ metric_outcome: "metric_outcome";
352
+ policy_block: "policy_block";
353
+ revision_request: "revision_request";
354
+ }>;
355
+ value: z.ZodUnknown;
356
+ reason: z.ZodOptional<z.ZodString>;
357
+ severity: z.ZodOptional<z.ZodEnum<{
358
+ error: "error";
359
+ info: "info";
360
+ warning: "warning";
361
+ critical: "critical";
362
+ }>>;
363
+ createdAt: z.ZodString;
364
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
365
+ }, z.core.$strip>>>;
366
+ outcome: z.ZodOptional<z.ZodObject<{
367
+ success: z.ZodOptional<z.ZodBoolean>;
368
+ score: z.ZodOptional<z.ZodNumber>;
369
+ metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
370
+ costUsd: z.ZodOptional<z.ZodNumber>;
371
+ detail: z.ZodOptional<z.ZodString>;
372
+ observedAt: z.ZodOptional<z.ZodString>;
373
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
374
+ }, z.core.$strip>>;
375
+ split: z.ZodOptional<z.ZodEnum<{
376
+ train: "train";
377
+ dev: "dev";
378
+ test: "test";
379
+ holdout: "holdout";
380
+ }>>;
381
+ tags: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
382
+ createdAt: z.ZodString;
383
+ updatedAt: z.ZodOptional<z.ZodString>;
384
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
385
+ }, z.core.$strip>;
386
+ declare const FeedbackIngestResponseSchema: z.ZodObject<{
387
+ id: z.ZodString;
388
+ persisted: z.ZodBoolean;
389
+ }, z.core.$strip>;
390
+ type TraceEvent = z.infer<typeof TraceEventSchema>;
391
+ type TracesIngestRequest = z.infer<typeof TracesIngestRequestSchema>;
392
+ type TracesIngestResponse = z.infer<typeof TracesIngestResponseSchema>;
393
+ type FeedbackTrajectory = z.infer<typeof FeedbackTrajectorySchema>;
394
+ type FeedbackIngestResponse = z.infer<typeof FeedbackIngestResponseSchema>;
108
395
  declare const ErrorResponseSchema: z.ZodObject<{
109
396
  error: z.ZodObject<{
110
397
  code: z.ZodString;
@@ -132,6 +419,18 @@ declare const WIRE_VERSION = "1.0.0";
132
419
  */
133
420
  declare function hashRubric(rubric: Rubric): string;
134
421
 
422
+ /**
423
+ * Pure handler functions — the "business logic" behind every wire-protocol
424
+ * method. The HTTP server (`server.ts`) and the stdio RPC (`rpc.ts`) both
425
+ * call these. Tests call these directly without spinning a server.
426
+ *
427
+ * Each handler:
428
+ * - Takes a parsed request (already Zod-validated by the transport).
429
+ * - Returns a result that matches the response schema.
430
+ * - Throws `WireError` for caller-fixable errors (404, 400, 422).
431
+ * - Lets unexpected errors bubble — the transport maps them to 500.
432
+ */
433
+
135
434
  /** Caller-fixable error. The transport renders this to 4xx + ErrorResponse. */
136
435
  declare class WireError extends Error {
137
436
  readonly code: string;
@@ -142,6 +441,35 @@ declare class WireError extends Error {
142
441
  declare function handleJudge(req: JudgeRequest): Promise<JudgeResult>;
143
442
  declare function handleListRubrics(): ListRubricsResponse;
144
443
  declare function handleVersion(): VersionResponse;
444
+ /**
445
+ * Pluggable stores the wire layer routes ingestion writes into. Both
446
+ * are optional — when omitted, the corresponding endpoint returns 503.
447
+ *
448
+ * Production deployments wire a `FileSystemTraceStore` and
449
+ * `FileSystemFeedbackTrajectoryStore` here. Tests substitute in-memory
450
+ * stores.
451
+ */
452
+ interface IngestionStores {
453
+ traceStore?: TraceStore;
454
+ feedbackStore?: FeedbackTrajectoryStore;
455
+ }
456
+ /**
457
+ * `POST /v1/traces/ingest` — accept a batch of `TraceEvent`s from the
458
+ * production runtime. Best-effort: each event is appended independently;
459
+ * one bad event does not poison the batch.
460
+ *
461
+ * Idempotency: the underlying store is append-only; consumers retrying
462
+ * the same payload will get duplicate events. Consumers should
463
+ * de-duplicate by `eventId` downstream — production traces frequently
464
+ * land via at-least-once buses (Kafka, SQS) where dedup is unavoidable.
465
+ */
466
+ declare function handleTracesIngest(req: TracesIngestRequest, stores: IngestionStores): Promise<TracesIngestResponse>;
467
+ /**
468
+ * `POST /v1/feedback` — accept a single `FeedbackTrajectory` from the
469
+ * production runtime. Idempotent on `id`: re-posting the same trajectory
470
+ * replaces the prior record.
471
+ */
472
+ declare function handleFeedbackIngest(req: FeedbackTrajectory, stores: IngestionStores): Promise<FeedbackIngestResponse>;
145
473
 
146
474
  declare function buildOpenApi(packageVersion: string): OpenAPIObject;
147
475
 
@@ -199,8 +527,24 @@ declare function listBuiltinRubrics(): {
199
527
  rubricVersion: string;
200
528
  }[];
201
529
 
202
- declare function createApp(): Hono<hono_types.BlankEnv, hono_types.BlankSchema, "/">;
203
- interface ServeOptions {
530
+ interface CreateAppOptions {
531
+ /** Stores wired to the ingestion endpoints. */
532
+ stores?: IngestionStores;
533
+ /**
534
+ * Bearer-token auth. When provided, every endpoint EXCEPT `/healthz`
535
+ * and `/v1/version` requires `Authorization: Bearer <token>`. The
536
+ * token may be a static string OR a function for time-bounded /
537
+ * rotating tokens.
538
+ *
539
+ * Recommended for any server that accepts ingestion writes from the
540
+ * public internet. Read-only deployments may omit it.
541
+ */
542
+ auth?: {
543
+ bearer: string | ((token: string) => boolean | Promise<boolean>);
544
+ };
545
+ }
546
+ declare function createApp(opts?: CreateAppOptions): Hono<hono_types.BlankEnv, hono_types.BlankSchema, "/">;
547
+ interface ServeOptions extends CreateAppOptions {
204
548
  /** Default 5005. */
205
549
  port?: number;
206
550
  /** Default '127.0.0.1'. Set to '0.0.0.0' to listen on all interfaces. */
@@ -208,4 +552,4 @@ interface ServeOptions {
208
552
  }
209
553
  declare function startServer(opts?: ServeOptions): ServerType;
210
554
 
211
- export { BUILTIN_RUBRICS, type ErrorResponse, ErrorResponseSchema, type FailureMode, FailureModeSchema, HealthResponseSchema, type JudgeRequest, JudgeRequestSchema, type JudgeResult, JudgeResultSchema, type ListRubricsResponse, ListRubricsResponseSchema, type Rubric, type RubricDimension, RubricDimensionSchema, type RubricInfo, RubricInfoSchema, RubricSchema, type ServeOptions, type VersionResponse, VersionResponseSchema, WIRE_VERSION, WireError, buildOpenApi, createApp, dispatchRpc, getBuiltinRubric, handleJudge, handleListRubrics, handleVersion, hashRubric, listBuiltinRubrics, runRpcBatch, runRpcOnce, startServer };
555
+ export { BUILTIN_RUBRICS, type ErrorResponse, ErrorResponseSchema, type FailureMode, FailureModeSchema, FeedbackAttemptSchema, type FeedbackIngestResponse, FeedbackIngestResponseSchema, FeedbackLabelSchema, type FeedbackTrajectory, FeedbackTrajectorySchema, HealthResponseSchema, type IngestionStores, type JudgeRequest, JudgeRequestSchema, type JudgeResult, JudgeResultSchema, type ListRubricsResponse, ListRubricsResponseSchema, type Rubric, type RubricDimension, RubricDimensionSchema, type RubricInfo, RubricInfoSchema, RubricSchema, type ServeOptions, type TraceEvent, TraceEventSchema, type TracesIngestRequest, TracesIngestRequestSchema, type TracesIngestResponse, TracesIngestResponseSchema, type VersionResponse, VersionResponseSchema, WIRE_VERSION, WireError, buildOpenApi, createApp, dispatchRpc, getBuiltinRubric, handleFeedbackIngest, handleJudge, handleListRubrics, handleTracesIngest, handleVersion, hashRubric, listBuiltinRubrics, runRpcBatch, runRpcOnce, startServer };