npm - @tangle-network/agent-eval - Versions diffs - 0.71.0 → 0.72.3 - Mend

@tangle-network/agent-eval 0.71.0 → 0.72.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

package/CHANGELOG.md +63 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +3 -2
package/dist/agent-profile-DYRboYWu.d.ts +364 -0
package/dist/analyst/index.d.ts +221 -0
package/dist/analyst/index.js +371 -0
package/dist/analyst/index.js.map +1 -0
package/dist/analyst-t7zZS3TV.d.ts +88 -0
package/dist/campaign/index.d.ts +485 -9
package/dist/campaign/index.js +618 -30
package/dist/campaign/index.js.map +1 -1
package/dist/chunk-7W4SM7FD.js +1075 -0
package/dist/chunk-7W4SM7FD.js.map +1 -0
package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
package/dist/chunk-JHA3ZGSO.js +1496 -0
package/dist/chunk-JHA3ZGSO.js.map +1 -0
package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
package/dist/chunk-LB2UOI5F.js +412 -0
package/dist/chunk-LB2UOI5F.js.map +1 -0
package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
package/dist/chunk-VUINJM5M.js.map +1 -0
package/dist/chunk-WYIHD6EB.js +1044 -0
package/dist/chunk-WYIHD6EB.js.map +1 -0
package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
package/dist/chunk-XPILG2CA.js.map +1 -0
package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
package/dist/contract/index.d.ts +17 -13
package/dist/contract/index.js +14 -8
package/dist/contract/index.js.map +1 -1
package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
package/dist/hosted/index.d.ts +223 -2
package/dist/index.d.ts +49 -1323
package/dist/index.js +339 -2627
package/dist/index.js.map +1 -1
package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/pareto-E-pembql.d.ts +81 -0
package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
package/dist/redact-B40YG2M_.d.ts +45 -0
package/dist/registry-DuVYiTvw.d.ts +128 -0
package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
package/dist/rl.d.ts +4 -3
package/dist/rl.js +4 -4
package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
package/dist/run-critic-BAIjX99r.d.ts +56 -0
package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
package/dist/traces.d.ts +371 -308
package/dist/traces.js +43 -18
package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
package/dist/wire/index.d.ts +1 -1
package/dist/workflow/index.d.ts +494 -0
package/dist/workflow/index.js +2177 -0
package/dist/workflow/index.js.map +1 -0
package/docs/design/self-improvement-roadmap.md +106 -0
package/package.json +36 -12
package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
package/dist/chunk-6QZUCFKM.js.map +0 -1
package/dist/chunk-ODGETRTM.js.map +0 -1
package/dist/chunk-PQV2TKC3.js +0 -27
package/dist/chunk-PQV2TKC3.js.map +0 -1
/package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
/package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0

package/dist/chunk-WYIHD6EB.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/analyst/types.ts","../src/analyst/finding-subject.ts","../src/analyst/parse-tolerant.ts","../src/analyst/finding-signature.ts","../src/analyst/structure-findings.ts","../src/analyst/kind-factory.ts","../src/analyst/tool-groups.ts","../src/analyst/kinds/failure-mode.ts","../src/analyst/kinds/improvement.ts","../src/analyst/kinds/knowledge-gap.ts","../src/analyst/kinds/knowledge-poisoning.ts","../src/analyst/kinds/index.ts","../src/analyst/registry.ts"],"sourcesContent":["/**\n * Analyst contract — the missing orchestration layer over agent-eval's\n * existing analyzers (analyzeTraces, MultiLayerVerifier, RunCritic,\n * SemanticConceptJudge, JudgeFn, ...).\n *\n * Each existing primitive returns its own output shape. The Analyst\n * contract is the single envelope every primitive lifts into, so a\n * registry can run N analysts against a run and a single renderer can\n * compose findings without knowing which analyzer produced them.\n *\n * The contract is intentionally domain-agnostic: nothing here knows\n * about code, voice, RAG, or any particular agent stack. Analysts\n * declare what INPUT KIND they need (a trace store, an artifact dir,\n * a RunRecord, a JudgeInput, or `custom`), and the registry routes\n * the matching input from `AnalystRunInputs`.\n */\n\nimport { createHash } from 'node:crypto'\nimport type { RunRecord } from '../run-record'\nimport type { TraceAnalysisStore } from '../trace-analyst/store'\nimport type { JudgeInput } from '../types'\nimport type { ChatClient } from './chat-client'\n\n/**\n * Unified envelope every analyst emits. Schema-versioned so renderers\n * and time-series diffs survive future field additions.\n */\nexport interface AnalystFinding {\n schema_version: '1.0.0'\n /**\n * Stable hash over identity-defining fields (analyst_id + canonical\n * claim + area + optional subject). Two findings from two runs that\n * \"are the same finding\" share this id — that's what `diffFindings`\n * uses to compute appeared/disappeared sets across runs.\n */\n finding_id: string\n analyst_id: string\n produced_at: string\n severity: AnalystSeverity\n /**\n * Coarse classification. Renderers group by this. Free-form so\n * domain-specific analysts can introduce categories without a\n * schema change ('agent-reasoning', 'verification', 'cost',\n * 'tool-use', 'safety', 'latency', 'data-quality', ...).\n */\n area: string\n claim: string\n rationale?: string\n evidence_refs: EvidenceRef[]\n recommended_action?: string\n validation_plan?: string\n /** 0..1 — the analyst's own confidence. Not calibrated across analysts. */\n confidence: number\n /**\n * Optional subject the finding is about — leaf id, agent id, request\n * id. Included in finding_id when present so per-subject findings\n * diff cleanly across runs.\n */\n subject?: string\n /** Analyst-private extras; renderers ignore unless they know the analyst. */\n metadata?: Record<string, unknown>\n}\n\nexport type AnalystSeverity = 'critical' | 'high' | 'medium' | 'low' | 'info'\n\nexport interface EvidenceRef {\n /**\n * Where the evidence lives. `span` and `event` refer to OTLP trace\n * elements; `artifact` to a file inside the run's artifact tree;\n * `finding` to another AnalystFinding (cross-analyst chaining);\n * `metric` to a named scalar reading the renderer knows how to read.\n */\n kind: 'span' | 'event' | 'artifact' | 'finding' | 'metric'\n uri: string\n excerpt?: string\n}\n\n// ── Analyst contract ─────────────────────────────────────────────────\n\n/**\n * The discriminator the registry uses to pass the right input.\n * `custom` is the escape hatch — analysts that need something else\n * (e.g. an embedding cache, a partner SDK handle) read it from\n * `AnalystRunInputs.custom[<analyst id>]`.\n */\nexport type AnalystInputKind =\n | 'trace-store'\n | 'artifact-dir'\n | 'run-record'\n | 'judge-input'\n | 'custom'\n\nexport interface AnalystCost {\n /** `deterministic` analysts MUST NOT call the LLM. */\n kind: 'deterministic' | 'llm'\n /** Optional declared upper bound; the registry can enforce a budget. */\n est_usd_per_run?: number\n /** Models the analyst expects to use (informational). */\n models?: string[]\n}\n\nexport interface AnalystRequirements {\n /** Min number of shots / samples the analyst needs to produce signal. */\n min_shots?: number\n /** Capabilities the runtime must supply (e.g. ['network', 'gpu']). */\n capabilities?: string[]\n}\n\n/**\n * What's passed to every analyst call. The registry resolves which\n * field the analyst's `inputKind` selects and asserts it's present.\n */\nexport interface AnalystRunInputs {\n traceStore?: TraceAnalysisStore\n artifactDir?: string\n runRecord?: RunRecord\n judgeInput?: JudgeInput\n /** Keyed by analyst id; populated by callers that registered custom analysts. */\n custom?: Record<string, unknown>\n}\n\nexport interface AnalystContext {\n runId: string\n /** Stable correlation id so logs from a single registry.run() share a tag. */\n correlationId: string\n /** Wall-clock deadline (epoch ms). Analysts SHOULD honor for graceful cancel. */\n deadlineMs?: number\n /** Per-analyst USD budget. Analysts MAY check before issuing LLM calls. */\n budgetUsd?: number\n /**\n * Shared chat client. Analysts that call an LLM go through this so\n * the operator picks transport (sandbox-sdk | router | cli-bridge |\n * direct-provider | mock) at the registry boundary without touching\n * analyst code.\n */\n chat?: ChatClient\n /**\n * Findings from a prior run the operator wants the analyst to see as\n * retrieval context. Kinds that take advantage of cross-run memory\n * (failure-mode \"I saw this cluster last run\", knowledge-gap \"the wiki\n * page I asked for is still missing\") render these into the actor's\n * working set. Filtering is the operator's job: pass the slice that\n * matches the analyst's id, or pass everything and let the kind\n * filter. Empty / absent means no cross-run context.\n */\n priorFindings?: ReadonlyArray<AnalystFinding>\n /** Free-form runtime tags (env, host, op). Findings can echo these into metadata. */\n tags?: Record<string, string>\n /** Logger callback — analysts SHOULD prefer this over console.* for testability. */\n log?: (msg: string, fields?: Record<string, unknown>) => void\n /** Optional abort signal. Analysts SHOULD pass it through to LLM calls. */\n signal?: AbortSignal\n}\n\n/**\n * The minimal contract. Concrete analysts can refine `TInput` so\n * implementations stay type-safe (e.g. a trace analyst's `TInput` is\n * `TraceAnalysisStore`); the registry passes the right field from\n * `AnalystRunInputs` based on `inputKind`.\n */\nexport interface Analyst<TInput = unknown> {\n /** Stable identifier — appears in finding_id, telemetry, and registry exclusion lists. */\n readonly id: string\n /** Human-readable. One sentence. */\n readonly description: string\n readonly inputKind: AnalystInputKind\n readonly cost: AnalystCost\n readonly requires?: AnalystRequirements\n /** Bump on breaking changes to claim wording or area so old finding_ids don't collide. */\n readonly version: string\n analyze(input: TInput, ctx: AnalystContext): Promise<AnalystFinding[]>\n}\n\n// ── finding_id stability ─────────────────────────────────────────────\n\n/**\n * Compute the stable finding_id from the identity-defining fields.\n * Default implementation hashes {analyst_id, area, subject, normalized claim}.\n * Analysts that emit findings whose claim text varies per run (timestamps,\n * counts) SHOULD either: (a) pass an explicit `id_basis` to fix the hash,\n * or (b) move the variable part into `rationale`/`metadata` and keep the\n * `claim` static.\n */\nexport function computeFindingId(input: {\n analyst_id: string\n area: string\n subject?: string\n claim: string\n /** Override the claim for hashing — use when the displayed claim has run-specific bits. */\n id_basis?: string\n}): string {\n const basis = JSON.stringify({\n a: input.analyst_id,\n r: input.area,\n s: input.subject ?? '',\n c: normalizeClaim(input.id_basis ?? input.claim),\n })\n return `f_${createHash('sha256').update(basis).digest('hex').slice(0, 20)}`\n}\n\nfunction normalizeClaim(c: string): string {\n // Lowercase, collapse whitespace, strip trailing punctuation. Goal:\n // \"Leaf X failed install\" and \"Leaf X failed install.\" hash the same.\n return c\n .toLowerCase()\n .replace(/\\s+/g, ' ')\n .replace(/[.!?;:,]+$/g, '')\n .trim()\n}\n\n/**\n * Convenience factory: produce a fully-formed AnalystFinding with the\n * id computed automatically. Analyst code stays terse.\n */\nexport function makeFinding(\n init: Omit<AnalystFinding, 'schema_version' | 'finding_id' | 'produced_at'> & {\n id_basis?: string\n produced_at?: string\n },\n): AnalystFinding {\n const { id_basis, produced_at, ...rest } = init\n return {\n schema_version: '1.0.0',\n finding_id: computeFindingId({\n analyst_id: rest.analyst_id,\n area: rest.area,\n subject: rest.subject,\n claim: rest.claim,\n id_basis,\n }),\n produced_at: produced_at ?? new Date().toISOString(),\n ...rest,\n }\n}\n\n// ── Registry result envelope ────────────────────────────────────────\n\nexport interface AnalystRunSummary {\n analyst_id: string\n status: 'ok' | 'skipped' | 'failed'\n /** Why skipped — missing input, budget exceeded, capability unmet. */\n reason?: string\n findings_count: number\n latency_ms: number\n cost_usd: number\n /** When `status='failed'`: the error class + message, never the full stack. */\n error?: { class: string; message: string }\n}\n\nexport interface AnalystRunResult {\n run_id: string\n correlation_id: string\n started_at: string\n ended_at: string\n findings: AnalystFinding[]\n per_analyst: AnalystRunSummary[]\n /** Total LLM cost in USD across all analysts in this registry.run(). */\n total_cost_usd: number\n}\n\n// ── Streaming event envelope ────────────────────────────────────────\n\n/**\n * Events emitted by `AnalystRegistry.runStream(...)` in real time as\n * the registry executes. UIs subscribe via `for await (const ev of\n * registry.runStream(...))`; `registry.run(...)` is a thin collector\n * over the same stream, so the two surfaces share their invariants.\n *\n * Per-finding events are intentionally omitted — analyzers are batch\n * operations (an Ax actor returns the full `findings:json[]` at the\n * end of the responder), so streaming inside one analyst would only\n * emit partial JSON consumers can't render. The kind-completion event\n * is the right granularity; subscribers wanting per-finding rendering\n * iterate `event.findings` themselves.\n */\nexport type AnalystRunEvent =\n | {\n type: 'run-started'\n run_id: string\n correlation_id: string\n started_at: string\n /** The ordered list of analyst ids the registry will run. */\n analyst_ids: ReadonlyArray<string>\n }\n | {\n type: 'analyst-skipped'\n summary: AnalystRunSummary\n }\n | {\n type: 'analyst-started'\n analyst_id: string\n started_at: string\n }\n | {\n type: 'analyst-completed'\n /** `summary.status` is `'ok'` for clean completion or `'failed'` for thrown analysts. */\n summary: AnalystRunSummary\n findings: ReadonlyArray<AnalystFinding>\n }\n | {\n type: 'run-completed'\n result: AnalystRunResult\n }\n","/**\n * Typed `FindingSubject` — the canonical grammar every analyst kind emits.\n *\n * Background: kind actor prompts have always documented a subject grammar\n * (e.g. `system-prompt:<section>`, `agent-knowledge:wiki:<slug>`) but the\n * LLM was unconstrained — it could emit `subject: \"fix the prompt\"`\n * (prose) and downstream adapters routed on `startsWith(...)` would\n * silently skip it. Every per-vertical `ImprovementAdapter` had a\n * routing table that mostly caught nothing.\n *\n * This module fixes that:\n * - `parseFindingSubject(raw)` — returns the typed `FindingSubject`\n * when `raw` matches the grammar, else `null`. Used at the\n * `RawAnalystFindingSchema` boundary so malformed subjects are\n * rejected loudly instead of silently lifted into the registry.\n * - `FindingSubjectKind` — the union of valid locus categories. Each\n * variant carries the typed components downstream adapters resolve\n * against the agent's surface manifest (no string parsing in the\n * adapter).\n * - `FINDING_SUBJECT_GRAMMAR_PROMPT` — single source of truth for the\n * grammar string embedded in kind actor prompts. Drift between\n * prompt and parser is impossible if every kind imports this.\n *\n * The grammar is intentionally NARROW — only loci the substrate's\n * default `ImprovementAdapter` / `KnowledgeAdapter` can act on. A\n * finding with a subject outside this set fails the parser; the kind\n * author either extends the grammar here (and adds adapter routing)\n * or rephrases the prompt to map onto an existing variant.\n *\n * `failure-mode` is the one exception — its subjects are free-form\n * cluster labels, not loci. The schema preserves them as\n * `{ kind: 'cluster', label }` and the adapters skip them (cluster\n * findings are evidence, not actionable mutations).\n */\n\nimport { z } from 'zod'\n\n// ── canonical grammar ─────────────────────────────────────────────────\n\n/**\n * Discriminated union of every locus the substrate can route findings to.\n *\n * Adapters narrow on `kind` and use the typed components (no string\n * parsing). Adding a variant here REQUIRES updating the parser, the\n * grammar prompt, and at least one adapter — by design.\n */\nexport type FindingSubject =\n // ── agent-knowledge:* — routed to the KnowledgeAdapter ──\n | { kind: 'knowledge.wiki'; slug: string; heading?: string }\n | { kind: 'knowledge.claim'; topic: string }\n | { kind: 'knowledge.raw'; sourceId: string }\n | { kind: 'knowledge.stale'; slug: string }\n // ── system-prompt / tool / new-tool / rag / memory / scaffolding / output-schema ──\n // routed to the ImprovementAdapter\n | { kind: 'system-prompt'; section: string }\n | { kind: 'tool-doc'; tool: string; aspect?: string }\n | { kind: 'new-tool'; name: string }\n | { kind: 'rag'; corpus: string; docId: string }\n | { kind: 'memory'; key: string }\n | { kind: 'scaffolding'; concern: string }\n | { kind: 'output-schema'; field: string }\n // ── websearch / prior-run-summary — routed to the KnowledgeAdapter as stale signals\n | { kind: 'websearch.outdated'; topic: string }\n | { kind: 'prior-run-summary'; topic: string }\n // ── failure-mode cluster label — preserved verbatim, not routed\n | { kind: 'cluster'; label: string }\n\nexport type FindingSubjectKind = FindingSubject['kind']\n\nexport const FINDING_SUBJECT_KINDS: ReadonlyArray<FindingSubjectKind> = [\n 'knowledge.wiki',\n 'knowledge.claim',\n 'knowledge.raw',\n 'knowledge.stale',\n 'system-prompt',\n 'tool-doc',\n 'new-tool',\n 'rag',\n 'memory',\n 'scaffolding',\n 'output-schema',\n 'websearch.outdated',\n 'prior-run-summary',\n 'cluster',\n]\n\n// ── parser ────────────────────────────────────────────────────────────\n\n/**\n * Parse a raw subject string emitted by an analyst kind's actor.\n *\n * Returns the typed `FindingSubject` when `raw` matches the grammar,\n * else `null`. Callers use the `null` return as a signal to either\n * (a) reject the finding at parse time (kinds that emit typed loci —\n * knowledge-gap, improvement, knowledge-poisoning) or (b) lift it as\n * a cluster label (failure-mode).\n *\n * Slugs are constrained to `[a-z0-9-]+` (lowercase kebab) to keep file\n * paths sane downstream. Topics / keys / sections allow any non-empty\n * string (free-form for the LLM's voice) but get trimmed.\n *\n * Empty / whitespace-only inputs return `null`. `undefined` returns\n * `null`. Both are surfaced by the caller as a rejected subject.\n */\nexport function parseFindingSubject(raw: string | null | undefined): FindingSubject | null {\n if (raw === null || raw === undefined) return null\n const trimmed = raw.trim()\n if (trimmed.length === 0) return null\n\n // agent-knowledge:wiki:<slug>[#<heading>]\n const wiki = trimmed.match(\n /^agent-knowledge:wiki:([a-z0-9][a-z0-9-]*)(?:#([a-z0-9][a-z0-9-]*))?$/,\n )\n if (wiki)\n return { kind: 'knowledge.wiki', slug: wiki[1]!, ...(wiki[2] ? { heading: wiki[2] } : {}) }\n\n // agent-knowledge:claim:<topic>\n const claim = trimmed.match(/^agent-knowledge:claim:(.+)$/)\n if (claim && claim[1]!.trim().length > 0)\n return { kind: 'knowledge.claim', topic: claim[1]!.trim() }\n\n // agent-knowledge:raw:<source-id>\n const raw_ = trimmed.match(/^agent-knowledge:raw:(.+)$/)\n if (raw_ && raw_[1]!.trim().length > 0)\n return { kind: 'knowledge.raw', sourceId: raw_[1]!.trim() }\n\n // agent-knowledge:stale:<slug>\n const stale = trimmed.match(/^agent-knowledge:stale:([a-z0-9][a-z0-9-]*)$/)\n if (stale) return { kind: 'knowledge.stale', slug: stale[1]! }\n\n // system-prompt:<section>\n const sp = trimmed.match(/^system-prompt:(.+)$/)\n if (sp && sp[1]!.trim().length > 0) return { kind: 'system-prompt', section: sp[1]!.trim() }\n\n // tool-doc:<tool>[:<aspect>]\n const tdAspect = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*):(.+)$/)\n if (tdAspect && tdAspect[2]!.trim().length > 0) {\n return { kind: 'tool-doc', tool: tdAspect[1]!, aspect: tdAspect[2]!.trim() }\n }\n const td = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*)$/)\n if (td) return { kind: 'tool-doc', tool: td[1]! }\n\n // new-tool:<name>\n const nt = trimmed.match(/^new-tool:([a-z0-9][a-z0-9_-]*)$/)\n if (nt) return { kind: 'new-tool', name: nt[1]! }\n\n // rag:<corpus>:<doc-id>\n const rag = trimmed.match(/^rag:([a-z0-9][a-z0-9_-]*):(.+)$/)\n if (rag && rag[2]!.trim().length > 0) {\n return { kind: 'rag', corpus: rag[1]!, docId: rag[2]!.trim() }\n }\n\n // memory:<key>\n const mem = trimmed.match(/^memory:(.+)$/)\n if (mem && mem[1]!.trim().length > 0) return { kind: 'memory', key: mem[1]!.trim() }\n\n // scaffolding:<concern>\n const sc = trimmed.match(/^scaffolding:(.+)$/)\n if (sc && sc[1]!.trim().length > 0) return { kind: 'scaffolding', concern: sc[1]!.trim() }\n\n // output-schema:<field>\n const os = trimmed.match(/^output-schema:(.+)$/)\n if (os && os[1]!.trim().length > 0) return { kind: 'output-schema', field: os[1]!.trim() }\n\n // websearch:outdated:<topic>\n const ws = trimmed.match(/^websearch:outdated:(.+)$/)\n if (ws && ws[1]!.trim().length > 0) return { kind: 'websearch.outdated', topic: ws[1]!.trim() }\n\n // prior-run-summary:<topic>\n const prs = trimmed.match(/^prior-run-summary:(.+)$/)\n if (prs && prs[1]!.trim().length > 0) return { kind: 'prior-run-summary', topic: prs[1]!.trim() }\n\n // cluster (no prefix — a free-form evidence label, never a routed locus, so\n // it admits dotted/underscored identifiers like `appworld.task.530b157_1`.\n // ':' stays excluded so it cannot collide with the prefixed grammars above.)\n if (/^[a-z0-9][a-z0-9._-]*$/.test(trimmed) && trimmed.length <= 80) {\n return { kind: 'cluster', label: trimmed }\n }\n\n return null\n}\n\n/**\n * Render the parsed subject back to its canonical string form. Inverse\n * of `parseFindingSubject`; useful when the substrate constructs new\n * findings programmatically (e.g. for tests, replays, or\n * `id_basis` carry-forward).\n */\nexport function renderFindingSubject(s: FindingSubject): string {\n switch (s.kind) {\n case 'knowledge.wiki':\n return s.heading\n ? `agent-knowledge:wiki:${s.slug}#${s.heading}`\n : `agent-knowledge:wiki:${s.slug}`\n case 'knowledge.claim':\n return `agent-knowledge:claim:${s.topic}`\n case 'knowledge.raw':\n return `agent-knowledge:raw:${s.sourceId}`\n case 'knowledge.stale':\n return `agent-knowledge:stale:${s.slug}`\n case 'system-prompt':\n return `system-prompt:${s.section}`\n case 'tool-doc':\n return s.aspect ? `tool-doc:${s.tool}:${s.aspect}` : `tool-doc:${s.tool}`\n case 'new-tool':\n return `new-tool:${s.name}`\n case 'rag':\n return `rag:${s.corpus}:${s.docId}`\n case 'memory':\n return `memory:${s.key}`\n case 'scaffolding':\n return `scaffolding:${s.concern}`\n case 'output-schema':\n return `output-schema:${s.field}`\n case 'websearch.outdated':\n return `websearch:outdated:${s.topic}`\n case 'prior-run-summary':\n return `prior-run-summary:${s.topic}`\n case 'cluster':\n return s.label\n }\n}\n\n// ── grammar prompt — single source of truth for actor instructions ──\n\n/**\n * The grammar text embedded into kind actor prompts. Kinds opt into\n * the subset of variants they emit (e.g. `improvement` excludes the\n * cluster variant; `failure-mode` includes ONLY the cluster variant).\n *\n * Drift between prompt and parser is impossible: every kind imports\n * this constant + the matching `expects` set, and the unit tests below\n * lock the table to the parser.\n */\nexport const FINDING_SUBJECT_GRAMMAR_PROMPT = [\n 'Subjects MUST match this grammar — anything else is rejected at parse time and your work is wasted:',\n '',\n ' Knowledge loci (write to the agent-knowledge base):',\n ' agent-knowledge:wiki:<slug>[#<heading>] create / update a wiki page',\n ' agent-knowledge:claim:<topic> draft a claim / relation triple',\n ' agent-knowledge:raw:<source-id> lift a raw source into a curated page',\n ' agent-knowledge:stale:<slug> mark a page superseded',\n '',\n ' Runtime mutable surfaces (write to prompts / tools / scaffolding):',\n ' system-prompt:<section> add / replace a system-prompt section',\n ' tool-doc:<tool>[:<aspect>] rewrite a tool description',\n ' new-tool:<name> propose a new tool surface',\n ' rag:<corpus>:<doc-id> ingest / correct a RAG document',\n ' memory:<key> invalidate / set a memory entry',\n ' scaffolding:<concern> change a precondition / retry / verifier',\n ' output-schema:<field> constrain the agent output shape',\n '',\n ' Stale signals (knowledge-poisoning only):',\n ' websearch:outdated:<topic> stale web result',\n ' prior-run-summary:<topic> stale prior-run summary',\n '',\n ' Cluster label (failure-mode only):',\n ' <kebab-case-label> short cluster id, e.g. \"tool-call-loop\"',\n '',\n 'Slugs / tool ids: [a-z0-9-]+ (lowercase kebab). Topics / keys / sections: free-form, trimmed.',\n].join('\\n')\n\n// ── kind expects sets ─────────────────────────────────────────────────\n\n/**\n * The variants each kind is allowed to emit. Used at the kind factory\n * boundary so a knowledge-gap finding can't sneak in a `system-prompt:*`\n * subject (the improvement-analyst's job) and vice versa.\n *\n * `failure-mode` is restricted to `cluster` — the only kind that emits\n * a non-locus subject.\n */\nexport const KIND_EXPECTED_SUBJECTS: Record<string, ReadonlyArray<FindingSubjectKind>> = {\n 'failure-mode': ['cluster'],\n 'knowledge-gap': [\n 'knowledge.wiki',\n 'knowledge.claim',\n 'knowledge.raw',\n 'knowledge.stale',\n 'tool-doc',\n 'system-prompt',\n 'memory',\n 'websearch.outdated',\n 'prior-run-summary',\n ],\n 'knowledge-poisoning': [\n 'knowledge.wiki',\n 'knowledge.claim',\n 'knowledge.raw',\n 'tool-doc',\n 'system-prompt',\n 'memory',\n 'websearch.outdated',\n 'prior-run-summary',\n ],\n improvement: [\n 'system-prompt',\n 'tool-doc',\n 'new-tool',\n 'rag',\n 'memory',\n 'scaffolding',\n 'output-schema',\n 'knowledge.wiki',\n 'knowledge.claim',\n ],\n}\n\n// ── Zod schema for boundary validation ───────────────────────────────\n\n/**\n * Zod schema that validates a raw subject string and returns the parsed\n * `FindingSubject`. Embedded in `RawAnalystFindingSchema` via\n * `transform`, so `subject` arrives at the kind factory either as a\n * typed locus or as a parse error attached to a single Zod issue.\n *\n * Optionality is preserved: subjects ARE optional on the wire (some\n * findings are descriptive, not actionable). When present, they MUST\n * parse — emitting a malformed subject is a contract violation, not a\n * soft signal.\n */\nexport const FindingSubjectStringSchema = z\n .string()\n .refine((s) => parseFindingSubject(s) !== null, {\n message: 'subject does not match the finding-subject grammar',\n })\n","/**\n * Forgiving pre-parse for analyst findings. Weak models routinely emit\n * schema-correct content in an unusable wrapper — fenced ```json blocks, a\n * single object where an array is expected, trailing commas. Measured: GPT-4o\n * drops to 0% usable output purely from markdown-fence wrapping\n * (arXiv:2605.02363). A five-line de-fence recovers most of it. This module is\n * the de-fence/coerce step that runs BEFORE Zod, so a recoverable finding is\n * repaired, not dropped.\n *\n * Pure + deterministic. No model, no network.\n */\n\n/** Strip a ```lang ... ``` (or bare ``` ... ```) code fence, if the string is one. */\nexport function stripCodeFences(text: string): string {\n const t = text.trim()\n const fence = /^```[a-zA-Z0-9]*\\s*\\n?([\\s\\S]*?)\\n?```$/\n const m = t.match(fence)\n return m ? m[1]!.trim() : t\n}\n\n/** Remove trailing commas before } or ] — the most common near-JSON defect. */\nfunction dropTrailingCommas(s: string): string {\n return s.replace(/,(\\s*[}\\]])/g, '$1')\n}\n\n/**\n * Best-effort parse of a string into JSON. De-fences, drops trailing commas,\n * then `JSON.parse`. Returns `undefined` (never throws) when unrecoverable.\n */\nexport function coerceJson(text: string): unknown {\n const candidate = dropTrailingCommas(stripCodeFences(text))\n try {\n return JSON.parse(candidate)\n } catch {\n return undefined\n }\n}\n\n/**\n * Coerce arbitrary actor/structurer output into an array of candidate finding\n * rows: a JSON string → parse; a single object → 1-element array; an array →\n * as-is; anything else → []. Callers still run each row through Zod\n * (`parseRawFinding`) — this only fixes the SHAPE, never invents fields.\n */\nexport function coerceToFindingRows(raw: unknown): unknown[] {\n let value = raw\n if (typeof value === 'string') {\n const parsed = coerceJson(value)\n if (parsed === undefined) return []\n value = parsed\n }\n if (Array.isArray(value)) return value\n if (value && typeof value === 'object') {\n // Some models wrap the array as { findings: [...] } — unwrap that one case.\n const inner = (value as Record<string, unknown>).findings\n if (Array.isArray(inner)) return inner\n return [value]\n }\n return []\n}\n","/**\n * Typed Ax output for analyst findings.\n *\n * Replaces the legacy `findings:string[]` pattern (where every bullet\n * became a flat-severity `AnalystFinding`) with a structured object\n * array. Ax binds the field as `findings:json[]` so the provider emits\n * native structured output; at the kind-factory boundary we Zod-validate\n * each emitted finding so malformed rows fail loud instead of being\n * silently lifted with default severity.\n *\n * Why not `f.object().array()` directly in the signature? The Ax\n * signature string `question:string -> findings:json[]` already lets\n * the provider emit JSON arrays. A Zod boundary is required either\n * way (the provider can return any JSON), and Zod gives us a single\n * validation surface independent of which Ax version is installed.\n */\n\nimport { z } from 'zod'\nimport { parseFindingSubject } from './finding-subject'\nimport { coerceJson } from './parse-tolerant'\n\nexport const ANALYST_SEVERITIES = ['critical', 'high', 'medium', 'low', 'info'] as const\n\nexport const RawAnalystFindingSchema = z\n .object({\n severity: z.enum(ANALYST_SEVERITIES),\n claim: z.string().min(1).max(2000),\n /**\n * Subject locus the finding is about. Validated at parse time\n * against the documented grammar (`finding-subject.ts`). Findings\n * with a malformed subject are rejected — they would have been\n * silently skipped by every downstream adapter, so failing loud at\n * parse time turns a hidden no-op into a kind-prompt audit signal.\n *\n * Optional because purely descriptive findings (no actionable\n * locus) are legitimate; they just don't route through the\n * KnowledgeAdapter / ImprovementAdapter.\n */\n subject: z\n .string()\n .max(400)\n .refine((s) => parseFindingSubject(s) !== null, {\n message: 'subject does not match the finding-subject grammar',\n })\n .optional(),\n evidence_uri: z.string().min(1).max(2000),\n evidence_excerpt: z.string().max(2000).optional(),\n confidence: z.number().min(0).max(1),\n rationale: z.string().max(4000).optional(),\n recommended_action: z.string().max(2000).optional(),\n })\n .strict()\n\nexport type RawAnalystFinding = z.infer<typeof RawAnalystFindingSchema>\n\n/**\n * Description embedded into the actor prompt so the LLM knows what\n * shape to emit. Kept here so kinds share one source of truth rather\n * than restating the schema in every prompt.\n */\nexport const RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:\n - severity: one of \"critical\" | \"high\" | \"medium\" | \"low\" | \"info\"\n - claim: one-sentence statement (max 2000 chars)\n - subject?: the routing locus this finding is about. It MUST be one of the exact subject forms listed in this kind's instructions above (e.g. \\`system-prompt:<section>\\`, \\`agent-knowledge:wiki:<slug>\\`, \\`tool-doc:<tool>\\`). A free phrase, a bare noun, or any form not in that list is REJECTED at parse time and the finding is discarded — omit subject entirely rather than guess a form.\n - evidence_uri: REQUIRED, never blank. Exactly one of \"span://<trace_id>/<span_id>\" (trace evidence), \"artifact://<relative-path>\" (files), \"metric://<name>\" (named scalars) — ALWAYS cite a real id surfaced by the tools. If you have no citable id, do not emit the finding.\n - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact\n - confidence: number 0..1 — 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative\n - rationale?: one or two sentences explaining the reasoning\n - recommended_action?: concrete change phrased as an imperative (\"Add ...\", \"Replace ...\", \"Stop ...\") — omit when the finding is purely descriptive\n\nEmit an empty array when the question has no findings to report. Do not fabricate evidence.`\n\n/**\n * Validate one row emitted by the LLM. Returns the typed finding on\n * success; returns `null` and logs the reason on failure so the kind\n * factory can skip-and-count rather than abort the whole analyst run.\n */\nexport function parseRawFinding(\n row: unknown,\n log?: (msg: string, fields?: Record<string, unknown>) => void,\n): RawAnalystFinding | null {\n const result = RawAnalystFindingSchema.safeParse(row)\n if (result.success) return result.data\n // A schema-correct finding in an unusable wrapper (a JSON string, a fenced\n // block) should be repaired, not dropped. Coerce the shape and retry ONCE.\n if (typeof row === 'string') {\n const coerced = coerceJson(row)\n if (coerced !== undefined) {\n const retry = RawAnalystFindingSchema.safeParse(coerced)\n if (retry.success) return retry.data\n }\n }\n log?.('finding rejected: schema failure', {\n issues: result.error.issues.map((i) => ({\n path: i.path.join('.'),\n code: i.code,\n message: i.message,\n })),\n })\n return null\n}\n","/**\n * `structureFindings` — the deferred structuring pass (DSPy TwoStepAdapter /\n * HALO `synthesize_traces` analog). The agentic actor reasons FREE-FORM and\n * emits a prose `report` (which any model does reliably); this separate, cheap\n * call's ONLY job is to turn that report into `AnalystFinding[]`. Decoupling\n * reasoning from structuring is what makes the SEMANTIC findings model-agnostic\n * — the reasoning model never has to satisfy a strict typed-array contract\n * while it diagnoses.\n *\n * Forgiving: the response runs through `coerceToFindingRows` (de-fence, lift\n * single→array) before Zod, and on a zero-finding extraction from a substantive\n * report it reasks ONCE with the schema restated. Returns a typed outcome so a\n * legitimate \"nothing to report\" is distinguishable from a failed extraction\n * (no silent empty).\n */\n\nimport { callLlm, type LlmClientOptions } from '../llm-client'\nimport { parseRawFinding, type RawAnalystFinding } from './finding-signature'\nimport { coerceToFindingRows } from './parse-tolerant'\nimport { type AnalystFinding, makeFinding } from './types'\n\nexport interface StructureFindingsOptions {\n /** The actor's free-form diagnosis prose. */\n report: string\n analystId: string\n /** Coarse classification stamped on every extracted finding. */\n area: string\n model: string\n baseUrl: string\n apiKey?: string\n /** Max reask attempts after a zero/invalid extraction. Default 1. */\n maxReasks?: number\n /** Test seam: inject a fetch (no network in unit tests). */\n fetchImpl?: LlmClientOptions['fetch']\n}\n\nexport interface StructureFindingsResult {\n findings: AnalystFinding[]\n outcome: 'ok' | 'extraction_failed'\n}\n\nconst SYSTEM = [\n 'You convert a free-form trace-analysis report into a STRICT JSON array of findings.',\n 'Output ONLY the JSON array — no prose, no code fences.',\n 'Each element: {\"severity\":\"critical|high|medium|low|info\",\"claim\":string,\"evidence_uri\":string,',\n '\"subject\"?:string,\"rationale\"?:string,\"recommended_action\"?:string,\"confidence\":number(0..1)}.',\n 'evidence_uri cites the trace element the report referenced (e.g. \"span://<trace>/<span>\") or \"report://summary\".',\n 'If the report asserts NO problems, output exactly [].',\n].join(' ')\n\nfunction buildRows(raw: unknown, analystId: string, area: string): AnalystFinding[] {\n const rows = coerceToFindingRows(raw)\n const out: AnalystFinding[] = []\n for (const row of rows) {\n // Recovery findings are extracted from PROSE — the report itself is the\n // evidence. A weak model often returns a sound claim + severity but omits\n // `evidence_uri`; default it to the report rather than dropping the row\n // (the strict evidence_uri requirement is a recovery yield-killer).\n const normalized =\n row &&\n typeof row === 'object' &&\n !Array.isArray(row) &&\n !(row as Record<string, unknown>).evidence_uri\n ? { ...(row as Record<string, unknown>), evidence_uri: 'report://summary' }\n : row\n const parsed: RawAnalystFinding | null = parseRawFinding(normalized)\n if (!parsed) continue\n out.push(\n makeFinding({\n analyst_id: analystId,\n area,\n subject: parsed.subject,\n claim: parsed.claim,\n rationale: parsed.rationale,\n severity: parsed.severity,\n confidence: parsed.confidence,\n evidence_refs: [\n {\n kind: parsed.evidence_uri.startsWith('span://') ? 'span' : 'artifact',\n uri: parsed.evidence_uri,\n excerpt: parsed.evidence_excerpt,\n },\n ],\n recommended_action: parsed.recommended_action,\n }),\n )\n }\n return out\n}\n\nexport async function structureFindings(\n opts: StructureFindingsOptions,\n): Promise<StructureFindingsResult> {\n const maxReasks = opts.maxReasks ?? 1\n const llm = { baseUrl: opts.baseUrl, apiKey: opts.apiKey, fetch: opts.fetchImpl }\n let user = `TRACE-ANALYSIS REPORT:\\n${opts.report}\\n\\nReturn the findings JSON array.`\n\n for (let attempt = 0; attempt <= maxReasks; attempt++) {\n const res = await callLlm(\n {\n model: opts.model,\n messages: [\n { role: 'system', content: SYSTEM },\n { role: 'user', content: user },\n ],\n },\n llm,\n )\n const text = res.content.trim()\n const findings = buildRows(text, opts.analystId, opts.area)\n if (findings.length > 0) return { findings, outcome: 'ok' }\n // A report that asserts nothing is a legitimate empty — only reask when the\n // report is substantive (the extraction, not the diagnosis, likely failed).\n if (opts.report.trim().length < 200) return { findings: [], outcome: 'ok' }\n user = `${user}\\n\\nThat produced no valid findings. The report DOES describe issues — re-extract them as the strict JSON array described in the system prompt. Output ONLY the array.`\n }\n return { findings: [], outcome: 'extraction_failed' }\n}\n","/**\n * Analyst-kind factory — the typed, focused replacement for the\n * legacy `createTraceAnalystAdapter`.\n *\n * A \"kind\" is a specialized analyst whose actor prompt, tool subset,\n * and Ax recursion config target one failure-mode lens (failure-mode\n * classification, knowledge gap discovery, knowledge poisoning, recursive\n * self-improvement, ...). Kinds emit findings in the typed `RawAnalystFinding`\n * shape via a JSON-array Ax output; the factory validates each row with\n * Zod and lifts it into `AnalystFinding[]` with no shape guessing.\n *\n * Composition rules:\n * - Each kind owns its actor description. No generic \"answer this\n * question\" prompt — the prompt names the failure lens.\n * - Each kind picks a narrow tool subset from `ANALYST_TOOL_GROUPS`.\n * A kind that never needs full-trace dumps can drop `viewTrace` /\n * `viewSpans` and stay cheap.\n * - Each kind declares its recursion + parallelism budget. Discovery-\n * heavy kinds (failure-mode) get higher `maxDepth`; lens kinds\n * (poisoning) usually stay at 0 since they have a tighter brief.\n *\n * Optimizer hook: kinds may declare `goldens` — labeled examples used\n * by `AxMiPRO` / `AxBootstrapFewShot` / `AxGEPA` to fit the actor\n * description programmatically. Stored on the kind, not the registry,\n * because the right metric is kind-specific.\n */\n\nimport type { AxAIService, AxFunction } from '@ax-llm/ax'\nimport { AxJSRuntime, agent } from '@ax-llm/ax'\nimport type { TraceAnalysisStore } from '../trace-analyst/store'\nimport { TraceFileMissingError } from '../trace-analyst/store-otlp'\nimport {\n parseRawFinding,\n RAW_FINDING_SCHEMA_PROMPT,\n type RawAnalystFinding,\n} from './finding-signature'\nimport { KIND_EXPECTED_SUBJECTS, parseFindingSubject } from './finding-subject'\nimport { structureFindings } from './structure-findings'\nimport type { Analyst, AnalystContext, AnalystCost, AnalystFinding } from './types'\nimport { makeFinding } from './types'\n\n/**\n * Per-kind specification. The factory turns this into a regular\n * `Analyst<TraceAnalysisStore>` ready for `AnalystRegistry.register()`.\n */\nexport interface TraceAnalystKindSpec {\n /** Stable id. Appears in finding_id, telemetry, and registry exclusions. */\n id: string\n /** One-sentence description shown in `registry.list()`. */\n description: string\n /** Coarse classification stamped on every emitted finding (`failure-mode`, `knowledge-gap`, ...). */\n area: string\n /** Bump on any breaking change to the actor prompt or output schema. */\n version: string\n /** Actor system prompt. Must instruct the LLM to emit `findings` per the schema. */\n actorDescription: string\n /** Responder system prompt; falls back to a minimal \"format the findings\" instruction. */\n responderDescription?: string\n /** Tool functions the actor may call. Pick narrow subsets via `ANALYST_TOOL_GROUPS`. */\n buildTools: (store: TraceAnalysisStore) => AxFunction[]\n /** Recursion budget. `maxDepth: 0` disables subagents. */\n recursion?: { maxDepth: number; maxParallelSubagents?: number }\n /** Actor turn cap. Default 12. */\n maxTurns?: number\n /** Runtime char cap. Default 6000. */\n maxRuntimeChars?: number\n /** Cost classification surfaced in `registry.list()` and budget enforcement. */\n cost: AnalystCost\n /** Per-finding-row hook — kinds may reject / rewrite before lifting. */\n postProcess?: (row: RawAnalystFinding, ctx: AnalystContext) => RawAnalystFinding | null\n /** Optional optimizer hook — populated when a kind wants to fit its prompt against labeled examples. */\n goldens?: TraceAnalystGolden[]\n}\n\n/**\n * One labeled example consumed by Ax optimizers (MIPRO / GEPA / Bootstrap).\n * Each input is the same `{question}` an analyst would receive; `expected`\n * is the ground-truth finding set a fitted prompt should produce on this\n * input. Metric: kind-specific (default: F1 on `finding_id` overlap).\n */\nexport interface TraceAnalystGolden {\n question: string\n expected: ReadonlyArray<Omit<RawAnalystFinding, 'confidence'>>\n}\n\nexport interface CreateTraceAnalystKindOpts {\n /** AxAIService bound at registration time. */\n ai: AxAIService\n /** Optional model override; falls back to the AI service's default. */\n model?: string\n /** Override the spec's `version` (e.g. when an optimizer has fitted a new prompt). */\n versionSuffix?: string\n /**\n * Optional two-phase recovery: when the agentic harvest is empty but the\n * actor produced a substantive free-form `report`, extract findings from that\n * prose via a tolerant chat-completions pass (`structureFindings`) — no\n * strict-emission contract, so it works on weak models. Omit to leave the\n * actor's harvest as-is (the report is still surfaced fail-loud either way).\n */\n recovery?: { baseUrl: string; apiKey?: string; model?: string; fetchImpl?: typeof fetch }\n}\n\n/**\n * Build an `Analyst<TraceAnalysisStore>` from a kind spec.\n *\n * Lifts the Ax pipeline once at registration time so the registry\n * gets a stateless analyst. The Ax agent is freshly constructed per\n * `analyze()` call (the agent carries chat-log + usage state we don't\n * want shared across analyst runs).\n */\nexport function createTraceAnalystKind(\n spec: TraceAnalystKindSpec,\n opts: CreateTraceAnalystKindOpts,\n): Analyst<TraceAnalysisStore> {\n const version = opts.versionSuffix ? `${spec.version}+${opts.versionSuffix}` : spec.version\n return {\n id: spec.id,\n description: spec.description,\n inputKind: 'trace-store',\n cost: spec.cost,\n version,\n async analyze(store, ctx) {\n const tools = spec.buildTools(store)\n const maxDepth = spec.recursion?.maxDepth ?? 0\n const maxParallel = spec.recursion?.maxParallelSubagents ?? 2\n const priorContext = renderPriorFindings(ctx.priorFindings)\n\n const actorDescription =\n spec.actorDescription.trim() +\n priorContext +\n '\\n\\n' +\n RAW_FINDING_SCHEMA_PROMPT +\n '\\n\\nFirst write `report`: a concise free-form prose diagnosis of what ' +\n 'the traces show — what succeeded, what was suboptimal or failed — with ' +\n 'concrete trace ids and numbers. THEN return the structured `findings` ' +\n 'array (it MAY be empty when there is nothing to report). Use `final(...)` ' +\n 'with the `{ report, findings }` payload when you are done.'\n\n const ax = agent<{ question: string }, { report: string; findings: unknown[] }>(\n 'question:string -> report:string, findings:json[]',\n {\n agentIdentity: {\n name: spec.id,\n description: spec.description,\n },\n contextFields: ['question'],\n runtime: new AxJSRuntime({\n permissions: [],\n blockDynamicImport: true,\n allowedModules: [],\n freezeIntrinsics: true,\n blockShadowRealm: true,\n preventGlobalThisExtensions: false,\n }),\n mode: maxDepth > 0 ? 'advanced' : 'simple',\n recursionOptions: maxDepth > 0 ? { maxDepth } : undefined,\n maxTurns: spec.maxTurns ?? 12,\n maxRuntimeChars: spec.maxRuntimeChars ?? 6000,\n maxBatchedLlmQueryConcurrency: maxParallel,\n promptLevel: 'detailed',\n // Trace analysis depends on exact prior tool results and runtime variables.\n contextPolicy: { preset: 'full', budget: 'balanced' },\n functions: { local: tools },\n actorOptions: {\n description: actorDescription,\n ...(opts.model ? { model: opts.model } : {}),\n showThoughts: false,\n thinkingTokenBudget: 'none',\n },\n responderOptions: {\n description:\n spec.responderDescription ??\n \"Pass through the actor's `report` prose verbatim, and format the `findings` array exactly as the actor produced it. Do not add, drop, or summarize entries.\",\n ...(opts.model ? { model: opts.model } : {}),\n showThoughts: false,\n },\n bubbleErrors: [TraceFileMissingError],\n },\n )\n\n ctx.log?.(`analyst.kind ${spec.id} forward`, {\n max_depth: maxDepth,\n tool_count: tools.length,\n tags: ctx.tags,\n })\n\n const result = await ax.forward(opts.ai, { question: deriveQuestion(ctx, spec) })\n\n const expectedSubjects = KIND_EXPECTED_SUBJECTS[spec.id]\n const out: AnalystFinding[] = []\n const rawRows = Array.isArray(result.findings) ? result.findings : []\n let rejectedWrongKind = 0\n for (const row of rawRows) {\n const parsed = parseRawFinding(row, ctx.log)\n if (!parsed) continue\n // Subject-grammar check: if the kind has a declared expects-set\n // (every shipped kind does), the finding's subject MUST parse to\n // one of the declared variants. A wrong-kind subject is a\n // contract violation — the actor's prompt drifted from the\n // grammar — and we count it for prompt-audit visibility.\n if (expectedSubjects && parsed.subject !== undefined) {\n const parsedSubject = parseFindingSubject(parsed.subject)\n if (parsedSubject === null) {\n ctx.log?.('finding rejected: subject failed to parse', {\n kind: spec.id,\n subject: parsed.subject,\n })\n rejectedWrongKind += 1\n continue\n }\n if (!expectedSubjects.includes(parsedSubject.kind)) {\n ctx.log?.('finding rejected: subject variant not allowed for this kind', {\n kind: spec.id,\n subject_kind: parsedSubject.kind,\n subject: parsed.subject,\n allowed: expectedSubjects,\n })\n rejectedWrongKind += 1\n continue\n }\n }\n const postProcessed = spec.postProcess?.(parsed, ctx) ?? parsed\n if (!postProcessed) continue\n out.push(toAnalystFinding(spec, postProcessed))\n }\n\n ctx.log?.(`analyst.kind ${spec.id} done`, {\n emitted: rawRows.length,\n accepted: out.length,\n rejected_wrong_subject: rejectedWrongKind,\n })\n\n // Two-phase recovery / fail-loud. The actor reasons free-form (the\n // `report`); a weak model often produces a sound diagnosis but fails the\n // strict findings emission (or the rows get rejected). If the harvest is\n // empty but the report is substantive, recover findings from the prose\n // via the tolerant structuring pass (opt-in), and — either way — surface\n // the report as a visible info finding so an empty harvest is never a\n // silent zero. A genuinely empty diagnosis (short/no report) stays empty.\n const report = typeof result.report === 'string' ? result.report : ''\n if (out.length === 0 && report.trim().length >= 200) {\n if (opts.recovery) {\n const recovered = await structureFindings({\n report,\n analystId: spec.id,\n area: spec.area,\n model: opts.recovery.model ?? opts.model ?? '',\n baseUrl: opts.recovery.baseUrl,\n apiKey: opts.recovery.apiKey,\n fetchImpl: opts.recovery.fetchImpl,\n })\n out.push(...recovered.findings)\n ctx.log?.(`analyst.kind ${spec.id} recovery`, {\n outcome: recovered.outcome,\n recovered: recovered.findings.length,\n })\n }\n if (out.length === 0) {\n out.push(\n makeFinding({\n analyst_id: spec.id,\n area: spec.area,\n claim: 'Analyst produced a diagnosis but no structured findings — see report.',\n rationale: report.slice(0, 1500),\n severity: 'info',\n confidence: 0.3,\n evidence_refs: [\n { kind: 'artifact', uri: 'report://summary', excerpt: report.slice(0, 2000) },\n ],\n metadata: { outcome: 'extraction_failed' },\n }),\n )\n }\n }\n return out\n },\n }\n}\n\nfunction deriveQuestion(ctx: AnalystContext, spec: TraceAnalystKindSpec): string {\n // The actor's user message must orient it at the task, not echo the kind id.\n // A bare id like \"failure-mode\" gives the actor nothing to act on, so it\n // spends turns inspecting the input instead of reading traces. Operators can\n // still steer with `tags.focus = \"leaf-X\"`, appended to the task directive.\n const focus = ctx.tags?.focus?.trim()\n const task = `Analyze this trace dataset with the available tools and report ${spec.area} findings. ${spec.description}`\n return focus ? `${task} Focus: ${focus}.` : task\n}\n\nfunction toAnalystFinding(spec: TraceAnalystKindSpec, raw: RawAnalystFinding): AnalystFinding {\n return makeFinding({\n analyst_id: spec.id,\n area: spec.area,\n subject: raw.subject,\n claim: raw.claim,\n rationale: raw.rationale,\n severity: raw.severity,\n confidence: raw.confidence,\n evidence_refs: [\n {\n kind: evidenceKindFromUri(raw.evidence_uri),\n uri: raw.evidence_uri,\n excerpt: raw.evidence_excerpt,\n },\n ],\n recommended_action: raw.recommended_action,\n metadata: { kind_version: spec.version },\n })\n}\n\nfunction evidenceKindFromUri(uri: string): 'span' | 'artifact' | 'metric' | 'event' | 'finding' {\n if (uri.startsWith('span://')) return 'span'\n if (uri.startsWith('artifact://')) return 'artifact'\n if (uri.startsWith('metric://')) return 'metric'\n if (uri.startsWith('event://')) return 'event'\n if (uri.startsWith('finding://')) return 'finding'\n return 'artifact'\n}\n\n/**\n * Render a compact prior-findings block the actor reads alongside its\n * brief. Each row is one line so the actor can scan dozens cheaply.\n * The kind's prompt instructs the actor to (a) check whether a new\n * cluster matches a prior `finding_id` (carry the id forward via\n * `id_basis` to keep diffs stable) and (b) raise severity / confidence\n * when a prior finding has reappeared without remediation.\n *\n * Returns the empty string when there are no prior findings — most\n * runs are \"first-of-its-kind\" and the prompt stays unchanged.\n *\n * Exported for tests + for consumers that build their own actor\n * prompts (e.g. specialized analysts living outside the default kinds).\n */\nexport function renderPriorFindings(prior: AnalystContext['priorFindings']): string {\n if (!prior || prior.length === 0) return ''\n const MAX_ROWS = 40 // keep the block under ~2KB; older history is summarized externally\n const rows = prior.slice(0, MAX_ROWS).map((f) => {\n const subject = f.subject ? ` [${f.subject}]` : ''\n return ` - id=${f.finding_id} ${f.severity}${subject} ${truncateForContext(f.claim, 160)}`\n })\n const overflow =\n prior.length > MAX_ROWS\n ? `\\n ... +${prior.length - MAX_ROWS} more prior findings (older history truncated)`\n : ''\n return [\n '',\n '',\n 'PRIOR FINDINGS (from a previous run on related data):',\n 'When the work you do now matches a row below, REUSE the `finding_id` (pass it as `id_basis`) so the cross-run diff stays stable.',\n 'A finding that reappears with no remediation evidence SHOULD raise its `confidence` and may justify a higher `severity`.',\n ...rows,\n overflow,\n ]\n .filter(Boolean)\n .join('\\n')\n}\n\nfunction truncateForContext(s: string, max: number): string {\n if (s.length <= max) return s\n return `${s.slice(0, max - 1).trimEnd()}…`\n}\n","/**\n * Pre-curated tool subsets for analyst kinds.\n *\n * The full trace-analyst tool set is seven functions. Most kinds only\n * need three or four. Picking from named groups instead of importing\n * the whole bundle keeps every kind's actor-context budget tight and\n * makes \"what can this analyst see?\" obvious at registration time.\n *\n * Each function in the group keeps its full `name`/`description` from\n * `buildTraceAnalystTools` — we filter, we don't re-implement.\n */\n\nimport type { AxFunction } from '@ax-llm/ax'\nimport type { TraceAnalysisStore } from '../trace-analyst/store'\nimport { buildTraceAnalystTools } from '../trace-analyst/tools'\n\n/** Named tool sets. Kinds pass `tools: TRACE_TOOL_GROUPS.failureForensics` etc. */\nexport type TraceToolGroupName =\n /** All seven tools. Use for open-ended discovery kinds. */\n | 'all'\n /** Overview + paginated query + count. No deep reads. Cheap. */\n | 'discovery'\n /** Discovery + viewTrace + viewSpans. Deep-read but no regex search. */\n | 'discoveryAndRead'\n /** Discovery + search tools. For pattern-matching across many traces. */\n | 'discoveryAndSearch'\n /** Discovery + viewSpans + searchSpan. Targeted-span work after another kind narrows down. */\n | 'targeted'\n\nconst TOOL_NAMES_BY_GROUP: Record<TraceToolGroupName, ReadonlySet<string>> = {\n all: new Set(),\n discovery: new Set(['getDatasetOverview', 'queryTraces', 'countTraces']),\n discoveryAndRead: new Set([\n 'getDatasetOverview',\n 'queryTraces',\n 'countTraces',\n 'viewTrace',\n 'viewSpans',\n ]),\n discoveryAndSearch: new Set([\n 'getDatasetOverview',\n 'queryTraces',\n 'countTraces',\n 'searchTrace',\n 'searchSpan',\n ]),\n targeted: new Set(['getDatasetOverview', 'queryTraces', 'viewSpans', 'searchSpan']),\n}\n\n/**\n * Build the tool set for a named group bound to a specific trace store.\n *\n * `all` returns every tool. Other groups filter `buildTraceAnalystTools`\n * by name to the documented subset. An unrecognised group name throws —\n * silently returning all tools would defeat the cost-control point.\n */\nexport function buildTraceToolsForGroup(\n group: TraceToolGroupName,\n store: TraceAnalysisStore,\n): AxFunction[] {\n const all = buildTraceAnalystTools({ store })\n if (group === 'all') return all\n const allow = TOOL_NAMES_BY_GROUP[group]\n if (!allow) throw new Error(`unknown trace tool group: ${group}`)\n return all.filter((tool) => allow.has((tool as { name: string }).name))\n}\n","/**\n * Failure-mode analyst — classifies what went wrong and why.\n *\n * Brief: read the trace dataset, identify the top failure modes across\n * runs, classify each with severity + evidence, and surface them as\n * findings. The actor's job is *taxonomy + evidence*, not fix-design —\n * that's the improvement-analyst's job.\n *\n * Recursion is deep (`maxDepth: 3`) because real failure-mode\n * discovery is genuinely tree-shaped: the actor splits the dataset\n * into candidate clusters, each cluster spawns a focused investigator\n * that drills into representative traces, and a deeply-recursed\n * investigator may itself split a confounded mode into two sub-modes.\n * Each level fans out 4-way, so the analyst can investigate up to\n * ~16 leaf clusters before hitting the depth ceiling.\n */\n\nimport type { TraceAnalystKindSpec } from '../kind-factory'\nimport { buildTraceToolsForGroup } from '../tool-groups'\n\nconst ACTOR_PROMPT = `You are a failure-mode classifier for an OTLP trace dataset. Your job is to identify the **distinct ways agents failed** in this dataset, not to grade individual runs.\n\nDISCOVERY → CLUSTER → CITE protocol:\n\n1. Call \\`traces.getDatasetOverview({})\\` first. Use \\`has_errors\\`, \\`models\\`, \\`agent_names\\`, \\`tools\\`, and \\`sample_trace_ids\\` to size the failure surface.\n2. Use \\`traces.queryTraces({ filters: { has_errors: true }, limit })\\` to pull error-bearing traces. Combine with \\`traces.countTraces\\` to see what fraction of the dataset failed.\n3. For each candidate failure cluster, use \\`traces.searchTrace\\` with regex like \\`STATUS_CODE_ERROR\\`, \\`MaxTurnsExceeded\\`, \\`assertion\\`, \\`unauthorized\\`, \\`timeout\\`, \\`429\\`, \\`5\\\\d\\\\d\\`, the agent's specific error strings, or the names of its tools. Pull one or two representative traces per cluster, **not all** of them.\n4. **Cluster, do not enumerate.** Two failures with the same root cause should be ONE finding citing both traces, not two findings. The point of this analyst is to compress N runs into K modes.\n5. For each cluster you can defend with evidence, emit ONE finding with:\n - \\`area\\` = \"failure-mode\"\n - \\`subject\\` = a short label for the cluster (\"tool-call-loop\", \"auth-revoked-mid-run\", \"agent-asked-clarification-too-late\", ...)\n - \\`claim\\` = one sentence stating the mode\n - \\`severity\\` = \"critical\" when it blocks the run, \"high\" when the run finished degraded, \"medium\" when it slowed convergence\n - \\`evidence_uri\\` = \\`span://<trace_id>/<span_id>\\` of the most representative span\n - \\`evidence_excerpt\\` = the exact quote (e.g. error message, stuck tool call payload, contradictory turn output)\n - \\`confidence\\` = 0.85+ when multiple traces show the same shape; 0.6-0.8 for a single-trace inference; <0.5 for speculative.\n - \\`recommended_action\\` = imperative-phrased fix idea (kept short — the improvement-analyst will expand on these)\n\nIf the dataset has no failures, return an empty findings array — do NOT pad with low-confidence speculation.\n\n**Delegate aggressively.** The recursion budget is there to be used:\n- After your first \\`getDatasetOverview\\` + \\`queryTraces\\` calls, you should have 3-6 candidate failure clusters in mind. Spawn one \\`llmQuery\\` per cluster in a single batch — they investigate in parallel.\n- A sub-investigator that finds its cluster is actually two distinct modes should split again at its own level. Recursion is meant to discover sub-modes, not to do trivial drilling that the parent could do in-line.\n- Pass narrow context to each subagent: { question: 'investigate the auth-revoked-mid-run cluster', context: { trace_ids: ['abc', 'def'], suspected_root_cause: 'token refresh skipped on idle sessions' } }. Subagents need enough context to skip re-discovery but not the whole conversation.\n- Each subagent returns its findings as JSON; the parent merges them. Do NOT have subagents call \\`final()\\` — they return their findings list to you, and you call \\`final()\\` once at the top.\n\nOBSERVABILITY rules:\n- Each non-final turn must emit at least one \\`console.log\\` for evidence.\n- Reuse runtime variables across turns; don't recompute.\n- Call \\`final({ findings: [...] })\\` exactly once, after you've gathered evidence for every cluster you intend to report.`\n\nexport const FAILURE_MODE_KIND_SPEC: TraceAnalystKindSpec = {\n id: 'failure-mode',\n description:\n 'Clusters trace-dataset failures into distinct failure modes with cited evidence and a short recommended action.',\n area: 'failure-mode',\n version: '1.0.0',\n actorDescription: ACTOR_PROMPT,\n buildTools: (store) => buildTraceToolsForGroup('all', store),\n recursion: { maxDepth: 3, maxParallelSubagents: 4 },\n maxTurns: 24,\n cost: { kind: 'llm' },\n}\n","/**\n * Improvement analyst — actionable, recursive self-improvement findings.\n *\n * Brief: read findings from upstream analysts (failure-mode,\n * knowledge-gap, knowledge-poisoning) AND the trace dataset itself,\n * then propose **concrete edits** to the agent's runtime: prompt\n * additions, RAG documents to ingest, tool descriptions to rewrite,\n * scaffolding changes to make, memory entries to invalidate. Each\n * finding is one proposed edit with the locus, the diff, and the\n * expected effect.\n *\n * This is the recursive-self-improvement loop's last mile: the prior\n * kinds describe *what's wrong*; this kind describes *what to change*.\n *\n * Recursion is deep (`maxDepth: 3`) because real improvement proposals\n * are competitive: for each failure-mode there are usually 2-3 viable\n * fix directions (tighten prompt vs add tool vs adjust scaffolding),\n * and the actor should explore each with a focused subagent before\n * picking the highest-leverage one to recommend.\n */\n\nimport type { TraceAnalystKindSpec } from '../kind-factory'\nimport { buildTraceToolsForGroup } from '../tool-groups'\n\nconst ACTOR_PROMPT = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.\n\nUpstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.\n\nDISCOVERY → CANDIDATE-FIXES → COMPETE → CITE protocol:\n\n1. \\`traces.getDatasetOverview({})\\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).\n2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:\n - **System-prompt edit** — add an instruction, remove a misleading one, restructure precedence\n - **Tool description edit** — rewrite a tool's description so the agent picks it correctly / passes valid args\n - **New tool** — add a tool the agent kept emulating in code\n - **RAG ingestion** — add a document or correct a stale one\n - **Memory invalidation** — clear cached prior-run decisions that no longer apply\n - **Scaffolding** — add a precondition check, a retry policy, a turn budget, a verification step\n - **Output schema** — narrow the agent's output to forbid the failure shape\n3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \\`llmQuery\\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.\n4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates — the output is the recommendation, not the candidate set.\n5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \\`evidence_uri = \"finding://<prior-finding-id>\"\\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show \"fix #X resolves failure modes A, B, C.\"\n\nFor each winning recommendation, emit ONE finding with:\n- \\`area\\` = \"improvement\"\n- \\`subject\\` = the locus to edit: \\`system-prompt:<section>\\`, \\`tool-doc:<tool-name>\\`, \\`new-tool:<proposed-name>\\`, \\`rag:<corpus>:<doc-id>\\`, \\`memory:<key>\\`, \\`scaffolding:<concern>\\`, \\`output-schema:<field>\\`\n- \\`claim\\` = one sentence stating the edit (\"Add a precondition check to refuse tool X calls without arg Y\")\n- \\`severity\\` = leverage rating: \"critical\" when fix resolves a critical failure mode; \"high\" when it resolves a high; \"medium\" when it's a quality-of-life win; \"info\" when it's a cleanup with no behavioral effect\n- \\`evidence_uri\\` = the failure-mode finding id this fix targets (\\`finding://<id>\\`) when it exists; else the most representative span\n- \\`evidence_excerpt\\` = a fragment showing the problem the fix targets\n- \\`confidence\\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative\n- \\`rationale\\` = why this candidate beat its alternatives (2 sentences max)\n- \\`recommended_action\\` = the **literal edit**, phrased as a diff or a quoted replacement: \"Replace section X with: '...'\" or \"Add tool with description: '...'\" or \"Set retry policy to max_attempts=3 with exponential backoff\"\n\nIf no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\\`searchTrace\\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present — the kinds are designed to chain.\n\nDo NOT propose a fix you cannot defend with evidence. \"Tighten the prompt\" is not a finding; \"Add 'When the user asks for X, always Y' to the system prompt section \"request-classification\"\" is.\n\nOBSERVABILITY rules:\n- Each non-final turn must emit at least one \\`console.log\\` for evidence.\n- Call \\`final({ findings: [...] })\\` exactly once at the top level.`\n\nexport const IMPROVEMENT_KIND_SPEC: TraceAnalystKindSpec = {\n id: 'improvement',\n description:\n 'Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.',\n area: 'improvement',\n version: '1.0.0',\n actorDescription: ACTOR_PROMPT,\n buildTools: (store) => buildTraceToolsForGroup('all', store),\n recursion: { maxDepth: 3, maxParallelSubagents: 4 },\n maxTurns: 30,\n maxRuntimeChars: 12000,\n cost: { kind: 'llm' },\n}\n","/**\n * Knowledge-gap analyst — what did the agent NOT know that it needed?\n *\n * Brief: find moments in the trace where the agent had to guess, ask\n * the user to fill in context, recover from a wrong assumption, or\n * loop on a retrieval. Each finding names a *missing or outdated piece\n * of knowledge* the agent's curated knowledge base should have held —\n * or a downstream lookup (web, docs, tool description) that surfaced\n * stale or outdated information.\n *\n * The primary expected store is `@tangle-network/agent-knowledge`: a\n * Karpathy-style wiki the agent maintains with raw ↔ curated pages,\n * source anchors, and claim/relation triples. A gap is anything the\n * agent had to discover at run-time that should already have lived\n * there. Secondary loci: web-search results that returned outdated\n * pages, tool descriptions that omitted critical behavior, system-\n * prompt sections that didn't cover the case.\n *\n * Distinct from failure-mode: failure-mode classifies *how* it broke;\n * knowledge-gap names the *information* whose absence (or staleness)\n * caused the break. One failure-mode often maps to several gaps.\n *\n * Recursion (`maxDepth: 2`) is enough to fan out one subagent per\n * candidate gap-source layer; each subagent runs a focused detection.\n */\n\nimport type { TraceAnalystKindSpec } from '../kind-factory'\nimport { buildTraceToolsForGroup } from '../tool-groups'\n\nconst ACTOR_PROMPT = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.\n\nThe agent under analysis maintains a curated knowledge base via \\`@tangle-network/agent-knowledge\\` — a wiki of \\`KnowledgePage\\`s with raw source anchors, claims, and relations. The primary expected store of agent-knowable facts IS that wiki. A \"knowledge gap\" is anything the agent had to discover or guess at run-time that the wiki should have held — or an outdated/contradictory fact the agent picked up from a non-wiki source.\n\nDISCOVERY → ATTRIBUTE-TO-LAYER → CITE protocol:\n\n1. \\`traces.getDatasetOverview({})\\` first. Note which agents, tools, and models appear.\n2. Pull traces where the agent shows gap signals. The strongest signals are:\n - Self-correction turns (\"I assumed X but…\", \"let me re-check\", \"actually,\")\n - Clarifying-question turns where the agent asked the user something the runtime should have surfaced\n - Repeated retrieval / lookup calls for the same artifact with slightly varied queries\n - Tool errors that name a missing argument or unknown resource\n - Web-search calls returning pages dated before a known cutoff for content that changes (versioned APIs, schemas, policies)\n - Agent quoting a tool's docs / system prompt incorrectly because the actual text was insufficient\n - Fabricated identifiers that don't appear in dataset \\`sample_trace_ids\\`\n Use \\`traces.searchTrace\\` with patterns like \\`I (don.?t|do not) know\\`, \\`assumed\\`, \\`unclear\\`, \\`could you (clarify|tell me|provide)\\`, \\`not found\\`, \\`undefined\\`, \\`unknown\\`, \\`null\\`, dates older than the analysis window, or the agent's specific clarification phrases.\n3. For each gap, identify the **layer of the runtime that should have prevented it**. The locus is the value of \\`subject\\` on the finding. Use one of:\n - \\`agent-knowledge:wiki:<page-slug>\\` — the wiki page that should exist but doesn't, or exists but lacks the claim\n - \\`agent-knowledge:wiki:<page-slug>#<heading>\\` — wiki page exists but a specific section is missing\n - \\`agent-knowledge:claim:<topic>\\` — a specific claim/relation triple that should be in the wiki\n - \\`agent-knowledge:raw:<source-id>\\` — raw source captured but never lifted into a curated page\n - \\`agent-knowledge:stale:<page-slug>\\` — wiki page exists but contradicts ground-truth evidence in this trace (the wiki itself drifted)\n - \\`websearch:outdated:<topic>\\` — agent relied on a web result that was stale; wiki should have superseded it\n - \\`tool-doc:<tool-name>:<aspect>\\` — tool description missed a behavior aspect (return shape, failure modes, side effects)\n - \\`system-prompt:<section>\\` — system prompt should have stated the rule directly\n - \\`memory:<key>\\` — prior-run memory should have surfaced an earlier decision\n4. For each gap you can defend with evidence, emit ONE finding with:\n - \\`area\\` = \"knowledge-gap\"\n - \\`subject\\` = the locus string from the list above\n - \\`claim\\` = a sentence naming the missing or stale knowledge (\"wiki has no page on invoice line-item shape, agent had to re-derive it from raw spans\")\n - \\`severity\\` = \"high\" when the gap caused a failure or a clarifying question; \"medium\" when it caused unnecessary turns; \"low\" when it caused minor inefficiency\n - \\`evidence_uri\\` = \\`span://<trace_id>/<span_id>\\` of the moment the gap surfaced (the question, the self-correction, the retrieval miss, the stale web result)\n - \\`evidence_excerpt\\` = exact quote where the agent showed the gap\n - \\`confidence\\` = 0.85+ when the agent itself articulated the gap; 0.6-0.8 when inferred from behavior\n - \\`recommended_action\\` = phrased as a wiki edit when the locus is \\`agent-knowledge:*\\` (\"Create wiki page \\`invoice-line-items\\` with claims: ...\"), or as a prompt/tool-doc edit otherwise\n\n**Delegate per layer.** After your first scan, you should have candidates spread across \\`agent-knowledge:*\\`, \\`websearch:outdated\\`, \\`tool-doc:*\\`, \\`system-prompt:*\\`, and \\`memory:*\\`. Spawn one \\`llmQuery\\` per layer in parallel — each subagent runs a focused detection (e.g. the \\`agent-knowledge\\` subagent looks for both missing-pages AND stale-pages; the \\`websearch\\` subagent looks specifically for date staleness signals; the \\`tool-doc\\` subagent looks for tool-call argument errors a fuller description would have prevented). Subagents return findings; you merge and emit one \\`final({ findings })\\` at the top.\n\nDo NOT report a gap that the agent later recovered from cleanly within the same turn — that's resilience, not a gap. Cite the *non-recovery* version when both exist.\n\nOBSERVABILITY rules:\n- Each non-final turn must emit at least one \\`console.log\\` for evidence.\n- Call \\`final({ findings: [...] })\\` exactly once at the top level.`\n\nexport const KNOWLEDGE_GAP_KIND_SPEC: TraceAnalystKindSpec = {\n id: 'knowledge-gap',\n description:\n 'Identifies missing or stale pieces of knowledge — primarily against the agent-knowledge wiki — and attributes each to the runtime layer (wiki page, claim, raw source, websearch, tool-doc, system-prompt, memory) that should have held it.',\n area: 'knowledge-gap',\n version: '1.0.0',\n actorDescription: ACTOR_PROMPT,\n buildTools: (store) => buildTraceToolsForGroup('discoveryAndSearch', store),\n recursion: { maxDepth: 2, maxParallelSubagents: 4 },\n maxTurns: 18,\n cost: { kind: 'llm' },\n}\n","/**\n * Knowledge-poisoning analyst — what FALSE information misled the agent?\n *\n * Brief: find moments where the agent acted on information that was\n * *wrong* — stale memory, RAG documents that contradicted ground truth,\n * tool descriptions that lied about return shapes, system-prompt\n * instructions that no longer matched reality, prior-run summaries that\n * cached a wrong decision.\n *\n * Distinct from knowledge-gap: a gap is \"the agent didn't know X\"; a\n * poisoning is \"the agent confidently used X, but X was wrong.\" Gaps\n * surface as questions / self-correction; poisonings surface as\n * confident-but-wrong actions that downstream evidence contradicts.\n *\n * Recursion is moderate (`maxDepth: 2`) because each candidate\n * poisoning typically needs two sub-investigations: one to confirm\n * the agent acted on the false belief, one to confirm the belief\n * itself is actually false in ground truth.\n */\n\nimport type { TraceAnalystKindSpec } from '../kind-factory'\nimport { buildTraceToolsForGroup } from '../tool-groups'\n\nconst ACTOR_PROMPT = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** — not where it lacked information (that's the knowledge-gap analyst).\n\nDISCOVERY → DUAL-VERIFY → CITE protocol:\n\n1. \\`traces.getDatasetOverview({})\\` first. Identify the agents, models, and tools.\n2. Pull traces where the agent's confident action was later contradicted. Strongest signals:\n - Agent stated a fact in one span; a later span surfaced contradictory evidence; the agent then proceeded anyway or fabricated reconciliation.\n - Tool call with stale arguments (an id that no longer exists, an API shape that changed).\n - Agent cited an \\`agent-knowledge\\` wiki page or claim whose content contradicts the trace's own evidence — the wiki itself drifted.\n - Web-search result the agent cited that returned an outdated page; agent treated it as canonical.\n - System-prompt instruction the agent followed that ground-truth evidence in the trace contradicts (e.g. prompt says \"use endpoint A\"; tool reply says \"endpoint A deprecated, use B\").\n - Repeated wrong-shape parsing despite the tool's actual output proving the shape.\n3. Use \\`traces.searchTrace\\` with regex on phrases like \\`actually\\`, \\`turns out\\`, \\`previously assumed\\`, \\`old version\\`, \\`deprecated\\`, \\`updated to\\`, \\`now uses\\`, or specific entity names you suspect have changed.\n4. For each candidate poisoning, **DUAL-VERIFY**:\n - Confirm the agent actually acted on the false belief (cite the span where it did)\n - Confirm the belief is actually false in this trace's own evidence (cite the span that contradicts it)\n Only emit a finding when both halves are nailed down. If you can only nail one, drop it — single-evidence poisoning findings are too speculative to be useful.\n\n**Delegate the dual-verify.** Use the recursion budget so each candidate poisoning gets one subagent investigating \"did the agent act?\" and one investigating \"is the belief false?\". After your first scan, fire off N parallel \\`llmQuery\\` pairs (one cluster per pair). Subagents return their findings; you accept only the ones where BOTH halves of the pair were confirmed.\n\nFor each confirmed poisoning, emit ONE finding with:\n- \\`area\\` = \"knowledge-poisoning\"\n- \\`subject\\` = the source of the false belief, one of: \\`agent-knowledge:wiki:<page-slug>\\` (wiki page contradicts current ground truth), \\`agent-knowledge:claim:<topic>\\` (a specific claim/relation went stale), \\`agent-knowledge:raw:<source-id>\\` (the raw source is outdated and the wiki inherited the drift), \\`websearch:outdated:<url-or-topic>\\`, \\`tool-doc:<tool>\\`, \\`system-prompt:<section>\\`, \\`memory:<key>\\`, \\`prior-run-summary:<topic>\\`\n- \\`claim\\` = one sentence: \"agent believed X (from source S); evidence in trace shows X is false\"\n- \\`severity\\` = \"critical\" when poisoning caused a wrong user-visible action; \"high\" when caught internally but wasted significant work; \"medium\" for inefficiency only\n- \\`evidence_uri\\` = \\`span://<trace_id>/<span_id>\\` of the action span (the moment the agent acted on the false belief)\n- \\`evidence_excerpt\\` = exact quote of the confident-but-wrong claim or action\n- \\`confidence\\` = 0.85+ when both halves are exact-quote backed; 0.6-0.8 when one half is inferred\n- \\`recommended_action\\` = where the source should be updated and how (\"Update wiki page \\`X\\` claim \\`Y\\` to '...'\", \"Invalidate raw source \\`Z\\` and re-curate\", \"Replace system-prompt section X with 'tool foo now returns Y'\")\n\nDo NOT report a finding if the agent caught and corrected the false belief in the same turn — that's the system working. Reserve poisoning for cases where the false belief shaped downstream action.\n\nOBSERVABILITY rules:\n- Each non-final turn must emit at least one \\`console.log\\` for evidence.\n- Call \\`final({ findings: [...] })\\` exactly once at the top level.`\n\nexport const KNOWLEDGE_POISONING_KIND_SPEC: TraceAnalystKindSpec = {\n id: 'knowledge-poisoning',\n description:\n 'Identifies confident-but-wrong actions caused by stale memory, contradicting RAG, deprecated tool docs, or outdated system-prompt instructions.',\n area: 'knowledge-poisoning',\n version: '1.0.0',\n actorDescription: ACTOR_PROMPT,\n buildTools: (store) => buildTraceToolsForGroup('all', store),\n recursion: { maxDepth: 2, maxParallelSubagents: 4 },\n maxTurns: 20,\n cost: { kind: 'llm' },\n}\n","/**\n * Default analyst kinds focused on agent failure + recursive\n * self-improvement.\n *\n * The four kinds chain: failure-mode classifies; knowledge-gap and\n * knowledge-poisoning explain *why* in two orthogonal ways; improvement\n * proposes concrete edits. Register all four against the same trace\n * store and the registry runs them in dependency order if the operator\n * pipes findings between them.\n */\n\nexport { FAILURE_MODE_KIND_SPEC } from './failure-mode'\nexport { IMPROVEMENT_KIND_SPEC } from './improvement'\nexport { KNOWLEDGE_GAP_KIND_SPEC } from './knowledge-gap'\nexport { KNOWLEDGE_POISONING_KIND_SPEC } from './knowledge-poisoning'\n\nimport type { TraceAnalystKindSpec } from '../kind-factory'\nimport { FAILURE_MODE_KIND_SPEC } from './failure-mode'\nimport { IMPROVEMENT_KIND_SPEC } from './improvement'\nimport { KNOWLEDGE_GAP_KIND_SPEC } from './knowledge-gap'\nimport { KNOWLEDGE_POISONING_KIND_SPEC } from './knowledge-poisoning'\n\n/**\n * The default kind suite. Order is the run order operators should\n * use: failure-mode first (no upstream deps), gap + poisoning next\n * (both depend on failures), improvement last (chains all three).\n */\nexport const DEFAULT_TRACE_ANALYST_KINDS: readonly TraceAnalystKindSpec[] = [\n FAILURE_MODE_KIND_SPEC,\n KNOWLEDGE_GAP_KIND_SPEC,\n KNOWLEDGE_POISONING_KIND_SPEC,\n IMPROVEMENT_KIND_SPEC,\n] as const\n","/**\n * AnalystRegistry — orchestrate N analysts against one run.\n *\n * Owns three responsibilities and only three:\n * 1. Registration — ids must be unique; bad registrations fail loudly\n * at register-time, not run-time.\n * 2. Routing — each analyst declares its `inputKind`; the registry\n * picks the matching field from AnalystRunInputs and skips the\n * analyst with a logged reason if it's missing.\n * 3. Isolation — one analyst's exception MUST NOT stop other analysts.\n * Failed analysts produce zero findings + a 'failed' summary row.\n *\n * Cross-cutting concerns (telemetry, error → finding conversion, cost\n * ingestion, storage rotation) live in `AnalystHooks`. Budget shaping\n * (equal split vs weighted vs custom) lives in `BudgetPolicy`. Both\n * have sensible defaults; consumers override only what they need.\n */\n\nimport { randomUUID } from 'node:crypto'\nimport type { ChatClient } from './chat-client'\nimport type {\n Analyst,\n AnalystContext,\n AnalystFinding,\n AnalystRunEvent,\n AnalystRunInputs,\n AnalystRunResult,\n AnalystRunSummary,\n} from './types'\n\n// ── Hook + policy surfaces ─────────────────────────────────────────\n\nexport interface AnalystHooks {\n /** Before analyze() — last chance to mutate ctx (e.g. inject tags, override budget). */\n onBeforeAnalyze?(args: {\n analyst: Analyst\n ctx: AnalystContext\n runId: string\n }): void | Promise<void>\n /** After every analyst (ok | failed | skipped). Use for telemetry, ingestion, rotation. */\n onAfterAnalyze?(args: {\n analyst: Analyst\n summary: AnalystRunSummary\n findings: AnalystFinding[]\n runId: string\n }): void | Promise<void>\n /**\n * On analyst exception. Hook MAY return findings to convert the\n * error into structured findings; the summary still reports 'failed'.\n * Return void to keep the default empty-findings behavior.\n */\n onError?(args: {\n analyst: Analyst\n error: Error\n runId: string\n }): AnalystFinding[] | undefined | Promise<AnalystFinding[] | undefined>\n /** Once after registry.run() completes. Use for final aggregation, persistence. */\n onComplete?(args: { result: AnalystRunResult }): void | Promise<void>\n}\n\nexport interface BudgetPolicy {\n /** Overall USD cap across the registry.run(). */\n totalUsd?: number\n /** Per-analyst weight for the default allocator. Missing ids get weight 1. */\n weights?: Record<string, number>\n /**\n * Custom allocator — receives the analyst, remaining/total budget, and\n * the count of analysts that will run. Returns the per-analyst budget\n * (or undefined to leave it uncapped). Overrides weights when set.\n */\n allocate?: (args: {\n analyst: Analyst\n totalUsd: number | undefined\n remainingUsd: number | undefined\n runningCount: number\n }) => number | undefined\n}\n\nexport interface AnalystRegistryOptions {\n /** Shared chat client passed to every LLM analyst via AnalystContext. */\n chat?: ChatClient\n /** Logger callback. Defaults to a no-op. */\n log?: (msg: string, fields?: Record<string, unknown>) => void\n /** Hooks invoked around analyze() — observability + customization seam. */\n hooks?: AnalystHooks\n /** Default budget when run() doesn't override. */\n defaultBudget?: BudgetPolicy\n}\n\nexport interface RegistryRunOpts {\n /** Restrict to a subset of registered analysts by id. */\n only?: string[]\n /** Skip these analysts even if registered. Useful for cheap iteration. */\n skip?: string[]\n /** Budget policy — totalUsd + optional weights/allocator. Falls back to options.defaultBudget. */\n budget?: BudgetPolicy\n /** Wall-clock cap. Analysts SHOULD honor `ctx.deadlineMs`. */\n timeoutMs?: number\n /** Abort signal — forwarded into every analyst's context. */\n signal?: AbortSignal\n /** Tags echoed into AnalystContext.tags — useful for tracking environment/version in findings. */\n tags?: Record<string, string>\n /**\n * Prior-run findings made available as retrieval context to every\n * analyst via `ctx.priorFindings`. The registry forwards the slice\n * whose `analyst_id` matches each registered analyst so a kind sees\n * only its own history. Pass `{ '*': findings }` to broadcast to\n * every analyst (useful for cross-kind chaining where the improvement\n * analyst consumes upstream failure findings).\n */\n priorFindings?: ReadonlyArray<AnalystFinding> | Record<string, ReadonlyArray<AnalystFinding>>\n}\n\nexport class AnalystRegistry {\n private readonly analysts = new Map<string, Analyst>()\n private readonly options: AnalystRegistryOptions\n\n constructor(options: AnalystRegistryOptions = {}) {\n this.options = options\n }\n\n register(analyst: Analyst): void {\n if (!analyst.id) throw new Error('AnalystRegistry.register: analyst.id is required')\n if (this.analysts.has(analyst.id)) {\n throw new Error(`AnalystRegistry.register: duplicate analyst id \"${analyst.id}\"`)\n }\n if (!analyst.version) {\n throw new Error(`AnalystRegistry.register: analyst \"${analyst.id}\" must declare a version`)\n }\n this.analysts.set(analyst.id, analyst)\n }\n\n list(): ReadonlyArray<{\n id: string\n description: string\n version: string\n cost: Analyst['cost']\n }> {\n return Array.from(this.analysts.values()).map((a) => ({\n id: a.id,\n description: a.description,\n version: a.version,\n cost: a.cost,\n }))\n }\n\n async run(\n runId: string,\n inputs: AnalystRunInputs,\n runOpts: RegistryRunOpts = {},\n ): Promise<AnalystRunResult> {\n // Thin collector over `runStream`. Both surfaces share the same\n // loop body so they cannot drift on isolation / hook order / cost.\n for await (const ev of this.runStream(runId, inputs, runOpts)) {\n if (ev.type === 'run-completed') return ev.result\n }\n throw new Error('AnalystRegistry.run: stream completed without run-completed event')\n }\n\n /**\n * Streaming counterpart to `run()`. Emits `AnalystRunEvent` values\n * in real time — `run-started`, then per-analyst `skipped` /\n * `started` / `completed`, then a terminal `run-completed` whose\n * payload is the full `AnalystRunResult`. UIs use this to render\n * progress; persistence consumers use `run()` and read the result.\n *\n * Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /\n * `onComplete`) fire as before — streaming is additive, not a hook\n * replacement.\n */\n async *runStream(\n runId: string,\n inputs: AnalystRunInputs,\n runOpts: RegistryRunOpts = {},\n ): AsyncGenerator<AnalystRunEvent, void, void> {\n const correlationId = `ar_${randomUUID().slice(0, 12)}`\n const log = this.options.log ?? (() => {})\n const hooks = this.options.hooks ?? {}\n const startedAt = new Date().toISOString()\n const started = Date.now()\n const deadlineMs = runOpts.timeoutMs ? started + runOpts.timeoutMs : undefined\n\n const selected = this.selectAnalysts(runOpts)\n const budget = runOpts.budget ?? this.options.defaultBudget\n\n yield {\n type: 'run-started',\n run_id: runId,\n correlation_id: correlationId,\n started_at: startedAt,\n analyst_ids: selected.map((a) => a.id),\n }\n\n const summaries: AnalystRunSummary[] = []\n const allFindings: AnalystFinding[] = []\n let totalCost = 0\n let remainingUsd = budget?.totalUsd\n\n for (const analyst of selected) {\n const t0 = Date.now()\n const input = this.routeInput(analyst, inputs)\n if (input.kind === 'missing') {\n const summary: AnalystRunSummary = {\n analyst_id: analyst.id,\n status: 'skipped',\n reason: `missing input of kind '${analyst.inputKind}'`,\n findings_count: 0,\n latency_ms: 0,\n cost_usd: 0,\n }\n summaries.push(summary)\n log(`[analyst] skip ${analyst.id} — missing input`, { runId, kind: analyst.inputKind })\n await hooks.onAfterAnalyze?.({ analyst, summary, findings: [], runId })\n yield { type: 'analyst-skipped', summary }\n continue\n }\n\n const perBudget = allocateBudget(budget, {\n analyst,\n remainingUsd,\n runningCount: selected.length,\n })\n\n const ctx: AnalystContext = {\n runId,\n correlationId,\n deadlineMs,\n budgetUsd: perBudget,\n chat: this.options.chat,\n tags: runOpts.tags,\n log: (msg, fields) => log(`[${analyst.id}] ${msg}`, { runId, correlationId, ...fields }),\n signal: runOpts.signal,\n priorFindings: selectPriorFindings(runOpts.priorFindings, analyst.id),\n }\n\n await hooks.onBeforeAnalyze?.({ analyst, ctx, runId })\n yield {\n type: 'analyst-started',\n analyst_id: analyst.id,\n started_at: new Date(t0).toISOString(),\n }\n\n try {\n const findings = await (analyst as Analyst<unknown>).analyze(input.value, ctx)\n const latency = Date.now() - t0\n const cost = sumFindingCost(findings)\n totalCost += cost\n if (typeof remainingUsd === 'number') remainingUsd = Math.max(0, remainingUsd - cost)\n allFindings.push(...findings)\n const summary: AnalystRunSummary = {\n analyst_id: analyst.id,\n status: 'ok',\n findings_count: findings.length,\n latency_ms: latency,\n cost_usd: cost,\n }\n summaries.push(summary)\n log(`[analyst] ok ${analyst.id}`, {\n runId,\n findings: findings.length,\n latency_ms: latency,\n cost_usd: cost,\n })\n await hooks.onAfterAnalyze?.({ analyst, summary, findings, runId })\n yield { type: 'analyst-completed', summary, findings }\n } catch (err) {\n const latency = Date.now() - t0\n const e = err instanceof Error ? err : new Error(String(err))\n // Hook gets first chance to convert the error into findings.\n const hookFindings = (await hooks.onError?.({ analyst, error: e, runId })) ?? []\n if (hookFindings.length) allFindings.push(...hookFindings)\n const summary: AnalystRunSummary = {\n analyst_id: analyst.id,\n status: 'failed',\n findings_count: hookFindings.length,\n latency_ms: latency,\n cost_usd: 0,\n error: { class: e.constructor.name, message: e.message },\n }\n summaries.push(summary)\n log(`[analyst] FAIL ${analyst.id}`, {\n runId,\n error_class: e.constructor.name,\n error: e.message,\n })\n await hooks.onAfterAnalyze?.({ analyst, summary, findings: hookFindings, runId })\n yield { type: 'analyst-completed', summary, findings: hookFindings }\n // Continue — isolation invariant.\n }\n }\n\n const result: AnalystRunResult = {\n run_id: runId,\n correlation_id: correlationId,\n started_at: startedAt,\n ended_at: new Date().toISOString(),\n findings: allFindings,\n per_analyst: summaries,\n total_cost_usd: totalCost,\n }\n await hooks.onComplete?.({ result })\n yield { type: 'run-completed', result }\n }\n\n private selectAnalysts(opts: RegistryRunOpts): Analyst[] {\n let candidates = Array.from(this.analysts.values())\n if (opts.only?.length) {\n const only = new Set(opts.only)\n candidates = candidates.filter((a) => only.has(a.id))\n }\n if (opts.skip?.length) {\n const skip = new Set(opts.skip)\n candidates = candidates.filter((a) => !skip.has(a.id))\n }\n return candidates\n }\n\n private routeInput(\n analyst: Analyst,\n inputs: AnalystRunInputs,\n ): { kind: 'present'; value: unknown } | { kind: 'missing' } {\n switch (analyst.inputKind) {\n case 'trace-store':\n return inputs.traceStore\n ? { kind: 'present', value: inputs.traceStore }\n : { kind: 'missing' }\n case 'artifact-dir':\n return inputs.artifactDir\n ? { kind: 'present', value: inputs.artifactDir }\n : { kind: 'missing' }\n case 'run-record':\n return inputs.runRecord ? { kind: 'present', value: inputs.runRecord } : { kind: 'missing' }\n case 'judge-input':\n return inputs.judgeInput\n ? { kind: 'present', value: inputs.judgeInput }\n : { kind: 'missing' }\n case 'custom': {\n const v = inputs.custom?.[analyst.id]\n return v !== undefined ? { kind: 'present', value: v } : { kind: 'missing' }\n }\n }\n }\n}\n\n/**\n * Default budget allocator: prefer the custom `allocate` callback if\n * provided; else weighted split when weights are set; else equal split\n * across `runningCount`. Returns undefined when no totalUsd is known.\n */\nfunction allocateBudget(\n policy: BudgetPolicy | undefined,\n args: { analyst: Analyst; remainingUsd: number | undefined; runningCount: number },\n): number | undefined {\n if (!policy) return undefined\n if (policy.allocate) {\n return policy.allocate({\n analyst: args.analyst,\n totalUsd: policy.totalUsd,\n remainingUsd: args.remainingUsd,\n runningCount: args.runningCount,\n })\n }\n if (policy.totalUsd == null) return undefined\n if (policy.weights) {\n // Weighted split: caller-supplied weights, default 1 for missing ids.\n // We can only normalize against the analysts in this run, but the\n // registry doesn't know all ids at allocator-time without passing\n // them. We approximate by treating `runningCount` as the count of\n // weight=1 analysts when the weight map omits ids. The exact split\n // is left to consumers that need precision via `allocate`.\n const w = policy.weights[args.analyst.id] ?? 1\n const totalWeight = Math.max(1, args.runningCount) // see note above\n return (policy.totalUsd * w) / totalWeight\n }\n return policy.totalUsd / Math.max(1, args.runningCount)\n}\n\n/**\n * Findings may carry their cost in `metadata.cost_usd` when the analyst\n * tracks it (the LLM-driven adapters do this — they sum chat-client\n * responses). Deterministic findings have no cost field.\n */\nfunction sumFindingCost(findings: AnalystFinding[]): number {\n let sum = 0\n for (const f of findings) {\n const c = f.metadata?.cost_usd\n if (typeof c === 'number' && Number.isFinite(c)) sum += c\n }\n return sum\n}\n\n/**\n * Resolve the `priorFindings` slice an analyst sees.\n *\n * - Array form → the analyst sees only findings whose `analyst_id`\n * matches its own id, so a kind never reads\n * another kind's history by accident.\n * - Record form → the analyst gets the entry keyed by its id, with\n * the `'*'` wildcard appended (in that order). Use\n * the wildcard for cross-kind chaining, e.g. when\n * `improvement` should see all upstream failure /\n * gap / poisoning findings.\n */\nfunction selectPriorFindings(\n source: RegistryRunOpts['priorFindings'],\n analystId: string,\n): ReadonlyArray<AnalystFinding> | undefined {\n if (!source) return undefined\n if (Array.isArray(source)) {\n const own = source.filter((f) => f.analyst_id === analystId)\n return own.length > 0 ? own : undefined\n }\n const record = source as Record<string, ReadonlyArray<AnalystFinding>>\n const own = record[analystId] ?? []\n const wildcard = record['*'] ?? []\n const merged = [...own, ...wildcard]\n return merged.length > 0 ? merged : undefined\n}\n"],"mappings":";;;;;;;;;AAiBA,SAAS,kBAAkB;AAsKpB,SAAS,iBAAiB,OAOtB;AACT,QAAM,QAAQ,KAAK,UAAU;AAAA,IAC3B,GAAG,MAAM;AAAA,IACT,GAAG,MAAM;AAAA,IACT,GAAG,MAAM,WAAW;AAAA,IACpB,GAAG,eAAe,MAAM,YAAY,MAAM,KAAK;AAAA,EACjD,CAAC;AACD,SAAO,KAAK,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE,CAAC;AAC3E;AAEA,SAAS,eAAe,GAAmB;AAGzC,SAAO,EACJ,YAAY,EACZ,QAAQ,QAAQ,GAAG,EACnB,QAAQ,eAAe,EAAE,EACzB,KAAK;AACV;AAMO,SAAS,YACd,MAIgB;AAChB,QAAM,EAAE,UAAU,aAAa,GAAG,KAAK,IAAI;AAC3C,SAAO;AAAA,IACL,gBAAgB;AAAA,IAChB,YAAY,iBAAiB;AAAA,MAC3B,YAAY,KAAK;AAAA,MACjB,MAAM,KAAK;AAAA,MACX,SAAS,KAAK;AAAA,MACd,OAAO,KAAK;AAAA,MACZ;AAAA,IACF,CAAC;AAAA,IACD,aAAa,gBAAe,oBAAI,KAAK,GAAE,YAAY;AAAA,IACnD,GAAG;AAAA,EACL;AACF;;;ACtMA,SAAS,SAAS;AAkCX,IAAM,wBAA2D;AAAA,EACtE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAoBO,SAAS,oBAAoB,KAAuD;AACzF,MAAI,QAAQ,QAAQ,QAAQ,OAAW,QAAO;AAC9C,QAAM,UAAU,IAAI,KAAK;AACzB,MAAI,QAAQ,WAAW,EAAG,QAAO;AAGjC,QAAM,OAAO,QAAQ;AAAA,IACnB;AAAA,EACF;AACA,MAAI;AACF,WAAO,EAAE,MAAM,kBAAkB,MAAM,KAAK,CAAC,GAAI,GAAI,KAAK,CAAC,IAAI,EAAE,SAAS,KAAK,CAAC,EAAE,IAAI,CAAC,EAAG;AAG5F,QAAM,QAAQ,QAAQ,MAAM,8BAA8B;AAC1D,MAAI,SAAS,MAAM,CAAC,EAAG,KAAK,EAAE,SAAS;AACrC,WAAO,EAAE,MAAM,mBAAmB,OAAO,MAAM,CAAC,EAAG,KAAK,EAAE;AAG5D,QAAM,OAAO,QAAQ,MAAM,4BAA4B;AACvD,MAAI,QAAQ,KAAK,CAAC,EAAG,KAAK,EAAE,SAAS;AACnC,WAAO,EAAE,MAAM,iBAAiB,UAAU,KAAK,CAAC,EAAG,KAAK,EAAE;AAG5D,QAAM,QAAQ,QAAQ,MAAM,8CAA8C;AAC1E,MAAI,MAAO,QAAO,EAAE,MAAM,mBAAmB,MAAM,MAAM,CAAC,EAAG;AAG7D,QAAM,KAAK,QAAQ,MAAM,sBAAsB;AAC/C,MAAI,MAAM,GAAG,CAAC,EAAG,KAAK,EAAE,SAAS,EAAG,QAAO,EAAE,MAAM,iBAAiB,SAAS,GAAG,CAAC,EAAG,KAAK,EAAE;AAG3F,QAAM,WAAW,QAAQ,MAAM,uCAAuC;AACtE,MAAI,YAAY,SAAS,CAAC,EAAG,KAAK,EAAE,SAAS,GAAG;AAC9C,WAAO,EAAE,MAAM,YAAY,MAAM,SAAS,CAAC,GAAI,QAAQ,SAAS,CAAC,EAAG,KAAK,EAAE;AAAA,EAC7E;AACA,QAAM,KAAK,QAAQ,MAAM,kCAAkC;AAC3D,MAAI,GAAI,QAAO,EAAE,MAAM,YAAY,MAAM,GAAG,CAAC,EAAG;AAGhD,QAAM,KAAK,QAAQ,MAAM,kCAAkC;AAC3D,MAAI,GAAI,QAAO,EAAE,MAAM,YAAY,MAAM,GAAG,CAAC,EAAG;AAGhD,QAAM,MAAM,QAAQ,MAAM,kCAAkC;AAC5D,MAAI,OAAO,IAAI,CAAC,EAAG,KAAK,EAAE,SAAS,GAAG;AACpC,WAAO,EAAE,MAAM,OAAO,QAAQ,IAAI,CAAC,GAAI,OAAO,IAAI,CAAC,EAAG,KAAK,EAAE;AAAA,EAC/D;AAGA,QAAM,MAAM,QAAQ,MAAM,eAAe;AACzC,MAAI,OAAO,IAAI,CAAC,EAAG,KAAK,EAAE,SAAS,EAAG,QAAO,EAAE,MAAM,UAAU,KAAK,IAAI,CAAC,EAAG,KAAK,EAAE;AAGnF,QAAM,KAAK,QAAQ,MAAM,oBAAoB;AAC7C,MAAI,MAAM,GAAG,CAAC,EAAG,KAAK,EAAE,SAAS,EAAG,QAAO,EAAE,MAAM,eAAe,SAAS,GAAG,CAAC,EAAG,KAAK,EAAE;AAGzF,QAAM,KAAK,QAAQ,MAAM,sBAAsB;AAC/C,MAAI,MAAM,GAAG,CAAC,EAAG,KAAK,EAAE,SAAS,EAAG,QAAO,EAAE,MAAM,iBAAiB,OAAO,GAAG,CAAC,EAAG,KAAK,EAAE;AAGzF,QAAM,KAAK,QAAQ,MAAM,2BAA2B;AACpD,MAAI,MAAM,GAAG,CAAC,EAAG,KAAK,EAAE,SAAS,EAAG,QAAO,EAAE,MAAM,sBAAsB,OAAO,GAAG,CAAC,EAAG,KAAK,EAAE;AAG9F,QAAM,MAAM,QAAQ,MAAM,0BAA0B;AACpD,MAAI,OAAO,IAAI,CAAC,EAAG,KAAK,EAAE,SAAS,EAAG,QAAO,EAAE,MAAM,qBAAqB,OAAO,IAAI,CAAC,EAAG,KAAK,EAAE;AAKhG,MAAI,yBAAyB,KAAK,OAAO,KAAK,QAAQ,UAAU,IAAI;AAClE,WAAO,EAAE,MAAM,WAAW,OAAO,QAAQ;AAAA,EAC3C;AAEA,SAAO;AACT;AAQO,SAAS,qBAAqB,GAA2B;AAC9D,UAAQ,EAAE,MAAM;AAAA,IACd,KAAK;AACH,aAAO,EAAE,UACL,wBAAwB,EAAE,IAAI,IAAI,EAAE,OAAO,KAC3C,wBAAwB,EAAE,IAAI;AAAA,IACpC,KAAK;AACH,aAAO,yBAAyB,EAAE,KAAK;AAAA,IACzC,KAAK;AACH,aAAO,uBAAuB,EAAE,QAAQ;AAAA,IAC1C,KAAK;AACH,aAAO,yBAAyB,EAAE,IAAI;AAAA,IACxC,KAAK;AACH,aAAO,iBAAiB,EAAE,OAAO;AAAA,IACnC,KAAK;AACH,aAAO,EAAE,SAAS,YAAY,EAAE,IAAI,IAAI,EAAE,MAAM,KAAK,YAAY,EAAE,IAAI;AAAA,IACzE,KAAK;AACH,aAAO,YAAY,EAAE,IAAI;AAAA,IAC3B,KAAK;AACH,aAAO,OAAO,EAAE,MAAM,IAAI,EAAE,KAAK;AAAA,IACnC,KAAK;AACH,aAAO,UAAU,EAAE,GAAG;AAAA,IACxB,KAAK;AACH,aAAO,eAAe,EAAE,OAAO;AAAA,IACjC,KAAK;AACH,aAAO,iBAAiB,EAAE,KAAK;AAAA,IACjC,KAAK;AACH,aAAO,sBAAsB,EAAE,KAAK;AAAA,IACtC,KAAK;AACH,aAAO,qBAAqB,EAAE,KAAK;AAAA,IACrC,KAAK;AACH,aAAO,EAAE;AAAA,EACb;AACF;AAaO,IAAM,iCAAiC;AAAA,EAC5C;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,EAAE,KAAK,IAAI;AAYJ,IAAM,yBAA4E;AAAA,EACvF,gBAAgB,CAAC,SAAS;AAAA,EAC1B,iBAAiB;AAAA,IACf;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAAA,EACA,uBAAuB;AAAA,IACrB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAAA,EACA,aAAa;AAAA,IACX;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAeO,IAAM,6BAA6B,EACvC,OAAO,EACP,OAAO,CAAC,MAAM,oBAAoB,CAAC,MAAM,MAAM;AAAA,EAC9C,SAAS;AACX,CAAC;;;ACxTI,SAAS,gBAAgB,MAAsB;AACpD,QAAM,IAAI,KAAK,KAAK;AACpB,QAAM,QAAQ;AACd,QAAM,IAAI,EAAE,MAAM,KAAK;AACvB,SAAO,IAAI,EAAE,CAAC,EAAG,KAAK,IAAI;AAC5B;AAGA,SAAS,mBAAmB,GAAmB;AAC7C,SAAO,EAAE,QAAQ,gBAAgB,IAAI;AACvC;AAMO,SAAS,WAAW,MAAuB;AAChD,QAAM,YAAY,mBAAmB,gBAAgB,IAAI,CAAC;AAC1D,MAAI;AACF,WAAO,KAAK,MAAM,SAAS;AAAA,EAC7B,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAQO,SAAS,oBAAoB,KAAyB;AAC3D,MAAI,QAAQ;AACZ,MAAI,OAAO,UAAU,UAAU;AAC7B,UAAM,SAAS,WAAW,KAAK;AAC/B,QAAI,WAAW,OAAW,QAAO,CAAC;AAClC,YAAQ;AAAA,EACV;AACA,MAAI,MAAM,QAAQ,KAAK,EAAG,QAAO;AACjC,MAAI,SAAS,OAAO,UAAU,UAAU;AAEtC,UAAM,QAAS,MAAkC;AACjD,QAAI,MAAM,QAAQ,KAAK,EAAG,QAAO;AACjC,WAAO,CAAC,KAAK;AAAA,EACf;AACA,SAAO,CAAC;AACV;;;AC1CA,SAAS,KAAAA,UAAS;AAIX,IAAM,qBAAqB,CAAC,YAAY,QAAQ,UAAU,OAAO,MAAM;AAEvE,IAAM,0BAA0BC,GACpC,OAAO;AAAA,EACN,UAAUA,GAAE,KAAK,kBAAkB;AAAA,EACnC,OAAOA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,GAAI;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAYjC,SAASA,GACN,OAAO,EACP,IAAI,GAAG,EACP,OAAO,CAAC,MAAM,oBAAoB,CAAC,MAAM,MAAM;AAAA,IAC9C,SAAS;AAAA,EACX,CAAC,EACA,SAAS;AAAA,EACZ,cAAcA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,GAAI;AAAA,EACxC,kBAAkBA,GAAE,OAAO,EAAE,IAAI,GAAI,EAAE,SAAS;AAAA,EAChD,YAAYA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EACnC,WAAWA,GAAE,OAAO,EAAE,IAAI,GAAI,EAAE,SAAS;AAAA,EACzC,oBAAoBA,GAAE,OAAO,EAAE,IAAI,GAAI,EAAE,SAAS;AACpD,CAAC,EACA,OAAO;AASH,IAAM,4BAA4B;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAiBlC,SAAS,gBACd,KACA,KAC0B;AAC1B,QAAM,SAAS,wBAAwB,UAAU,GAAG;AACpD,MAAI,OAAO,QAAS,QAAO,OAAO;AAGlC,MAAI,OAAO,QAAQ,UAAU;AAC3B,UAAM,UAAU,WAAW,GAAG;AAC9B,QAAI,YAAY,QAAW;AACzB,YAAM,QAAQ,wBAAwB,UAAU,OAAO;AACvD,UAAI,MAAM,QAAS,QAAO,MAAM;AAAA,IAClC;AAAA,EACF;AACA,QAAM,oCAAoC;AAAA,IACxC,QAAQ,OAAO,MAAM,OAAO,IAAI,CAAC,OAAO;AAAA,MACtC,MAAM,EAAE,KAAK,KAAK,GAAG;AAAA,MACrB,MAAM,EAAE;AAAA,MACR,SAAS,EAAE;AAAA,IACb,EAAE;AAAA,EACJ,CAAC;AACD,SAAO;AACT;;;AC3DA,IAAM,SAAS;AAAA,EACb;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,EAAE,KAAK,GAAG;AAEV,SAAS,UAAU,KAAc,WAAmB,MAAgC;AAClF,QAAM,OAAO,oBAAoB,GAAG;AACpC,QAAM,MAAwB,CAAC;AAC/B,aAAW,OAAO,MAAM;AAKtB,UAAM,aACJ,OACA,OAAO,QAAQ,YACf,CAAC,MAAM,QAAQ,GAAG,KAClB,CAAE,IAAgC,eAC9B,EAAE,GAAI,KAAiC,cAAc,mBAAmB,IACxE;AACN,UAAM,SAAmC,gBAAgB,UAAU;AACnE,QAAI,CAAC,OAAQ;AACb,QAAI;AAAA,MACF,YAAY;AAAA,QACV,YAAY;AAAA,QACZ;AAAA,QACA,SAAS,OAAO;AAAA,QAChB,OAAO,OAAO;AAAA,QACd,WAAW,OAAO;AAAA,QAClB,UAAU,OAAO;AAAA,QACjB,YAAY,OAAO;AAAA,QACnB,eAAe;AAAA,UACb;AAAA,YACE,MAAM,OAAO,aAAa,WAAW,SAAS,IAAI,SAAS;AAAA,YAC3D,KAAK,OAAO;AAAA,YACZ,SAAS,OAAO;AAAA,UAClB;AAAA,QACF;AAAA,QACA,oBAAoB,OAAO;AAAA,MAC7B,CAAC;AAAA,IACH;AAAA,EACF;AACA,SAAO;AACT;AAEA,eAAsB,kBACpB,MACkC;AAClC,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,MAAM,EAAE,SAAS,KAAK,SAAS,QAAQ,KAAK,QAAQ,OAAO,KAAK,UAAU;AAChF,MAAI,OAAO;AAAA,EAA2B,KAAK,MAAM;AAAA;AAAA;AAEjD,WAAS,UAAU,GAAG,WAAW,WAAW,WAAW;AACrD,UAAM,MAAM,MAAM;AAAA,MAChB;AAAA,QACE,OAAO,KAAK;AAAA,QACZ,UAAU;AAAA,UACR,EAAE,MAAM,UAAU,SAAS,OAAO;AAAA,UAClC,EAAE,MAAM,QAAQ,SAAS,KAAK;AAAA,QAChC;AAAA,MACF;AAAA,MACA;AAAA,IACF;AACA,UAAM,OAAO,IAAI,QAAQ,KAAK;AAC9B,UAAM,WAAW,UAAU,MAAM,KAAK,WAAW,KAAK,IAAI;AAC1D,QAAI,SAAS,SAAS,EAAG,QAAO,EAAE,UAAU,SAAS,KAAK;AAG1D,QAAI,KAAK,OAAO,KAAK,EAAE,SAAS,IAAK,QAAO,EAAE,UAAU,CAAC,GAAG,SAAS,KAAK;AAC1E,WAAO,GAAG,IAAI;AAAA;AAAA;AAAA,EAChB;AACA,SAAO,EAAE,UAAU,CAAC,GAAG,SAAS,oBAAoB;AACtD;;;ACzFA,SAAS,aAAa,aAAa;AAkF5B,SAAS,uBACd,MACA,MAC6B;AAC7B,QAAM,UAAU,KAAK,gBAAgB,GAAG,KAAK,OAAO,IAAI,KAAK,aAAa,KAAK,KAAK;AACpF,SAAO;AAAA,IACL,IAAI,KAAK;AAAA,IACT,aAAa,KAAK;AAAA,IAClB,WAAW;AAAA,IACX,MAAM,KAAK;AAAA,IACX;AAAA,IACA,MAAM,QAAQ,OAAO,KAAK;AACxB,YAAM,QAAQ,KAAK,WAAW,KAAK;AACnC,YAAM,WAAW,KAAK,WAAW,YAAY;AAC7C,YAAM,cAAc,KAAK,WAAW,wBAAwB;AAC5D,YAAM,eAAe,oBAAoB,IAAI,aAAa;AAE1D,YAAM,mBACJ,KAAK,iBAAiB,KAAK,IAC3B,eACA,SACA,4BACA;AAMF,YAAM,KAAK;AAAA,QACT;AAAA,QACA;AAAA,UACE,eAAe;AAAA,YACb,MAAM,KAAK;AAAA,YACX,aAAa,KAAK;AAAA,UACpB;AAAA,UACA,eAAe,CAAC,UAAU;AAAA,UAC1B,SAAS,IAAI,YAAY;AAAA,YACvB,aAAa,CAAC;AAAA,YACd,oBAAoB;AAAA,YACpB,gBAAgB,CAAC;AAAA,YACjB,kBAAkB;AAAA,YAClB,kBAAkB;AAAA,YAClB,6BAA6B;AAAA,UAC/B,CAAC;AAAA,UACD,MAAM,WAAW,IAAI,aAAa;AAAA,UAClC,kBAAkB,WAAW,IAAI,EAAE,SAAS,IAAI;AAAA,UAChD,UAAU,KAAK,YAAY;AAAA,UAC3B,iBAAiB,KAAK,mBAAmB;AAAA,UACzC,+BAA+B;AAAA,UAC/B,aAAa;AAAA;AAAA,UAEb,eAAe,EAAE,QAAQ,QAAQ,QAAQ,WAAW;AAAA,UACpD,WAAW,EAAE,OAAO,MAAM;AAAA,UAC1B,cAAc;AAAA,YACZ,aAAa;AAAA,YACb,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,YAC1C,cAAc;AAAA,YACd,qBAAqB;AAAA,UACvB;AAAA,UACA,kBAAkB;AAAA,YAChB,aACE,KAAK,wBACL;AAAA,YACF,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,YAC1C,cAAc;AAAA,UAChB;AAAA,UACA,cAAc,CAAC,qBAAqB;AAAA,QACtC;AAAA,MACF;AAEA,UAAI,MAAM,gBAAgB,KAAK,EAAE,YAAY;AAAA,QAC3C,WAAW;AAAA,QACX,YAAY,MAAM;AAAA,QAClB,MAAM,IAAI;AAAA,MACZ,CAAC;AAED,YAAM,SAAS,MAAM,GAAG,QAAQ,KAAK,IAAI,EAAE,UAAU,eAAe,KAAK,IAAI,EAAE,CAAC;AAEhF,YAAM,mBAAmB,uBAAuB,KAAK,EAAE;AACvD,YAAM,MAAwB,CAAC;AAC/B,YAAM,UAAU,MAAM,QAAQ,OAAO,QAAQ,IAAI,OAAO,WAAW,CAAC;AACpE,UAAI,oBAAoB;AACxB,iBAAW,OAAO,SAAS;AACzB,cAAM,SAAS,gBAAgB,KAAK,IAAI,GAAG;AAC3C,YAAI,CAAC,OAAQ;AAMb,YAAI,oBAAoB,OAAO,YAAY,QAAW;AACpD,gBAAM,gBAAgB,oBAAoB,OAAO,OAAO;AACxD,cAAI,kBAAkB,MAAM;AAC1B,gBAAI,MAAM,6CAA6C;AAAA,cACrD,MAAM,KAAK;AAAA,cACX,SAAS,OAAO;AAAA,YAClB,CAAC;AACD,iCAAqB;AACrB;AAAA,UACF;AACA,cAAI,CAAC,iBAAiB,SAAS,cAAc,IAAI,GAAG;AAClD,gBAAI,MAAM,+DAA+D;AAAA,cACvE,MAAM,KAAK;AAAA,cACX,cAAc,cAAc;AAAA,cAC5B,SAAS,OAAO;AAAA,cAChB,SAAS;AAAA,YACX,CAAC;AACD,iCAAqB;AACrB;AAAA,UACF;AAAA,QACF;AACA,cAAM,gBAAgB,KAAK,cAAc,QAAQ,GAAG,KAAK;AACzD,YAAI,CAAC,cAAe;AACpB,YAAI,KAAK,iBAAiB,MAAM,aAAa,CAAC;AAAA,MAChD;AAEA,UAAI,MAAM,gBAAgB,KAAK,EAAE,SAAS;AAAA,QACxC,SAAS,QAAQ;AAAA,QACjB,UAAU,IAAI;AAAA,QACd,wBAAwB;AAAA,MAC1B,CAAC;AASD,YAAM,SAAS,OAAO,OAAO,WAAW,WAAW,OAAO,SAAS;AACnE,UAAI,IAAI,WAAW,KAAK,OAAO,KAAK,EAAE,UAAU,KAAK;AACnD,YAAI,KAAK,UAAU;AACjB,gBAAM,YAAY,MAAM,kBAAkB;AAAA,YACxC;AAAA,YACA,WAAW,KAAK;AAAA,YAChB,MAAM,KAAK;AAAA,YACX,OAAO,KAAK,SAAS,SAAS,KAAK,SAAS;AAAA,YAC5C,SAAS,KAAK,SAAS;AAAA,YACvB,QAAQ,KAAK,SAAS;AAAA,YACtB,WAAW,KAAK,SAAS;AAAA,UAC3B,CAAC;AACD,cAAI,KAAK,GAAG,UAAU,QAAQ;AAC9B,cAAI,MAAM,gBAAgB,KAAK,EAAE,aAAa;AAAA,YAC5C,SAAS,UAAU;AAAA,YACnB,WAAW,UAAU,SAAS;AAAA,UAChC,CAAC;AAAA,QACH;AACA,YAAI,IAAI,WAAW,GAAG;AACpB,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY,KAAK;AAAA,cACjB,MAAM,KAAK;AAAA,cACX,OAAO;AAAA,cACP,WAAW,OAAO,MAAM,GAAG,IAAI;AAAA,cAC/B,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe;AAAA,gBACb,EAAE,MAAM,YAAY,KAAK,oBAAoB,SAAS,OAAO,MAAM,GAAG,GAAI,EAAE;AAAA,cAC9E;AAAA,cACA,UAAU,EAAE,SAAS,oBAAoB;AAAA,YAC3C,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAEA,SAAS,eAAe,KAAqB,MAAoC;AAK/E,QAAM,QAAQ,IAAI,MAAM,OAAO,KAAK;AACpC,QAAM,OAAO,kEAAkE,KAAK,IAAI,cAAc,KAAK,WAAW;AACtH,SAAO,QAAQ,GAAG,IAAI,WAAW,KAAK,MAAM;AAC9C;AAEA,SAAS,iBAAiB,MAA4B,KAAwC;AAC5F,SAAO,YAAY;AAAA,IACjB,YAAY,KAAK;AAAA,IACjB,MAAM,KAAK;AAAA,IACX,SAAS,IAAI;AAAA,IACb,OAAO,IAAI;AAAA,IACX,WAAW,IAAI;AAAA,IACf,UAAU,IAAI;AAAA,IACd,YAAY,IAAI;AAAA,IAChB,eAAe;AAAA,MACb;AAAA,QACE,MAAM,oBAAoB,IAAI,YAAY;AAAA,QAC1C,KAAK,IAAI;AAAA,QACT,SAAS,IAAI;AAAA,MACf;AAAA,IACF;AAAA,IACA,oBAAoB,IAAI;AAAA,IACxB,UAAU,EAAE,cAAc,KAAK,QAAQ;AAAA,EACzC,CAAC;AACH;AAEA,SAAS,oBAAoB,KAAmE;AAC9F,MAAI,IAAI,WAAW,SAAS,EAAG,QAAO;AACtC,MAAI,IAAI,WAAW,aAAa,EAAG,QAAO;AAC1C,MAAI,IAAI,WAAW,WAAW,EAAG,QAAO;AACxC,MAAI,IAAI,WAAW,UAAU,EAAG,QAAO;AACvC,MAAI,IAAI,WAAW,YAAY,EAAG,QAAO;AACzC,SAAO;AACT;AAgBO,SAAS,oBAAoB,OAAgD;AAClF,MAAI,CAAC,SAAS,MAAM,WAAW,EAAG,QAAO;AACzC,QAAM,WAAW;AACjB,QAAM,OAAO,MAAM,MAAM,GAAG,QAAQ,EAAE,IAAI,CAAC,MAAM;AAC/C,UAAM,UAAU,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAChD,WAAO,UAAU,EAAE,UAAU,IAAI,EAAE,QAAQ,GAAG,OAAO,IAAI,mBAAmB,EAAE,OAAO,GAAG,CAAC;AAAA,EAC3F,CAAC;AACD,QAAM,WACJ,MAAM,SAAS,WACX;AAAA,SAAY,MAAM,SAAS,QAAQ,mDACnC;AACN,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG;AAAA,IACH;AAAA,EACF,EACG,OAAO,OAAO,EACd,KAAK,IAAI;AACd;AAEA,SAAS,mBAAmB,GAAW,KAAqB;AAC1D,MAAI,EAAE,UAAU,IAAK,QAAO;AAC5B,SAAO,GAAG,EAAE,MAAM,GAAG,MAAM,CAAC,EAAE,QAAQ,CAAC;AACzC;;;AC3UA,IAAM,sBAAuE;AAAA,EAC3E,KAAK,oBAAI,IAAI;AAAA,EACb,WAAW,oBAAI,IAAI,CAAC,sBAAsB,eAAe,aAAa,CAAC;AAAA,EACvE,kBAAkB,oBAAI,IAAI;AAAA,IACxB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AAAA,EACD,oBAAoB,oBAAI,IAAI;AAAA,IAC1B;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AAAA,EACD,UAAU,oBAAI,IAAI,CAAC,sBAAsB,eAAe,aAAa,YAAY,CAAC;AACpF;AASO,SAAS,wBACd,OACA,OACc;AACd,QAAM,MAAM,uBAAuB,EAAE,MAAM,CAAC;AAC5C,MAAI,UAAU,MAAO,QAAO;AAC5B,QAAM,QAAQ,oBAAoB,KAAK;AACvC,MAAI,CAAC,MAAO,OAAM,IAAI,MAAM,6BAA6B,KAAK,EAAE;AAChE,SAAO,IAAI,OAAO,CAAC,SAAS,MAAM,IAAK,KAA0B,IAAI,CAAC;AACxE;;;AC7CA,IAAM,eAAe;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AA+Bd,IAAM,yBAA+C;AAAA,EAC1D,IAAI;AAAA,EACJ,aACE;AAAA,EACF,MAAM;AAAA,EACN,SAAS;AAAA,EACT,kBAAkB;AAAA,EAClB,YAAY,CAAC,UAAU,wBAAwB,OAAO,KAAK;AAAA,EAC3D,WAAW,EAAE,UAAU,GAAG,sBAAsB,EAAE;AAAA,EAClD,UAAU;AAAA,EACV,MAAM,EAAE,MAAM,MAAM;AACtB;;;ACtCA,IAAMC,gBAAe;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAsCd,IAAM,wBAA8C;AAAA,EACzD,IAAI;AAAA,EACJ,aACE;AAAA,EACF,MAAM;AAAA,EACN,SAAS;AAAA,EACT,kBAAkBA;AAAA,EAClB,YAAY,CAAC,UAAU,wBAAwB,OAAO,KAAK;AAAA,EAC3D,WAAW,EAAE,UAAU,GAAG,sBAAsB,EAAE;AAAA,EAClD,UAAU;AAAA,EACV,iBAAiB;AAAA,EACjB,MAAM,EAAE,MAAM,MAAM;AACtB;;;AC7CA,IAAMC,gBAAe;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AA4Cd,IAAM,0BAAgD;AAAA,EAC3D,IAAI;AAAA,EACJ,aACE;AAAA,EACF,MAAM;AAAA,EACN,SAAS;AAAA,EACT,kBAAkBA;AAAA,EAClB,YAAY,CAAC,UAAU,wBAAwB,sBAAsB,KAAK;AAAA,EAC1E,WAAW,EAAE,UAAU,GAAG,sBAAsB,EAAE;AAAA,EAClD,UAAU;AAAA,EACV,MAAM,EAAE,MAAM,MAAM;AACtB;;;AC7DA,IAAMC,gBAAe;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAoCd,IAAM,gCAAsD;AAAA,EACjE,IAAI;AAAA,EACJ,aACE;AAAA,EACF,MAAM;AAAA,EACN,SAAS;AAAA,EACT,kBAAkBA;AAAA,EAClB,YAAY,CAAC,UAAU,wBAAwB,OAAO,KAAK;AAAA,EAC3D,WAAW,EAAE,UAAU,GAAG,sBAAsB,EAAE;AAAA,EAClD,UAAU;AAAA,EACV,MAAM,EAAE,MAAM,MAAM;AACtB;;;AC3CO,IAAM,8BAA+D;AAAA,EAC1E;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;;;ACdA,SAAS,kBAAkB;AA+FpB,IAAM,kBAAN,MAAsB;AAAA,EACV,WAAW,oBAAI,IAAqB;AAAA,EACpC;AAAA,EAEjB,YAAY,UAAkC,CAAC,GAAG;AAChD,SAAK,UAAU;AAAA,EACjB;AAAA,EAEA,SAAS,SAAwB;AAC/B,QAAI,CAAC,QAAQ,GAAI,OAAM,IAAI,MAAM,kDAAkD;AACnF,QAAI,KAAK,SAAS,IAAI,QAAQ,EAAE,GAAG;AACjC,YAAM,IAAI,MAAM,mDAAmD,QAAQ,EAAE,GAAG;AAAA,IAClF;AACA,QAAI,CAAC,QAAQ,SAAS;AACpB,YAAM,IAAI,MAAM,sCAAsC,QAAQ,EAAE,0BAA0B;AAAA,IAC5F;AACA,SAAK,SAAS,IAAI,QAAQ,IAAI,OAAO;AAAA,EACvC;AAAA,EAEA,OAKG;AACD,WAAO,MAAM,KAAK,KAAK,SAAS,OAAO,CAAC,EAAE,IAAI,CAAC,OAAO;AAAA,MACpD,IAAI,EAAE;AAAA,MACN,aAAa,EAAE;AAAA,MACf,SAAS,EAAE;AAAA,MACX,MAAM,EAAE;AAAA,IACV,EAAE;AAAA,EACJ;AAAA,EAEA,MAAM,IACJ,OACA,QACA,UAA2B,CAAC,GACD;AAG3B,qBAAiB,MAAM,KAAK,UAAU,OAAO,QAAQ,OAAO,GAAG;AAC7D,UAAI,GAAG,SAAS,gBAAiB,QAAO,GAAG;AAAA,IAC7C;AACA,UAAM,IAAI,MAAM,mEAAmE;AAAA,EACrF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAaA,OAAO,UACL,OACA,QACA,UAA2B,CAAC,GACiB;AAC7C,UAAM,gBAAgB,MAAM,WAAW,EAAE,MAAM,GAAG,EAAE,CAAC;AACrD,UAAM,MAAM,KAAK,QAAQ,QAAQ,MAAM;AAAA,IAAC;AACxC,UAAM,QAAQ,KAAK,QAAQ,SAAS,CAAC;AACrC,UAAM,aAAY,oBAAI,KAAK,GAAE,YAAY;AACzC,UAAM,UAAU,KAAK,IAAI;AACzB,UAAM,aAAa,QAAQ,YAAY,UAAU,QAAQ,YAAY;AAErE,UAAM,WAAW,KAAK,eAAe,OAAO;AAC5C,UAAM,SAAS,QAAQ,UAAU,KAAK,QAAQ;AAE9C,UAAM;AAAA,MACJ,MAAM;AAAA,MACN,QAAQ;AAAA,MACR,gBAAgB;AAAA,MAChB,YAAY;AAAA,MACZ,aAAa,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE;AAAA,IACvC;AAEA,UAAM,YAAiC,CAAC;AACxC,UAAM,cAAgC,CAAC;AACvC,QAAI,YAAY;AAChB,QAAI,eAAe,QAAQ;AAE3B,eAAW,WAAW,UAAU;AAC9B,YAAM,KAAK,KAAK,IAAI;AACpB,YAAM,QAAQ,KAAK,WAAW,SAAS,MAAM;AAC7C,UAAI,MAAM,SAAS,WAAW;AAC5B,cAAM,UAA6B;AAAA,UACjC,YAAY,QAAQ;AAAA,UACpB,QAAQ;AAAA,UACR,QAAQ,0BAA0B,QAAQ,SAAS;AAAA,UACnD,gBAAgB;AAAA,UAChB,YAAY;AAAA,UACZ,UAAU;AAAA,QACZ;AACA,kBAAU,KAAK,OAAO;AACtB,YAAI,kBAAkB,QAAQ,EAAE,yBAAoB,EAAE,OAAO,MAAM,QAAQ,UAAU,CAAC;AACtF,cAAM,MAAM,iBAAiB,EAAE,SAAS,SAAS,UAAU,CAAC,GAAG,MAAM,CAAC;AACtE,cAAM,EAAE,MAAM,mBAAmB,QAAQ;AACzC;AAAA,MACF;AAEA,YAAM,YAAY,eAAe,QAAQ;AAAA,QACvC;AAAA,QACA;AAAA,QACA,cAAc,SAAS;AAAA,MACzB,CAAC;AAED,YAAM,MAAsB;AAAA,QAC1B;AAAA,QACA;AAAA,QACA;AAAA,QACA,WAAW;AAAA,QACX,MAAM,KAAK,QAAQ;AAAA,QACnB,MAAM,QAAQ;AAAA,QACd,KAAK,CAAC,KAAK,WAAW,IAAI,IAAI,QAAQ,EAAE,KAAK,GAAG,IAAI,EAAE,OAAO,eAAe,GAAG,OAAO,CAAC;AAAA,QACvF,QAAQ,QAAQ;AAAA,QAChB,eAAe,oBAAoB,QAAQ,eAAe,QAAQ,EAAE;AAAA,MACtE;AAEA,YAAM,MAAM,kBAAkB,EAAE,SAAS,KAAK,MAAM,CAAC;AACrD,YAAM;AAAA,QACJ,MAAM;AAAA,QACN,YAAY,QAAQ;AAAA,QACpB,YAAY,IAAI,KAAK,EAAE,EAAE,YAAY;AAAA,MACvC;AAEA,UAAI;AACF,cAAM,WAAW,MAAO,QAA6B,QAAQ,MAAM,OAAO,GAAG;AAC7E,cAAM,UAAU,KAAK,IAAI,IAAI;AAC7B,cAAM,OAAO,eAAe,QAAQ;AACpC,qBAAa;AACb,YAAI,OAAO,iBAAiB,SAAU,gBAAe,KAAK,IAAI,GAAG,eAAe,IAAI;AACpF,oBAAY,KAAK,GAAG,QAAQ;AAC5B,cAAM,UAA6B;AAAA,UACjC,YAAY,QAAQ;AAAA,UACpB,QAAQ;AAAA,UACR,gBAAgB,SAAS;AAAA,UACzB,YAAY;AAAA,UACZ,UAAU;AAAA,QACZ;AACA,kBAAU,KAAK,OAAO;AACtB,YAAI,gBAAgB,QAAQ,EAAE,IAAI;AAAA,UAChC;AAAA,UACA,UAAU,SAAS;AAAA,UACnB,YAAY;AAAA,UACZ,UAAU;AAAA,QACZ,CAAC;AACD,cAAM,MAAM,iBAAiB,EAAE,SAAS,SAAS,UAAU,MAAM,CAAC;AAClE,cAAM,EAAE,MAAM,qBAAqB,SAAS,SAAS;AAAA,MACvD,SAAS,KAAK;AACZ,cAAM,UAAU,KAAK,IAAI,IAAI;AAC7B,cAAM,IAAI,eAAe,QAAQ,MAAM,IAAI,MAAM,OAAO,GAAG,CAAC;AAE5D,cAAM,eAAgB,MAAM,MAAM,UAAU,EAAE,SAAS,OAAO,GAAG,MAAM,CAAC,KAAM,CAAC;AAC/E,YAAI,aAAa,OAAQ,aAAY,KAAK,GAAG,YAAY;AACzD,cAAM,UAA6B;AAAA,UACjC,YAAY,QAAQ;AAAA,UACpB,QAAQ;AAAA,UACR,gBAAgB,aAAa;AAAA,UAC7B,YAAY;AAAA,UACZ,UAAU;AAAA,UACV,OAAO,EAAE,OAAO,EAAE,YAAY,MAAM,SAAS,EAAE,QAAQ;AAAA,QACzD;AACA,kBAAU,KAAK,OAAO;AACtB,YAAI,kBAAkB,QAAQ,EAAE,IAAI;AAAA,UAClC;AAAA,UACA,aAAa,EAAE,YAAY;AAAA,UAC3B,OAAO,EAAE;AAAA,QACX,CAAC;AACD,cAAM,MAAM,iBAAiB,EAAE,SAAS,SAAS,UAAU,cAAc,MAAM,CAAC;AAChF,cAAM,EAAE,MAAM,qBAAqB,SAAS,UAAU,aAAa;AAAA,MAErE;AAAA,IACF;AAEA,UAAM,SAA2B;AAAA,MAC/B,QAAQ;AAAA,MACR,gBAAgB;AAAA,MAChB,YAAY;AAAA,MACZ,WAAU,oBAAI,KAAK,GAAE,YAAY;AAAA,MACjC,UAAU;AAAA,MACV,aAAa;AAAA,MACb,gBAAgB;AAAA,IAClB;AACA,UAAM,MAAM,aAAa,EAAE,OAAO,CAAC;AACnC,UAAM,EAAE,MAAM,iBAAiB,OAAO;AAAA,EACxC;AAAA,EAEQ,eAAe,MAAkC;AACvD,QAAI,aAAa,MAAM,KAAK,KAAK,SAAS,OAAO,CAAC;AAClD,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,OAAO,IAAI,IAAI,KAAK,IAAI;AAC9B,mBAAa,WAAW,OAAO,CAAC,MAAM,KAAK,IAAI,EAAE,EAAE,CAAC;AAAA,IACtD;AACA,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,OAAO,IAAI,IAAI,KAAK,IAAI;AAC9B,mBAAa,WAAW,OAAO,CAAC,MAAM,CAAC,KAAK,IAAI,EAAE,EAAE,CAAC;AAAA,IACvD;AACA,WAAO;AAAA,EACT;AAAA,EAEQ,WACN,SACA,QAC2D;AAC3D,YAAQ,QAAQ,WAAW;AAAA,MACzB,KAAK;AACH,eAAO,OAAO,aACV,EAAE,MAAM,WAAW,OAAO,OAAO,WAAW,IAC5C,EAAE,MAAM,UAAU;AAAA,MACxB,KAAK;AACH,eAAO,OAAO,cACV,EAAE,MAAM,WAAW,OAAO,OAAO,YAAY,IAC7C,EAAE,MAAM,UAAU;AAAA,MACxB,KAAK;AACH,eAAO,OAAO,YAAY,EAAE,MAAM,WAAW,OAAO,OAAO,UAAU,IAAI,EAAE,MAAM,UAAU;AAAA,MAC7F,KAAK;AACH,eAAO,OAAO,aACV,EAAE,MAAM,WAAW,OAAO,OAAO,WAAW,IAC5C,EAAE,MAAM,UAAU;AAAA,MACxB,KAAK,UAAU;AACb,cAAM,IAAI,OAAO,SAAS,QAAQ,EAAE;AACpC,eAAO,MAAM,SAAY,EAAE,MAAM,WAAW,OAAO,EAAE,IAAI,EAAE,MAAM,UAAU;AAAA,MAC7E;AAAA,IACF;AAAA,EACF;AACF;AAOA,SAAS,eACP,QACA,MACoB;AACpB,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,OAAO,UAAU;AACnB,WAAO,OAAO,SAAS;AAAA,MACrB,SAAS,KAAK;AAAA,MACd,UAAU,OAAO;AAAA,MACjB,cAAc,KAAK;AAAA,MACnB,cAAc,KAAK;AAAA,IACrB,CAAC;AAAA,EACH;AACA,MAAI,OAAO,YAAY,KAAM,QAAO;AACpC,MAAI,OAAO,SAAS;AAOlB,UAAM,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE,KAAK;AAC7C,UAAM,cAAc,KAAK,IAAI,GAAG,KAAK,YAAY;AACjD,WAAQ,OAAO,WAAW,IAAK;AAAA,EACjC;AACA,SAAO,OAAO,WAAW,KAAK,IAAI,GAAG,KAAK,YAAY;AACxD;AAOA,SAAS,eAAe,UAAoC;AAC1D,MAAI,MAAM;AACV,aAAW,KAAK,UAAU;AACxB,UAAM,IAAI,EAAE,UAAU;AACtB,QAAI,OAAO,MAAM,YAAY,OAAO,SAAS,CAAC,EAAG,QAAO;AAAA,EAC1D;AACA,SAAO;AACT;AAcA,SAAS,oBACP,QACA,WAC2C;AAC3C,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,GAAG;AACzB,UAAMC,OAAM,OAAO,OAAO,CAAC,MAAM,EAAE,eAAe,SAAS;AAC3D,WAAOA,KAAI,SAAS,IAAIA,OAAM;AAAA,EAChC;AACA,QAAM,SAAS;AACf,QAAM,MAAM,OAAO,SAAS,KAAK,CAAC;AAClC,QAAM,WAAW,OAAO,GAAG,KAAK,CAAC;AACjC,QAAM,SAAS,CAAC,GAAG,KAAK,GAAG,QAAQ;AACnC,SAAO,OAAO,SAAS,IAAI,SAAS;AACtC;","names":["z","z","ACTOR_PROMPT","ACTOR_PROMPT","ACTOR_PROMPT","own"]}

package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} RENAMED Viewed

@@ -1,10 +1,10 @@
 import {
   runCanaries,
   scoreRedTeamOutput
-} from "./chunk-VMAYE3LM.js";
+} from "./chunk-JYE3WOTE.js";
 import {
   runCampaign
-} from "./chunk-6XQIEUQ2.js";
+} from "./chunk-ZPSKPT3V.js";
 import {
   detectRewardHacking
 } from "./chunk-YV7J7X5N.js";
@@ -300,6 +300,120 @@ function extractText(artifact) {
   return void 0;
 }
+// src/campaign/gates/promotion-policy.ts
+function buildEvidenceVector(ctx, objectives, opts = {}) {
+  if (objectives.length === 0) {
+    throw new Error("buildEvidenceVector: at least 1 objective required");
+  }
+  const minProductiveRuns = opts.minProductiveRuns ?? 3;
+  const confidence = opts.confidence ?? 0.95;
+  const resamples = opts.resamples ?? 2e3;
+  const seed = opts.seed ?? 1337;
+  const baseline = ctx.baselineJudgeScores ?? ctx.judgeScores;
+  const scenarioIds = new Set(ctx.scenarios.map((s) => s.id));
+  const axes = [];
+  for (const obj of objectives) {
+    let select;
+    if (obj.source.kind === "composite") {
+      select = (s) => s.composite;
+    } else {
+      const dim = obj.source.dimension;
+      select = (s) => s.dimensions[dim];
+    }
+    const paired = pairHoldout(ctx.judgeScores, baseline, scenarioIds, select);
+    const before = obj.direction === "maximize" ? paired.before : paired.after;
+    const after = obj.direction === "maximize" ? paired.after : paired.before;
+    const bootstrap = pairedBootstrap(before, after, {
+      confidence,
+      resamples,
+      statistic: "median",
+      seed
+    });
+    const n = paired.before.length;
+    const floorTolerance = obj.floorTolerance ?? 0.05 * detectScale([...paired.before, ...paired.after]);
+    const gainThreshold = obj.gainThreshold ?? 0;
+    const verdict = n < minProductiveRuns ? "few_runs" : bootstrap.low < -floorTolerance ? "regressed" : bootstrap.low > gainThreshold ? "improved" : "flat";
+    axes.push({
+      name: obj.name,
+      source: obj.source,
+      direction: obj.direction,
+      bootstrap,
+      n,
+      gainThreshold,
+      floorTolerance,
+      verdict
+    });
+  }
+  const ns = axes.map((a) => a.n).filter((n) => n > 0);
+  const minN = ns.length > 0 ? Math.min(...ns) : 0;
+  return { axes, minN, cost: { candidate: ctx.cost.candidate, baseline: ctx.cost.baseline } };
+}
+var paretoPolicy = (ev) => {
+  const contributingGates = ev.axes.map((ax) => ({
+    name: `objective:${ax.name}`,
+    passed: ax.verdict === "improved",
+    detail: {
+      direction: ax.direction,
+      source: ax.source,
+      verdict: ax.verdict,
+      n: ax.n,
+      deltaMedian: ax.bootstrap.median,
+      ciLow: ax.bootstrap.low,
+      ciHigh: ax.bootstrap.high,
+      confidence: ax.bootstrap.confidence,
+      gainThreshold: ax.gainThreshold,
+      floorTolerance: ax.floorTolerance
+    }
+  }));
+  const regressed = ev.axes.filter((a) => a.verdict === "regressed");
+  const fewRuns = ev.axes.filter((a) => a.verdict === "few_runs");
+  const improved = ev.axes.filter((a) => a.verdict === "improved");
+  let decision;
+  const reasons = [];
+  if (regressed.length > 0) {
+    decision = "hold";
+    for (const a of regressed) {
+      reasons.push(
+        `objective '${a.name}' regressed: good-direction CI.low ${a.bootstrap.low.toFixed(3)} < -${a.floorTolerance} (n=${a.n})`
+      );
+    }
+  } else if (fewRuns.length > 0) {
+    decision = "need_more_work";
+    for (const a of fewRuns) {
+      reasons.push(
+        `objective '${a.name}' has only n=${a.n} paired runs \u2014 insufficient evidence to claim significance`
+      );
+    }
+  } else if (improved.length > 0) {
+    decision = "ship";
+    reasons.push(
+      `Pareto improvement at the confidence level: ${improved.map(
+        (a) => `'${a.name}' +${a.bootstrap.median.toFixed(3)} (CI.low ${a.bootstrap.low.toFixed(3)})`
+      ).join(", ")}; no objective regressed`
+    );
+  } else {
+    decision = "hold";
+    reasons.push(
+      "no Pareto improvement: candidate statistically equivalent to baseline on every objective"
+    );
+  }
+  const composite = ev.axes.find((a) => a.source.kind === "composite") ?? ev.axes[0];
+  return { decision, reasons, contributingGates, delta: composite?.bootstrap.median };
+};
+function paretoSignificanceGate(options) {
+  if (options.objectives.length === 0) {
+    throw new Error("paretoSignificanceGate: at least 1 objective required");
+  }
+  const policy = options.policy ?? paretoPolicy;
+  return {
+    name: options.name ?? "paretoSignificanceGate",
+    async decide(ctx) {
+      const ev = buildEvidenceVector(ctx, options.objectives, options);
+      return policy(ev);
+    }
+  };
+}
 // src/campaign/presets/run-eval.ts
 async function runEval(opts) {
   return runCampaign(opts);
@@ -313,6 +427,9 @@ export {
   detectScale,
   dimensionRegressions,
   defaultProductionGate,
+  buildEvidenceVector,
+  paretoPolicy,
+  paretoSignificanceGate,
   runEval
 };
-//# sourceMappingURL=chunk-6QZUCFKM.js.map
+//# sourceMappingURL=chunk-XPILG2CA.js.map

package/dist/chunk-XPILG2CA.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/campaign/drivers/evolutionary.ts","../src/campaign/gates/compose.ts","../src/campaign/gates/statistical-heldout.ts","../src/campaign/gates/default-production-gate.ts","../src/campaign/gates/promotion-policy.ts","../src/campaign/presets/run-eval.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:\n * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is\n * the evolutionary strategy: each generation, mutate the current best surface\n * into N candidates, measure, select. No generation memory beyond the current\n * surface; the loop body handles ranking + promotion.\n *\n * The reflective alternative is agent-runtime's `improvementDriver` with a\n * `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +\n * trace findings to propose targeted edits rather than blind mutations. Both\n * conform to `ImprovementDriver`; the improvement loop is identical regardless\n * of which drives it.\n */\n\nimport type { ImprovementDriver, Mutator } from '../types'\n\nexport interface EvolutionaryDriverOptions<TFindings = unknown> {\n mutator: Mutator<TFindings>\n /** External findings fed to the mutator each generation. Default: []. */\n findings?: TFindings[]\n}\n\nexport function evolutionaryDriver<TFindings = unknown>(\n opts: EvolutionaryDriverOptions<TFindings>,\n): ImprovementDriver<TFindings> {\n return {\n kind: `evolutionary:${opts.mutator.kind}`,\n async propose({ currentSurface, findings, populationSize, signal }) {\n return opts.mutator.mutate({\n findings: findings.length > 0 ? findings : (opts.findings ?? []),\n currentSurface,\n populationSize,\n signal,\n })\n },\n }\n}\n","/**\n * @experimental\n *\n * Compose multiple `Gate` implementations — every gate must pass for the\n * composite to ship. Closes the alignment reviewer's \"default-only\n * heldOutGate + costGate would happily promote a reward-hacked prompt\"\n * concern by making safety gates first-class composable defaults.\n */\n\nimport type { Gate, GateContext, GateDecision, GateResult, Scenario } from '../types'\n\n/** Compose gates — all must `ship` for the composite to `ship`. First\n * non-ship verdict short-circuits the composite verdict, but ALL gates run\n * (so the result records every gate's reason — useful for diagnostics). */\nexport function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(\n ...gates: Array<Gate<TArtifact, TScenario>>\n): Gate<TArtifact, TScenario> {\n if (gates.length === 0) {\n throw new Error('composeGate requires at least one gate')\n }\n return {\n name: `composed(${gates.map((g) => g.name).join(',')})`,\n async decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult> {\n const results: Array<{ gate: Gate<TArtifact, TScenario>; res: GateResult }> = []\n for (const gate of gates) {\n const res = await gate.decide(ctx)\n results.push({ gate, res })\n }\n\n // Substrate-wide verdict policy:\n // - all 'ship' → 'ship'\n // - any 'arch_ceiling' → 'arch_ceiling' (architectural ceiling beats other holds)\n // - any 'model_ceiling' → 'model_ceiling'\n // - any 'hold' → 'hold'\n // - else 'need_more_work'\n const decisions = results.map((r) => r.res.decision)\n const overall: GateDecision = decisions.every((d) => d === 'ship')\n ? 'ship'\n : decisions.includes('arch_ceiling')\n ? 'arch_ceiling'\n : decisions.includes('model_ceiling')\n ? 'model_ceiling'\n : decisions.includes('hold')\n ? 'hold'\n : 'need_more_work'\n\n const contributing = results.flatMap((r) =>\n r.res.contributingGates.length > 0\n ? r.res.contributingGates\n : [{ name: r.gate.name, passed: r.res.decision === 'ship', detail: r.res }],\n )\n\n const reasons = results.flatMap((r) =>\n r.res.reasons.map((reason) => `[${r.gate.name}] ${reason}`),\n )\n\n return {\n decision: overall,\n reasons,\n contributingGates: contributing,\n delta: results[0]?.res.delta,\n }\n },\n }\n}\n","/**\n * @experimental\n *\n * Statistical held-out promotion machinery — the trustworthy core the\n * point-estimate `heldout-delta` gate lacked.\n *\n * The shipped false positive it prevents: a winner re-scored against the\n * baseline on the holdout read run-to-run model NOISE (e.g. 91 vs 95) as a\n * \"+4 lift\" and shipped, because the gate compared point estimates with no\n * confidence interval. Here we pair candidate vs baseline holdout observations\n * and bootstrap a CI on the paired delta — a candidate ships only when the CI\n * lower bound clears the effect-size threshold (the gain is real at the\n * confidence level, not noise), and is blocked when a critical dimension\n * (e.g. `hallucination_free` for a legal agent) significantly regresses even if\n * the net composite rose (anti-Goodhart).\n *\n * Two traps this module is built around (both produce a NEW false positive if\n * gotten wrong):\n * 1. PAIRING GRANULARITY — pairs by FULL `cellId` (`scenario:rep`), never by\n * `scenarioId` (which averages reps away and destroys the within-pair\n * variance reduction that makes a paired bootstrap tighter than unpaired).\n * One paired observation per cell ⇒ reps multiply n.\n * 2. SCALE — a judge may emit composites/dimensions on [0,1] or 0-100. The\n * threshold + tolerance are interpreted in the judge's NATIVE scale; the\n * per-dimension tolerance auto-scales off the observed baseline magnitudes\n * so `-0.10` on [0,1] doesn't silently become a no-op on a 0-100 dimension.\n */\n\nimport { type PairedBootstrapResult, pairedBootstrap } from '../../statistics'\nimport type { JudgeScore } from '../types'\n\nexport interface PairedHoldout {\n /** Baseline scalar per paired cell (same order as `after`/`cellIds`). */\n before: number[]\n /** Candidate scalar per paired cell. */\n after: number[]\n /** The full cellIds (`scenario:rep`) that paired, in order. */\n cellIds: string[]\n}\n\n/**\n * Pair candidate vs baseline holdout observations by FULL cellId. `select`\n * pulls the scalar from a cell's judge reports (composite, or a named\n * dimension); a cell contributes the mean of `select` across its judges. Cells\n * whose scenario is not in `scenarioIds`, or where `select` is undefined for\n * every judge on either side, are skipped on BOTH sides so the arrays stay\n * paired. Throws when the two maps disagree on which holdout cells exist — a\n * load-bearing invariant: the baseline + winner holdout campaigns run the same\n * scenarios with the same seed base, so their cellIds MUST align; a mismatch\n * means a silent pairing bug, not a soft fallback.\n */\nexport function pairHoldout(\n candidate: Map<string, Record<string, JudgeScore>>,\n baseline: Map<string, Record<string, JudgeScore>>,\n scenarioIds: Set<string>,\n select: (s: JudgeScore) => number | undefined,\n): PairedHoldout {\n const cellValue = (\n byCell: Map<string, Record<string, JudgeScore>>,\n cellId: string,\n ): number | undefined => {\n const scores = byCell.get(cellId)\n if (!scores) return undefined\n const vals: number[] = []\n for (const s of Object.values(scores)) {\n const v = select(s)\n if (typeof v === 'number' && Number.isFinite(v)) vals.push(v)\n }\n if (vals.length === 0) return undefined\n return vals.reduce((a, b) => a + b, 0) / vals.length\n }\n\n const inScope = (cellId: string) => scenarioIds.has(cellId.split(':')[0] ?? '')\n const candCells = [...candidate.keys()].filter(inScope).sort()\n const baseCells = [...baseline.keys()].filter(inScope).sort()\n // Alignment invariant — the holdout campaigns share scenarios + seed, so the\n // cell sets must be identical. Differ ⇒ a real pairing bug; fail loud.\n if (candCells.length !== baseCells.length || candCells.some((c, i) => c !== baseCells[i])) {\n throw new Error(\n `pairHoldout: candidate/baseline holdout cells do not align — ` +\n `candidate=[${candCells.join(',')}] baseline=[${baseCells.join(',')}]. ` +\n `Both holdout campaigns must run the same scenarios with the same seed base.`,\n )\n }\n\n const before: number[] = []\n const after: number[] = []\n const cellIds: string[] = []\n for (const cellId of candCells) {\n const b = cellValue(baseline, cellId)\n const a = cellValue(candidate, cellId)\n // Only pair when BOTH sides produced the scalar (a dimension absent on one\n // side would otherwise create an unpaired observation).\n if (b === undefined || a === undefined) continue\n before.push(b)\n after.push(a)\n cellIds.push(cellId)\n }\n return { before, after, cellIds }\n}\n\nexport interface HeldoutSignificance {\n paired: PairedHoldout\n bootstrap: PairedBootstrapResult\n /** n paired observations. */\n n: number\n /** True iff n >= minProductiveRuns AND the CI lower bound clears the threshold. */\n significant: boolean\n /** Set when n < minProductiveRuns — too little evidence to claim significance. */\n fewRuns: boolean\n}\n\nexport interface HeldoutSignificanceOptions {\n deltaThreshold?: number\n minProductiveRuns?: number\n confidence?: number\n resamples?: number\n /** Fixed by default for a deterministic, reproducible gate verdict. */\n seed?: number\n statistic?: 'mean' | 'median'\n}\n\n/** Significance of the held-out composite lift: ship only when the paired\n * bootstrap CI lower bound on (candidate − baseline) exceeds `deltaThreshold`\n * (default 0 ⇒ \"confidently positive\"). Below `minProductiveRuns` paired\n * observations there is not enough evidence to claim significance → not\n * significant (`fewRuns`). Interpret `deltaThreshold` in the judge's native\n * composite scale. */\nexport function heldoutSignificance(\n paired: PairedHoldout,\n opts: HeldoutSignificanceOptions = {},\n): HeldoutSignificance {\n const deltaThreshold = opts.deltaThreshold ?? 0\n const minProductiveRuns = opts.minProductiveRuns ?? 3\n const bootstrap = pairedBootstrap(paired.before, paired.after, {\n confidence: opts.confidence ?? 0.95,\n resamples: opts.resamples ?? 2000,\n statistic: opts.statistic ?? 'median',\n seed: opts.seed ?? 1337,\n })\n const n = paired.before.length\n const fewRuns = n < minProductiveRuns\n const significant = !fewRuns && bootstrap.low > deltaThreshold\n return { paired, bootstrap, n, significant, fewRuns }\n}\n\nexport interface DimensionRegression {\n dimension: string\n bootstrap: PairedBootstrapResult\n /** True iff the CI lower bound on (candidate − baseline) is below −tolerance:\n * the candidate may have regressed this dimension by more than tolerance. */\n regressed: boolean\n tolerance: number\n n: number\n}\n\n/** Detect the native scale of a set of scores: 0-100 when any magnitude clears\n * 1.5, else [0,1]. Used to auto-scale the regression tolerance so a default\n * expressed for [0,1] is not silently a no-op on a 0-100 dimension. */\nexport function detectScale(values: number[]): 1 | 100 {\n return values.some((v) => Math.abs(v) > 1.5) ? 100 : 1\n}\n\n/** Per-critical-dimension regression guard. For each dimension, pair the\n * candidate vs baseline values by full cellId and bootstrap the paired delta;\n * a dimension is \"regressed\" when the CI lower bound < −tolerance (conservative\n * — blocks if the credible worst case exceeds tolerance, which is the right\n * posture for safety dimensions like `hallucination_free`). When `tolerance`\n * is omitted it auto-scales: 0.05 on [0,1], 5 on 0-100. */\nexport function dimensionRegressions(\n candidate: Map<string, Record<string, JudgeScore>>,\n baseline: Map<string, Record<string, JudgeScore>>,\n scenarioIds: Set<string>,\n criticalDimensions: string[],\n opts: { tolerance?: number; confidence?: number; resamples?: number; seed?: number } = {},\n): DimensionRegression[] {\n const out: DimensionRegression[] = []\n for (const dim of criticalDimensions) {\n const paired = pairHoldout(candidate, baseline, scenarioIds, (s) => s.dimensions[dim])\n if (paired.before.length === 0) continue // dimension not scored on this judge\n const tolerance = opts.tolerance ?? 0.05 * detectScale([...paired.before, ...paired.after])\n const bootstrap = pairedBootstrap(paired.before, paired.after, {\n confidence: opts.confidence ?? 0.95,\n resamples: opts.resamples ?? 2000,\n statistic: 'median',\n seed: opts.seed ?? 1337,\n })\n out.push({\n dimension: dim,\n bootstrap,\n regressed: bootstrap.low < -tolerance,\n tolerance,\n n: paired.before.length,\n })\n }\n return out\n}\n","/**\n * @experimental\n *\n * `defaultProductionGate` — composes the substrate's existing safety\n * primitives (red-team / reward-hacking / canary / heldout) into a single\n * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' \"safety\n * primitives are off the critical path\" blocker.\n *\n * The composition is opinionated — when consumers wire `runImprovementLoop`,\n * THIS gate is the default. Consumers can still pass a custom gate to\n * override; the recommended pattern is to compose THIS gate with whatever\n * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).\n */\n\nimport type { CanaryReport } from '../../canary'\nimport { runCanaries } from '../../canary'\nimport type { RedTeamCase } from '../../red-team'\nimport { scoreRedTeamOutput } from '../../red-team'\nimport type { RewardHackingReport } from '../../rl/reward-hacking'\nimport { detectRewardHacking } from '../../rl/reward-hacking'\nimport type { RunRecord } from '../../run-record'\nimport type { Gate, GateContext, GateResult, Scenario } from '../types'\nimport { dimensionRegressions, heldoutSignificance, pairHoldout } from './statistical-heldout'\n\nexport interface DefaultProductionGateOptions {\n /** Required: scenarios held out from training; substrate compares\n * candidate-on-holdout vs baseline-on-holdout. */\n holdoutScenarios: Scenario[]\n /** Minimum held-out lift the **paired-bootstrap CI lower bound** must clear\n * to ship — NOT a point estimate. Default 0 ⇒ \"confidently positive at the\n * confidence level\". Interpreted in the judge's native composite scale (set\n * e.g. 2 for a 0-100 rubric to require a ≥2-point significant gain). */\n deltaThreshold?: number\n /** Confidence level for the held-out + dimension bootstraps. Default 0.95. */\n confidence?: number\n /** Bootstrap resamples. Default 2000. */\n bootstrapResamples?: number\n /** Fixed bootstrap seed for a deterministic verdict. Default 1337. */\n bootstrapSeed?: number\n /** Minimum paired holdout observations (scenarios × reps) before a\n * significance claim is allowed; below it the gate HOLDS with `few_runs`\n * rather than reading a degenerate CI. Default 3. */\n minProductiveRuns?: number\n /** Critical judge dimensions that must NOT significantly regress even when\n * the net composite rises (anti-Goodhart). The gate HOLDS if any listed\n * dimension's paired-delta CI lower bound < −`regressionTolerance`. E.g.\n * `['hallucination_free']` for a legal agent. */\n criticalDimensions?: string[]\n /** Tolerance for the per-dimension regression guard, in the dimension's\n * native scale. When omitted it auto-scales off observed magnitudes:\n * 0.05 on [0,1], 5 on 0-100. */\n regressionTolerance?: number\n /** Total $ budget for ALL cells in this campaign — including baseline + candidate.\n * Composite verdict refuses to ship when spend exceeded budget. */\n budgetUsd?: number\n /** Red-team cases to probe candidate outputs against. When omitted the\n * substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific\n * battery for tighter coverage. */\n redTeamBattery?: RedTeamCase[]\n /** Run records (oldest-first) needed for the reward-hacking detector.\n * Substrate populates from prior production-loop generations. */\n recentRuns?: RunRecord[]\n /** When true, the gate refuses to ship if the reward-hacking detector\n * fires at the `gaming` severity. Default true. */\n blockOnRewardHackingGaming?: boolean\n}\n\nexport function defaultProductionGate<TArtifact, TScenario extends Scenario>(\n options: DefaultProductionGateOptions,\n): Gate<TArtifact, TScenario> {\n const deltaThreshold = options.deltaThreshold ?? 0\n const confidence = options.confidence ?? 0.95\n const resamples = options.bootstrapResamples ?? 2000\n const seed = options.bootstrapSeed ?? 1337\n const minProductiveRuns = options.minProductiveRuns ?? 3\n const blockOnGaming = options.blockOnRewardHackingGaming ?? true\n\n return {\n name: 'defaultProductionGate',\n async decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult> {\n const reasons: string[] = []\n const contributing: Array<{ name: string; passed: boolean; detail: unknown }> = []\n\n // ── (1) heldout composite lift — paired-bootstrap CI, NOT a point estimate\n // The shipped false positive: the baseline re-scored against itself read\n // run-to-run model noise (91 vs 95) as a \"+4 lift\" and shipped, because a\n // point estimate carries no confidence interval. Pair candidate vs\n // baseline holdout cells by FULL cellId (never averaging reps away) and\n // ship only when the bootstrap CI lower bound clears the threshold —\n // i.e. the gain is real at the confidence level, not noise.\n const scenarioIds = new Set(options.holdoutScenarios.map((s) => s.id))\n const sig = heldoutSignificance(\n pairHoldout(\n ctx.judgeScores,\n ctx.baselineJudgeScores ?? ctx.judgeScores,\n scenarioIds,\n (s) => s.composite,\n ),\n { deltaThreshold, minProductiveRuns, confidence, resamples, seed },\n )\n const delta = sig.bootstrap.median\n const heldoutPass = sig.significant\n contributing.push({\n name: 'heldout-significance',\n passed: heldoutPass,\n detail: {\n n: sig.n,\n deltaMedian: sig.bootstrap.median,\n ciLow: sig.bootstrap.low,\n ciHigh: sig.bootstrap.high,\n confidence: sig.bootstrap.confidence,\n deltaThreshold,\n fewRuns: sig.fewRuns,\n },\n })\n if (!heldoutPass) {\n reasons.push(\n sig.fewRuns\n ? `held-out: only ${sig.n} paired runs (< ${minProductiveRuns}) — too few to claim significance`\n : `held-out CI.low ${sig.bootstrap.low.toFixed(3)} ≤ threshold ${deltaThreshold} (median ${sig.bootstrap.median.toFixed(3)}, ${(sig.bootstrap.confidence * 100).toFixed(0)}% CI [${sig.bootstrap.low.toFixed(3)}, ${sig.bootstrap.high.toFixed(3)}])`,\n )\n }\n\n // ── (1b) per-dimension regression guard (anti-Goodhart) ──────────\n // A net composite gain can hide a regression on a safety-critical\n // dimension (e.g. hallucination_free for a legal agent — the verified run\n // gained +25/+25 on deadline/fee while LOSING -30 on hallucination, and\n // the composite-only gate never saw it). Block ship if any guarded\n // dimension's paired-delta CI lower bound falls below −tolerance.\n const dimRegs = options.criticalDimensions?.length\n ? dimensionRegressions(\n ctx.judgeScores,\n ctx.baselineJudgeScores ?? ctx.judgeScores,\n scenarioIds,\n options.criticalDimensions,\n { tolerance: options.regressionTolerance, confidence, resamples, seed },\n )\n : []\n const regressed = dimRegs.filter((d) => d.regressed)\n const dimPass = regressed.length === 0\n contributing.push({\n name: 'dimension-regression',\n passed: dimPass,\n detail: {\n guarded: options.criticalDimensions ?? [],\n regressions: dimRegs.map((d) => ({\n dimension: d.dimension,\n ciLow: d.bootstrap.low,\n median: d.bootstrap.median,\n tolerance: d.tolerance,\n n: d.n,\n regressed: d.regressed,\n })),\n },\n })\n if (!dimPass) {\n reasons.push(\n `critical dimension(s) regressed: ${regressed.map((d) => `${d.dimension} CI.low ${d.bootstrap.low.toFixed(3)} < -${d.tolerance}`).join('; ')}`,\n )\n }\n\n // ── (2) budget gate ─────────────────────────────────────────────\n const budgetPass =\n options.budgetUsd === undefined ||\n ctx.cost.candidate + ctx.cost.baseline <= options.budgetUsd\n contributing.push({\n name: 'budget',\n passed: budgetPass,\n detail: {\n candidateUsd: ctx.cost.candidate,\n baselineUsd: ctx.cost.baseline,\n budgetUsd: options.budgetUsd,\n },\n })\n if (!budgetPass) {\n reasons.push(\n `spend ${(ctx.cost.candidate + ctx.cost.baseline).toFixed(2)} > budget ${options.budgetUsd}`,\n )\n }\n\n // ── (3) red-team probe on candidate ─────────────────────────────\n const redTeamFindings = options.redTeamBattery\n ? probeRedTeam(ctx.candidateArtifacts, options.redTeamBattery)\n : { passed: true, findings: [] }\n contributing.push({\n name: 'red-team',\n passed: redTeamFindings.passed,\n detail: {\n failures: redTeamFindings.findings.length,\n sample: redTeamFindings.findings.slice(0, 3),\n },\n })\n if (!redTeamFindings.passed) {\n reasons.push(`red-team probe failed (${redTeamFindings.findings.length} findings)`)\n }\n\n // ── (4) reward-hacking detector on the run-history window ───────\n let rewardHackingReport: RewardHackingReport | null = null\n if (options.recentRuns && options.recentRuns.length >= 10) {\n rewardHackingReport = detectRewardHacking({ runs: options.recentRuns })\n }\n // reward-hacking severity is numeric (0..1). \"gaming\" threshold per\n // detectRewardHacking defaults = 0.6. Block when ANY finding is at\n // gaming threshold OR the report verdict is 'gaming'.\n const gamingThreshold = 0.6\n const gamingFindings = (rewardHackingReport?.findings ?? []).filter(\n (f) => f.severity >= gamingThreshold,\n )\n const rewardHackingPass =\n !rewardHackingReport ||\n !blockOnGaming ||\n (gamingFindings.length === 0 && rewardHackingReport.verdict !== 'gaming')\n contributing.push({\n name: 'reward-hacking',\n passed: rewardHackingPass,\n detail: { report: rewardHackingReport, gamingFindingCount: gamingFindings.length },\n })\n if (!rewardHackingPass) {\n reasons.push(\n `reward-hacking detector flagged ${gamingFindings.length} gaming-severity findings (verdict=${rewardHackingReport!.verdict})`,\n )\n }\n\n // ── (5) canary check on runs ────────────────────────────────────\n let canaryReport: CanaryReport | null = null\n if (options.recentRuns && options.recentRuns.length >= 10) {\n canaryReport = runCanaries(options.recentRuns, {})\n }\n // CanarySeverity is 'info' | 'warn' | 'error' — block on 'error'.\n const errorAlerts = (canaryReport?.alerts ?? []).filter((a) => a.severity === 'error')\n const canaryPass = errorAlerts.length === 0\n contributing.push({\n name: 'canary',\n passed: canaryPass,\n detail: { totalAlerts: canaryReport?.alerts.length ?? 0, errorAlerts: errorAlerts.length },\n })\n if (!canaryPass) {\n reasons.push(`canary error alerts: ${errorAlerts.length}`)\n }\n\n // ── Verdict ─────────────────────────────────────────────────────\n const allPassed = contributing.every((c) => c.passed)\n const decision = allPassed ? 'ship' : 'hold'\n\n return {\n decision,\n reasons: reasons.length > 0 ? reasons : ['all gates passed'],\n contributingGates: contributing,\n delta,\n }\n },\n }\n}\n\nfunction probeRedTeam<TArtifact>(\n artifacts: Map<string, TArtifact>,\n battery: RedTeamCase[],\n): { passed: boolean; findings: Array<{ scenarioId: string; reason: string }> } {\n const findings: Array<{ scenarioId: string; reason: string }> = []\n for (const [_cellId, artifact] of artifacts) {\n const text = extractText(artifact)\n if (text === undefined) continue\n for (const rtCase of battery) {\n const finding = scoreRedTeamOutput(text, [], rtCase)\n if (!finding.passed) {\n findings.push({ scenarioId: rtCase.id, reason: finding.reason ?? 'red-team probe failed' })\n }\n }\n }\n return { passed: findings.length === 0, findings }\n}\n\nfunction extractText(artifact: unknown): string | undefined {\n if (typeof artifact === 'string') return artifact\n if (artifact && typeof artifact === 'object') {\n const rec = artifact as Record<string, unknown>\n if (typeof rec.text === 'string') return rec.text\n if (typeof rec.output === 'string') return rec.output\n if (typeof rec.content === 'string') return rec.content\n }\n return undefined\n}\n","/**\n * @experimental\n *\n * Promotion policy over the evidence VECTOR — the substrate's answer to \"never\n * collapse the multi-objective promotion decision into one scalar.\" A\n * `defaultProductionGate` is one opinionated composition; this module factors\n * the decision into two reusable pieces so MANY policies can compete over the\n * SAME evidence (the quant-desk pattern: one evidence bus, plural strategies):\n *\n * buildEvidenceVector(ctx, objectives, opts) -> EvidenceVector // the bus\n * PromotionPolicy = (ev: EvidenceVector) => GateResult // a strategy\n * paretoPolicy(ev) // the default strategy\n * paretoSignificanceGate(options): Gate // bus + policy as a Gate\n *\n * The Pareto policy is SYMMETRIC multi-objective: every objective is BOTH a\n * potential gain source AND a safety floor (unlike `defaultProductionGate`,\n * where only `composite` can win and `criticalDimensions` are pure floors). A\n * candidate ships iff it weakly DOMINATES the baseline at the confidence level —\n * no objective credibly worse (CI floor breach) AND at least one objective\n * credibly better (CI gain). Insufficient evidence on ANY axis -> need_more_work\n * (NOT folded into hold: \"gather more reps\" and \"reject\" are different actions).\n *\n * Cost/latency are NOT CI axes here — `GateContext` carries only an aggregate\n * per-side cost, no per-cell observation vector to bootstrap. Treat them as hard\n * constraints (compose with a budget gate via `composeGate`), not faked CIs.\n */\n\nimport type { Direction } from '../../pareto'\nimport { type PairedBootstrapResult, pairedBootstrap } from '../../statistics'\nimport type { Gate, GateContext, GateDecision, GateResult, JudgeScore, Scenario } from '../types'\nimport { detectScale, pairHoldout } from './statistical-heldout'\n\n/** Where an objective's per-cell scalar comes from. `composite` reads the\n * judge's composite; `dimension` reads a named per-dimension score. */\nexport type ObjectiveSource = { kind: 'composite' } | { kind: 'dimension'; dimension: string }\n\nexport interface PromotionObjective {\n /** Stable label used in reports + `contributingGates`. */\n name: string\n source: ObjectiveSource\n /** 'maximize' (quality dims) or 'minimize' (error/risk/length dims). Orients\n * the paired delta so a positive bootstrap always means \"candidate better\". */\n direction: Direction\n /** The good-direction paired-delta CI lower bound must EXCEED this to count\n * as a significant gain on this axis. Interpreted in the judge's native\n * scale. Default 0 (⇒ \"confidently better\"). */\n gainThreshold?: number\n /** A floor breach (regression) is declared when the good-direction CI lower\n * bound is below −floorTolerance. When omitted it auto-scales off observed\n * magnitudes (0.05 on [0,1], 5 on 0-100), matching `dimensionRegressions`. */\n floorTolerance?: number\n}\n\n/** Per-axis verdict from the good-direction paired bootstrap. */\nexport type AxisVerdict = 'improved' | 'regressed' | 'flat' | 'few_runs'\n\nexport interface AxisEvidence {\n name: string\n source: ObjectiveSource\n direction: Direction\n /** Paired bootstrap on the GOOD-DIRECTION delta (oriented by `direction`):\n * a positive value means the candidate is better on this axis. */\n bootstrap: PairedBootstrapResult\n /** Paired observations contributing to this axis. */\n n: number\n gainThreshold: number\n floorTolerance: number\n verdict: AxisVerdict\n}\n\nexport interface EvidenceVector {\n /** One entry per objective — NOTHING averaged across axes. */\n axes: AxisEvidence[]\n /** Smallest paired n across axes that produced observations — the binding\n * evidence-sufficiency constraint. 0 when no axis produced observations. */\n minN: number\n /** Aggregate per-side cost from the gate context (a constraint input, not a\n * CI axis — see the module header). */\n cost: { candidate: number; baseline: number }\n}\n\n/** A promotion strategy: a pure function from the evidence vector to a verdict.\n * Many policies can run over the same `EvidenceVector` and disagree — that's\n * the point (competing strategies, shared evidence). */\nexport type PromotionPolicy = (ev: EvidenceVector) => GateResult\n\nexport interface BuildEvidenceVectorOptions {\n /** Minimum paired observations before an axis can claim significance; below\n * it the axis is `few_runs`. Default 3. */\n minProductiveRuns?: number\n /** Confidence level for every axis bootstrap. Default 0.95. */\n confidence?: number\n /** Bootstrap resamples. Default 2000. */\n resamples?: number\n /** Fixed bootstrap seed for a deterministic, reproducible verdict. Default 1337. */\n seed?: number\n}\n\n/**\n * The Evidence Bus. For each objective, pair candidate vs baseline by full\n * cellId and bootstrap a CI on the good-direction paired delta. Reuses the\n * exact `pairHoldout` + `pairedBootstrap` machinery the held-out gate uses, so\n * a single source of truth governs pairing granularity + scale handling.\n */\nexport function buildEvidenceVector<TArtifact, TScenario extends Scenario>(\n ctx: GateContext<TArtifact, TScenario>,\n objectives: PromotionObjective[],\n opts: BuildEvidenceVectorOptions = {},\n): EvidenceVector {\n if (objectives.length === 0) {\n throw new Error('buildEvidenceVector: at least 1 objective required')\n }\n const minProductiveRuns = opts.minProductiveRuns ?? 3\n const confidence = opts.confidence ?? 0.95\n const resamples = opts.resamples ?? 2000\n const seed = opts.seed ?? 1337\n const baseline = ctx.baselineJudgeScores ?? ctx.judgeScores\n const scenarioIds = new Set(ctx.scenarios.map((s) => s.id))\n\n const axes: AxisEvidence[] = []\n for (const obj of objectives) {\n let select: (s: JudgeScore) => number | undefined\n if (obj.source.kind === 'composite') {\n select = (s) => s.composite\n } else {\n const dim = obj.source.dimension\n select = (s) => s.dimensions[dim]\n }\n const paired = pairHoldout(ctx.judgeScores, baseline, scenarioIds, select)\n // Orient to the good direction: maximize ⇒ bootstrap (candidate − baseline);\n // minimize ⇒ bootstrap (baseline − candidate) by swapping args, so a\n // positive bootstrap always reads as \"candidate better on this axis\".\n const before = obj.direction === 'maximize' ? paired.before : paired.after\n const after = obj.direction === 'maximize' ? paired.after : paired.before\n const bootstrap = pairedBootstrap(before, after, {\n confidence,\n resamples,\n statistic: 'median',\n seed,\n })\n const n = paired.before.length\n const floorTolerance =\n obj.floorTolerance ?? 0.05 * detectScale([...paired.before, ...paired.after])\n const gainThreshold = obj.gainThreshold ?? 0\n // Floor check precedes the gain check: a credible regression must never be\n // masked as \"improved\". With the defaults (gainThreshold 0, positive floor)\n // the regions are disjoint and order is moot, but a consumer who sets a\n // negative gainThreshold (\"accept small dips\") could otherwise have a real\n // floor breach classified as a gain — anti-Goodhart wins the tie.\n const verdict: AxisVerdict =\n n < minProductiveRuns\n ? 'few_runs'\n : bootstrap.low < -floorTolerance\n ? 'regressed'\n : bootstrap.low > gainThreshold\n ? 'improved'\n : 'flat'\n axes.push({\n name: obj.name,\n source: obj.source,\n direction: obj.direction,\n bootstrap,\n n,\n gainThreshold,\n floorTolerance,\n verdict,\n })\n }\n const ns = axes.map((a) => a.n).filter((n) => n > 0)\n const minN = ns.length > 0 ? Math.min(...ns) : 0\n return { axes, minN, cost: { candidate: ctx.cost.candidate, baseline: ctx.cost.baseline } }\n}\n\n/**\n * The default strategy: symmetric multi-objective Pareto significance. Ship iff\n * the candidate weakly dominates the baseline at the confidence level — no axis\n * credibly worse AND ≥1 axis credibly better. Floor breach on any axis → hold\n * (anti-Goodhart, dominates everything). Insufficient evidence on any axis →\n * need_more_work. Statistically equivalent → hold (never ship noise).\n */\nexport const paretoPolicy: PromotionPolicy = (ev) => {\n const contributingGates = ev.axes.map((ax) => ({\n name: `objective:${ax.name}`,\n passed: ax.verdict === 'improved',\n detail: {\n direction: ax.direction,\n source: ax.source,\n verdict: ax.verdict,\n n: ax.n,\n deltaMedian: ax.bootstrap.median,\n ciLow: ax.bootstrap.low,\n ciHigh: ax.bootstrap.high,\n confidence: ax.bootstrap.confidence,\n gainThreshold: ax.gainThreshold,\n floorTolerance: ax.floorTolerance,\n },\n }))\n\n const regressed = ev.axes.filter((a) => a.verdict === 'regressed')\n const fewRuns = ev.axes.filter((a) => a.verdict === 'few_runs')\n const improved = ev.axes.filter((a) => a.verdict === 'improved')\n\n let decision: GateDecision\n const reasons: string[] = []\n if (regressed.length > 0) {\n // Floor breach dominates: a credible regression on ANY axis blocks ship even\n // if another axis improved. This makes the +gain/−safety false positive\n // structurally impossible whenever the safety dim is an objective.\n decision = 'hold'\n for (const a of regressed) {\n reasons.push(\n `objective '${a.name}' regressed: good-direction CI.low ${a.bootstrap.low.toFixed(3)} < -${a.floorTolerance} (n=${a.n})`,\n )\n }\n } else if (fewRuns.length > 0) {\n // No credible regression on the scored axes, but ≥1 axis lacks the evidence\n // to claim a gain ⇒ gather more reps, do NOT reject.\n decision = 'need_more_work'\n for (const a of fewRuns) {\n reasons.push(\n `objective '${a.name}' has only n=${a.n} paired runs — insufficient evidence to claim significance`,\n )\n }\n } else if (improved.length > 0) {\n // Weakly dominates (no axis worse) AND strictly better on ≥1 axis ⇒ a Pareto\n // improvement at the confidence level.\n decision = 'ship'\n reasons.push(\n `Pareto improvement at the confidence level: ${improved\n .map(\n (a) =>\n `'${a.name}' +${a.bootstrap.median.toFixed(3)} (CI.low ${a.bootstrap.low.toFixed(3)})`,\n )\n .join(', ')}; no objective regressed`,\n )\n } else {\n // Enough evidence, nothing credibly better or worse ⇒ statistically\n // equivalent. Do NOT ship a no-op.\n decision = 'hold'\n reasons.push(\n 'no Pareto improvement: candidate statistically equivalent to baseline on every objective',\n )\n }\n\n // `delta` surfaces the composite axis if present, else the first axis — a\n // single convenience scalar; the vector lives in `contributingGates`.\n const composite = ev.axes.find((a) => a.source.kind === 'composite') ?? ev.axes[0]\n return { decision, reasons, contributingGates, delta: composite?.bootstrap.median }\n}\n\nexport interface ParetoSignificanceGateOptions extends BuildEvidenceVectorOptions {\n /** The objective vector. Every axis is both a gain source and a safety floor. */\n objectives: PromotionObjective[]\n /** Strategy applied to the evidence vector. Default `paretoPolicy`. Override\n * to run a stricter/looser strategy over the SAME bus (competing policies). */\n policy?: PromotionPolicy\n /** Override the gate name in reports. */\n name?: string\n}\n\n/**\n * Wrap the bus + a policy as a `Gate`. Plugs into the existing\n * `runImprovementLoop({ gate })` slot and composes via `composeGate`; default\n * loop behavior is unchanged because consumers opt in by passing this gate.\n */\nexport function paretoSignificanceGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(\n options: ParetoSignificanceGateOptions,\n): Gate<TArtifact, TScenario> {\n if (options.objectives.length === 0) {\n throw new Error('paretoSignificanceGate: at least 1 objective required')\n }\n const policy = options.policy ?? paretoPolicy\n return {\n name: options.name ?? 'paretoSignificanceGate',\n async decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult> {\n const ev = buildEvidenceVector(ctx, options.objectives, options)\n return policy(ev)\n },\n }\n}\n","/**\n * @experimental\n *\n * `runEval` — the simplest preset over `runCampaign`. No optimizer, no\n * gate, no auto-PR. Just: run scenarios through dispatch, score with\n * judges, return CampaignResult.\n *\n * The 80% case for consumers who want a scorecard, not an improvement loop.\n */\n\nimport { type RunCampaignOptions, runCampaign } from '../run-campaign'\nimport type { CampaignResult, Scenario } from '../types'\n\nexport interface RunEvalOptions<TScenario extends Scenario, TArtifact>\n extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {\n runDir: string\n}\n\nexport async function runEval<TScenario extends Scenario, TArtifact>(\n opts: RunEvalOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n return runCampaign(opts)\n}\n"],"mappings":";;;;;;;;;;;;;;;AAwBO,SAAS,mBACd,MAC8B;AAC9B,SAAO;AAAA,IACL,MAAM,gBAAgB,KAAK,QAAQ,IAAI;AAAA,IACvC,MAAM,QAAQ,EAAE,gBAAgB,UAAU,gBAAgB,OAAO,GAAG;AAClE,aAAO,KAAK,QAAQ,OAAO;AAAA,QACzB,UAAU,SAAS,SAAS,IAAI,WAAY,KAAK,YAAY,CAAC;AAAA,QAC9D;AAAA,QACA;AAAA,QACA;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AACF;;;ACxBO,SAAS,eACX,OACyB;AAC5B,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI,MAAM,wCAAwC;AAAA,EAC1D;AACA,SAAO;AAAA,IACL,MAAM,YAAY,MAAM,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,GAAG,CAAC;AAAA,IACpD,MAAM,OAAO,KAA6D;AACxE,YAAM,UAAwE,CAAC;AAC/E,iBAAW,QAAQ,OAAO;AACxB,cAAM,MAAM,MAAM,KAAK,OAAO,GAAG;AACjC,gBAAQ,KAAK,EAAE,MAAM,IAAI,CAAC;AAAA,MAC5B;AAQA,YAAM,YAAY,QAAQ,IAAI,CAAC,MAAM,EAAE,IAAI,QAAQ;AACnD,YAAM,UAAwB,UAAU,MAAM,CAAC,MAAM,MAAM,MAAM,IAC7D,SACA,UAAU,SAAS,cAAc,IAC/B,iBACA,UAAU,SAAS,eAAe,IAChC,kBACA,UAAU,SAAS,MAAM,IACvB,SACA;AAEV,YAAM,eAAe,QAAQ;AAAA,QAAQ,CAAC,MACpC,EAAE,IAAI,kBAAkB,SAAS,IAC7B,EAAE,IAAI,oBACN,CAAC,EAAE,MAAM,EAAE,KAAK,MAAM,QAAQ,EAAE,IAAI,aAAa,QAAQ,QAAQ,EAAE,IAAI,CAAC;AAAA,MAC9E;AAEA,YAAM,UAAU,QAAQ;AAAA,QAAQ,CAAC,MAC/B,EAAE,IAAI,QAAQ,IAAI,CAAC,WAAW,IAAI,EAAE,KAAK,IAAI,KAAK,MAAM,EAAE;AAAA,MAC5D;AAEA,aAAO;AAAA,QACL,UAAU;AAAA,QACV;AAAA,QACA,mBAAmB;AAAA,QACnB,OAAO,QAAQ,CAAC,GAAG,IAAI;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AACF;;;ACbO,SAAS,YACd,WACA,UACA,aACA,QACe;AACf,QAAM,YAAY,CAChB,QACA,WACuB;AACvB,UAAM,SAAS,OAAO,IAAI,MAAM;AAChC,QAAI,CAAC,OAAQ,QAAO;AACpB,UAAM,OAAiB,CAAC;AACxB,eAAW,KAAK,OAAO,OAAO,MAAM,GAAG;AACrC,YAAM,IAAI,OAAO,CAAC;AAClB,UAAI,OAAO,MAAM,YAAY,OAAO,SAAS,CAAC,EAAG,MAAK,KAAK,CAAC;AAAA,IAC9D;AACA,QAAI,KAAK,WAAW,EAAG,QAAO;AAC9B,WAAO,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,KAAK;AAAA,EAChD;AAEA,QAAM,UAAU,CAAC,WAAmB,YAAY,IAAI,OAAO,MAAM,GAAG,EAAE,CAAC,KAAK,EAAE;AAC9E,QAAM,YAAY,CAAC,GAAG,UAAU,KAAK,CAAC,EAAE,OAAO,OAAO,EAAE,KAAK;AAC7D,QAAM,YAAY,CAAC,GAAG,SAAS,KAAK,CAAC,EAAE,OAAO,OAAO,EAAE,KAAK;AAG5D,MAAI,UAAU,WAAW,UAAU,UAAU,UAAU,KAAK,CAAC,GAAG,MAAM,MAAM,UAAU,CAAC,CAAC,GAAG;AACzF,UAAM,IAAI;AAAA,MACR,gFACgB,UAAU,KAAK,GAAG,CAAC,eAAe,UAAU,KAAK,GAAG,CAAC;AAAA,IAEvE;AAAA,EACF;AAEA,QAAM,SAAmB,CAAC;AAC1B,QAAM,QAAkB,CAAC;AACzB,QAAM,UAAoB,CAAC;AAC3B,aAAW,UAAU,WAAW;AAC9B,UAAM,IAAI,UAAU,UAAU,MAAM;AACpC,UAAM,IAAI,UAAU,WAAW,MAAM;AAGrC,QAAI,MAAM,UAAa,MAAM,OAAW;AACxC,WAAO,KAAK,CAAC;AACb,UAAM,KAAK,CAAC;AACZ,YAAQ,KAAK,MAAM;AAAA,EACrB;AACA,SAAO,EAAE,QAAQ,OAAO,QAAQ;AAClC;AA6BO,SAAS,oBACd,QACA,OAAmC,CAAC,GACf;AACrB,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,QAAM,oBAAoB,KAAK,qBAAqB;AACpD,QAAM,YAAY,gBAAgB,OAAO,QAAQ,OAAO,OAAO;AAAA,IAC7D,YAAY,KAAK,cAAc;AAAA,IAC/B,WAAW,KAAK,aAAa;AAAA,IAC7B,WAAW,KAAK,aAAa;AAAA,IAC7B,MAAM,KAAK,QAAQ;AAAA,EACrB,CAAC;AACD,QAAM,IAAI,OAAO,OAAO;AACxB,QAAM,UAAU,IAAI;AACpB,QAAM,cAAc,CAAC,WAAW,UAAU,MAAM;AAChD,SAAO,EAAE,QAAQ,WAAW,GAAG,aAAa,QAAQ;AACtD;AAeO,SAAS,YAAY,QAA2B;AACrD,SAAO,OAAO,KAAK,CAAC,MAAM,KAAK,IAAI,CAAC,IAAI,GAAG,IAAI,MAAM;AACvD;AAQO,SAAS,qBACd,WACA,UACA,aACA,oBACA,OAAuF,CAAC,GACjE;AACvB,QAAM,MAA6B,CAAC;AACpC,aAAW,OAAO,oBAAoB;AACpC,UAAM,SAAS,YAAY,WAAW,UAAU,aAAa,CAAC,MAAM,EAAE,WAAW,GAAG,CAAC;AACrF,QAAI,OAAO,OAAO,WAAW,EAAG;AAChC,UAAM,YAAY,KAAK,aAAa,OAAO,YAAY,CAAC,GAAG,OAAO,QAAQ,GAAG,OAAO,KAAK,CAAC;AAC1F,UAAM,YAAY,gBAAgB,OAAO,QAAQ,OAAO,OAAO;AAAA,MAC7D,YAAY,KAAK,cAAc;AAAA,MAC/B,WAAW,KAAK,aAAa;AAAA,MAC7B,WAAW;AAAA,MACX,MAAM,KAAK,QAAQ;AAAA,IACrB,CAAC;AACD,QAAI,KAAK;AAAA,MACP,WAAW;AAAA,MACX;AAAA,MACA,WAAW,UAAU,MAAM,CAAC;AAAA,MAC5B;AAAA,MACA,GAAG,OAAO,OAAO;AAAA,IACnB,CAAC;AAAA,EACH;AACA,SAAO;AACT;;;ACjIO,SAAS,sBACd,SAC4B;AAC5B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,aAAa,QAAQ,cAAc;AACzC,QAAM,YAAY,QAAQ,sBAAsB;AAChD,QAAM,OAAO,QAAQ,iBAAiB;AACtC,QAAM,oBAAoB,QAAQ,qBAAqB;AACvD,QAAM,gBAAgB,QAAQ,8BAA8B;AAE5D,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,OAAO,KAA6D;AACxE,YAAM,UAAoB,CAAC;AAC3B,YAAM,eAA0E,CAAC;AASjF,YAAM,cAAc,IAAI,IAAI,QAAQ,iBAAiB,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;AACrE,YAAM,MAAM;AAAA,QACV;AAAA,UACE,IAAI;AAAA,UACJ,IAAI,uBAAuB,IAAI;AAAA,UAC/B;AAAA,UACA,CAAC,MAAM,EAAE;AAAA,QACX;AAAA,QACA,EAAE,gBAAgB,mBAAmB,YAAY,WAAW,KAAK;AAAA,MACnE;AACA,YAAM,QAAQ,IAAI,UAAU;AAC5B,YAAM,cAAc,IAAI;AACxB,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ;AAAA,UACN,GAAG,IAAI;AAAA,UACP,aAAa,IAAI,UAAU;AAAA,UAC3B,OAAO,IAAI,UAAU;AAAA,UACrB,QAAQ,IAAI,UAAU;AAAA,UACtB,YAAY,IAAI,UAAU;AAAA,UAC1B;AAAA,UACA,SAAS,IAAI;AAAA,QACf;AAAA,MACF,CAAC;AACD,UAAI,CAAC,aAAa;AAChB,gBAAQ;AAAA,UACN,IAAI,UACA,kBAAkB,IAAI,CAAC,mBAAmB,iBAAiB,2CAC3D,mBAAmB,IAAI,UAAU,IAAI,QAAQ,CAAC,CAAC,qBAAgB,cAAc,YAAY,IAAI,UAAU,OAAO,QAAQ,CAAC,CAAC,MAAM,IAAI,UAAU,aAAa,KAAK,QAAQ,CAAC,CAAC,SAAS,IAAI,UAAU,IAAI,QAAQ,CAAC,CAAC,KAAK,IAAI,UAAU,KAAK,QAAQ,CAAC,CAAC;AAAA,QACrP;AAAA,MACF;AAQA,YAAM,UAAU,QAAQ,oBAAoB,SACxC;AAAA,QACE,IAAI;AAAA,QACJ,IAAI,uBAAuB,IAAI;AAAA,QAC/B;AAAA,QACA,QAAQ;AAAA,QACR,EAAE,WAAW,QAAQ,qBAAqB,YAAY,WAAW,KAAK;AAAA,MACxE,IACA,CAAC;AACL,YAAM,YAAY,QAAQ,OAAO,CAAC,MAAM,EAAE,SAAS;AACnD,YAAM,UAAU,UAAU,WAAW;AACrC,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ;AAAA,UACN,SAAS,QAAQ,sBAAsB,CAAC;AAAA,UACxC,aAAa,QAAQ,IAAI,CAAC,OAAO;AAAA,YAC/B,WAAW,EAAE;AAAA,YACb,OAAO,EAAE,UAAU;AAAA,YACnB,QAAQ,EAAE,UAAU;AAAA,YACpB,WAAW,EAAE;AAAA,YACb,GAAG,EAAE;AAAA,YACL,WAAW,EAAE;AAAA,UACf,EAAE;AAAA,QACJ;AAAA,MACF,CAAC;AACD,UAAI,CAAC,SAAS;AACZ,gBAAQ;AAAA,UACN,oCAAoC,UAAU,IAAI,CAAC,MAAM,GAAG,EAAE,SAAS,WAAW,EAAE,UAAU,IAAI,QAAQ,CAAC,CAAC,OAAO,EAAE,SAAS,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,QAC9I;AAAA,MACF;AAGA,YAAM,aACJ,QAAQ,cAAc,UACtB,IAAI,KAAK,YAAY,IAAI,KAAK,YAAY,QAAQ;AACpD,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ;AAAA,UACN,cAAc,IAAI,KAAK;AAAA,UACvB,aAAa,IAAI,KAAK;AAAA,UACtB,WAAW,QAAQ;AAAA,QACrB;AAAA,MACF,CAAC;AACD,UAAI,CAAC,YAAY;AACf,gBAAQ;AAAA,UACN,UAAU,IAAI,KAAK,YAAY,IAAI,KAAK,UAAU,QAAQ,CAAC,CAAC,aAAa,QAAQ,SAAS;AAAA,QAC5F;AAAA,MACF;AAGA,YAAM,kBAAkB,QAAQ,iBAC5B,aAAa,IAAI,oBAAoB,QAAQ,cAAc,IAC3D,EAAE,QAAQ,MAAM,UAAU,CAAC,EAAE;AACjC,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ,gBAAgB;AAAA,QACxB,QAAQ;AAAA,UACN,UAAU,gBAAgB,SAAS;AAAA,UACnC,QAAQ,gBAAgB,SAAS,MAAM,GAAG,CAAC;AAAA,QAC7C;AAAA,MACF,CAAC;AACD,UAAI,CAAC,gBAAgB,QAAQ;AAC3B,gBAAQ,KAAK,0BAA0B,gBAAgB,SAAS,MAAM,YAAY;AAAA,MACpF;AAGA,UAAI,sBAAkD;AACtD,UAAI,QAAQ,cAAc,QAAQ,WAAW,UAAU,IAAI;AACzD,8BAAsB,oBAAoB,EAAE,MAAM,QAAQ,WAAW,CAAC;AAAA,MACxE;AAIA,YAAM,kBAAkB;AACxB,YAAM,kBAAkB,qBAAqB,YAAY,CAAC,GAAG;AAAA,QAC3D,CAAC,MAAM,EAAE,YAAY;AAAA,MACvB;AACA,YAAM,oBACJ,CAAC,uBACD,CAAC,iBACA,eAAe,WAAW,KAAK,oBAAoB,YAAY;AAClE,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ,EAAE,QAAQ,qBAAqB,oBAAoB,eAAe,OAAO;AAAA,MACnF,CAAC;AACD,UAAI,CAAC,mBAAmB;AACtB,gBAAQ;AAAA,UACN,mCAAmC,eAAe,MAAM,sCAAsC,oBAAqB,OAAO;AAAA,QAC5H;AAAA,MACF;AAGA,UAAI,eAAoC;AACxC,UAAI,QAAQ,cAAc,QAAQ,WAAW,UAAU,IAAI;AACzD,uBAAe,YAAY,QAAQ,YAAY,CAAC,CAAC;AAAA,MACnD;AAEA,YAAM,eAAe,cAAc,UAAU,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,aAAa,OAAO;AACrF,YAAM,aAAa,YAAY,WAAW;AAC1C,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ,EAAE,aAAa,cAAc,OAAO,UAAU,GAAG,aAAa,YAAY,OAAO;AAAA,MAC3F,CAAC;AACD,UAAI,CAAC,YAAY;AACf,gBAAQ,KAAK,wBAAwB,YAAY,MAAM,EAAE;AAAA,MAC3D;AAGA,YAAM,YAAY,aAAa,MAAM,CAAC,MAAM,EAAE,MAAM;AACpD,YAAM,WAAW,YAAY,SAAS;AAEtC,aAAO;AAAA,QACL;AAAA,QACA,SAAS,QAAQ,SAAS,IAAI,UAAU,CAAC,kBAAkB;AAAA,QAC3D,mBAAmB;AAAA,QACnB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,aACP,WACA,SAC8E;AAC9E,QAAM,WAA0D,CAAC;AACjE,aAAW,CAAC,SAAS,QAAQ,KAAK,WAAW;AAC3C,UAAM,OAAO,YAAY,QAAQ;AACjC,QAAI,SAAS,OAAW;AACxB,eAAW,UAAU,SAAS;AAC5B,YAAM,UAAU,mBAAmB,MAAM,CAAC,GAAG,MAAM;AACnD,UAAI,CAAC,QAAQ,QAAQ;AACnB,iBAAS,KAAK,EAAE,YAAY,OAAO,IAAI,QAAQ,QAAQ,UAAU,wBAAwB,CAAC;AAAA,MAC5F;AAAA,IACF;AAAA,EACF;AACA,SAAO,EAAE,QAAQ,SAAS,WAAW,GAAG,SAAS;AACnD;AAEA,SAAS,YAAY,UAAuC;AAC1D,MAAI,OAAO,aAAa,SAAU,QAAO;AACzC,MAAI,YAAY,OAAO,aAAa,UAAU;AAC5C,UAAM,MAAM;AACZ,QAAI,OAAO,IAAI,SAAS,SAAU,QAAO,IAAI;AAC7C,QAAI,OAAO,IAAI,WAAW,SAAU,QAAO,IAAI;AAC/C,QAAI,OAAO,IAAI,YAAY,SAAU,QAAO,IAAI;AAAA,EAClD;AACA,SAAO;AACT;;;ACjLO,SAAS,oBACd,KACA,YACA,OAAmC,CAAC,GACpB;AAChB,MAAI,WAAW,WAAW,GAAG;AAC3B,UAAM,IAAI,MAAM,oDAAoD;AAAA,EACtE;AACA,QAAM,oBAAoB,KAAK,qBAAqB;AACpD,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,IAAI,uBAAuB,IAAI;AAChD,QAAM,cAAc,IAAI,IAAI,IAAI,UAAU,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;AAE1D,QAAM,OAAuB,CAAC;AAC9B,aAAW,OAAO,YAAY;AAC5B,QAAI;AACJ,QAAI,IAAI,OAAO,SAAS,aAAa;AACnC,eAAS,CAAC,MAAM,EAAE;AAAA,IACpB,OAAO;AACL,YAAM,MAAM,IAAI,OAAO;AACvB,eAAS,CAAC,MAAM,EAAE,WAAW,GAAG;AAAA,IAClC;AACA,UAAM,SAAS,YAAY,IAAI,aAAa,UAAU,aAAa,MAAM;AAIzE,UAAM,SAAS,IAAI,cAAc,aAAa,OAAO,SAAS,OAAO;AACrE,UAAM,QAAQ,IAAI,cAAc,aAAa,OAAO,QAAQ,OAAO;AACnE,UAAM,YAAY,gBAAgB,QAAQ,OAAO;AAAA,MAC/C;AAAA,MACA;AAAA,MACA,WAAW;AAAA,MACX;AAAA,IACF,CAAC;AACD,UAAM,IAAI,OAAO,OAAO;AACxB,UAAM,iBACJ,IAAI,kBAAkB,OAAO,YAAY,CAAC,GAAG,OAAO,QAAQ,GAAG,OAAO,KAAK,CAAC;AAC9E,UAAM,gBAAgB,IAAI,iBAAiB;AAM3C,UAAM,UACJ,IAAI,oBACA,aACA,UAAU,MAAM,CAAC,iBACf,cACA,UAAU,MAAM,gBACd,aACA;AACV,SAAK,KAAK;AAAA,MACR,MAAM,IAAI;AAAA,MACV,QAAQ,IAAI;AAAA,MACZ,WAAW,IAAI;AAAA,MACf;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AACA,QAAM,KAAK,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,OAAO,CAAC,MAAM,IAAI,CAAC;AACnD,QAAM,OAAO,GAAG,SAAS,IAAI,KAAK,IAAI,GAAG,EAAE,IAAI;AAC/C,SAAO,EAAE,MAAM,MAAM,MAAM,EAAE,WAAW,IAAI,KAAK,WAAW,UAAU,IAAI,KAAK,SAAS,EAAE;AAC5F;AASO,IAAM,eAAgC,CAAC,OAAO;AACnD,QAAM,oBAAoB,GAAG,KAAK,IAAI,CAAC,QAAQ;AAAA,IAC7C,MAAM,aAAa,GAAG,IAAI;AAAA,IAC1B,QAAQ,GAAG,YAAY;AAAA,IACvB,QAAQ;AAAA,MACN,WAAW,GAAG;AAAA,MACd,QAAQ,GAAG;AAAA,MACX,SAAS,GAAG;AAAA,MACZ,GAAG,GAAG;AAAA,MACN,aAAa,GAAG,UAAU;AAAA,MAC1B,OAAO,GAAG,UAAU;AAAA,MACpB,QAAQ,GAAG,UAAU;AAAA,MACrB,YAAY,GAAG,UAAU;AAAA,MACzB,eAAe,GAAG;AAAA,MAClB,gBAAgB,GAAG;AAAA,IACrB;AAAA,EACF,EAAE;AAEF,QAAM,YAAY,GAAG,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,WAAW;AACjE,QAAM,UAAU,GAAG,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,UAAU;AAC9D,QAAM,WAAW,GAAG,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,UAAU;AAE/D,MAAI;AACJ,QAAM,UAAoB,CAAC;AAC3B,MAAI,UAAU,SAAS,GAAG;AAIxB,eAAW;AACX,eAAW,KAAK,WAAW;AACzB,cAAQ;AAAA,QACN,cAAc,EAAE,IAAI,sCAAsC,EAAE,UAAU,IAAI,QAAQ,CAAC,CAAC,OAAO,EAAE,cAAc,OAAO,EAAE,CAAC;AAAA,MACvH;AAAA,IACF;AAAA,EACF,WAAW,QAAQ,SAAS,GAAG;AAG7B,eAAW;AACX,eAAW,KAAK,SAAS;AACvB,cAAQ;AAAA,QACN,cAAc,EAAE,IAAI,gBAAgB,EAAE,CAAC;AAAA,MACzC;AAAA,IACF;AAAA,EACF,WAAW,SAAS,SAAS,GAAG;AAG9B,eAAW;AACX,YAAQ;AAAA,MACN,+CAA+C,SAC5C;AAAA,QACC,CAAC,MACC,IAAI,EAAE,IAAI,MAAM,EAAE,UAAU,OAAO,QAAQ,CAAC,CAAC,YAAY,EAAE,UAAU,IAAI,QAAQ,CAAC,CAAC;AAAA,MACvF,EACC,KAAK,IAAI,CAAC;AAAA,IACf;AAAA,EACF,OAAO;AAGL,eAAW;AACX,YAAQ;AAAA,MACN;AAAA,IACF;AAAA,EACF;AAIA,QAAM,YAAY,GAAG,KAAK,KAAK,CAAC,MAAM,EAAE,OAAO,SAAS,WAAW,KAAK,GAAG,KAAK,CAAC;AACjF,SAAO,EAAE,UAAU,SAAS,mBAAmB,OAAO,WAAW,UAAU,OAAO;AACpF;AAiBO,SAAS,uBACd,SAC4B;AAC5B,MAAI,QAAQ,WAAW,WAAW,GAAG;AACnC,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACzE;AACA,QAAM,SAAS,QAAQ,UAAU;AACjC,SAAO;AAAA,IACL,MAAM,QAAQ,QAAQ;AAAA,IACtB,MAAM,OAAO,KAA6D;AACxE,YAAM,KAAK,oBAAoB,KAAK,QAAQ,YAAY,OAAO;AAC/D,aAAO,OAAO,EAAE;AAAA,IAClB;AAAA,EACF;AACF;;;ACrQA,eAAsB,QACpB,MAC+C;AAC/C,SAAO,YAAY,IAAI;AACzB;","names":[]}

package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} RENAMED Viewed

@@ -86,8 +86,10 @@ function buildDiagnosis(r) {
     const pct = (r.uncostedRecords / r.totalRecords * 100).toFixed(0);
     return [
       `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
-      `${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 \u2014 cost ledger is mis-wired (no input-token`,
-      "propagation from the runtime stream into RunRecord)."
+      `${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0. Two distinct roots:`,
+      "(a) cost ledger mis-wired \u2014 no usage propagation from the runtime stream into RunRecord; or",
+      "(b) the model is unpriced at the source (sandbox/router returned $0 despite real tokens).",
+      "For (b), price the measured tokens against the substrate table (estimateCost) instead of leaving $0."
     ].join(" ");
   }
   return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`;
@@ -533,4 +535,4 @@ export {
   inMemoryCampaignStorage,
   runCampaign
 };
-//# sourceMappingURL=chunk-6XQIEUQ2.js.map
+//# sourceMappingURL=chunk-ZPSKPT3V.js.map