npm - @tangle-network/agent-eval - Versions diffs - 0.20.12 → 0.21.0 - Mend

@tangle-network/agent-eval 0.20.12 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/CHANGELOG.md +76 -0
package/README.md +39 -1
package/dist/{chunk-75MCTH7P.js → chunk-3GN6U53I.js} +198 -3
package/dist/chunk-3GN6U53I.js.map +1 -0
package/dist/chunk-3IX6QTB7.js +1349 -0
package/dist/chunk-3IX6QTB7.js.map +1 -0
package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
package/dist/chunk-5IIQKMD5.js.map +1 -0
package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
package/dist/{chunk-HKYRWNHV.js → chunk-HRZELXCR.js} +2 -2
package/dist/{chunk-ODFINDLQ.js → chunk-KRR4VMH7.js} +11 -1
package/dist/chunk-KRR4VMH7.js.map +1 -0
package/dist/chunk-SNUHRBDL.js +154 -0
package/dist/chunk-SNUHRBDL.js.map +1 -0
package/dist/{chunk-KWUAAIHR.js → chunk-WOK2RTWG.js} +157 -1
package/dist/chunk-WOK2RTWG.js.map +1 -0
package/dist/{chunk-HNJLMAJ2.js → chunk-WOPGKVN4.js} +2 -2
package/dist/cli.js +3 -2
package/dist/cli.js.map +1 -1
package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
package/dist/control.d.ts +4 -3
package/dist/control.js +2 -2
package/dist/emitter-B2XqDKFU.d.ts +121 -0
package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
package/dist/index.d.ts +71 -83
package/dist/index.js +48 -60
package/dist/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +3 -2
package/dist/optimization.js +2 -2
package/dist/reporting-Da2ihlcM.d.ts +672 -0
package/dist/reporting.d.ts +5 -426
package/dist/reporting.js +6 -2
package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
package/dist/traces.d.ts +259 -3
package/dist/traces.js +24 -4
package/dist/wire/index.js +3 -2
package/docs/research-report-methodology.md +155 -0
package/package.json +10 -12
package/dist/chunk-75MCTH7P.js.map +0 -1
package/dist/chunk-IKFVX537.js +0 -717
package/dist/chunk-IKFVX537.js.map +0 -1
package/dist/chunk-KWUAAIHR.js.map +0 -1
package/dist/chunk-ODFINDLQ.js.map +0 -1
package/dist/chunk-PKCVBYTQ.js.map +0 -1
/package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
/package/dist/{chunk-HKYRWNHV.js.map → chunk-HRZELXCR.js.map} +0 -0
/package/dist/{chunk-HNJLMAJ2.js.map → chunk-WOPGKVN4.js.map} +0 -0

package/dist/traces.d.ts CHANGED Viewed

@@ -1,5 +1,7 @@
-import { a as TraceStore, L as LlmSpan, J as JudgeSpan, R as Run, F as FailureClass, d as ToolSpan } from './emitter-BYO2nSDA.js';
-export { A as Artifact, B as BudgetLedgerEntry, c as BudgetSpec, E as EventFilter, f as EventKind, g as FAILURE_CLASSES, h as FileSystemTraceStore, i as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, j as RetrievalSpan, e as RunFilter, k as RunLayer, C as RunOutcome, l as RunStatus, m as SandboxSpan, S as Span, n as SpanBase, o as SpanFilter, p as SpanHandle, q as SpanKind, r as SpanStatus, s as TRACE_SCHEMA_VERSION, T as TraceEmitter, t as TraceEmitterOptions, b as TraceEvent, u as isJudgeSpan, v as isLlmSpan, w as isRetrievalSpan, x as isSandboxSpan, y as isToolSpan, z as llmSpanFromProvider } from './emitter-BYO2nSDA.js';
+import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureClass, c as ToolSpan } from './store-u47QaJ9G.js';
+export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
+import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
+export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
 import { AxAIService, AxFunction } from '@ax-llm/ax';
 /**
@@ -130,6 +132,206 @@ interface OtlpExport {
 /** Export a single run's spans + events in OTLP/JSON. */
 declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
+/**
+ * RawProviderSink — first-class persistence for the actual HTTP-level
+ * request/response bodies of every LLM provider call.
+ *
+ * Why this is a separate sink from the structured `LlmSpan`:
+ *
+ *   - `LlmSpan` records the *intent* — model name, messages, output text,
+ *     usage. It's what dashboards read; it's NOT enough for forensics.
+ *   - When a downstream consumer reports "the verifier used the wrong route"
+ *     or "tokens look right but reasoning was missing," the only way to
+ *     answer is the raw HTTP body. Span fields can lie (a proxy can echo
+ *     a different `model` value than what actually answered); the raw
+ *     response is ground truth.
+ *
+ * Default behaviour: opt-in. Pass `rawSink` to `LlmClientOptions` (or the
+ * matrix runner / BuilderSession sets it up automatically) and every
+ * request, response, and error is recorded — including retries, with the
+ * attempt index attached so a flaky call's full event chain is recoverable.
+ *
+ * Redaction is enforced at sink time. The default redactor strips
+ * `Authorization`, `X-Api-Key`, `X-Auth-Token`, `Cookie` headers and any
+ * payload field whose key matches `apiKey | api_key | bearer | password |
+ * secret | token` (case-insensitive). Override via the sink constructor or
+ * the per-call `redactor`. The `redactedFields` array on the persisted
+ * event lets a reviewer see what was stripped without exposing the values.
+ */
+type RawProviderDirection = 'request' | 'response' | 'error';
+interface RawProviderEvent {
+    /** Stable id. Generated by the sink if omitted. */
+    eventId: string;
+    /** Trace context populated by `LlmClient` when the call is wrapped in a span. */
+    runId?: string;
+    spanId?: string;
+    /**
+     * Logical provider name. Free-form so callers can use whatever id matches
+     * their topology (`'openai'`, `'anthropic'`, `'tangle-router'`, …). When
+     * omitted, derived from `baseUrl` in `LlmClientOptions`.
+     */
+    provider: string;
+    model: string;
+    /** Endpoint path, e.g. `'/v1/chat/completions'`. */
+    endpoint: string;
+    /** Base URL used for the call (already-normalised — no trailing slash). */
+    baseUrl: string;
+    /** 0-indexed retry attempt. The first attempt is 0; a retried call gets 1, 2, … */
+    attemptIndex: number;
+    direction: RawProviderDirection;
+    /** Unix ms. */
+    timestamp: number;
+    /** Wall-clock duration of the call leg. Set on `response` and `error` events; null on `request`. */
+    durationMs?: number;
+    statusCode?: number;
+    requestHeaders?: Record<string, string>;
+    requestBody?: unknown;
+    responseHeaders?: Record<string, string>;
+    responseBody?: unknown;
+    /** Set on `direction: 'error'` events. */
+    errorMessage?: string;
+    /** Field paths the redactor stripped from this event ('header:Authorization', 'body.apiKey', …). */
+    redactedFields: string[];
+}
+interface RawProviderSinkFilter {
+    runId?: string;
+    spanId?: string;
+    direction?: RawProviderDirection;
+    attemptIndex?: number;
+}
+interface RawProviderSink {
+    record(event: RawProviderEvent): Promise<void>;
+    /** Optional listing — implementations that durably persist (file, db) should support this. */
+    list?(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
+    /** Optional teardown for backed implementations. */
+    close?(): Promise<void>;
+}
+type ProviderRedactor = (event: RawProviderEvent) => RawProviderEvent;
+/**
+ * Default redactor — strips well-known auth headers and any body field whose
+ * key matches the credential pattern. Records every redacted path on
+ * `event.redactedFields` so a downstream reviewer can see what was removed.
+ */
+declare function defaultProviderRedactor(event: RawProviderEvent): RawProviderEvent;
+interface InMemoryRawProviderSinkOptions {
+    redactor?: ProviderRedactor;
+}
+declare class InMemoryRawProviderSink implements RawProviderSink {
+    private events;
+    private redactor;
+    constructor(opts?: InMemoryRawProviderSinkOptions);
+    record(event: RawProviderEvent): Promise<void>;
+    list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
+    size(): number;
+}
+declare class NoopRawProviderSink implements RawProviderSink {
+    record(): Promise<void>;
+}
+interface FileSystemRawProviderSinkOptions {
+    /** Directory the NDJSON file is written into. Created if missing. */
+    dir: string;
+    /** File name; default `'raw-provider-events.ndjson'`. */
+    fileName?: string;
+    /** Bytes after which the writer rolls over to a new file (default 32 MiB). */
+    rollAtBytes?: number;
+    redactor?: ProviderRedactor;
+}
+declare class FileSystemRawProviderSink implements RawProviderSink {
+    private dir;
+    private fileName;
+    private rollAtBytes;
+    private redactor;
+    private bytesWritten;
+    private rollIndex;
+    private initPromise;
+    constructor(opts: FileSystemRawProviderSinkOptions);
+    private ensureInit;
+    private currentPath;
+    record(event: RawProviderEvent): Promise<void>;
+    list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
+}
+/**
+ * Best-effort provider id from a base URL. Falls back to the URL host when
+ * none of the well-known patterns match.
+ */
+declare function providerFromBaseUrl(baseUrl: string): string;
+/**
+ * Run-completion integrity check — at end of run, verify the expected event
+ * types were actually captured. The point is the launch-review failure mode:
+ * a run *appears* successful but the raw provider events were never written,
+ * so a downstream reviewer can't reconstruct what happened.
+ *
+ * Pattern:
+ *
+ *   const report = await assertRunCaptured(store, runId, {
+ *     llmSpansMin: 1,
+ *     judgeSpansMin: 1,
+ *     rawSink: providerSink,                  // must have ≥ 1 event for this run
+ *     requireRawCoverageOfLlmSpans: true,     // every llm span has matching raw events
+ *   })
+ *   if (!report.ok) throwIfRunIncomplete(report)  // or mark run failed and continue
+ *
+ * The function is read-only on the store and returns a structured report;
+ * the caller chooses the failure mode (throw, mark run failed, log warning).
+ * `throwIfRunIncomplete` is the convenient strict mode.
+ */
+interface RunIntegrityExpectations {
+    /** Minimum LLM span count. Default 0 (no requirement). */
+    llmSpansMin?: number;
+    /** Minimum judge span count. Default 0. */
+    judgeSpansMin?: number;
+    /** Minimum tool span count. Default 0. */
+    toolSpansMin?: number;
+    /**
+     * Raw provider sink to consult for capture verification. When present,
+     * the check requires at least one raw event for the run.
+     */
+    rawSink?: RawProviderSink;
+    /** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
+    rawProviderEventsMin?: number;
+    /**
+     * Every LLM span must have at least one matching raw `request` event
+     * (matched by spanId). Catches the common bug where the structured span
+     * was emitted but the raw HTTP capture was wired to a different sink.
+     */
+    requireRawCoverageOfLlmSpans?: boolean;
+    /** Run outcome must be set (not null/undefined). Default false. */
+    requireOutcome?: boolean;
+}
+type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
+interface RunIntegrityIssue {
+    code: RunIntegrityIssueCode;
+    message: string;
+    detail?: Record<string, unknown>;
+}
+interface RunIntegrityReport {
+    ok: boolean;
+    runId: string;
+    llmSpanCount: number;
+    judgeSpanCount: number;
+    toolSpanCount: number;
+    rawProviderEventCount: number;
+    /**
+     * Coverage of LLM spans by raw provider events keyed on spanId.
+     * `total` is the number of LLM spans; `covered` is the count with at
+     * least one matching `request` raw event.
+     */
+    rawSpanCoverage: {
+        covered: number;
+        total: number;
+    };
+    issues: RunIntegrityIssue[];
+}
+declare class RunIntegrityError extends Error {
+    readonly report: RunIntegrityReport;
+    constructor(report: RunIntegrityReport);
+}
+declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
+/** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
+declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
 /**
  * Shared types for the trace-analyst module.
  *
@@ -578,6 +780,60 @@ declare function traceAnalystFunctionGroup(opts: BuildTraceAnalystToolsOpts): {
     functions: AxFunction[];
 };
+/**
+ * Trace-analyst auto-execution hook.
+ *
+ * Wires `analyzeTraces` into a `TraceEmitter`'s `onRunComplete` so a
+ * direct matrix run produces an analysis artifact without an out-of-band
+ * step. Designed for the case where the consumer reports "the analyst
+ * never ran" — the cause is almost always orchestration, not the analyst.
+ *
+ * Usage:
+ *
+ *   const emitter = new TraceEmitter(store, {
+ *     onRunComplete: [traceAnalystOnRunComplete({ analyze: opts, save })],
+ *   })
+ *
+ * Hooks are best-effort by default — they never crash the underlying run.
+ * The caller decides whether to gate the run on the analysis result via
+ * the `gateOn` callback.
+ */
+interface TraceAnalystHookOptions {
+    /**
+     * Options forwarded to `analyzeTraces`. The hook supplies the question
+     * if you don't pass one — defaulting to a launch-grade prompt that asks
+     * for failure modes, surprising findings, and a recommendation.
+     */
+    analyze: Omit<AnalyzeTracesOptions, 'source'> & {
+        source?: AnalyzeTracesOptions['source'];
+    };
+    /**
+     * Override the question. The default is intentionally generic:
+     * "Summarise what happened in this run, surface any failure modes,
+     *  surprising findings, or evidence the verdict is wrong."
+     */
+    question?: string;
+    /**
+     * Persist the result. The hook calls this with the analysis output and
+     * the run context. Common implementations write to a TraceAnalysisStore
+     * or append to a per-run JSONL.
+     */
+    save?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => Promise<void>;
+    /**
+     * Predicate gating execution per run. Default: every completed run.
+     * Use to skip aborted runs, debug runs, or runs without LLM activity.
+     */
+    shouldRun?: (ctx: RunCompleteHookContext) => boolean;
+    /**
+     * Optional gate: if set and returns false, the hook records the failure
+     * as a log event on the run instead of staying quiet. The caller can
+     * then trigger downstream alerts off `analyst_gate_failed` log events.
+     */
+    gateOn?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => boolean;
+}
+declare function traceAnalystOnRunComplete(opts: TraceAnalystHookOptions): RunCompleteHook;
 /** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
 declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n   - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n   - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n   - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n5a. **Result-shape contract** \u2014 searchTrace and searchSpan return `{ trace_id, hits, total_matches, has_more }`. Iterate `result.hits` (NOT result.matches). Each hit has `{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }`. viewTrace returns `{ trace_id, spans }` (or `oversized`). viewSpans returns `{ trace_id, spans, missing_span_ids, truncated_attribute_count }`. Never assume a field name \u2014 log the result shape first if unsure.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n    const reviews = await llmQuery([\n      { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n      { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n    ]);\n\nOBSERVABILITY rules:\n- Each non-final actor turn must emit at least one `console.log(...)` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).\n- Do NOT combine `console.log` with `final(...)` or `askClarification(...)` in the same turn \u2014 finish gathering data first, then call final on its own turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.\n\nCRITICAL \u2014 `final()` payload contract for evidence-grounded analysis tasks:\n- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.\n- Example for per-item verdict tasks:\n  ```js\n  await final(\"Format the per-item verdict report from the evidence below.\", {\n    findings: [\n      { id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },\n      ...all items\n    ],\n    systemic_summary: '3 sentences I wrote based on the evidence above'\n  });\n  ```\n- Calling `final(\"answer\", {})` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.\n- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. `spans[i].attributes['redteam.finding.title']`), and for each one perform the requested cross-reference (e.g. read the source SPAN's `attributes['source.content']`).\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
 declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
@@ -655,4 +911,4 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
 declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
 declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
-export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, JudgeSpan, LlmSpan, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, type RedactionReport, type RedactionRule, Run, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, llmSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup };
+export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, FileSystemRawProviderSink, type FileSystemRawProviderSinkOptions, InMemoryRawProviderSink, type InMemoryRawProviderSinkOptions, JudgeSpan, LlmSpan, NoopRawProviderSink, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type ProviderRedactor, type QueryTracesPage, REDACTION_VERSION, type RawProviderDirection, type RawProviderEvent, type RawProviderSink, type RawProviderSinkFilter, type RedactionReport, type RedactionRule, Run, RunCompleteHook, RunCompleteHookContext, RunIntegrityError, type RunIntegrityExpectations, type RunIntegrityIssue, type RunIntegrityIssueCode, type RunIntegrityReport, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, assertRunCaptured, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultProviderRedactor, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, llmSpans, planTraceInsightQuestions, providerFromBaseUrl, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, throwIfRunIncomplete, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };

package/dist/traces.js CHANGED Viewed

@@ -7,6 +7,7 @@ import {
   OTEL_AGENT_EVAL_SCOPE,
   OtlpFileTraceStore,
   REDACTION_VERSION,
+  RunIntegrityError,
   SpanNotFoundError,
   TRACE_ANALYST_ACTOR_DESCRIPTION,
   TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -18,6 +19,7 @@ import {
   aggregateLlm,
   analyzeTraces,
   argHash,
+  assertRunCaptured,
   buildTraceAnalystTools,
   buildTraceInsightContext,
   buildTraceInsightPrompt,
@@ -40,24 +42,37 @@ import {
   runFailureClass,
   runsForScenario,
   scoreTraceInsightReadiness,
+  throwIfRunIncomplete,
   tokenizeDomainWords,
   toolSpans,
-  traceAnalystFunctionGroup
-} from "./chunk-KWUAAIHR.js";
+  traceAnalystFunctionGroup,
+  traceAnalystOnRunComplete
+} from "./chunk-WOK2RTWG.js";
 import {
   TraceEmitter,
   llmSpanFromProvider
-} from "./chunk-PKCVBYTQ.js";
+} from "./chunk-5IIQKMD5.js";
+import {
+  FileSystemRawProviderSink,
+  InMemoryRawProviderSink,
+  NoopRawProviderSink,
+  defaultProviderRedactor,
+  providerFromBaseUrl
+} from "./chunk-SNUHRBDL.js";
 import "./chunk-PZ5AY32C.js";
 export {
   DEFAULT_REDACTION_RULES,
   DEFAULT_TRACE_ANALYST_BUDGETS,
   FAILURE_CLASSES,
+  FileSystemRawProviderSink,
   FileSystemTraceStore,
+  InMemoryRawProviderSink,
   InMemoryTraceStore,
+  NoopRawProviderSink,
   OTEL_AGENT_EVAL_SCOPE,
   OtlpFileTraceStore,
   REDACTION_VERSION,
+  RunIntegrityError,
   SpanNotFoundError,
   TRACE_ANALYST_ACTOR_DESCRIPTION,
   TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -70,9 +85,11 @@ export {
   aggregateLlm,
   analyzeTraces,
   argHash,
+  assertRunCaptured,
   buildTraceAnalystTools,
   buildTraceInsightContext,
   buildTraceInsightPrompt,
+  defaultProviderRedactor,
   defaultTraceInsightPanel,
   describeTraceInsightScope,
   domainEvidencePattern,
@@ -88,13 +105,16 @@ export {
   llmSpanFromProvider,
   llmSpans,
   planTraceInsightQuestions,
+  providerFromBaseUrl,
   redactString,
   redactValue,
   runFailureClass,
   runsForScenario,
   scoreTraceInsightReadiness,
+  throwIfRunIncomplete,
   tokenizeDomainWords,
   toolSpans,
-  traceAnalystFunctionGroup
+  traceAnalystFunctionGroup,
+  traceAnalystOnRunComplete
 };
 //# sourceMappingURL=traces.js.map

package/dist/wire/index.js CHANGED Viewed

@@ -24,8 +24,9 @@ import {
   runRpcBatch,
   runRpcOnce,
   startServer
-} from "../chunk-HNJLMAJ2.js";
-import "../chunk-75MCTH7P.js";
+} from "../chunk-WOPGKVN4.js";
+import "../chunk-3GN6U53I.js";
+import "../chunk-SNUHRBDL.js";
 import "../chunk-PZ5AY32C.js";
 export {
   BUILTIN_RUBRICS,

package/docs/research-report-methodology.md ADDED Viewed

@@ -0,0 +1,155 @@
+# researchReport — methodology
+This document is the methodological brief for `researchReport` (exported from
+`@tangle-network/agent-eval` and `@tangle-network/agent-eval/reporting`). It
+exists so a launch reviewer, peer reviewer, or auditor can quickly verify that
+the verdict embedded in any rendered report is defensible, reproducible, and
+appropriate to the data.
+The companion code is `src/summary-report.ts`. Each item below names the
+corresponding function or option so the doc and the code don't drift.
+## Inputs
+- `runs: RunRecord[]` — every record carries `runId`, `candidateId`, `seed`,
+  `experimentId`, `splitTag`, and an `outcome` with the configured score.
+- `comparator: string` — the candidate id treated as the null reference. Must
+  be selected before data inspection; `preregistrationHash` should pin this.
+- `split: 'search' | 'holdout'` — defaults to `holdout`. Decisions on `search`
+  are descriptive only; promotion calls require the holdout.
+- `rope: { low, high }` — Region of Practical Equivalence on the paired delta,
+  in score units. Must come from the domain owner — there is no
+  statistically-defensible default.
+- `minPairs` (soft floor, default 20) and `RESEARCH_REPORT_HARD_PAIR_FLOOR`
+  (hard floor, 6). Below the soft floor, the verdict is `needs_more_data` and
+  the report carries the MDE at the current N.
+- `fdr` (default 0.05), `confidence` (default 0.95), `mdePower` (default 0.8),
+  `mdeAlpha` (default = `fdr`).
+## Pairing
+Pairs are joined by `(experimentId, seed)` so the comparator and candidate
+share scenario *and* seed. This is the same join `gainHistogram` uses; see
+`pairScoresByKey` in `src/summary-report.ts`. Records on the wrong split or
+with non-finite scores are dropped before pairing.
+## Decision rule
+In order — first match wins:
+1. `comparator` itself → `hold` (baseline).
+2. No comparator → `hold` if on the cost/quality Pareto frontier, else
+   `needs_more_data`. The verdict is descriptive, not causal.
+3. Held-out gate verdict ≠ `promote` → `reject`. The gate is *necessary but
+   not sufficient*; even a `promote` gate must clear the paired test below.
+4. Paired N < `RESEARCH_REPORT_HARD_PAIR_FLOOR` → `needs_more_data` with a
+   "below hard floor" reason. Bootstrap CIs degenerate at this size.
+5. ROPE configured AND paired-delta CI ⊂ ROPE → `equivalent`.
+6. Paired-delta CI upper bound < 0 → `reject` (CI excludes a non-negative
+   effect). Note: this uses **paired delta only** — not the marginal mean.
+7. Paired N < `minPairs` (soft floor) → `needs_more_data` with the MDE at
+   current N attached so the verdict is actionable.
+8. BH-adjusted q ≤ `fdr` AND CI lower bound > 0 → `promote`. The BH q-value
+   controls FDR across all candidates in the same sweep; the bootstrap CI
+   provides an effect-size guarantee independent of the test.
+9. Otherwise → `hold`.
+## Statistical primitives used
+| Quantity | Function | Source file |
+|---|---|---|
+| Marginal CI on score mean | `confidenceInterval` | `statistics.ts` |
+| Cohen's d vs comparator | `cohensD` | `statistics.ts` |
+| Wilcoxon signed-rank (paired) | `wilcoxonSignedRank` | `statistics.ts` |
+| BH-FDR q-values | `benjaminiHochberg` | `power-analysis.ts` |
+| Paired bootstrap CI on median delta | `pairedBootstrap` | `paired-stats.ts` |
+| Bayesian-bootstrap-style Pr(Δ>0), Pr(Δ∈ROPE) | `bootstrapMeanSamples` | `summary-report.ts` (private) |
+| Minimum detectable paired effect | `pairedMde` | `power-analysis.ts` |
+| Run fingerprint | `hashJson(canonicalize(...))` | `pre-registration.ts` |
+The Pr(Δ>0) and Pr(Δ∈ROPE) summaries use the bootstrap-prior duality of
+[Rubin 1981]: under a non-informative Dirichlet prior, the bootstrap
+distribution of a sample statistic is its posterior. We expose these as
+posterior summaries on the **mean** delta and the bootstrap CI on the
+**median** delta — the median is more robust to the heavy-tailed score
+distributions seen in agent benchmarks; the mean lets us read off the
+Bayesian-style probability of superiority in a single number.
+## MDE
+The minimum detectable paired effect at N pairs, two-sided α, and power β:
+$$d_\text{min} = \frac{z_{1-\alpha/2} + z_\beta}{\sqrt{n}}$$
+reported on the standardised scale, then multiplied by the observed paired-
+delta SD to get the MDE in score units. Consumers reading a `needs_more_data`
+verdict can use the MDE to budget the next round of runs:
+- Observed paired SD = 0.10 score units, paired N = 20, α = 0.05, β = 0.8 →
+  d_min ≈ 0.63 standardised → MDE ≈ 0.063 score units. If the smallest
+  effect that would change a launch decision is below this, run more pairs.
+## Provenance
+Every report carries:
+- `runFingerprint`: SHA-256 over the canonicalised list of
+  `(runId, candidateId, splitTag)` triples (sorted by runId), plus the
+  comparator id and split. Same `(runs, comparator, split)` produces the same
+  fingerprint regardless of input order.
+- `preregistrationHash`: the caller passes the hash of a signed
+  `HypothesisManifest` (see `pre-registration.ts`). The fingerprint and the
+  preregistration hash together let a reader verify both *what data the
+  report saw* and *what protocol it was supposed to run.*
+Reports without a `preregistrationHash` carry a "post-hoc" warning in the
+risks list and the executive summary. Treat them as descriptive only.
+## Alternatives considered
+- **Paired t-test instead of Wilcoxon + bootstrap.** Rejected: agent score
+  distributions are heavy-tailed (judges saturate near 0 and 1) and the t
+  approximation breaks down with the small N typical of holdouts.
+- **Unpaired Mann–Whitney.** Rejected: matched scenarios make pairing free,
+  and unpaired tests throw away the variance reduction. Use the paired test
+  by default.
+- **Sequential / always-valid inference (e-values, mSPRT, alpha-spending).**
+  Out of scope for a single-look report. If users iterate, wrap this report
+  in an alpha-spending schedule, or commit to one preregistered look.
+- **Hierarchical Bayesian shrinkage across many candidates.** Future work.
+  The current ranking is on raw paired statistics and over-credits the top
+  candidate when many are tested.
+- **Calibration / coverage simulation on the bootstrap CI.** Future work; we
+  rely on the asymptotic guarantee plus the hard pair floor to keep coverage
+  reasonable.
+## When NOT to apply
+- Paired N below the hard floor (6) on any candidate.
+- Comparator chosen by inspecting the data (post-hoc selection inflates
+  false-discovery rates beyond the BH guarantee).
+- Mid-run distribution shift: judge model swap, rubric change, infrastructure
+  outage. Pair exchangeability is violated and the bootstrap is not valid.
+- Scenarios drawn non-randomly from a stream the candidate can influence
+  (data-leak across runs). The pairing is no longer ignorable.
+- Highly skewed cost distributions: the Pareto frontier still works but the
+  marginal CI on cost may be misleading.
+## Citations
+- Benjamini, Y. & Hochberg, Y. (1995). Controlling the false discovery rate:
+  a practical and powerful approach to multiple testing. *JRSS B*,
+  57(1), 289–300.
+- Wilcoxon, F. (1945). Individual comparisons by ranking methods.
+  *Biometrics Bulletin*, 1(6), 80–83.
+- Efron, B. (1979). Bootstrap methods: another look at the jackknife.
+  *Annals of Statistics*, 7(1), 1–26.
+- Rubin, D. B. (1981). The Bayesian bootstrap.
+  *Annals of Statistics*, 9(1), 130–134.
+- Kruschke, J. K. (2018). Rejecting or accepting parameter values in
+  Bayesian estimation. *Advances in Methods and Practices in
+  Psychological Science*, 1(2), 270–280. (ROPE.)
+- Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
+  Time-uniform, nonparametric, nonasymptotic confidence sequences.
+  *Annals of Statistics*, 49(2), 1055–1080. (Background reading on
+  always-valid inference for sequential extensions.)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.20.12",
+  "version": "0.21.0",
   "description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
@@ -74,15 +74,6 @@
   "publishConfig": {
     "access": "public"
   },
-  "scripts": {
-    "build": "tsup && pnpm openapi",
-    "dev": "tsup --watch",
-    "prepare": "pnpm build",
-    "test": "vitest run",
-    "test:watch": "vitest",
-    "typecheck": "tsc --noEmit",
-    "openapi": "node dist/cli.js openapi --out dist/openapi.json"
-  },
   "dependencies": {
     "@asteasolutions/zod-to-openapi": "^8.5.0",
     "@ax-llm/ax": "^19.0.25",
@@ -102,5 +93,12 @@
     "node": ">=20"
   },
   "license": "MIT",
-  "packageManager": "pnpm@10.22.0"
-}
+  "scripts": {
+    "build": "tsup && pnpm openapi",
+    "dev": "tsup --watch",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "typecheck": "tsc --noEmit",
+    "openapi": "node dist/cli.js openapi --out dist/openapi.json"
+  }
+}

package/dist/chunk-75MCTH7P.js.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"sources":["../src/llm-client.ts"],"sourcesContent":["/**\n * LLM client with graceful degrade.\n *\n * OpenAI-compatible `/v1/chat/completions` client with:\n * - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).\n * - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).\n * - Graceful json_schema → json_object degrade on 400 with schema-reject body.\n * - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.\n * - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI\n * directly, cli-bridge subscriptions, and any router that speaks the spec.\n *\n * Usage:\n * const { value, result } = await callLlmJson<MyType>(\n * { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },\n * { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },\n * )\n *\n * This is THE llm-calling seam for agent-eval primitives that need structured\n * output (semantic concept judge, reviewer directives, critic scores). Primitives\n * that need free-form text use `callLlm` and parse output themselves.\n */\n\n// ─── Types ──────────────────────────────────────────────────────────────\n\nexport interface LlmMessage {\n role: 'system' | 'user' | 'assistant'\n /**\n * Either a plain text content string OR a multimodal content array\n * (text + image_url parts) for vision-capable models.\n */\n content:\n | string\n | Array<\n | { type: 'text'; text: string }\n | { type: 'image_url'; image_url: { url: string; detail?: 'auto' | 'low' | 'high' } }\n >\n}\n\nexport interface LlmCallRequest {\n model: string\n messages: LlmMessage[]\n /** Optional JSON-mode response format (response_format: json_object). */\n jsonMode?: boolean\n /** Optional structured output via JSON Schema. Falls back to json_object on 400. */\n jsonSchema?: { name: string; schema: Record<string, unknown> }\n temperature?: number\n maxTokens?: number\n /** Per-call timeout, default 60s. */\n timeoutMs?: number\n}\n\nexport interface LlmUsage {\n promptTokens: number\n completionTokens: number\n totalTokens: number\n /** Proxies populate this when prompt caching is on. */\n cachedPromptTokens?: number\n}\n\nexport interface LlmCallResult {\n /** The text content of the first choice. Empty string if none. */\n content: string\n usage: LlmUsage\n /**\n * Cost in USD. Pulled from proxy's `_response_cost` field when present;\n * `null` when neither the proxy nor the caller can derive it.\n */\n costUsd: number | null\n /** Model name actually used (echoed from response). */\n model: string\n /** Wall-clock duration of the HTTP call (last attempt, if retried). */\n durationMs: number\n /** Raw response body. */\n raw: Record<string, unknown>\n}\n\nexport class LlmCallError extends Error {\n constructor(\n message: string,\n public readonly status: number,\n public readonly body: string,\n public readonly model: string,\n ) {\n super(message)\n this.name = 'LlmCallError'\n }\n}\n\nexport interface LlmClientOptions {\n /** Base URL (without trailing slash). Must end at the `/v1` prefix. */\n baseUrl?: string\n /** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */\n apiKey?: string\n bearer?: string\n /** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */\n authHeader?: { name: string; value: string }\n /** Default timeout in ms. Per-call can override. */\n defaultTimeoutMs?: number\n /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */\n maxRetries?: number\n /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */\n fetch?: typeof fetch\n}\n\n// ─── Internals ──────────────────────────────────────────────────────────\n\nconst DEFAULT_BASE_URL = 'https://router.tangle.tools/v1'\nconst DEFAULT_TIMEOUT_MS = 60_000\nconst DEFAULT_MAX_RETRIES = 3\n\nconst RETRYABLE_STATUS = new Set([429, 502, 503, 504])\n\nfunction isRetryableError(err: unknown): boolean {\n if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status)\n if (err instanceof Error) {\n return (\n err.name === 'AbortError' ||\n err.name === 'TimeoutError' ||\n /fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message)\n )\n }\n return false\n}\n\nfunction parseRetryAfter(headers: Headers): number | null {\n const h = headers.get('retry-after')\n if (!h) return null\n const asNumber = Number(h)\n if (Number.isFinite(asNumber) && asNumber > 0) return asNumber * 1000\n const asDate = Date.parse(h)\n if (Number.isFinite(asDate)) return Math.max(0, asDate - Date.now())\n return null\n}\n\nfunction backoffMs(attempt: number): number {\n // 500ms, 1s, 2s, 4s, ...\n return Math.min(500 * Math.pow(2, attempt), 16_000)\n}\n\nfunction buildHeaders(opts: LlmClientOptions): Record<string, string> {\n const headers: Record<string, string> = {\n 'Content-Type': 'application/json',\n Accept: 'application/json',\n }\n if (opts.authHeader) {\n headers[opts.authHeader.name] = opts.authHeader.value\n } else if (opts.bearer || opts.apiKey) {\n headers.Authorization = `Bearer ${opts.bearer ?? opts.apiKey}`\n }\n return headers\n}\n\nfunction isSchemaRejection(status: number, body: string): boolean {\n if (status !== 400) return false\n const lower = body.toLowerCase()\n return (\n lower.includes('response_format') ||\n lower.includes('json_schema') ||\n lower.includes('is unavailable') ||\n lower.includes('not supported')\n )\n}\n\nfunction buildBody(req: LlmCallRequest, forceJsonObject: boolean): Record<string, unknown> {\n const body: Record<string, unknown> = {\n model: req.model,\n messages: req.messages,\n temperature: req.temperature ?? 0,\n }\n if (req.maxTokens != null) {\n if (usesMaxCompletionTokens(req.model)) body.max_completion_tokens = req.maxTokens\n else body.max_tokens = req.maxTokens\n }\n\n if (req.jsonSchema && !forceJsonObject) {\n body.response_format = {\n type: 'json_schema',\n json_schema: { name: req.jsonSchema.name, schema: req.jsonSchema.schema, strict: true },\n }\n } else if (req.jsonMode || req.jsonSchema) {\n body.response_format = { type: 'json_object' }\n }\n\n return body\n}\n\nfunction usesMaxCompletionTokens(model: string): boolean {\n return /^gpt-5(?:[.\\-]|$)/i.test(model)\n}\n\nasync function sleep(ms: number): Promise<void> {\n return new Promise((resolve) => setTimeout(resolve, ms))\n}\n\n// ─── Public API ─────────────────────────────────────────────────────────\n\n/**\n * Strip a ```json / ``` code fence if the model emitted one.\n * Idempotent for naked JSON. Some models (claude-code via router, certain\n * deepseek models) wrap output even under json_object.\n */\nexport function stripFencedJson(raw: string): string {\n const trimmed = raw.trim()\n const m = trimmed.match(/^```(?:json)?\\s*\\n?([\\s\\S]*?)\\n?```\\s*$/)\n return m ? m[1]!.trim() : trimmed\n}\n\nexport function extractJsonPayload(raw: string): string {\n const stripped = stripFencedJson(raw)\n try {\n JSON.parse(stripped)\n return stripped\n } catch {\n // Continue with balanced extraction below.\n }\n\n const starts = [...stripped.matchAll(/[\\[{]/g)].map((match) => match.index).filter((index) => index != null)\n for (const start of starts) {\n const candidate = extractBalancedJson(stripped, start)\n if (!candidate) continue\n try {\n JSON.parse(candidate)\n return candidate\n } catch {\n // Keep scanning; earlier braces may belong to prose.\n }\n }\n\n return stripped\n}\n\nfunction extractBalancedJson(input: string, start: number): string | null {\n const opener = input[start]\n const closer = opener === '{' ? '}' : opener === '[' ? ']' : null\n if (!closer) return null\n\n const stack: string[] = [closer]\n let isInString = false\n let isEscaped = false\n\n for (let i = start + 1; i < input.length; i++) {\n const char = input[i]!\n if (isEscaped) {\n isEscaped = false\n continue\n }\n if (char === '\\\\') {\n isEscaped = isInString\n continue\n }\n if (char === '\"') {\n isInString = !isInString\n continue\n }\n if (isInString) continue\n\n if (char === '{') stack.push('}')\n else if (char === '[') stack.push(']')\n else if (char === stack[stack.length - 1]) {\n stack.pop()\n if (stack.length === 0) return input.slice(start, i + 1)\n }\n }\n\n return null\n}\n\n/**\n * Low-level call. Returns raw content + usage + cost. Retries on transient\n * failures; does NOT degrade schema here — callers that want graceful\n * degrade use `callLlmJson`.\n */\nexport async function callLlm(\n req: LlmCallRequest,\n opts: LlmClientOptions = {},\n): Promise<LlmCallResult> {\n const baseUrl = (opts.baseUrl ?? DEFAULT_BASE_URL).replace(/\\/+$/, '')\n const url = `${baseUrl}/chat/completions`\n const timeoutMs = req.timeoutMs ?? opts.defaultTimeoutMs ?? DEFAULT_TIMEOUT_MS\n const maxRetries = opts.maxRetries ?? DEFAULT_MAX_RETRIES\n const fetchFn = opts.fetch ?? globalThis.fetch\n const headers = buildHeaders(opts)\n\n let lastErr: unknown\n for (let attempt = 0; attempt < maxRetries; attempt++) {\n const controller = new AbortController()\n const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs)\n const started = Date.now()\n\n try {\n const res = await fetchFn(url, {\n method: 'POST',\n headers,\n body: JSON.stringify(buildBody(req, false)),\n signal: controller.signal,\n })\n clearTimeout(timeoutHandle)\n\n if (!res.ok) {\n const body = await res.text()\n const err = new LlmCallError(\n `LLM call ${res.status}: ${body.slice(0, 300)}`,\n res.status,\n body,\n req.model,\n )\n if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) {\n lastErr = err\n const retryAfter = parseRetryAfter(res.headers)\n await sleep(retryAfter ?? backoffMs(attempt))\n continue\n }\n throw err\n }\n\n const json = (await res.json()) as Record<string, unknown>\n const choice = (json.choices as Array<{ message?: { content?: string } }> | undefined)?.[0]\n const usageRaw = (json.usage as Record<string, unknown> | undefined) ?? {}\n const costFromProxy = (json._response_cost ?? json.cost_usd) as number | undefined\n\n return {\n content: choice?.message?.content ?? '',\n usage: {\n promptTokens: Number(usageRaw.prompt_tokens ?? 0),\n completionTokens: Number(usageRaw.completion_tokens ?? 0),\n totalTokens: Number(usageRaw.total_tokens ?? 0),\n cachedPromptTokens:\n usageRaw.prompt_tokens_details &&\n typeof usageRaw.prompt_tokens_details === 'object'\n ? Number(\n (usageRaw.prompt_tokens_details as Record<string, unknown>).cached_tokens ?? 0,\n )\n : undefined,\n },\n costUsd: typeof costFromProxy === 'number' ? costFromProxy : null,\n model: (json.model as string) ?? req.model,\n durationMs: Date.now() - started,\n raw: json,\n }\n } catch (err) {\n clearTimeout(timeoutHandle)\n lastErr = err\n if (attempt < maxRetries - 1 && isRetryableError(err)) {\n await sleep(backoffMs(attempt))\n continue\n }\n throw err\n }\n }\n throw lastErr instanceof Error ? lastErr : new Error(String(lastErr))\n}\n\n/**\n * Structured-output call. Returns parsed JSON plus the raw result envelope.\n * Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —\n * critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept\n * the `response_format.json_schema` shape but DO accept `json_object`.\n */\nexport async function callLlmJson<T = unknown>(\n req: LlmCallRequest,\n opts: LlmClientOptions = {},\n): Promise<{ value: T; result: LlmCallResult }> {\n try {\n const result = await callLlm({ ...req, jsonMode: req.jsonMode ?? !req.jsonSchema }, opts)\n const value = parseJsonSafely<T>(result.content, result.model)\n return { value, result }\n } catch (err) {\n if (err instanceof LlmCallError && isSchemaRejection(err.status, err.body) && req.jsonSchema) {\n // Degrade to json_object + retry.\n const degradedReq: LlmCallRequest = { ...req, jsonMode: true, jsonSchema: undefined }\n const result = await callLlm(degradedReq, opts)\n const value = parseJsonSafely<T>(result.content, result.model)\n return { value, result }\n }\n throw err\n }\n}\n\nfunction parseJsonSafely<T>(content: string, model: string): T {\n const stripped = extractJsonPayload(content)\n try {\n return JSON.parse(stripped) as T\n } catch (err) {\n throw new Error(\n `LLM returned non-JSON content (model=${model}): ${\n err instanceof Error ? err.message : String(err)\n }\\n--- raw content ---\\n${content.slice(0, 800)}`,\n )\n }\n}\n\n/**\n * Probe whether a model is reachable. Returns latency + null error on\n * success; `ok=false` + error message on any failure (HTTP, timeout,\n * network, parse). Designed for sweep preflights — fail loud at the\n * boundary before burning a 30-leaf run on a misconfigured router.\n *\n * Sends a tiny `ping` message with `maxTokens=64`. Reasoning models\n * (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning\n * for short prompts, so don't tighten this further. We don't validate\n * content; HTTP 200 means reachable.\n */\nexport async function probeLlm(\n model: string,\n opts: LlmClientOptions & { timeoutMs?: number } = {},\n): Promise<{ ok: boolean; latencyMs: number; error: string | null }> {\n const start = Date.now()\n try {\n await callLlm(\n {\n model,\n messages: [{ role: 'user', content: 'ping' }],\n maxTokens: 64,\n timeoutMs: opts.timeoutMs ?? 30_000,\n },\n opts,\n )\n return { ok: true, latencyMs: Date.now() - start, error: null }\n } catch (err) {\n return {\n ok: false,\n latencyMs: Date.now() - start,\n error: err instanceof Error ? err.message : String(err),\n }\n }\n}\n\n/**\n * Stateful client — construct once with defaults, call many times.\n * Thin wrapper around the free functions; exists for callers that want\n * to inject a single configured instance into multiple primitives.\n */\nexport class LlmClient {\n constructor(private readonly opts: LlmClientOptions = {}) {}\n\n call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult> {\n return callLlm(req, { ...this.opts, ...per })\n }\n\n callJson<T = unknown>(\n req: LlmCallRequest,\n per?: LlmClientOptions,\n ): Promise<{ value: T; result: LlmCallResult }> {\n return callLlmJson<T>(req, { ...this.opts, ...per })\n }\n}\n"],"mappings":";AA4EO,IAAM,eAAN,cAA2B,MAAM;AAAA,EACtC,YACE,SACgB,QACA,MACA,OAChB;AACA,UAAM,OAAO;AAJG;AACA;AACA;AAGhB,SAAK,OAAO;AAAA,EACd;AAAA,EANkB;AAAA,EACA;AAAA,EACA;AAKpB;AAoBA,IAAM,mBAAmB;AACzB,IAAM,qBAAqB;AAC3B,IAAM,sBAAsB;AAE5B,IAAM,mBAAmB,oBAAI,IAAI,CAAC,KAAK,KAAK,KAAK,GAAG,CAAC;AAErD,SAAS,iBAAiB,KAAuB;AAC/C,MAAI,eAAe,aAAc,QAAO,iBAAiB,IAAI,IAAI,MAAM;AACvE,MAAI,eAAe,OAAO;AACxB,WACE,IAAI,SAAS,gBACb,IAAI,SAAS,kBACb,+CAA+C,KAAK,IAAI,OAAO;AAAA,EAEnE;AACA,SAAO;AACT;AAEA,SAAS,gBAAgB,SAAiC;AACxD,QAAM,IAAI,QAAQ,IAAI,aAAa;AACnC,MAAI,CAAC,EAAG,QAAO;AACf,QAAM,WAAW,OAAO,CAAC;AACzB,MAAI,OAAO,SAAS,QAAQ,KAAK,WAAW,EAAG,QAAO,WAAW;AACjE,QAAM,SAAS,KAAK,MAAM,CAAC;AAC3B,MAAI,OAAO,SAAS,MAAM,EAAG,QAAO,KAAK,IAAI,GAAG,SAAS,KAAK,IAAI,CAAC;AACnE,SAAO;AACT;AAEA,SAAS,UAAU,SAAyB;AAE1C,SAAO,KAAK,IAAI,MAAM,KAAK,IAAI,GAAG,OAAO,GAAG,IAAM;AACpD;AAEA,SAAS,aAAa,MAAgD;AACpE,QAAM,UAAkC;AAAA,IACtC,gBAAgB;AAAA,IAChB,QAAQ;AAAA,EACV;AACA,MAAI,KAAK,YAAY;AACnB,YAAQ,KAAK,WAAW,IAAI,IAAI,KAAK,WAAW;AAAA,EAClD,WAAW,KAAK,UAAU,KAAK,QAAQ;AACrC,YAAQ,gBAAgB,UAAU,KAAK,UAAU,KAAK,MAAM;AAAA,EAC9D;AACA,SAAO;AACT;AAEA,SAAS,kBAAkB,QAAgB,MAAuB;AAChE,MAAI,WAAW,IAAK,QAAO;AAC3B,QAAM,QAAQ,KAAK,YAAY;AAC/B,SACE,MAAM,SAAS,iBAAiB,KAChC,MAAM,SAAS,aAAa,KAC5B,MAAM,SAAS,gBAAgB,KAC/B,MAAM,SAAS,eAAe;AAElC;AAEA,SAAS,UAAU,KAAqB,iBAAmD;AACzF,QAAM,OAAgC;AAAA,IACpC,OAAO,IAAI;AAAA,IACX,UAAU,IAAI;AAAA,IACd,aAAa,IAAI,eAAe;AAAA,EAClC;AACA,MAAI,IAAI,aAAa,MAAM;AACzB,QAAI,wBAAwB,IAAI,KAAK,EAAG,MAAK,wBAAwB,IAAI;AAAA,QACpE,MAAK,aAAa,IAAI;AAAA,EAC7B;AAEA,MAAI,IAAI,cAAc,CAAC,iBAAiB;AACtC,SAAK,kBAAkB;AAAA,MACrB,MAAM;AAAA,MACN,aAAa,EAAE,MAAM,IAAI,WAAW,MAAM,QAAQ,IAAI,WAAW,QAAQ,QAAQ,KAAK;AAAA,IACxF;AAAA,EACF,WAAW,IAAI,YAAY,IAAI,YAAY;AACzC,SAAK,kBAAkB,EAAE,MAAM,cAAc;AAAA,EAC/C;AAEA,SAAO;AACT;AAEA,SAAS,wBAAwB,OAAwB;AACvD,SAAO,qBAAqB,KAAK,KAAK;AACxC;AAEA,eAAe,MAAM,IAA2B;AAC9C,SAAO,IAAI,QAAQ,CAAC,YAAY,WAAW,SAAS,EAAE,CAAC;AACzD;AASO,SAAS,gBAAgB,KAAqB;AACnD,QAAM,UAAU,IAAI,KAAK;AACzB,QAAM,IAAI,QAAQ,MAAM,yCAAyC;AACjE,SAAO,IAAI,EAAE,CAAC,EAAG,KAAK,IAAI;AAC5B;AAEO,SAAS,mBAAmB,KAAqB;AACtD,QAAM,WAAW,gBAAgB,GAAG;AACpC,MAAI;AACF,SAAK,MAAM,QAAQ;AACnB,WAAO;AAAA,EACT,QAAQ;AAAA,EAER;AAEA,QAAM,SAAS,CAAC,GAAG,SAAS,SAAS,QAAQ,CAAC,EAAE,IAAI,CAAC,UAAU,MAAM,KAAK,EAAE,OAAO,CAAC,UAAU,SAAS,IAAI;AAC3G,aAAW,SAAS,QAAQ;AAC1B,UAAM,YAAY,oBAAoB,UAAU,KAAK;AACrD,QAAI,CAAC,UAAW;AAChB,QAAI;AACF,WAAK,MAAM,SAAS;AACpB,aAAO;AAAA,IACT,QAAQ;AAAA,IAER;AAAA,EACF;AAEA,SAAO;AACT;AAEA,SAAS,oBAAoB,OAAe,OAA8B;AACxE,QAAM,SAAS,MAAM,KAAK;AAC1B,QAAM,SAAS,WAAW,MAAM,MAAM,WAAW,MAAM,MAAM;AAC7D,MAAI,CAAC,OAAQ,QAAO;AAEpB,QAAM,QAAkB,CAAC,MAAM;AAC/B,MAAI,aAAa;AACjB,MAAI,YAAY;AAEhB,WAAS,IAAI,QAAQ,GAAG,IAAI,MAAM,QAAQ,KAAK;AAC7C,UAAM,OAAO,MAAM,CAAC;AACpB,QAAI,WAAW;AACb,kBAAY;AACZ;AAAA,IACF;AACA,QAAI,SAAS,MAAM;AACjB,kBAAY;AACZ;AAAA,IACF;AACA,QAAI,SAAS,KAAK;AAChB,mBAAa,CAAC;AACd;AAAA,IACF;AACA,QAAI,WAAY;AAEhB,QAAI,SAAS,IAAK,OAAM,KAAK,GAAG;AAAA,aACvB,SAAS,IAAK,OAAM,KAAK,GAAG;AAAA,aAC5B,SAAS,MAAM,MAAM,SAAS,CAAC,GAAG;AACzC,YAAM,IAAI;AACV,UAAI,MAAM,WAAW,EAAG,QAAO,MAAM,MAAM,OAAO,IAAI,CAAC;AAAA,IACzD;AAAA,EACF;AAEA,SAAO;AACT;AAOA,eAAsB,QACpB,KACA,OAAyB,CAAC,GACF;AACxB,QAAM,WAAW,KAAK,WAAW,kBAAkB,QAAQ,QAAQ,EAAE;AACrE,QAAM,MAAM,GAAG,OAAO;AACtB,QAAM,YAAY,IAAI,aAAa,KAAK,oBAAoB;AAC5D,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,UAAU,KAAK,SAAS,WAAW;AACzC,QAAM,UAAU,aAAa,IAAI;AAEjC,MAAI;AACJ,WAAS,UAAU,GAAG,UAAU,YAAY,WAAW;AACrD,UAAM,aAAa,IAAI,gBAAgB;AACvC,UAAM,gBAAgB,WAAW,MAAM,WAAW,MAAM,GAAG,SAAS;AACpE,UAAM,UAAU,KAAK,IAAI;AAEzB,QAAI;AACF,YAAM,MAAM,MAAM,QAAQ,KAAK;AAAA,QAC7B,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU,UAAU,KAAK,KAAK,CAAC;AAAA,QAC1C,QAAQ,WAAW;AAAA,MACrB,CAAC;AACD,mBAAa,aAAa;AAE1B,UAAI,CAAC,IAAI,IAAI;AACX,cAAM,OAAO,MAAM,IAAI,KAAK;AAC5B,cAAM,MAAM,IAAI;AAAA,UACd,YAAY,IAAI,MAAM,KAAK,KAAK,MAAM,GAAG,GAAG,CAAC;AAAA,UAC7C,IAAI;AAAA,UACJ;AAAA,UACA,IAAI;AAAA,QACN;AACA,YAAI,iBAAiB,IAAI,IAAI,MAAM,KAAK,UAAU,aAAa,GAAG;AAChE,oBAAU;AACV,gBAAM,aAAa,gBAAgB,IAAI,OAAO;AAC9C,gBAAM,MAAM,cAAc,UAAU,OAAO,CAAC;AAC5C;AAAA,QACF;AACA,cAAM;AAAA,MACR;AAEA,YAAM,OAAQ,MAAM,IAAI,KAAK;AAC7B,YAAM,SAAU,KAAK,UAAoE,CAAC;AAC1F,YAAM,WAAY,KAAK,SAAiD,CAAC;AACzE,YAAM,gBAAiB,KAAK,kBAAkB,KAAK;AAEnD,aAAO;AAAA,QACL,SAAS,QAAQ,SAAS,WAAW;AAAA,QACrC,OAAO;AAAA,UACL,cAAc,OAAO,SAAS,iBAAiB,CAAC;AAAA,UAChD,kBAAkB,OAAO,SAAS,qBAAqB,CAAC;AAAA,UACxD,aAAa,OAAO,SAAS,gBAAgB,CAAC;AAAA,UAC9C,oBACE,SAAS,yBACT,OAAO,SAAS,0BAA0B,WACtC;AAAA,YACG,SAAS,sBAAkD,iBAAiB;AAAA,UAC/E,IACA;AAAA,QACR;AAAA,QACA,SAAS,OAAO,kBAAkB,WAAW,gBAAgB;AAAA,QAC7D,OAAQ,KAAK,SAAoB,IAAI;AAAA,QACrC,YAAY,KAAK,IAAI,IAAI;AAAA,QACzB,KAAK;AAAA,MACP;AAAA,IACF,SAAS,KAAK;AACZ,mBAAa,aAAa;AAC1B,gBAAU;AACV,UAAI,UAAU,aAAa,KAAK,iBAAiB,GAAG,GAAG;AACrD,cAAM,MAAM,UAAU,OAAO,CAAC;AAC9B;AAAA,MACF;AACA,YAAM;AAAA,IACR;AAAA,EACF;AACA,QAAM,mBAAmB,QAAQ,UAAU,IAAI,MAAM,OAAO,OAAO,CAAC;AACtE;AAQA,eAAsB,YACpB,KACA,OAAyB,CAAC,GACoB;AAC9C,MAAI;AACF,UAAM,SAAS,MAAM,QAAQ,EAAE,GAAG,KAAK,UAAU,IAAI,YAAY,CAAC,IAAI,WAAW,GAAG,IAAI;AACxF,UAAM,QAAQ,gBAAmB,OAAO,SAAS,OAAO,KAAK;AAC7D,WAAO,EAAE,OAAO,OAAO;AAAA,EACzB,SAAS,KAAK;AACZ,QAAI,eAAe,gBAAgB,kBAAkB,IAAI,QAAQ,IAAI,IAAI,KAAK,IAAI,YAAY;AAE5F,YAAM,cAA8B,EAAE,GAAG,KAAK,UAAU,MAAM,YAAY,OAAU;AACpF,YAAM,SAAS,MAAM,QAAQ,aAAa,IAAI;AAC9C,YAAM,QAAQ,gBAAmB,OAAO,SAAS,OAAO,KAAK;AAC7D,aAAO,EAAE,OAAO,OAAO;AAAA,IACzB;AACA,UAAM;AAAA,EACR;AACF;AAEA,SAAS,gBAAmB,SAAiB,OAAkB;AAC7D,QAAM,WAAW,mBAAmB,OAAO;AAC3C,MAAI;AACF,WAAO,KAAK,MAAM,QAAQ;AAAA,EAC5B,SAAS,KAAK;AACZ,UAAM,IAAI;AAAA,MACR,wCAAwC,KAAK,MAC3C,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CACjD;AAAA;AAAA,EAA0B,QAAQ,MAAM,GAAG,GAAG,CAAC;AAAA,IACjD;AAAA,EACF;AACF;AAaA,eAAsB,SACpB,OACA,OAAkD,CAAC,GACgB;AACnE,QAAM,QAAQ,KAAK,IAAI;AACvB,MAAI;AACF,UAAM;AAAA,MACJ;AAAA,QACE;AAAA,QACA,UAAU,CAAC,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAAA,QAC5C,WAAW;AAAA,QACX,WAAW,KAAK,aAAa;AAAA,MAC/B;AAAA,MACA;AAAA,IACF;AACA,WAAO,EAAE,IAAI,MAAM,WAAW,KAAK,IAAI,IAAI,OAAO,OAAO,KAAK;AAAA,EAChE,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,IAAI;AAAA,MACJ,WAAW,KAAK,IAAI,IAAI;AAAA,MACxB,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,IACxD;AAAA,EACF;AACF;AAOO,IAAM,YAAN,MAAgB;AAAA,EACrB,YAA6B,OAAyB,CAAC,GAAG;AAA7B;AAAA,EAA8B;AAAA,EAA9B;AAAA,EAE7B,KAAK,KAAqB,KAAgD;AACxE,WAAO,QAAQ,KAAK,EAAE,GAAG,KAAK,MAAM,GAAG,IAAI,CAAC;AAAA,EAC9C;AAAA,EAEA,SACE,KACA,KAC8C;AAC9C,WAAO,YAAe,KAAK,EAAE,GAAG,KAAK,MAAM,GAAG,IAAI,CAAC;AAAA,EACrD;AACF;","names":[]}