npm - @tangle-network/agent-eval - Versions diffs - 0.21.0 → 0.22.0 - Mend

@tangle-network/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +102 -1
package/README.md +4 -0
package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
package/dist/chunk-4W4NCYM2.js.map +1 -0
package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
package/dist/chunk-6M774GY6.js +53 -0
package/dist/chunk-6M774GY6.js.map +1 -0
package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
package/dist/chunk-IOXMGMHQ.js.map +1 -0
package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
package/dist/chunk-QUKKGHTZ.js +121 -0
package/dist/chunk-QUKKGHTZ.js.map +1 -0
package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
package/dist/chunk-UAND2LOT.js +738 -0
package/dist/chunk-UAND2LOT.js.map +1 -0
package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
package/dist/chunk-USHQBPMH.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/index.d.ts +10 -284
package/dist/index.js +39 -19
package/dist/index.js.map +1 -1
package/dist/integrity-K2oVlF57.d.ts +210 -0
package/dist/openapi.json +1 -1
package/dist/optimization-UVDNKaO6.d.ts +574 -0
package/dist/optimization.d.ts +6 -144
package/dist/optimization.js +9 -2
package/dist/reporting-B82RSv9C.d.ts +593 -0
package/dist/reporting.d.ts +2 -2
package/dist/reporting.js +15 -8
package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
package/dist/traces.d.ts +101 -181
package/dist/traces.js +16 -5
package/dist/wire/index.js +3 -3
package/docs/research-report-methodology.md +19 -4
package/docs/wire-protocol.md +1 -1
package/package.json +2 -2
package/dist/chunk-3IX6QTB7.js.map +0 -1
package/dist/chunk-HRZELXCR.js.map +0 -1
package/dist/chunk-KRR4VMH7.js +0 -423
package/dist/chunk-KRR4VMH7.js.map +0 -1
package/dist/chunk-WOK2RTWG.js.map +0 -1
package/dist/reporting-Da2ihlcM.d.ts +0 -672
/package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
/package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0

package/dist/traces.d.ts CHANGED Viewed

@@ -2,6 +2,8 @@ import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureCl
 export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
 import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
 export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
+import { d as RawProviderSink, c as RawProviderEvent } from './integrity-K2oVlF57.js';
+export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, R as RawProviderDirection, e as RawProviderSinkFilter, f as RunIntegrityError, g as RunIntegrityExpectations, h as RunIntegrityIssue, i as RunIntegrityIssueCode, j as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-K2oVlF57.js';
 import { AxAIService, AxFunction } from '@ax-llm/ax';
 /**
@@ -133,204 +135,122 @@ interface OtlpExport {
 declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
 /**
- * RawProviderSink — first-class persistence for the actual HTTP-level
- * request/response bodies of every LLM provider call.
+ * Replay-from-raw-events — turn every captured campaign run into a
+ * re-runnable artifact.
  *
- * Why this is a separate sink from the structured `LlmSpan`:
+ * The premise: 0.21 made `RawProviderSink` capture every provider HTTP
+ * envelope. 0.22's `runEvalCampaign` makes capture the default. Together
+ * they mean every past run is a complete fingerprint of what happened on
+ * the wire — and that fingerprint is enough to replay the run without
+ * burning new LLM cost.
  *
- *   - `LlmSpan` records the *intent* — model name, messages, output text,
- *     usage. It's what dashboards read; it's NOT enough for forensics.
- *   - When a downstream consumer reports "the verifier used the wrong route"
- *     or "tokens look right but reasoning was missing," the only way to
- *     answer is the raw HTTP body. Span fields can lie (a proxy can echo
- *     a different `model` value than what actually answered); the raw
- *     response is ground truth.
+ * Three use cases this primitive enables:
  *
- * Default behaviour: opt-in. Pass `rawSink` to `LlmClientOptions` (or the
- * matrix runner / BuilderSession sets it up automatically) and every
- * request, response, and error is recorded — including retries, with the
- * attempt index attached so a flaky call's full event chain is recoverable.
+ *   1. **Post-hoc judging** — apply a new judge / rubric / scoring callback
+ *      to last week's runs without re-calling any LLM. The cost of trying
+ *      a new rubric drops from "another full sweep" to a CPU-bound replay.
+ *   2. **Determinism audits** — replay the same campaign and verify the
+ *      raw responses match byte-for-byte. Any drift is a non-determinism
+ *      bug (in the harness, the prompt builder, the sandbox, …).
+ *   3. **Free judge calibration** — run two judges on identical responses
+ *      and measure inter-judge agreement without doubling LLM spend.
  *
- * Redaction is enforced at sink time. The default redactor strips
- * `Authorization`, `X-Api-Key`, `X-Auth-Token`, `Cookie` headers and any
- * payload field whose key matches `apiKey | api_key | bearer | password |
- * secret | token` (case-insensitive). Override via the sink constructor or
- * the per-call `redactor`. The `redactedFields` array on the persisted
- * event lets a reviewer see what was stripped without exposing the values.
+ * The interface is deliberately fetch-shaped. Inject `createReplayFetch`
+ * into `LlmClientOptions.fetch` and every `callLlm` transparently reads
+ * from the cache instead of calling the network. No new code path through
+ * the LLM client is needed; the cache hit is invisible to the runner.
  */
-type RawProviderDirection = 'request' | 'response' | 'error';
-interface RawProviderEvent {
-    /** Stable id. Generated by the sink if omitted. */
-    eventId: string;
-    /** Trace context populated by `LlmClient` when the call is wrapped in a span. */
-    runId?: string;
-    spanId?: string;
-    /**
-     * Logical provider name. Free-form so callers can use whatever id matches
-     * their topology (`'openai'`, `'anthropic'`, `'tangle-router'`, …). When
-     * omitted, derived from `baseUrl` in `LlmClientOptions`.
-     */
-    provider: string;
-    model: string;
-    /** Endpoint path, e.g. `'/v1/chat/completions'`. */
-    endpoint: string;
-    /** Base URL used for the call (already-normalised — no trailing slash). */
-    baseUrl: string;
-    /** 0-indexed retry attempt. The first attempt is 0; a retried call gets 1, 2, … */
-    attemptIndex: number;
-    direction: RawProviderDirection;
-    /** Unix ms. */
-    timestamp: number;
-    /** Wall-clock duration of the call leg. Set on `response` and `error` events; null on `request`. */
-    durationMs?: number;
-    statusCode?: number;
-    requestHeaders?: Record<string, string>;
-    requestBody?: unknown;
-    responseHeaders?: Record<string, string>;
-    responseBody?: unknown;
-    /** Set on `direction: 'error'` events. */
-    errorMessage?: string;
-    /** Field paths the redactor stripped from this event ('header:Authorization', 'body.apiKey', …). */
-    redactedFields: string[];
-}
-interface RawProviderSinkFilter {
-    runId?: string;
-    spanId?: string;
-    direction?: RawProviderDirection;
-    attemptIndex?: number;
+declare class ReplayCacheMissError extends Error {
+    readonly url: string;
+    readonly requestKey: string;
+    constructor(url: string, requestKey: string, message?: string);
 }
-interface RawProviderSink {
-    record(event: RawProviderEvent): Promise<void>;
-    /** Optional listing — implementations that durably persist (file, db) should support this. */
-    list?(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
-    /** Optional teardown for backed implementations. */
-    close?(): Promise<void>;
+interface ReplayCacheEntry {
+    request: RawProviderEvent;
+    response: RawProviderEvent;
 }
-type ProviderRedactor = (event: RawProviderEvent) => RawProviderEvent;
-/**
- * Default redactor — strips well-known auth headers and any body field whose
- * key matches the credential pattern. Records every redacted path on
- * `event.redactedFields` so a downstream reviewer can see what was removed.
- */
-declare function defaultProviderRedactor(event: RawProviderEvent): RawProviderEvent;
-interface InMemoryRawProviderSinkOptions {
-    redactor?: ProviderRedactor;
-}
-declare class InMemoryRawProviderSink implements RawProviderSink {
-    private events;
-    private redactor;
-    constructor(opts?: InMemoryRawProviderSinkOptions);
-    record(event: RawProviderEvent): Promise<void>;
-    list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
-    size(): number;
-}
-declare class NoopRawProviderSink implements RawProviderSink {
-    record(): Promise<void>;
-}
-interface FileSystemRawProviderSinkOptions {
-    /** Directory the NDJSON file is written into. Created if missing. */
-    dir: string;
-    /** File name; default `'raw-provider-events.ndjson'`. */
-    fileName?: string;
-    /** Bytes after which the writer rolls over to a new file (default 32 MiB). */
-    rollAtBytes?: number;
-    redactor?: ProviderRedactor;
-}
-declare class FileSystemRawProviderSink implements RawProviderSink {
-    private dir;
-    private fileName;
-    private rollAtBytes;
-    private redactor;
-    private bytesWritten;
-    private rollIndex;
-    private initPromise;
-    constructor(opts: FileSystemRawProviderSinkOptions);
-    private ensureInit;
-    private currentPath;
-    record(event: RawProviderEvent): Promise<void>;
-    list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
+interface ReplayCacheStats {
+    total: number;
+    byProvider: Record<string, number>;
+    byModel: Record<string, number>;
+    /** Spans for which we have a request but no response (run aborted mid-call). */
+    orphanRequests: number;
 }
 /**
- * Best-effort provider id from a base URL. Falls back to the URL host when
- * none of the well-known patterns match.
- */
-declare function providerFromBaseUrl(baseUrl: string): string;
-/**
- * Run-completion integrity check — at end of run, verify the expected event
- * types were actually captured. The point is the launch-review failure mode:
- * a run *appears* successful but the raw provider events were never written,
- * so a downstream reviewer can't reconstruct what happened.
+ * In-memory deterministic cache of (request → response) keyed on a stable
+ * hash of the request body. Built from a `RawProviderSink` containing
+ * paired `request` and `response` events from a previous run.
  *
- * Pattern:
- *
- *   const report = await assertRunCaptured(store, runId, {
- *     llmSpansMin: 1,
- *     judgeSpansMin: 1,
- *     rawSink: providerSink,                  // must have ≥ 1 event for this run
- *     requireRawCoverageOfLlmSpans: true,     // every llm span has matching raw events
- *   })
- *   if (!report.ok) throwIfRunIncomplete(report)  // or mark run failed and continue
- *
- * The function is read-only on the store and returns a structured report;
- * the caller chooses the failure mode (throw, mark run failed, log warning).
- * `throwIfRunIncomplete` is the convenient strict mode.
+ * The cache is the source of truth for replay; `createReplayFetch` is a
+ * thin wrapper that reads from it.
  */
-interface RunIntegrityExpectations {
-    /** Minimum LLM span count. Default 0 (no requirement). */
-    llmSpansMin?: number;
-    /** Minimum judge span count. Default 0. */
-    judgeSpansMin?: number;
-    /** Minimum tool span count. Default 0. */
-    toolSpansMin?: number;
+declare class ReplayCache {
+    private byKey;
+    private orphans;
+    private byProvider;
+    private byModel;
     /**
-     * Raw provider sink to consult for capture verification. When present,
-     * the check requires at least one raw event for the run.
+     * Build a cache from a sink's events. The sink must implement `list()`.
+     * Filter by `runId` / `spanId` to scope to a specific replay.
      */
-    rawSink?: RawProviderSink;
-    /** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
-    rawProviderEventsMin?: number;
+    static fromSink(sink: RawProviderSink, filter?: {
+        runId?: string;
+        spanId?: string;
+    }): Promise<ReplayCache>;
+    /** Build a cache from an in-memory event list. */
+    static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
+    /** Number of cacheable (request, response) pairs in the cache. */
+    size(): number;
+    stats(): ReplayCacheStats;
     /**
-     * Every LLM span must have at least one matching raw `request` event
-     * (matched by spanId). Catches the common bug where the structured span
-     * was emitted but the raw HTTP capture was wired to a different sink.
+     * Look up a cached response by hashing the (model, messages, temperature,
+     * maxTokens, response_format) shape. Returns `undefined` on miss; the
+     * caller decides whether to throw, fall back to the network, or skip.
      */
-    requireRawCoverageOfLlmSpans?: boolean;
-    /** Run outcome must be set (not null/undefined). Default false. */
-    requireOutcome?: boolean;
-}
-type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
-interface RunIntegrityIssue {
-    code: RunIntegrityIssueCode;
-    message: string;
-    detail?: Record<string, unknown>;
-}
-interface RunIntegrityReport {
-    ok: boolean;
-    runId: string;
-    llmSpanCount: number;
-    judgeSpanCount: number;
-    toolSpanCount: number;
-    rawProviderEventCount: number;
+    lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
+}
+interface ReplayFetchOptions {
     /**
-     * Coverage of LLM spans by raw provider events keyed on spanId.
-     * `total` is the number of LLM spans; `covered` is the count with at
-     * least one matching `request` raw event.
+     * Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
+     * `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
+     * still complete; `'fail-closed'` returns a synthetic 599 response so the
+     * call site sees a non-retriable failure.
      */
-    rawSpanCoverage: {
-        covered: number;
-        total: number;
-    };
-    issues: RunIntegrityIssue[];
-}
-declare class RunIntegrityError extends Error {
-    readonly report: RunIntegrityReport;
-    constructor(report: RunIntegrityReport);
+    onMiss?: 'throw' | 'fallback' | 'fail-closed';
+    fallbackFetch?: typeof fetch;
+    /** Optional callback fired once per replayed call (for telemetry / counters). */
+    onHit?: (info: {
+        url: string;
+        provider: string;
+        model: string;
+    }) => void;
+    /** Optional callback fired on cache miss before the `onMiss` policy applies. */
+    onMissNotify?: (info: {
+        url: string;
+        requestBody: unknown;
+    }) => void;
 }
-declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
-/** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
-declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
+/**
+ * Build a `fetch`-shaped function that serves cached responses out of a
+ * `ReplayCache` for any URL ending in `/chat/completions`. Pass through
+ * `LlmClientOptions.fetch` and `callLlm` becomes free.
+ *
+ * Non-`/chat/completions` URLs are passed straight to the fallback fetch
+ * (default: `globalThis.fetch`). This matters because non-LLM HTTP work
+ * (judge HTTP servers, sandbox callbacks) sometimes flows through the same
+ * `fetch` and shouldn't be intercepted.
+ */
+declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
+/**
+ * Convenience iterator over `(request, response)` pairs in a sink — for
+ * post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
+ * runs purely in-process over cached LLM outputs.
+ */
+declare function iterateRawCalls(sink: RawProviderSink, filter?: {
+    runId?: string;
+    spanId?: string;
+}): AsyncGenerator<ReplayCacheEntry>;
 /**
  * Shared types for the trace-analyst module.
@@ -911,4 +831,4 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
 declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
 declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
-export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, FileSystemRawProviderSink, type FileSystemRawProviderSinkOptions, InMemoryRawProviderSink, type InMemoryRawProviderSinkOptions, JudgeSpan, LlmSpan, NoopRawProviderSink, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type ProviderRedactor, type QueryTracesPage, REDACTION_VERSION, type RawProviderDirection, type RawProviderEvent, type RawProviderSink, type RawProviderSinkFilter, type RedactionReport, type RedactionRule, Run, RunCompleteHook, RunCompleteHookContext, RunIntegrityError, type RunIntegrityExpectations, type RunIntegrityIssue, type RunIntegrityIssueCode, type RunIntegrityReport, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, assertRunCaptured, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultProviderRedactor, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, llmSpans, planTraceInsightQuestions, providerFromBaseUrl, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, throwIfRunIncomplete, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
+export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, JudgeSpan, LlmSpan, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, Run, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, llmSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };

package/dist/traces.js CHANGED Viewed

@@ -7,7 +7,8 @@ import {
   OTEL_AGENT_EVAL_SCOPE,
   OtlpFileTraceStore,
   REDACTION_VERSION,
-  RunIntegrityError,
+  ReplayCache,
+  ReplayCacheMissError,
   SpanNotFoundError,
   TRACE_ANALYST_ACTOR_DESCRIPTION,
   TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -19,10 +20,10 @@ import {
   aggregateLlm,
   analyzeTraces,
   argHash,
-  assertRunCaptured,
   buildTraceAnalystTools,
   buildTraceInsightContext,
   buildTraceInsightPrompt,
+  createReplayFetch,
   defaultTraceInsightPanel,
   describeTraceInsightScope,
   domainEvidencePattern,
@@ -34,6 +35,7 @@ import {
   isRetrievalSpan,
   isSandboxSpan,
   isToolSpan,
+  iterateRawCalls,
   judgeSpans,
   llmSpans,
   planTraceInsightQuestions,
@@ -42,23 +44,28 @@ import {
   runFailureClass,
   runsForScenario,
   scoreTraceInsightReadiness,
-  throwIfRunIncomplete,
   tokenizeDomainWords,
   toolSpans,
   traceAnalystFunctionGroup,
   traceAnalystOnRunComplete
-} from "./chunk-WOK2RTWG.js";
+} from "./chunk-4W4NCYM2.js";
+import {
+  RunIntegrityError,
+  assertRunCaptured,
+  throwIfRunIncomplete
+} from "./chunk-QUKKGHTZ.js";
 import {
   TraceEmitter,
   llmSpanFromProvider
 } from "./chunk-5IIQKMD5.js";
+import "./chunk-6M774GY6.js";
 import {
   FileSystemRawProviderSink,
   InMemoryRawProviderSink,
   NoopRawProviderSink,
   defaultProviderRedactor,
   providerFromBaseUrl
-} from "./chunk-SNUHRBDL.js";
+} from "./chunk-SQQLHODJ.js";
 import "./chunk-PZ5AY32C.js";
 export {
   DEFAULT_REDACTION_RULES,
@@ -72,6 +79,8 @@ export {
   OTEL_AGENT_EVAL_SCOPE,
   OtlpFileTraceStore,
   REDACTION_VERSION,
+  ReplayCache,
+  ReplayCacheMissError,
   RunIntegrityError,
   SpanNotFoundError,
   TRACE_ANALYST_ACTOR_DESCRIPTION,
@@ -89,6 +98,7 @@ export {
   buildTraceAnalystTools,
   buildTraceInsightContext,
   buildTraceInsightPrompt,
+  createReplayFetch,
   defaultProviderRedactor,
   defaultTraceInsightPanel,
   describeTraceInsightScope,
@@ -101,6 +111,7 @@ export {
   isRetrievalSpan,
   isSandboxSpan,
   isToolSpan,
+  iterateRawCalls,
   judgeSpans,
   llmSpanFromProvider,
   llmSpans,

package/dist/wire/index.js CHANGED Viewed

@@ -24,9 +24,9 @@ import {
   runRpcBatch,
   runRpcOnce,
   startServer
-} from "../chunk-WOPGKVN4.js";
-import "../chunk-3GN6U53I.js";
-import "../chunk-SNUHRBDL.js";
+} from "../chunk-6KQG5HAH.js";
+import "../chunk-KAO3Q65R.js";
+import "../chunk-SQQLHODJ.js";
 import "../chunk-PZ5AY32C.js";
 export {
   BUILTIN_RUBRICS,

package/docs/research-report-methodology.md CHANGED Viewed

@@ -113,15 +113,30 @@ risks list and the executive summary. Treat them as descriptive only.
 - **Unpaired Mann–Whitney.** Rejected: matched scenarios make pairing free,
   and unpaired tests throw away the variance reduction. Use the paired test
   by default.
-- **Sequential / always-valid inference (e-values, mSPRT, alpha-spending).**
-  Out of scope for a single-look report. If users iterate, wrap this report
-  in an alpha-spending schedule, or commit to one preregistered look.
+- **Sequential / always-valid inference (e-values, alpha-spending).**
+  **Shipped in 0.22.** `pairedEvalueSequence` and
+  `evaluateInterimReleaseConfidence` provide time-uniform inference using
+  the predictable plug-in betting martingale (Waudby-Smith & Ramdas 2024)
+  paired with the empirical Bernstein confidence sequence (Howard et al.
+  2021). For *rolling* analyses (interim looks at a campaign that's still
+  accumulating data) call those primitives directly; `researchReport`
+  remains the single-look summary. Paper-grade pre-registration covers the
+  static analysis; the sequential primitives cover the iterative one.
 - **Hierarchical Bayesian shrinkage across many candidates.** Future work.
   The current ranking is on raw paired statistics and over-credits the top
-  candidate when many are tested.
+  candidate when many are tested. A Bayesian hierarchical model with a
+  weakly informative prior would shrink each variant toward the grand mean,
+  reducing rank flips between near-tied candidates.
 - **Calibration / coverage simulation on the bootstrap CI.** Future work; we
   rely on the asymptotic guarantee plus the hard pair floor to keep coverage
   reasonable.
+- **Outcome-anchored calibration.** **Shipped in 0.22.**
+  `rubricPredictiveValidity` joins `RunRecord`s to a `DeploymentOutcomeStore`
+  and reports per-rubric Spearman against deployment outcomes (revenue,
+  retention, CSAT, …). Combined with the static methodology in this
+  document, the loop is: pre-register → measure with `researchReport` →
+  ship → observe outcomes → recalibrate rubric weights with
+  `rubricPredictiveValidity`.
 ## When NOT to apply

package/docs/wire-protocol.md CHANGED Viewed

@@ -188,7 +188,7 @@ Each invocation is one process — Node startup adds ~500 ms. For more than a fe
 4. **RPC case** — add `case 'x':` in `dispatchRpc` in `src/wire/rpc.ts`.
 5. **OpenAPI route** — register in `src/wire/openapi.ts` so it shows up in the spec.
 6. **Test** — add to `tests/wire/`. At minimum: schema validation, happy-path, error-path.
-7. **Python client** — add a method on `Client` in `clients/python/src/tangle_agent_eval/client.py`, plus pydantic models in `models.py` mirroring the new schemas.
+7. **Python client** — add a method on `Client` in `clients/python/src/agent_eval_rpc/client.py`, plus pydantic models in `models.py` mirroring the new schemas.
 The pattern is mechanical. When the surface grows past ~10 methods, swap the hand-written Python models for `datamodel-code-generator -i openapi.json -o models.py`.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.21.0",
+  "version": "0.22.0",
   "description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
@@ -79,7 +79,7 @@
     "@ax-llm/ax": "^19.0.25",
     "@hono/node-server": "^2.0.0",
     "@tangle-network/tcloud": "^0.4.6",
-    "hono": "^4.12.15",
+    "hono": "^4.12.16",
     "zod": "^4.3.6"
   },
   "devDependencies": {