npm - @tangle-network/agent-eval - Versions diffs - 0.21.0 → 0.23.0 - Mend

@tangle-network/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/CHANGELOG.md +236 -1
package/README.md +17 -3
package/dist/benchmarks/index.d.ts +2 -2
package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
package/dist/chunk-4W4NCYM2.js.map +1 -0
package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
package/dist/chunk-6M774GY6.js +53 -0
package/dist/chunk-6M774GY6.js.map +1 -0
package/dist/chunk-7EAUOUQS.js +495 -0
package/dist/chunk-7EAUOUQS.js.map +1 -0
package/dist/chunk-AXHNWLIX.js +246 -0
package/dist/chunk-AXHNWLIX.js.map +1 -0
package/dist/chunk-EXGR4XEM.js +283 -0
package/dist/chunk-EXGR4XEM.js.map +1 -0
package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
package/dist/chunk-IOXMGMHQ.js.map +1 -0
package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
package/dist/chunk-LZKIOBG2.js +2026 -0
package/dist/chunk-LZKIOBG2.js.map +1 -0
package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
package/dist/chunk-QBW3YBTR.js.map +1 -0
package/dist/chunk-QUKKGHTZ.js +121 -0
package/dist/chunk-QUKKGHTZ.js.map +1 -0
package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
package/dist/cli.js +3 -3
package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
package/dist/control.d.ts +3 -3
package/dist/control.js +2 -2
package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
package/dist/index-ekBXweiQ.d.ts +1894 -0
package/dist/index.d.ts +20 -430
package/dist/index.js +154 -34
package/dist/index.js.map +1 -1
package/dist/integrity-Cr5YodSY.d.ts +210 -0
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +7 -145
package/dist/optimization.js +12 -3
package/dist/reporting.d.ts +294 -4
package/dist/reporting.js +18 -9
package/dist/rl.d.ts +8 -0
package/dist/rl.js +113 -0
package/dist/rl.js.map +1 -0
package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
package/dist/sequential-DgU2mFsE.d.ts +304 -0
package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
package/dist/traces.d.ts +101 -181
package/dist/traces.js +19 -8
package/dist/wire/index.js +3 -3
package/docs/auto-research-loop-end-to-end.md +186 -0
package/docs/research-report-methodology.md +19 -4
package/docs/three-package-architecture.md +180 -0
package/docs/wire-protocol.md +1 -1
package/package.json +7 -2
package/dist/chunk-3IX6QTB7.js.map +0 -1
package/dist/chunk-KRR4VMH7.js +0 -423
package/dist/chunk-KRR4VMH7.js.map +0 -1
package/dist/chunk-WOK2RTWG.js.map +0 -1
package/dist/chunk-YUFXO3TU.js.map +0 -1
package/dist/reporting-Da2ihlcM.d.ts +0 -672
/package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
/package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
/package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
/package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0

package/dist/traces.d.ts CHANGED Viewed

@@ -2,6 +2,8 @@ import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureCl
 export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
 import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
 export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
+import { R as RawProviderSink, f as RawProviderEvent } from './integrity-Cr5YodSY.js';
+export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-Cr5YodSY.js';
 import { AxAIService, AxFunction } from '@ax-llm/ax';
 /**
@@ -133,204 +135,122 @@ interface OtlpExport {
 declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
 /**
- * RawProviderSink — first-class persistence for the actual HTTP-level
- * request/response bodies of every LLM provider call.
+ * Replay-from-raw-events — turn every captured campaign run into a
+ * re-runnable artifact.
  *
- * Why this is a separate sink from the structured `LlmSpan`:
+ * The premise: 0.21 made `RawProviderSink` capture every provider HTTP
+ * envelope. 0.22's `runEvalCampaign` makes capture the default. Together
+ * they mean every past run is a complete fingerprint of what happened on
+ * the wire — and that fingerprint is enough to replay the run without
+ * burning new LLM cost.
  *
- *   - `LlmSpan` records the *intent* — model name, messages, output text,
- *     usage. It's what dashboards read; it's NOT enough for forensics.
- *   - When a downstream consumer reports "the verifier used the wrong route"
- *     or "tokens look right but reasoning was missing," the only way to
- *     answer is the raw HTTP body. Span fields can lie (a proxy can echo
- *     a different `model` value than what actually answered); the raw
- *     response is ground truth.
+ * Three use cases this primitive enables:
  *
- * Default behaviour: opt-in. Pass `rawSink` to `LlmClientOptions` (or the
- * matrix runner / BuilderSession sets it up automatically) and every
- * request, response, and error is recorded — including retries, with the
- * attempt index attached so a flaky call's full event chain is recoverable.
+ *   1. **Post-hoc judging** — apply a new judge / rubric / scoring callback
+ *      to last week's runs without re-calling any LLM. The cost of trying
+ *      a new rubric drops from "another full sweep" to a CPU-bound replay.
+ *   2. **Determinism audits** — replay the same campaign and verify the
+ *      raw responses match byte-for-byte. Any drift is a non-determinism
+ *      bug (in the harness, the prompt builder, the sandbox, …).
+ *   3. **Free judge calibration** — run two judges on identical responses
+ *      and measure inter-judge agreement without doubling LLM spend.
  *
- * Redaction is enforced at sink time. The default redactor strips
- * `Authorization`, `X-Api-Key`, `X-Auth-Token`, `Cookie` headers and any
- * payload field whose key matches `apiKey | api_key | bearer | password |
- * secret | token` (case-insensitive). Override via the sink constructor or
- * the per-call `redactor`. The `redactedFields` array on the persisted
- * event lets a reviewer see what was stripped without exposing the values.
+ * The interface is deliberately fetch-shaped. Inject `createReplayFetch`
+ * into `LlmClientOptions.fetch` and every `callLlm` transparently reads
+ * from the cache instead of calling the network. No new code path through
+ * the LLM client is needed; the cache hit is invisible to the runner.
  */
-type RawProviderDirection = 'request' | 'response' | 'error';
-interface RawProviderEvent {
-    /** Stable id. Generated by the sink if omitted. */
-    eventId: string;
-    /** Trace context populated by `LlmClient` when the call is wrapped in a span. */
-    runId?: string;
-    spanId?: string;
-    /**
-     * Logical provider name. Free-form so callers can use whatever id matches
-     * their topology (`'openai'`, `'anthropic'`, `'tangle-router'`, …). When
-     * omitted, derived from `baseUrl` in `LlmClientOptions`.
-     */
-    provider: string;
-    model: string;
-    /** Endpoint path, e.g. `'/v1/chat/completions'`. */
-    endpoint: string;
-    /** Base URL used for the call (already-normalised — no trailing slash). */
-    baseUrl: string;
-    /** 0-indexed retry attempt. The first attempt is 0; a retried call gets 1, 2, … */
-    attemptIndex: number;
-    direction: RawProviderDirection;
-    /** Unix ms. */
-    timestamp: number;
-    /** Wall-clock duration of the call leg. Set on `response` and `error` events; null on `request`. */
-    durationMs?: number;
-    statusCode?: number;
-    requestHeaders?: Record<string, string>;
-    requestBody?: unknown;
-    responseHeaders?: Record<string, string>;
-    responseBody?: unknown;
-    /** Set on `direction: 'error'` events. */
-    errorMessage?: string;
-    /** Field paths the redactor stripped from this event ('header:Authorization', 'body.apiKey', …). */
-    redactedFields: string[];
-}
-interface RawProviderSinkFilter {
-    runId?: string;
-    spanId?: string;
-    direction?: RawProviderDirection;
-    attemptIndex?: number;
+declare class ReplayCacheMissError extends Error {
+    readonly url: string;
+    readonly requestKey: string;
+    constructor(url: string, requestKey: string, message?: string);
 }
-interface RawProviderSink {
-    record(event: RawProviderEvent): Promise<void>;
-    /** Optional listing — implementations that durably persist (file, db) should support this. */
-    list?(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
-    /** Optional teardown for backed implementations. */
-    close?(): Promise<void>;
+interface ReplayCacheEntry {
+    request: RawProviderEvent;
+    response: RawProviderEvent;
 }
-type ProviderRedactor = (event: RawProviderEvent) => RawProviderEvent;
-/**
- * Default redactor — strips well-known auth headers and any body field whose
- * key matches the credential pattern. Records every redacted path on
- * `event.redactedFields` so a downstream reviewer can see what was removed.
- */
-declare function defaultProviderRedactor(event: RawProviderEvent): RawProviderEvent;
-interface InMemoryRawProviderSinkOptions {
-    redactor?: ProviderRedactor;
-}
-declare class InMemoryRawProviderSink implements RawProviderSink {
-    private events;
-    private redactor;
-    constructor(opts?: InMemoryRawProviderSinkOptions);
-    record(event: RawProviderEvent): Promise<void>;
-    list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
-    size(): number;
-}
-declare class NoopRawProviderSink implements RawProviderSink {
-    record(): Promise<void>;
-}
-interface FileSystemRawProviderSinkOptions {
-    /** Directory the NDJSON file is written into. Created if missing. */
-    dir: string;
-    /** File name; default `'raw-provider-events.ndjson'`. */
-    fileName?: string;
-    /** Bytes after which the writer rolls over to a new file (default 32 MiB). */
-    rollAtBytes?: number;
-    redactor?: ProviderRedactor;
-}
-declare class FileSystemRawProviderSink implements RawProviderSink {
-    private dir;
-    private fileName;
-    private rollAtBytes;
-    private redactor;
-    private bytesWritten;
-    private rollIndex;
-    private initPromise;
-    constructor(opts: FileSystemRawProviderSinkOptions);
-    private ensureInit;
-    private currentPath;
-    record(event: RawProviderEvent): Promise<void>;
-    list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
+interface ReplayCacheStats {
+    total: number;
+    byProvider: Record<string, number>;
+    byModel: Record<string, number>;
+    /** Spans for which we have a request but no response (run aborted mid-call). */
+    orphanRequests: number;
 }
 /**
- * Best-effort provider id from a base URL. Falls back to the URL host when
- * none of the well-known patterns match.
- */
-declare function providerFromBaseUrl(baseUrl: string): string;
-/**
- * Run-completion integrity check — at end of run, verify the expected event
- * types were actually captured. The point is the launch-review failure mode:
- * a run *appears* successful but the raw provider events were never written,
- * so a downstream reviewer can't reconstruct what happened.
+ * In-memory deterministic cache of (request → response) keyed on a stable
+ * hash of the request body. Built from a `RawProviderSink` containing
+ * paired `request` and `response` events from a previous run.
  *
- * Pattern:
- *
- *   const report = await assertRunCaptured(store, runId, {
- *     llmSpansMin: 1,
- *     judgeSpansMin: 1,
- *     rawSink: providerSink,                  // must have ≥ 1 event for this run
- *     requireRawCoverageOfLlmSpans: true,     // every llm span has matching raw events
- *   })
- *   if (!report.ok) throwIfRunIncomplete(report)  // or mark run failed and continue
- *
- * The function is read-only on the store and returns a structured report;
- * the caller chooses the failure mode (throw, mark run failed, log warning).
- * `throwIfRunIncomplete` is the convenient strict mode.
+ * The cache is the source of truth for replay; `createReplayFetch` is a
+ * thin wrapper that reads from it.
  */
-interface RunIntegrityExpectations {
-    /** Minimum LLM span count. Default 0 (no requirement). */
-    llmSpansMin?: number;
-    /** Minimum judge span count. Default 0. */
-    judgeSpansMin?: number;
-    /** Minimum tool span count. Default 0. */
-    toolSpansMin?: number;
+declare class ReplayCache {
+    private byKey;
+    private orphans;
+    private byProvider;
+    private byModel;
     /**
-     * Raw provider sink to consult for capture verification. When present,
-     * the check requires at least one raw event for the run.
+     * Build a cache from a sink's events. The sink must implement `list()`.
+     * Filter by `runId` / `spanId` to scope to a specific replay.
      */
-    rawSink?: RawProviderSink;
-    /** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
-    rawProviderEventsMin?: number;
+    static fromSink(sink: RawProviderSink, filter?: {
+        runId?: string;
+        spanId?: string;
+    }): Promise<ReplayCache>;
+    /** Build a cache from an in-memory event list. */
+    static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
+    /** Number of cacheable (request, response) pairs in the cache. */
+    size(): number;
+    stats(): ReplayCacheStats;
     /**
-     * Every LLM span must have at least one matching raw `request` event
-     * (matched by spanId). Catches the common bug where the structured span
-     * was emitted but the raw HTTP capture was wired to a different sink.
+     * Look up a cached response by hashing the (model, messages, temperature,
+     * maxTokens, response_format) shape. Returns `undefined` on miss; the
+     * caller decides whether to throw, fall back to the network, or skip.
      */
-    requireRawCoverageOfLlmSpans?: boolean;
-    /** Run outcome must be set (not null/undefined). Default false. */
-    requireOutcome?: boolean;
-}
-type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
-interface RunIntegrityIssue {
-    code: RunIntegrityIssueCode;
-    message: string;
-    detail?: Record<string, unknown>;
-}
-interface RunIntegrityReport {
-    ok: boolean;
-    runId: string;
-    llmSpanCount: number;
-    judgeSpanCount: number;
-    toolSpanCount: number;
-    rawProviderEventCount: number;
+    lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
+}
+interface ReplayFetchOptions {
     /**
-     * Coverage of LLM spans by raw provider events keyed on spanId.
-     * `total` is the number of LLM spans; `covered` is the count with at
-     * least one matching `request` raw event.
+     * Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
+     * `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
+     * still complete; `'fail-closed'` returns a synthetic 599 response so the
+     * call site sees a non-retriable failure.
      */
-    rawSpanCoverage: {
-        covered: number;
-        total: number;
-    };
-    issues: RunIntegrityIssue[];
-}
-declare class RunIntegrityError extends Error {
-    readonly report: RunIntegrityReport;
-    constructor(report: RunIntegrityReport);
+    onMiss?: 'throw' | 'fallback' | 'fail-closed';
+    fallbackFetch?: typeof fetch;
+    /** Optional callback fired once per replayed call (for telemetry / counters). */
+    onHit?: (info: {
+        url: string;
+        provider: string;
+        model: string;
+    }) => void;
+    /** Optional callback fired on cache miss before the `onMiss` policy applies. */
+    onMissNotify?: (info: {
+        url: string;
+        requestBody: unknown;
+    }) => void;
 }
-declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
-/** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
-declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
+/**
+ * Build a `fetch`-shaped function that serves cached responses out of a
+ * `ReplayCache` for any URL ending in `/chat/completions`. Pass through
+ * `LlmClientOptions.fetch` and `callLlm` becomes free.
+ *
+ * Non-`/chat/completions` URLs are passed straight to the fallback fetch
+ * (default: `globalThis.fetch`). This matters because non-LLM HTTP work
+ * (judge HTTP servers, sandbox callbacks) sometimes flows through the same
+ * `fetch` and shouldn't be intercepted.
+ */
+declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
+/**
+ * Convenience iterator over `(request, response)` pairs in a sink — for
+ * post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
+ * runs purely in-process over cached LLM outputs.
+ */
+declare function iterateRawCalls(sink: RawProviderSink, filter?: {
+    runId?: string;
+    spanId?: string;
+}): AsyncGenerator<ReplayCacheEntry>;
 /**
  * Shared types for the trace-analyst module.
@@ -911,4 +831,4 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
 declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
 declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
-export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, FileSystemRawProviderSink, type FileSystemRawProviderSinkOptions, InMemoryRawProviderSink, type InMemoryRawProviderSinkOptions, JudgeSpan, LlmSpan, NoopRawProviderSink, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type ProviderRedactor, type QueryTracesPage, REDACTION_VERSION, type RawProviderDirection, type RawProviderEvent, type RawProviderSink, type RawProviderSinkFilter, type RedactionReport, type RedactionRule, Run, RunCompleteHook, RunCompleteHookContext, RunIntegrityError, type RunIntegrityExpectations, type RunIntegrityIssue, type RunIntegrityIssueCode, type RunIntegrityReport, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, assertRunCaptured, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultProviderRedactor, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, llmSpans, planTraceInsightQuestions, providerFromBaseUrl, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, throwIfRunIncomplete, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
+export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, JudgeSpan, LlmSpan, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, Run, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, llmSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };

package/dist/traces.js CHANGED Viewed

@@ -7,7 +7,8 @@ import {
   OTEL_AGENT_EVAL_SCOPE,
   OtlpFileTraceStore,
   REDACTION_VERSION,
-  RunIntegrityError,
+  ReplayCache,
+  ReplayCacheMissError,
   SpanNotFoundError,
   TRACE_ANALYST_ACTOR_DESCRIPTION,
   TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -19,10 +20,10 @@ import {
   aggregateLlm,
   analyzeTraces,
   argHash,
-  assertRunCaptured,
   buildTraceAnalystTools,
   buildTraceInsightContext,
   buildTraceInsightPrompt,
+  createReplayFetch,
   defaultTraceInsightPanel,
   describeTraceInsightScope,
   domainEvidencePattern,
@@ -34,6 +35,7 @@ import {
   isRetrievalSpan,
   isSandboxSpan,
   isToolSpan,
+  iterateRawCalls,
   judgeSpans,
   llmSpans,
   planTraceInsightQuestions,
@@ -42,23 +44,28 @@ import {
   runFailureClass,
   runsForScenario,
   scoreTraceInsightReadiness,
-  throwIfRunIncomplete,
   tokenizeDomainWords,
   toolSpans,
   traceAnalystFunctionGroup,
   traceAnalystOnRunComplete
-} from "./chunk-WOK2RTWG.js";
+} from "./chunk-4W4NCYM2.js";
 import {
-  TraceEmitter,
-  llmSpanFromProvider
-} from "./chunk-5IIQKMD5.js";
+  RunIntegrityError,
+  assertRunCaptured,
+  throwIfRunIncomplete
+} from "./chunk-QUKKGHTZ.js";
 import {
   FileSystemRawProviderSink,
   InMemoryRawProviderSink,
   NoopRawProviderSink,
   defaultProviderRedactor,
   providerFromBaseUrl
-} from "./chunk-SNUHRBDL.js";
+} from "./chunk-SQQLHODJ.js";
+import {
+  TraceEmitter,
+  llmSpanFromProvider
+} from "./chunk-5IIQKMD5.js";
+import "./chunk-6M774GY6.js";
 import "./chunk-PZ5AY32C.js";
 export {
   DEFAULT_REDACTION_RULES,
@@ -72,6 +79,8 @@ export {
   OTEL_AGENT_EVAL_SCOPE,
   OtlpFileTraceStore,
   REDACTION_VERSION,
+  ReplayCache,
+  ReplayCacheMissError,
   RunIntegrityError,
   SpanNotFoundError,
   TRACE_ANALYST_ACTOR_DESCRIPTION,
@@ -89,6 +98,7 @@ export {
   buildTraceAnalystTools,
   buildTraceInsightContext,
   buildTraceInsightPrompt,
+  createReplayFetch,
   defaultProviderRedactor,
   defaultTraceInsightPanel,
   describeTraceInsightScope,
@@ -101,6 +111,7 @@ export {
   isRetrievalSpan,
   isSandboxSpan,
   isToolSpan,
+  iterateRawCalls,
   judgeSpans,
   llmSpanFromProvider,
   llmSpans,

package/dist/wire/index.js CHANGED Viewed

@@ -24,9 +24,9 @@ import {
   runRpcBatch,
   runRpcOnce,
   startServer
-} from "../chunk-WOPGKVN4.js";
-import "../chunk-3GN6U53I.js";
-import "../chunk-SNUHRBDL.js";
+} from "../chunk-6KQG5HAH.js";
+import "../chunk-KAO3Q65R.js";
+import "../chunk-SQQLHODJ.js";
 import "../chunk-PZ5AY32C.js";
 export {
   BUILTIN_RUBRICS,

package/docs/auto-research-loop-end-to-end.md ADDED Viewed

@@ -0,0 +1,186 @@
+# Auto-research loop end-to-end
+This is the runnable composition pattern that closes the loop the package
+was originally designed for: capture-integrity → eval → preferences →
+mutation → improved candidate → repeat.
+There's no new orchestrator primitive that runs this for you (and we
+deliberately resisted shipping one — every consumer's loop has different
+invariants). What this doc gives you is **the integration recipe**: the
+imports, the wiring, and the explicit invariants every iteration must
+preserve.
+A working version of this recipe lives at
+[`examples/auto-research-with-agent-builder/`](../examples/auto-research-with-agent-builder/) —
+runnable, ~250 lines, demonstrates the score climbing across iterations.
+## The pattern
+```ts
+import {
+  runEvalCampaign,
+  analyzeOptimizationResult,
+  trialsToRunRecords,
+  PredictiveValidityResearcher,
+} from '@tangle-network/agent-eval'
+import { traceAnalystOnRunComplete } from '@tangle-network/agent-eval/traces'
+async function runAutoResearchLoop(opts: {
+  task: string
+  initialVariants: Variant[]
+  scenarios: Scenario[]
+  iterations: number
+  // The thing that turns a Variant into a scoreable artifact.
+  // For agent-builder this is `runForgeBuilderSim`; for tax-agent it's
+  // their domain runner; for the multi-shot prompt evolution case it's
+  // already wired inside `runPromptEvolution`.
+  candidateRunner: CandidateRunner<Variant>
+  // The thing that proposes the next variants given the analysis output.
+  // For prompt-only optimization, this is `reflective-mutation` against
+  // the top/bottom trials. For code+prompt, this is `createCompositeMutator`.
+  // For agent-builder, this can be a hand-rolled "edit the system prompt"
+  // function — the example shows one.
+  mutator: (champion: Variant, analysis: AnalysisReport) => Promise<Variant[]>
+  // Optional: outcome store for predictive validity. When present, the
+  // loop learns which scoring rubrics actually predict deployment outcomes
+  // and reweights the composite score accordingly.
+  outcomes?: { store: OutcomeStore; metrics: string[] }
+}): Promise<IterationReport[]> {
+  const reports: IterationReport[] = []
+  let variants = opts.initialVariants
+  // (Optional) standing researcher that drives rubric reweighting.
+  const researcher = opts.outcomes
+    ? new PredictiveValidityResearcher({
+        outcomes: opts.outcomes.store,
+        outcomeMetrics: opts.outcomes.metrics,
+      })
+    : null
+  for (let iter = 0; iter < opts.iterations; iter++) {
+    // 1. Capture-integrity-by-construction matrix run.
+    const campaign = await runEvalCampaign({
+      campaignId: `auto-research-iter-${iter}`,
+      commitSha: opts.task,
+      variants: variants.map((v) => ({ id: v.id, payload: v })),
+      scenarios: opts.scenarios,
+      seeds: [0, 1, 2],
+      llmOpts: { ... },
+      storeFactory: () => new InMemoryTraceStore(),
+      rawSinkFactory: () => new InMemoryRawProviderSink(),
+      runner: makeCampaignRunner(opts.candidateRunner),
+      onRunComplete: opts.outcomes
+        ? [traceAnalystOnRunComplete({ analyze: ..., save: ... })]
+        : [],
+      report: { comparator: variants[0]!.id },
+    })
+    // 2. RL-bridge analysis: preferences, verifiable rewards, sequential
+    //    interim verdict, reward-hacking diagnosis.
+    const analysis = await analyzeOptimizationResult({
+      result: pretendItsAPromptEvolution(campaign),
+      ctx: { experimentId: 'task', model: '...', commitSha: '...', promptHash: '...', configHash: '...' },
+      comparator: variants[0]!.id,
+      outcomes: opts.outcomes,
+    })
+    // 3. Periodic rubric recalibration via predictive validity.
+    if (researcher && iter > 0 && iter % 5 === 0) {
+      await researcher.runValidityCheck(campaign.runs)
+      // The researcher's `proposeChange` output can be folded into the
+      // mutator as a steering signal in the next iteration.
+    }
+    // 4. Pick champion + record this iteration.
+    const champion = pickChampion(campaign.runs)
+    reports.push({ iter, champion, score: champion.score, analysis })
+    // 5. Sequential stop: the anytime-valid e-value can decisively call
+    //    'promote_now' or 'reject_now' before iterations exhausted.
+    if (analysis.interimConfidence?.recommendation.decision === 'promote_now') {
+      break
+    }
+    // 6. Propose next variants via the mutator.
+    if (iter < opts.iterations - 1) {
+      variants = await opts.mutator(champion.variant, analysis)
+    }
+  }
+  return reports
+}
+```
+## Invariants every iteration must preserve
+1. **The campaign produces RunRecord[] with `scenarioId` populated.** Every
+   downstream primitive (preferences, sequential, predictive validity,
+   tournament) keys on this. `runEvalCampaign` populates it canonically;
+   if you adapt from `runPromptEvolution` use `trialsToRunRecords`.
+2. **Capture is wired by construction.** Don't pass `NoopRawProviderSink`
+   to `rawSinkFactory` unless the iteration is exploratory. Every
+   captured run is replayable, every replayable run is free judge-iteration
+   data for the next loop.
+3. **`commitSha` is real.** It's how downstream tooling (predictive
+   validity, contamination probe, tournament) ties iterations together.
+4. **The comparator is stable across iterations.** Either the original
+   `baseline` or whichever champion you froze. Shifting the comparator
+   between iterations corrupts the paired-delta semantics.
+5. **The mutator is deterministic given the analysis output.** Otherwise
+   the iteration isn't reproducible and the auto-research artifacts
+   become unfalsifiable. If you need stochastic mutation, seed the
+   mutator and emit the seed onto the run record.
+## When to run each primitive
+| Frequency | Primitive | Why |
+|---|---|---|
+| Every iteration | `runEvalCampaign` | core measurement |
+| Every iteration | `analyzeOptimizationResult` | preferences + verifiable rewards + reward-hacking |
+| Every iteration | `evaluateInterimReleaseConfidence` (via `analyzeOptimizationResult`) | anytime-valid stop signal |
+| Every 5–10 iterations | `rubricPredictiveValidity` | rubric weights drift; recalibrate |
+| Every release | `runContaminationProbe` | scenario set freshness |
+| Once per task | `runComputeCurve` | cost-quality frontier |
+| As-needed | `adversarialScenarioSearch` | discover failure modes the curated set missed |
+## When to drop into the smaller primitives
+Two cases:
+1. **Trajectory-shaped optimization with steering.** Use
+   `runMultiShotOptimization` directly — it already runs the inner
+   search-vs-holdout loop. Wrap with `analyzeOptimizationResult` after
+   for the RL bridge.
+2. **Prompt + code evolution with sandboxed code mutation.** Use
+   `runPromptEvolution` + `createCompositeMutator` directly. Same wrap
+   pattern.
+The auto-research loop above wraps these primitives in a higher-level
+loop that runs them across multiple campaigns. They're each one tick of
+the bigger loop.
+## What this does NOT do
+- It doesn't fine-tune model weights. That's the
+  [`fine-tune-with-prime-rl`](../examples/fine-tune-with-prime-rl/) example
+  — separate concern, separate trainer.
+- It doesn't drive a production deployment decision on its own. The
+  artifacts feed a launch-review process (humans, the `researchReport`
+  output, the `assertReleaseConfidence` gate). Loop ≠ promotion gate.
+- It doesn't substitute for a real preregistration trail. The
+  `preregistrationHash` field on the report exists so iterations can be
+  audited, but the auto-research loop *is* iterative and post-hoc by
+  definition. Use the standing `assertReleaseConfidence` gate at the
+  release boundary; use the auto-research loop everywhere upstream of it.
+## Reading order for the example
+1. [`examples/auto-research-with-agent-builder/README.md`](../examples/auto-research-with-agent-builder/README.md) — architectural picture.
+2. [`examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts`](../examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts) — runnable demo.
+3. Run it: `npx tsx examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts`.
+   It prints the iteration progression and the score climbing.

package/docs/research-report-methodology.md CHANGED Viewed

@@ -113,15 +113,30 @@ risks list and the executive summary. Treat them as descriptive only.
 - **Unpaired Mann–Whitney.** Rejected: matched scenarios make pairing free,
   and unpaired tests throw away the variance reduction. Use the paired test
   by default.
-- **Sequential / always-valid inference (e-values, mSPRT, alpha-spending).**
-  Out of scope for a single-look report. If users iterate, wrap this report
-  in an alpha-spending schedule, or commit to one preregistered look.
+- **Sequential / always-valid inference (e-values, alpha-spending).**
+  **Shipped in 0.22.** `pairedEvalueSequence` and
+  `evaluateInterimReleaseConfidence` provide time-uniform inference using
+  the predictable plug-in betting martingale (Waudby-Smith & Ramdas 2024)
+  paired with the empirical Bernstein confidence sequence (Howard et al.
+  2021). For *rolling* analyses (interim looks at a campaign that's still
+  accumulating data) call those primitives directly; `researchReport`
+  remains the single-look summary. Paper-grade pre-registration covers the
+  static analysis; the sequential primitives cover the iterative one.
 - **Hierarchical Bayesian shrinkage across many candidates.** Future work.
   The current ranking is on raw paired statistics and over-credits the top
-  candidate when many are tested.
+  candidate when many are tested. A Bayesian hierarchical model with a
+  weakly informative prior would shrink each variant toward the grand mean,
+  reducing rank flips between near-tied candidates.
 - **Calibration / coverage simulation on the bootstrap CI.** Future work; we
   rely on the asymptotic guarantee plus the hard pair floor to keep coverage
   reasonable.
+- **Outcome-anchored calibration.** **Shipped in 0.22.**
+  `rubricPredictiveValidity` joins `RunRecord`s to a `DeploymentOutcomeStore`
+  and reports per-rubric Spearman against deployment outcomes (revenue,
+  retention, CSAT, …). Combined with the static methodology in this
+  document, the loop is: pre-register → measure with `researchReport` →
+  ship → observe outcomes → recalibrate rubric weights with
+  `rubricPredictiveValidity`.
 ## When NOT to apply