@tangle-network/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +102 -1
  2. package/README.md +4 -0
  3. package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
  6. package/dist/chunk-6M774GY6.js +53 -0
  7. package/dist/chunk-6M774GY6.js.map +1 -0
  8. package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
  9. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  10. package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
  11. package/dist/chunk-QUKKGHTZ.js +121 -0
  12. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  13. package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
  14. package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
  15. package/dist/chunk-UAND2LOT.js +738 -0
  16. package/dist/chunk-UAND2LOT.js.map +1 -0
  17. package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
  18. package/dist/chunk-USHQBPMH.js.map +1 -0
  19. package/dist/cli.js +3 -3
  20. package/dist/index.d.ts +10 -284
  21. package/dist/index.js +39 -19
  22. package/dist/index.js.map +1 -1
  23. package/dist/integrity-K2oVlF57.d.ts +210 -0
  24. package/dist/openapi.json +1 -1
  25. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  26. package/dist/optimization.d.ts +6 -144
  27. package/dist/optimization.js +9 -2
  28. package/dist/reporting-B82RSv9C.d.ts +593 -0
  29. package/dist/reporting.d.ts +2 -2
  30. package/dist/reporting.js +15 -8
  31. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  32. package/dist/traces.d.ts +101 -181
  33. package/dist/traces.js +16 -5
  34. package/dist/wire/index.js +3 -3
  35. package/docs/research-report-methodology.md +19 -4
  36. package/docs/wire-protocol.md +1 -1
  37. package/package.json +2 -2
  38. package/dist/chunk-3IX6QTB7.js.map +0 -1
  39. package/dist/chunk-HRZELXCR.js.map +0 -1
  40. package/dist/chunk-KRR4VMH7.js +0 -423
  41. package/dist/chunk-KRR4VMH7.js.map +0 -1
  42. package/dist/chunk-WOK2RTWG.js.map +0 -1
  43. package/dist/reporting-Da2ihlcM.d.ts +0 -672
  44. /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
  45. /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
package/dist/traces.d.ts CHANGED
@@ -2,6 +2,8 @@ import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureCl
2
2
  export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
3
3
  import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
4
4
  export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
5
+ import { d as RawProviderSink, c as RawProviderEvent } from './integrity-K2oVlF57.js';
6
+ export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, R as RawProviderDirection, e as RawProviderSinkFilter, f as RunIntegrityError, g as RunIntegrityExpectations, h as RunIntegrityIssue, i as RunIntegrityIssueCode, j as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-K2oVlF57.js';
5
7
  import { AxAIService, AxFunction } from '@ax-llm/ax';
6
8
 
7
9
  /**
@@ -133,204 +135,122 @@ interface OtlpExport {
133
135
  declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
134
136
 
135
137
  /**
136
- * RawProviderSinkfirst-class persistence for the actual HTTP-level
137
- * request/response bodies of every LLM provider call.
138
+ * Replay-from-raw-eventsturn every captured campaign run into a
139
+ * re-runnable artifact.
138
140
  *
139
- * Why this is a separate sink from the structured `LlmSpan`:
141
+ * The premise: 0.21 made `RawProviderSink` capture every provider HTTP
142
+ * envelope. 0.22's `runEvalCampaign` makes capture the default. Together
143
+ * they mean every past run is a complete fingerprint of what happened on
144
+ * the wire — and that fingerprint is enough to replay the run without
145
+ * burning new LLM cost.
140
146
  *
141
- * - `LlmSpan` records the *intent* model name, messages, output text,
142
- * usage. It's what dashboards read; it's NOT enough for forensics.
143
- * - When a downstream consumer reports "the verifier used the wrong route"
144
- * or "tokens look right but reasoning was missing," the only way to
145
- * answer is the raw HTTP body. Span fields can lie (a proxy can echo
146
- * a different `model` value than what actually answered); the raw
147
- * response is ground truth.
147
+ * Three use cases this primitive enables:
148
148
  *
149
- * Default behaviour: opt-in. Pass `rawSink` to `LlmClientOptions` (or the
150
- * matrix runner / BuilderSession sets it up automatically) and every
151
- * request, response, and error is recorded including retries, with the
152
- * attempt index attached so a flaky call's full event chain is recoverable.
149
+ * 1. **Post-hoc judging** apply a new judge / rubric / scoring callback
150
+ * to last week's runs without re-calling any LLM. The cost of trying
151
+ * a new rubric drops from "another full sweep" to a CPU-bound replay.
152
+ * 2. **Determinism audits** replay the same campaign and verify the
153
+ * raw responses match byte-for-byte. Any drift is a non-determinism
154
+ * bug (in the harness, the prompt builder, the sandbox, …).
155
+ * 3. **Free judge calibration** — run two judges on identical responses
156
+ * and measure inter-judge agreement without doubling LLM spend.
153
157
  *
154
- * Redaction is enforced at sink time. The default redactor strips
155
- * `Authorization`, `X-Api-Key`, `X-Auth-Token`, `Cookie` headers and any
156
- * payload field whose key matches `apiKey | api_key | bearer | password |
157
- * secret | token` (case-insensitive). Override via the sink constructor or
158
- * the per-call `redactor`. The `redactedFields` array on the persisted
159
- * event lets a reviewer see what was stripped without exposing the values.
158
+ * The interface is deliberately fetch-shaped. Inject `createReplayFetch`
159
+ * into `LlmClientOptions.fetch` and every `callLlm` transparently reads
160
+ * from the cache instead of calling the network. No new code path through
161
+ * the LLM client is needed; the cache hit is invisible to the runner.
160
162
  */
161
- type RawProviderDirection = 'request' | 'response' | 'error';
162
- interface RawProviderEvent {
163
- /** Stable id. Generated by the sink if omitted. */
164
- eventId: string;
165
- /** Trace context populated by `LlmClient` when the call is wrapped in a span. */
166
- runId?: string;
167
- spanId?: string;
168
- /**
169
- * Logical provider name. Free-form so callers can use whatever id matches
170
- * their topology (`'openai'`, `'anthropic'`, `'tangle-router'`, …). When
171
- * omitted, derived from `baseUrl` in `LlmClientOptions`.
172
- */
173
- provider: string;
174
- model: string;
175
- /** Endpoint path, e.g. `'/v1/chat/completions'`. */
176
- endpoint: string;
177
- /** Base URL used for the call (already-normalised — no trailing slash). */
178
- baseUrl: string;
179
- /** 0-indexed retry attempt. The first attempt is 0; a retried call gets 1, 2, … */
180
- attemptIndex: number;
181
- direction: RawProviderDirection;
182
- /** Unix ms. */
183
- timestamp: number;
184
- /** Wall-clock duration of the call leg. Set on `response` and `error` events; null on `request`. */
185
- durationMs?: number;
186
- statusCode?: number;
187
- requestHeaders?: Record<string, string>;
188
- requestBody?: unknown;
189
- responseHeaders?: Record<string, string>;
190
- responseBody?: unknown;
191
- /** Set on `direction: 'error'` events. */
192
- errorMessage?: string;
193
- /** Field paths the redactor stripped from this event ('header:Authorization', 'body.apiKey', …). */
194
- redactedFields: string[];
195
- }
196
- interface RawProviderSinkFilter {
197
- runId?: string;
198
- spanId?: string;
199
- direction?: RawProviderDirection;
200
- attemptIndex?: number;
163
+
164
+ declare class ReplayCacheMissError extends Error {
165
+ readonly url: string;
166
+ readonly requestKey: string;
167
+ constructor(url: string, requestKey: string, message?: string);
201
168
  }
202
- interface RawProviderSink {
203
- record(event: RawProviderEvent): Promise<void>;
204
- /** Optional listing — implementations that durably persist (file, db) should support this. */
205
- list?(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
206
- /** Optional teardown for backed implementations. */
207
- close?(): Promise<void>;
169
+ interface ReplayCacheEntry {
170
+ request: RawProviderEvent;
171
+ response: RawProviderEvent;
208
172
  }
209
- type ProviderRedactor = (event: RawProviderEvent) => RawProviderEvent;
210
- /**
211
- * Default redactor — strips well-known auth headers and any body field whose
212
- * key matches the credential pattern. Records every redacted path on
213
- * `event.redactedFields` so a downstream reviewer can see what was removed.
214
- */
215
- declare function defaultProviderRedactor(event: RawProviderEvent): RawProviderEvent;
216
- interface InMemoryRawProviderSinkOptions {
217
- redactor?: ProviderRedactor;
218
- }
219
- declare class InMemoryRawProviderSink implements RawProviderSink {
220
- private events;
221
- private redactor;
222
- constructor(opts?: InMemoryRawProviderSinkOptions);
223
- record(event: RawProviderEvent): Promise<void>;
224
- list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
225
- size(): number;
226
- }
227
- declare class NoopRawProviderSink implements RawProviderSink {
228
- record(): Promise<void>;
229
- }
230
- interface FileSystemRawProviderSinkOptions {
231
- /** Directory the NDJSON file is written into. Created if missing. */
232
- dir: string;
233
- /** File name; default `'raw-provider-events.ndjson'`. */
234
- fileName?: string;
235
- /** Bytes after which the writer rolls over to a new file (default 32 MiB). */
236
- rollAtBytes?: number;
237
- redactor?: ProviderRedactor;
238
- }
239
- declare class FileSystemRawProviderSink implements RawProviderSink {
240
- private dir;
241
- private fileName;
242
- private rollAtBytes;
243
- private redactor;
244
- private bytesWritten;
245
- private rollIndex;
246
- private initPromise;
247
- constructor(opts: FileSystemRawProviderSinkOptions);
248
- private ensureInit;
249
- private currentPath;
250
- record(event: RawProviderEvent): Promise<void>;
251
- list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
173
+ interface ReplayCacheStats {
174
+ total: number;
175
+ byProvider: Record<string, number>;
176
+ byModel: Record<string, number>;
177
+ /** Spans for which we have a request but no response (run aborted mid-call). */
178
+ orphanRequests: number;
252
179
  }
253
180
  /**
254
- * Best-effort provider id from a base URL. Falls back to the URL host when
255
- * none of the well-known patterns match.
256
- */
257
- declare function providerFromBaseUrl(baseUrl: string): string;
258
-
259
- /**
260
- * Run-completion integrity check — at end of run, verify the expected event
261
- * types were actually captured. The point is the launch-review failure mode:
262
- * a run *appears* successful but the raw provider events were never written,
263
- * so a downstream reviewer can't reconstruct what happened.
181
+ * In-memory deterministic cache of (request response) keyed on a stable
182
+ * hash of the request body. Built from a `RawProviderSink` containing
183
+ * paired `request` and `response` events from a previous run.
264
184
  *
265
- * Pattern:
266
- *
267
- * const report = await assertRunCaptured(store, runId, {
268
- * llmSpansMin: 1,
269
- * judgeSpansMin: 1,
270
- * rawSink: providerSink, // must have ≥ 1 event for this run
271
- * requireRawCoverageOfLlmSpans: true, // every llm span has matching raw events
272
- * })
273
- * if (!report.ok) throwIfRunIncomplete(report) // or mark run failed and continue
274
- *
275
- * The function is read-only on the store and returns a structured report;
276
- * the caller chooses the failure mode (throw, mark run failed, log warning).
277
- * `throwIfRunIncomplete` is the convenient strict mode.
185
+ * The cache is the source of truth for replay; `createReplayFetch` is a
186
+ * thin wrapper that reads from it.
278
187
  */
279
-
280
- interface RunIntegrityExpectations {
281
- /** Minimum LLM span count. Default 0 (no requirement). */
282
- llmSpansMin?: number;
283
- /** Minimum judge span count. Default 0. */
284
- judgeSpansMin?: number;
285
- /** Minimum tool span count. Default 0. */
286
- toolSpansMin?: number;
188
+ declare class ReplayCache {
189
+ private byKey;
190
+ private orphans;
191
+ private byProvider;
192
+ private byModel;
287
193
  /**
288
- * Raw provider sink to consult for capture verification. When present,
289
- * the check requires at least one raw event for the run.
194
+ * Build a cache from a sink's events. The sink must implement `list()`.
195
+ * Filter by `runId` / `spanId` to scope to a specific replay.
290
196
  */
291
- rawSink?: RawProviderSink;
292
- /** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
293
- rawProviderEventsMin?: number;
197
+ static fromSink(sink: RawProviderSink, filter?: {
198
+ runId?: string;
199
+ spanId?: string;
200
+ }): Promise<ReplayCache>;
201
+ /** Build a cache from an in-memory event list. */
202
+ static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
203
+ /** Number of cacheable (request, response) pairs in the cache. */
204
+ size(): number;
205
+ stats(): ReplayCacheStats;
294
206
  /**
295
- * Every LLM span must have at least one matching raw `request` event
296
- * (matched by spanId). Catches the common bug where the structured span
297
- * was emitted but the raw HTTP capture was wired to a different sink.
207
+ * Look up a cached response by hashing the (model, messages, temperature,
208
+ * maxTokens, response_format) shape. Returns `undefined` on miss; the
209
+ * caller decides whether to throw, fall back to the network, or skip.
298
210
  */
299
- requireRawCoverageOfLlmSpans?: boolean;
300
- /** Run outcome must be set (not null/undefined). Default false. */
301
- requireOutcome?: boolean;
302
- }
303
- type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
304
- interface RunIntegrityIssue {
305
- code: RunIntegrityIssueCode;
306
- message: string;
307
- detail?: Record<string, unknown>;
308
- }
309
- interface RunIntegrityReport {
310
- ok: boolean;
311
- runId: string;
312
- llmSpanCount: number;
313
- judgeSpanCount: number;
314
- toolSpanCount: number;
315
- rawProviderEventCount: number;
211
+ lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
212
+ }
213
+ interface ReplayFetchOptions {
316
214
  /**
317
- * Coverage of LLM spans by raw provider events keyed on spanId.
318
- * `total` is the number of LLM spans; `covered` is the count with at
319
- * least one matching `request` raw event.
215
+ * Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
216
+ * `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
217
+ * still complete; `'fail-closed'` returns a synthetic 599 response so the
218
+ * call site sees a non-retriable failure.
320
219
  */
321
- rawSpanCoverage: {
322
- covered: number;
323
- total: number;
324
- };
325
- issues: RunIntegrityIssue[];
326
- }
327
- declare class RunIntegrityError extends Error {
328
- readonly report: RunIntegrityReport;
329
- constructor(report: RunIntegrityReport);
220
+ onMiss?: 'throw' | 'fallback' | 'fail-closed';
221
+ fallbackFetch?: typeof fetch;
222
+ /** Optional callback fired once per replayed call (for telemetry / counters). */
223
+ onHit?: (info: {
224
+ url: string;
225
+ provider: string;
226
+ model: string;
227
+ }) => void;
228
+ /** Optional callback fired on cache miss before the `onMiss` policy applies. */
229
+ onMissNotify?: (info: {
230
+ url: string;
231
+ requestBody: unknown;
232
+ }) => void;
330
233
  }
331
- declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
332
- /** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
333
- declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
234
+ /**
235
+ * Build a `fetch`-shaped function that serves cached responses out of a
236
+ * `ReplayCache` for any URL ending in `/chat/completions`. Pass through
237
+ * `LlmClientOptions.fetch` and `callLlm` becomes free.
238
+ *
239
+ * Non-`/chat/completions` URLs are passed straight to the fallback fetch
240
+ * (default: `globalThis.fetch`). This matters because non-LLM HTTP work
241
+ * (judge HTTP servers, sandbox callbacks) sometimes flows through the same
242
+ * `fetch` and shouldn't be intercepted.
243
+ */
244
+ declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
245
+ /**
246
+ * Convenience iterator over `(request, response)` pairs in a sink — for
247
+ * post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
248
+ * runs purely in-process over cached LLM outputs.
249
+ */
250
+ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
251
+ runId?: string;
252
+ spanId?: string;
253
+ }): AsyncGenerator<ReplayCacheEntry>;
334
254
 
335
255
  /**
336
256
  * Shared types for the trace-analyst module.
@@ -911,4 +831,4 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
911
831
  declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
912
832
  declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
913
833
 
914
- export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, FileSystemRawProviderSink, type FileSystemRawProviderSinkOptions, InMemoryRawProviderSink, type InMemoryRawProviderSinkOptions, JudgeSpan, LlmSpan, NoopRawProviderSink, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type ProviderRedactor, type QueryTracesPage, REDACTION_VERSION, type RawProviderDirection, type RawProviderEvent, type RawProviderSink, type RawProviderSinkFilter, type RedactionReport, type RedactionRule, Run, RunCompleteHook, RunCompleteHookContext, RunIntegrityError, type RunIntegrityExpectations, type RunIntegrityIssue, type RunIntegrityIssueCode, type RunIntegrityReport, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, assertRunCaptured, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultProviderRedactor, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, llmSpans, planTraceInsightQuestions, providerFromBaseUrl, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, throwIfRunIncomplete, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
834
+ export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, JudgeSpan, LlmSpan, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, Run, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, llmSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
package/dist/traces.js CHANGED
@@ -7,7 +7,8 @@ import {
7
7
  OTEL_AGENT_EVAL_SCOPE,
8
8
  OtlpFileTraceStore,
9
9
  REDACTION_VERSION,
10
- RunIntegrityError,
10
+ ReplayCache,
11
+ ReplayCacheMissError,
11
12
  SpanNotFoundError,
12
13
  TRACE_ANALYST_ACTOR_DESCRIPTION,
13
14
  TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -19,10 +20,10 @@ import {
19
20
  aggregateLlm,
20
21
  analyzeTraces,
21
22
  argHash,
22
- assertRunCaptured,
23
23
  buildTraceAnalystTools,
24
24
  buildTraceInsightContext,
25
25
  buildTraceInsightPrompt,
26
+ createReplayFetch,
26
27
  defaultTraceInsightPanel,
27
28
  describeTraceInsightScope,
28
29
  domainEvidencePattern,
@@ -34,6 +35,7 @@ import {
34
35
  isRetrievalSpan,
35
36
  isSandboxSpan,
36
37
  isToolSpan,
38
+ iterateRawCalls,
37
39
  judgeSpans,
38
40
  llmSpans,
39
41
  planTraceInsightQuestions,
@@ -42,23 +44,28 @@ import {
42
44
  runFailureClass,
43
45
  runsForScenario,
44
46
  scoreTraceInsightReadiness,
45
- throwIfRunIncomplete,
46
47
  tokenizeDomainWords,
47
48
  toolSpans,
48
49
  traceAnalystFunctionGroup,
49
50
  traceAnalystOnRunComplete
50
- } from "./chunk-WOK2RTWG.js";
51
+ } from "./chunk-4W4NCYM2.js";
52
+ import {
53
+ RunIntegrityError,
54
+ assertRunCaptured,
55
+ throwIfRunIncomplete
56
+ } from "./chunk-QUKKGHTZ.js";
51
57
  import {
52
58
  TraceEmitter,
53
59
  llmSpanFromProvider
54
60
  } from "./chunk-5IIQKMD5.js";
61
+ import "./chunk-6M774GY6.js";
55
62
  import {
56
63
  FileSystemRawProviderSink,
57
64
  InMemoryRawProviderSink,
58
65
  NoopRawProviderSink,
59
66
  defaultProviderRedactor,
60
67
  providerFromBaseUrl
61
- } from "./chunk-SNUHRBDL.js";
68
+ } from "./chunk-SQQLHODJ.js";
62
69
  import "./chunk-PZ5AY32C.js";
63
70
  export {
64
71
  DEFAULT_REDACTION_RULES,
@@ -72,6 +79,8 @@ export {
72
79
  OTEL_AGENT_EVAL_SCOPE,
73
80
  OtlpFileTraceStore,
74
81
  REDACTION_VERSION,
82
+ ReplayCache,
83
+ ReplayCacheMissError,
75
84
  RunIntegrityError,
76
85
  SpanNotFoundError,
77
86
  TRACE_ANALYST_ACTOR_DESCRIPTION,
@@ -89,6 +98,7 @@ export {
89
98
  buildTraceAnalystTools,
90
99
  buildTraceInsightContext,
91
100
  buildTraceInsightPrompt,
101
+ createReplayFetch,
92
102
  defaultProviderRedactor,
93
103
  defaultTraceInsightPanel,
94
104
  describeTraceInsightScope,
@@ -101,6 +111,7 @@ export {
101
111
  isRetrievalSpan,
102
112
  isSandboxSpan,
103
113
  isToolSpan,
114
+ iterateRawCalls,
104
115
  judgeSpans,
105
116
  llmSpanFromProvider,
106
117
  llmSpans,
@@ -24,9 +24,9 @@ import {
24
24
  runRpcBatch,
25
25
  runRpcOnce,
26
26
  startServer
27
- } from "../chunk-WOPGKVN4.js";
28
- import "../chunk-3GN6U53I.js";
29
- import "../chunk-SNUHRBDL.js";
27
+ } from "../chunk-6KQG5HAH.js";
28
+ import "../chunk-KAO3Q65R.js";
29
+ import "../chunk-SQQLHODJ.js";
30
30
  import "../chunk-PZ5AY32C.js";
31
31
  export {
32
32
  BUILTIN_RUBRICS,
@@ -113,15 +113,30 @@ risks list and the executive summary. Treat them as descriptive only.
113
113
  - **Unpaired Mann–Whitney.** Rejected: matched scenarios make pairing free,
114
114
  and unpaired tests throw away the variance reduction. Use the paired test
115
115
  by default.
116
- - **Sequential / always-valid inference (e-values, mSPRT, alpha-spending).**
117
- Out of scope for a single-look report. If users iterate, wrap this report
118
- in an alpha-spending schedule, or commit to one preregistered look.
116
+ - **Sequential / always-valid inference (e-values, alpha-spending).**
117
+ **Shipped in 0.22.** `pairedEvalueSequence` and
118
+ `evaluateInterimReleaseConfidence` provide time-uniform inference using
119
+ the predictable plug-in betting martingale (Waudby-Smith & Ramdas 2024)
120
+ paired with the empirical Bernstein confidence sequence (Howard et al.
121
+ 2021). For *rolling* analyses (interim looks at a campaign that's still
122
+ accumulating data) call those primitives directly; `researchReport`
123
+ remains the single-look summary. Paper-grade pre-registration covers the
124
+ static analysis; the sequential primitives cover the iterative one.
119
125
  - **Hierarchical Bayesian shrinkage across many candidates.** Future work.
120
126
  The current ranking is on raw paired statistics and over-credits the top
121
- candidate when many are tested.
127
+ candidate when many are tested. A Bayesian hierarchical model with a
128
+ weakly informative prior would shrink each variant toward the grand mean,
129
+ reducing rank flips between near-tied candidates.
122
130
  - **Calibration / coverage simulation on the bootstrap CI.** Future work; we
123
131
  rely on the asymptotic guarantee plus the hard pair floor to keep coverage
124
132
  reasonable.
133
+ - **Outcome-anchored calibration.** **Shipped in 0.22.**
134
+ `rubricPredictiveValidity` joins `RunRecord`s to a `DeploymentOutcomeStore`
135
+ and reports per-rubric Spearman against deployment outcomes (revenue,
136
+ retention, CSAT, …). Combined with the static methodology in this
137
+ document, the loop is: pre-register → measure with `researchReport` →
138
+ ship → observe outcomes → recalibrate rubric weights with
139
+ `rubricPredictiveValidity`.
125
140
 
126
141
  ## When NOT to apply
127
142
 
@@ -188,7 +188,7 @@ Each invocation is one process — Node startup adds ~500 ms. For more than a fe
188
188
  4. **RPC case** — add `case 'x':` in `dispatchRpc` in `src/wire/rpc.ts`.
189
189
  5. **OpenAPI route** — register in `src/wire/openapi.ts` so it shows up in the spec.
190
190
  6. **Test** — add to `tests/wire/`. At minimum: schema validation, happy-path, error-path.
191
- 7. **Python client** — add a method on `Client` in `clients/python/src/tangle_agent_eval/client.py`, plus pydantic models in `models.py` mirroring the new schemas.
191
+ 7. **Python client** — add a method on `Client` in `clients/python/src/agent_eval_rpc/client.py`, plus pydantic models in `models.py` mirroring the new schemas.
192
192
 
193
193
  The pattern is mechanical. When the surface grows past ~10 methods, swap the hand-written Python models for `datamodel-code-generator -i openapi.json -o models.py`.
194
194
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.21.0",
3
+ "version": "0.22.0",
4
4
  "description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -79,7 +79,7 @@
79
79
  "@ax-llm/ax": "^19.0.25",
80
80
  "@hono/node-server": "^2.0.0",
81
81
  "@tangle-network/tcloud": "^0.4.6",
82
- "hono": "^4.12.15",
82
+ "hono": "^4.12.16",
83
83
  "zod": "^4.3.6"
84
84
  },
85
85
  "devDependencies": {