@tangle-network/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +102 -1
  2. package/README.md +4 -0
  3. package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
  6. package/dist/chunk-6M774GY6.js +53 -0
  7. package/dist/chunk-6M774GY6.js.map +1 -0
  8. package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
  9. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  10. package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
  11. package/dist/chunk-QUKKGHTZ.js +121 -0
  12. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  13. package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
  14. package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
  15. package/dist/chunk-UAND2LOT.js +738 -0
  16. package/dist/chunk-UAND2LOT.js.map +1 -0
  17. package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
  18. package/dist/chunk-USHQBPMH.js.map +1 -0
  19. package/dist/cli.js +3 -3
  20. package/dist/index.d.ts +10 -284
  21. package/dist/index.js +39 -19
  22. package/dist/index.js.map +1 -1
  23. package/dist/integrity-K2oVlF57.d.ts +210 -0
  24. package/dist/openapi.json +1 -1
  25. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  26. package/dist/optimization.d.ts +6 -144
  27. package/dist/optimization.js +9 -2
  28. package/dist/reporting-B82RSv9C.d.ts +593 -0
  29. package/dist/reporting.d.ts +2 -2
  30. package/dist/reporting.js +15 -8
  31. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  32. package/dist/traces.d.ts +101 -181
  33. package/dist/traces.js +16 -5
  34. package/dist/wire/index.js +3 -3
  35. package/docs/research-report-methodology.md +19 -4
  36. package/docs/wire-protocol.md +1 -1
  37. package/package.json +2 -2
  38. package/dist/chunk-3IX6QTB7.js.map +0 -1
  39. package/dist/chunk-HRZELXCR.js.map +0 -1
  40. package/dist/chunk-KRR4VMH7.js +0 -423
  41. package/dist/chunk-KRR4VMH7.js.map +0 -1
  42. package/dist/chunk-WOK2RTWG.js.map +0 -1
  43. package/dist/reporting-Da2ihlcM.d.ts +0 -672
  44. /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
  45. /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
package/CHANGELOG.md CHANGED
@@ -1,5 +1,104 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.22.0 — EvalCampaign + replay + always-valid + outcome calibration
4
+
5
+ 0.21 shipped the four capture-integrity primitives as opt-in. Every consumer still had to wire them by hand, and the bug class blueprint-agent reported (forgotten wiring → silent partial-capture) reappears the moment a new consumer adopts agent-eval cold. **0.22 makes the right thing the default path** — and adds three primitives that compound on top of standardized capture: replay-from-raw-events, anytime-valid sequential evaluation, and rubric predictive validity. The four primitives together turn agent-eval from a TS framework into research-grade evaluation infrastructure.
6
+
7
+ ### Added
8
+
9
+ #### `runEvalCampaign` — capture integrity by construction
10
+
11
+ Opinionated matrix runner that wires the four directives by construction. Inputs: variants, scenarios, seeds, an `LlmClientOptions`, factories for `TraceStore` and `RawProviderSink`, and a `runner(ctx)` callback. Outputs: per-cell `RunRecord[]`, `RunIntegrityReport[]`, optional `researchReport`, and a campaign fingerprint.
12
+
13
+ - **Preflight:** `assertLlmRoute` is called once before any work, with `{ requireExplicitBaseUrl: true, requireAuth: true }` defaults. Misconfigured routes never burn a run.
14
+ - **Per run:** the campaign constructs the `TraceStore`, `RawProviderSink`, and `TraceEmitter` (with `onRunComplete` hooks attached), then hands the runner an `LlmClientOptions` already pre-wired with `rawSink` + `traceContext`. The runner cannot accidentally call an LLM without capture.
15
+ - **Run-completion:** `assertRunCaptured` runs after every `endRun` with `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }` defaults. Failures are routed via `onIntegrityFailure: 'throw' | 'mark_failed' | 'log'` (default `'mark_failed'`).
16
+ - **End of campaign:** if `report.comparator` is set, computes `researchReport` over the collected `RunRecord`s and embeds the campaign fingerprint + `preregistrationHash`.
17
+ - **Concurrency:** local async worker pool, default 1, configurable via `concurrency`.
18
+ - **Determinism:** the default `runId` generator is a stable hash of `(campaignId, variantId, scenarioId, seed)`, so re-running the same campaign produces the same ids; override `runId` for non-deterministic generation.
19
+
20
+ Exported from the root barrel and the `@tangle-network/agent-eval/optimization` subpath: `runEvalCampaign`, `CampaignRunner`, `CampaignRunContext`, `CampaignRunOutcome`, `CampaignVariant`, `CampaignScenario`, `EvalCampaignOptions`, `EvalCampaignResult`, `FailedRun`, `CampaignIntegrityPolicy`, `CampaignFactoryParams`.
21
+
22
+ #### Replay-from-raw-events
23
+
24
+ Every campaign run is now a re-runnable artifact. `ReplayCache.fromSink(sink)` turns a populated `RawProviderSink` into a deterministic `(canonicalised request → cached response)` map; `createReplayFetch(cache)` returns a `fetch`-shaped function that satisfies `/chat/completions` calls out of the cache and passes other URLs through.
25
+
26
+ ```ts
27
+ const cache = await ReplayCache.fromSink(yesterdayRawSink)
28
+ const replayFetch = createReplayFetch(cache, { onMiss: 'fail-closed' })
29
+ await callLlm(req, { ...llmOpts, fetch: replayFetch }) // zero LLM cost
30
+ ```
31
+
32
+ Use cases:
33
+
34
+ - Post-hoc judging — apply a new judge or scorer to last week's runs without burning a single token.
35
+ - Determinism audits — replay a campaign and verify the responses match byte-for-byte.
36
+ - Free judge calibration — run two judges on identical responses and measure agreement.
37
+
38
+ `onMiss` is `'throw' | 'fallback' | 'fail-closed'`. The cache hashes a canonical projection (`model + messages + temperature + max_tokens|max_completion_tokens + response_format`) so insertion-order quirks don't cause spurious misses.
39
+
40
+ Exported from root and `@tangle-network/agent-eval/traces`: `ReplayCache`, `createReplayFetch`, `iterateRawCalls`, `ReplayCacheEntry`, `ReplayCacheStats`, `ReplayFetchOptions`, `ReplayCacheMissError`.
41
+
42
+ #### Always-valid sequential evaluation
43
+
44
+ `pairedEvalueSequence(deltas, opts)` and `evaluateInterimReleaseConfidence({ deltaSeries })` ship the predictable plug-in betting martingale of Waudby-Smith & Ramdas (2024) for paired bounded outcomes, plus the empirical Bernstein confidence sequence of Howard et al. (2021) for the running mean. Both are *anytime-valid* — type-I error is bounded by α at every stopping time, no peeking penalty.
45
+
46
+ ```ts
47
+ const verdict = evaluateInterimReleaseConfidence({
48
+ deltaSeries: [{ candidateId: 'cand', deltas }],
49
+ alpha: 0.05,
50
+ rope: { low: -0.02, high: 0.02 },
51
+ })
52
+ // → { recommendation: { decision: 'promote_now' | 'continue' | 'reject_now' | 'equivalent', candidateId } }
53
+ ```
54
+
55
+ This closes the methodological hole flagged in the 0.21 methodology doc as out-of-scope. Consumers running rolling campaigns can now ship the moment evidence is decisive, stop-early on dead-on-arrival variants, and accumulate evidence across partial runs without spending the FDR budget. Tested under-the-null at α=0.05 on 100 synthetic series; false-rejection rate stays below the bound.
56
+
57
+ Exported from root and `@tangle-network/agent-eval/reporting`: `pairedEvalueSequence`, `evaluateInterimReleaseConfidence`, `PairedEvalueOptions`, `PairedEvalueSequence`, `PairedEvalueStep`, `InterimReleaseConfidence`, `InterimReleaseConfidenceInput`, `SequentialDecision`.
58
+
59
+ #### Rubric predictive validity
60
+
61
+ `rubricPredictiveValidity({ runs, outcomes, outcomeMetrics })` joins canonical campaign `RunRecord`s to a `DeploymentOutcomeStore` and reports per-rubric Pearson + Spearman + bootstrap CI against each outcome metric. Verdict bucketing: `'load_bearing' | 'informative' | 'decorative'` based on `|spearman|`. **Without this loop every rubric is faith-based;** with it, you know which rubrics earn their promotion power and which are decoration.
62
+
63
+ ```ts
64
+ const validity = await rubricPredictiveValidity({
65
+ runs: lastQuarterRuns,
66
+ outcomes: shipFlagOutcomeStore,
67
+ outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
68
+ })
69
+ for (const r of validity.ranked) {
70
+ console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)} (${r.verdict})`)
71
+ }
72
+ ```
73
+
74
+ Builds on the existing `correlationStudy` primitive but works directly off `RunRecord` (the canonical campaign artifact) rather than `Run` from a `TraceStore`, so it composes cleanly with `runEvalCampaign`'s output. Returns a per-rubric ranking + every (rubric, outcome) pair tested + a list of rubrics that produced no usable data.
75
+
76
+ Exported from root and `@tangle-network/agent-eval/reporting`: `rubricPredictiveValidity`, `RubricOutcomePair`, `RubricRanking`, `RubricPredictiveValidityInput`, `RubricPredictiveValidityReport`. The existing `correlationStudy`, `OutcomeStore`, `InMemoryOutcomeStore`, `FileSystemOutcomeStore` continue to work unchanged.
77
+
78
+ #### `NoopRawProviderSink.list()` returns `[]`
79
+
80
+ Explicit opt-out from capture is no longer flagged by `assertRunCaptured` as `no_raw_sink`. Opt-out remains a deliberate choice; the campaign still requires the matching integrity overrides.
81
+
82
+ ### Why
83
+
84
+ Every consumer that adopted agent-eval before 0.22 wrote their own matrix runner, and every one of them re-introduced the same forgettable wiring (raw sink, route guard, integrity assertion, analyst hook). 0.21 documented the pattern; 0.22 owns it. The four new primitives compound:
85
+
86
+ - `runEvalCampaign` standardises the artifact (`RunRecord` + raw events + fingerprint).
87
+ - Replay turns every past run into free training/validation data for new judges.
88
+ - Sequential evaluation makes "ship-when-evidence-says-so" mathematically defensible.
89
+ - Predictive validity converts evals from belief-based to outcome-anchored.
90
+
91
+ `runMultiShotOptimization` remains the right primitive for trajectory-shaped GEPA optimization sweeps; `runPromptEvolution` for prompt + code evolution loops with sandbox pools; `runEvalCampaign` for the "compare N variants on M scenarios with K seeds and tell me which to ship" case that makes up the bulk of consumer evals.
92
+
93
+ ### References
94
+
95
+ - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021). Time-uniform, nonparametric, nonasymptotic confidence sequences. *Annals of Statistics*, 49(2), 1055–1080.
96
+ - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded random variables by betting. *JRSS B*, 86(1), 1–27.
97
+
98
+ ### Migration
99
+
100
+ Existing consumers do not need to change. All four primitives are additive. Recommended path: on the next eval-runner refactor, replace hand-rolled matrix loops with `runEvalCampaign`. Use `evaluateInterimReleaseConfidence` for any campaign you run on a recurring cadence. Wire `rubricPredictiveValidity` once you have ≥ 30 deployment outcomes joinable by `runId`. Replay is a free win — once campaigns are running, every eval R&D loop drops to CPU-bound.
101
+
3
102
  ## 0.21.0 — capture integrity + launch-grade reporting
4
103
 
5
104
  This release closes the layer-1 gap a downstream consumer surfaced: better
@@ -74,7 +173,9 @@ surface.
74
173
 
75
174
  ### Python client
76
175
 
77
- Locked at `tangle-agent-eval==0.21.0` to match the npm package.
176
+ The PyPI distribution renamed from `tangle-agent-eval` to **`agent-eval-rpc`**, and the import path from `tangle_agent_eval` to `agent_eval_rpc`. The new name accurately describes the package — it is a thin RPC client over the Node runtime, not a Python re-implementation of the eval logic — and the npm scope (`@tangle-network/agent-eval`) already provides the namespacing the `tangle-` prefix was substituting for. No prior PyPI version ever shipped under the old name (Trusted Publisher misconfiguration; see issue #40), so this rename is a clean first publish rather than a migration.
177
+
178
+ Locked at `agent-eval-rpc==0.21.0` to match the npm package.
78
179
 
79
180
  ## 0.20.10 — hardening audit follow-up
80
181
 
package/README.md CHANGED
@@ -116,6 +116,10 @@ import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
116
116
  | Fail loud if an eval would silently use the wrong route | `assertLlmRoute` |
117
117
  | Assert at run-end that the artifact is complete | `assertRunCaptured`, `throwIfRunIncomplete` |
118
118
  | Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
119
+ | Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
120
+ | Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
121
+ | Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
122
+ | Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
119
123
  | Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
120
124
 
121
125
  ### Capture integrity (0.21+)
@@ -1,3 +1,8 @@
1
+ import {
2
+ canonicalize,
3
+ hashJson
4
+ } from "./chunk-6M774GY6.js";
5
+
1
6
  // src/trace/store.ts
2
7
  var InMemoryTraceStore = class {
3
8
  runs = /* @__PURE__ */ new Map();
@@ -497,119 +502,138 @@ function runToTraceId(run) {
497
502
  return cleaned.slice(0, 32).padEnd(32, "0");
498
503
  }
499
504
 
500
- // src/trace/integrity.ts
501
- var RunIntegrityError = class extends Error {
502
- constructor(report) {
503
- super(
504
- `Run ${report.runId} failed integrity check: ${report.issues.map((i) => i.code).join(", ")}`
505
- );
506
- this.report = report;
507
- this.name = "RunIntegrityError";
508
- }
509
- report;
505
+ // src/replay.ts
506
+ var ReplayCacheMissError = class extends Error {
507
+ constructor(url, requestKey2, message) {
508
+ super(message ?? `replay cache miss for ${url} (key=${requestKey2})`);
509
+ this.url = url;
510
+ this.requestKey = requestKey2;
511
+ this.name = "ReplayCacheMissError";
512
+ }
513
+ url;
514
+ requestKey;
510
515
  };
511
- async function assertRunCaptured(store, runId, expectations = {}) {
512
- const issues = [];
513
- const run = await store.getRun(runId);
514
- if (!run) {
515
- return {
516
- ok: false,
517
- runId,
518
- llmSpanCount: 0,
519
- judgeSpanCount: 0,
520
- toolSpanCount: 0,
521
- rawProviderEventCount: 0,
522
- rawSpanCoverage: { covered: 0, total: 0 },
523
- issues: [{ code: "no_run", message: `Run ${runId} not found in store.` }]
524
- };
516
+ var ReplayCache = class _ReplayCache {
517
+ byKey = /* @__PURE__ */ new Map();
518
+ orphans = 0;
519
+ byProvider = {};
520
+ byModel = {};
521
+ /**
522
+ * Build a cache from a sink's events. The sink must implement `list()`.
523
+ * Filter by `runId` / `spanId` to scope to a specific replay.
524
+ */
525
+ static async fromSink(sink, filter = {}) {
526
+ if (!sink.list) {
527
+ throw new Error("ReplayCache.fromSink: sink must implement list() to be replayable.");
528
+ }
529
+ const events = await sink.list(filter);
530
+ return _ReplayCache.fromEvents(events);
531
+ }
532
+ /** Build a cache from an in-memory event list. */
533
+ static async fromEvents(events) {
534
+ const cache = new _ReplayCache();
535
+ const groups = /* @__PURE__ */ new Map();
536
+ for (const e of events) {
537
+ const k = `${e.runId ?? ""}::${e.spanId ?? ""}::${e.attemptIndex}`;
538
+ const g = groups.get(k) ?? {};
539
+ if (e.direction === "request") g.req = e;
540
+ else g.res = e;
541
+ groups.set(k, g);
542
+ }
543
+ for (const g of groups.values()) {
544
+ if (!g.req) continue;
545
+ if (!g.res) {
546
+ cache.orphans += 1;
547
+ continue;
548
+ }
549
+ const key = await requestKey(g.req);
550
+ cache.byKey.set(key, { request: g.req, response: g.res });
551
+ cache.byProvider[g.req.provider] = (cache.byProvider[g.req.provider] ?? 0) + 1;
552
+ cache.byModel[g.req.model] = (cache.byModel[g.req.model] ?? 0) + 1;
553
+ }
554
+ return cache;
525
555
  }
526
- const spans = await store.spans({ runId });
527
- const llmSpans2 = spans.filter((s) => s.kind === "llm");
528
- const judgeSpans2 = spans.filter((s) => s.kind === "judge");
529
- const toolSpans2 = spans.filter((s) => s.kind === "tool");
530
- const llmMin = expectations.llmSpansMin ?? 0;
531
- const judgeMin = expectations.judgeSpansMin ?? 0;
532
- const toolMin = expectations.toolSpansMin ?? 0;
533
- if (llmSpans2.length < llmMin) {
534
- issues.push({
535
- code: "missing_llm_spans",
536
- message: `Expected \u2265 ${llmMin} LLM spans, found ${llmSpans2.length}.`,
537
- detail: { expected: llmMin, found: llmSpans2.length }
538
- });
556
+ /** Number of cacheable (request, response) pairs in the cache. */
557
+ size() {
558
+ return this.byKey.size;
539
559
  }
540
- if (judgeSpans2.length < judgeMin) {
541
- issues.push({
542
- code: "missing_judge_spans",
543
- message: `Expected \u2265 ${judgeMin} judge spans, found ${judgeSpans2.length}.`,
544
- detail: { expected: judgeMin, found: judgeSpans2.length }
545
- });
560
+ stats() {
561
+ return {
562
+ total: this.byKey.size,
563
+ byProvider: { ...this.byProvider },
564
+ byModel: { ...this.byModel },
565
+ orphanRequests: this.orphans
566
+ };
546
567
  }
547
- if (toolSpans2.length < toolMin) {
548
- issues.push({
549
- code: "missing_tool_spans",
550
- message: `Expected \u2265 ${toolMin} tool spans, found ${toolSpans2.length}.`,
551
- detail: { expected: toolMin, found: toolSpans2.length }
552
- });
568
+ /**
569
+ * Look up a cached response by hashing the (model, messages, temperature,
570
+ * maxTokens, response_format) shape. Returns `undefined` on miss; the
571
+ * caller decides whether to throw, fall back to the network, or skip.
572
+ */
573
+ async lookup(requestBody) {
574
+ const key = await keyFromBody(requestBody);
575
+ return this.byKey.get(key);
553
576
  }
554
- let rawEventCount = 0;
555
- let coverage = { covered: 0, total: llmSpans2.length };
556
- if (expectations.rawSink) {
557
- if (!expectations.rawSink.list) {
558
- issues.push({
559
- code: "no_raw_sink",
560
- message: "Provided rawSink does not implement list(); cannot verify capture."
561
- });
562
- } else {
563
- const events = await expectations.rawSink.list({ runId });
564
- rawEventCount = events.length;
565
- const rawMin = expectations.rawProviderEventsMin ?? 1;
566
- if (rawEventCount < rawMin) {
567
- issues.push({
568
- code: "missing_raw_events",
569
- message: `Expected \u2265 ${rawMin} raw provider events, found ${rawEventCount}.`,
570
- detail: { expected: rawMin, found: rawEventCount }
571
- });
572
- }
573
- if (expectations.requireRawCoverageOfLlmSpans) {
574
- const requestEventsBySpan = new Set(
575
- events.filter((e) => e.direction === "request" && e.spanId).map((e) => e.spanId)
576
- );
577
- const orphaned = llmSpans2.filter((s) => !requestEventsBySpan.has(s.spanId));
578
- coverage = { covered: llmSpans2.length - orphaned.length, total: llmSpans2.length };
579
- if (orphaned.length > 0) {
580
- issues.push({
581
- code: "orphan_llm_span",
582
- message: `${orphaned.length} LLM span(s) have no matching raw provider request event.`,
583
- detail: { orphanedSpanIds: orphaned.map((s) => s.spanId) }
584
- });
585
- }
577
+ };
578
+ function createReplayFetch(cache, opts = {}) {
579
+ const onMiss = opts.onMiss ?? "throw";
580
+ const fallback = opts.fallbackFetch ?? globalThis.fetch?.bind(globalThis);
581
+ return (async (input, init) => {
582
+ const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url;
583
+ if (!/\/chat\/completions(?:[?#].*)?$/.test(url)) {
584
+ if (!fallback) throw new Error(`replay fetch: non-completions URL ${url} but no fallbackFetch configured`);
585
+ return fallback(input, init);
586
+ }
587
+ let bodyParsed;
588
+ if (init?.body && typeof init.body === "string") {
589
+ try {
590
+ bodyParsed = JSON.parse(init.body);
591
+ } catch {
586
592
  }
587
593
  }
588
- } else if (expectations.requireRawCoverageOfLlmSpans || expectations.rawProviderEventsMin) {
589
- issues.push({
590
- code: "no_raw_sink",
591
- message: "Raw coverage required but no rawSink supplied to the integrity check."
592
- });
593
- }
594
- if (expectations.requireOutcome && (run.outcome === void 0 || run.outcome === null)) {
595
- issues.push({
596
- code: "missing_outcome",
597
- message: `Run ${runId} has no outcome recorded.`
598
- });
599
- }
600
- return {
601
- ok: issues.length === 0,
602
- runId,
603
- llmSpanCount: llmSpans2.length,
604
- judgeSpanCount: judgeSpans2.length,
605
- toolSpanCount: toolSpans2.length,
606
- rawProviderEventCount: rawEventCount,
607
- rawSpanCoverage: coverage,
608
- issues
609
- };
594
+ const hit = bodyParsed === void 0 ? void 0 : await cache.lookup(bodyParsed);
595
+ if (hit) {
596
+ opts.onHit?.({ url, provider: hit.request.provider, model: hit.request.model });
597
+ const status = hit.response.statusCode ?? 200;
598
+ const headers = new Headers(Object.entries(hit.response.responseHeaders ?? { "Content-Type": "application/json" }));
599
+ const bodyText = typeof hit.response.responseBody === "string" ? hit.response.responseBody : JSON.stringify(hit.response.responseBody ?? {});
600
+ return new Response(bodyText, { status, headers });
601
+ }
602
+ opts.onMissNotify?.({ url, requestBody: bodyParsed });
603
+ if (onMiss === "throw") {
604
+ const key = bodyParsed === void 0 ? "<unparseable>" : await keyFromBody(bodyParsed);
605
+ throw new ReplayCacheMissError(url, key);
606
+ }
607
+ if (onMiss === "fail-closed") {
608
+ return new Response(JSON.stringify({ error: "replay_cache_miss" }), { status: 599 });
609
+ }
610
+ if (!fallback) throw new Error("replay fetch: onMiss=fallback but no fallbackFetch configured");
611
+ return fallback(input, init);
612
+ });
610
613
  }
611
- function throwIfRunIncomplete(report) {
612
- if (!report.ok) throw new RunIntegrityError(report);
614
+ async function* iterateRawCalls(sink, filter = {}) {
615
+ if (!sink.list) {
616
+ throw new Error("iterateRawCalls: sink must implement list().");
617
+ }
618
+ const events = await sink.list(filter);
619
+ const cache = await ReplayCache.fromEvents(events);
620
+ for (const entry of cache["byKey"].values()) yield entry;
621
+ }
622
+ async function requestKey(event) {
623
+ return keyFromBody(event.requestBody);
624
+ }
625
+ async function keyFromBody(body) {
626
+ if (body == null || typeof body !== "object") return hashJson({ raw: String(body) });
627
+ const b = body;
628
+ const reduced = canonicalize({
629
+ model: b.model ?? null,
630
+ messages: b.messages ?? null,
631
+ temperature: b.temperature ?? null,
632
+ max_tokens: b.max_tokens ?? null,
633
+ max_completion_tokens: b.max_completion_tokens ?? null,
634
+ response_format: b.response_format ?? null
635
+ });
636
+ return hashJson(reduced);
613
637
  }
614
638
 
615
639
  // src/trace-analyst/types.ts
@@ -1891,9 +1915,10 @@ export {
1891
1915
  redactValue,
1892
1916
  OTEL_AGENT_EVAL_SCOPE,
1893
1917
  exportRunAsOtlp,
1894
- RunIntegrityError,
1895
- assertRunCaptured,
1896
- throwIfRunIncomplete,
1918
+ ReplayCacheMissError,
1919
+ ReplayCache,
1920
+ createReplayFetch,
1921
+ iterateRawCalls,
1897
1922
  DEFAULT_TRACE_ANALYST_BUDGETS,
1898
1923
  TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
1899
1924
  OtlpFileTraceStore,
@@ -1917,4 +1942,4 @@ export {
1917
1942
  defaultTraceInsightPanel,
1918
1943
  buildTraceInsightPrompt
1919
1944
  };
1920
- //# sourceMappingURL=chunk-WOK2RTWG.js.map
1945
+ //# sourceMappingURL=chunk-4W4NCYM2.js.map