@tangle-network/agent-eval 0.20.12 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +177 -0
  2. package/README.md +43 -1
  3. package/dist/{chunk-KWUAAIHR.js → chunk-4W4NCYM2.js} +182 -1
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
  6. package/dist/chunk-5IIQKMD5.js.map +1 -0
  7. package/dist/{chunk-HNJLMAJ2.js → chunk-6KQG5HAH.js} +2 -2
  8. package/dist/chunk-6M774GY6.js +53 -0
  9. package/dist/chunk-6M774GY6.js.map +1 -0
  10. package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
  11. package/dist/chunk-IOXMGMHQ.js +1226 -0
  12. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  13. package/dist/{chunk-75MCTH7P.js → chunk-KAO3Q65R.js} +198 -3
  14. package/dist/chunk-KAO3Q65R.js.map +1 -0
  15. package/dist/chunk-QUKKGHTZ.js +121 -0
  16. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  17. package/dist/chunk-SQQLHODJ.js +163 -0
  18. package/dist/chunk-SQQLHODJ.js.map +1 -0
  19. package/dist/{chunk-IKFVX537.js → chunk-UAND2LOT.js} +232 -211
  20. package/dist/chunk-UAND2LOT.js.map +1 -0
  21. package/dist/{chunk-HKYRWNHV.js → chunk-USHQBPMH.js} +283 -7
  22. package/dist/chunk-USHQBPMH.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
  26. package/dist/control.d.ts +4 -3
  27. package/dist/control.js +2 -2
  28. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  29. package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
  30. package/dist/index.d.ts +16 -302
  31. package/dist/index.js +70 -62
  32. package/dist/index.js.map +1 -1
  33. package/dist/integrity-K2oVlF57.d.ts +210 -0
  34. package/dist/openapi.json +1 -1
  35. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  36. package/dist/optimization.d.ts +7 -144
  37. package/dist/optimization.js +9 -2
  38. package/dist/reporting-B82RSv9C.d.ts +593 -0
  39. package/dist/reporting.d.ts +5 -426
  40. package/dist/reporting.js +17 -6
  41. package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
  42. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  43. package/dist/traces.d.ts +179 -3
  44. package/dist/traces.js +35 -4
  45. package/dist/wire/index.js +3 -2
  46. package/docs/research-report-methodology.md +170 -0
  47. package/docs/wire-protocol.md +1 -1
  48. package/package.json +11 -13
  49. package/dist/chunk-75MCTH7P.js.map +0 -1
  50. package/dist/chunk-HKYRWNHV.js.map +0 -1
  51. package/dist/chunk-IKFVX537.js.map +0 -1
  52. package/dist/chunk-KWUAAIHR.js.map +0 -1
  53. package/dist/chunk-ODFINDLQ.js +0 -413
  54. package/dist/chunk-ODFINDLQ.js.map +0 -1
  55. package/dist/chunk-PKCVBYTQ.js.map +0 -1
  56. /package/dist/{chunk-HNJLMAJ2.js.map → chunk-6KQG5HAH.js.map} +0 -0
  57. /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
package/CHANGELOG.md CHANGED
@@ -1,5 +1,182 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.22.0 — EvalCampaign + replay + always-valid + outcome calibration
4
+
5
+ 0.21 shipped the four capture-integrity primitives as opt-in. Every consumer still had to wire them by hand, and the bug class blueprint-agent reported (forgotten wiring → silent partial-capture) reappears the moment a new consumer adopts agent-eval cold. **0.22 makes the right thing the default path** — and adds three primitives that compound on top of standardized capture: replay-from-raw-events, anytime-valid sequential evaluation, and rubric predictive validity. The four primitives together turn agent-eval from a TS framework into research-grade evaluation infrastructure.
6
+
7
+ ### Added
8
+
9
+ #### `runEvalCampaign` — capture integrity by construction
10
+
11
+ Opinionated matrix runner that wires the four directives by construction. Inputs: variants, scenarios, seeds, an `LlmClientOptions`, factories for `TraceStore` and `RawProviderSink`, and a `runner(ctx)` callback. Outputs: per-cell `RunRecord[]`, `RunIntegrityReport[]`, optional `researchReport`, and a campaign fingerprint.
12
+
13
+ - **Preflight:** `assertLlmRoute` is called once before any work, with `{ requireExplicitBaseUrl: true, requireAuth: true }` defaults. Misconfigured routes never burn a run.
14
+ - **Per run:** the campaign constructs the `TraceStore`, `RawProviderSink`, and `TraceEmitter` (with `onRunComplete` hooks attached), then hands the runner an `LlmClientOptions` already pre-wired with `rawSink` + `traceContext`. The runner cannot accidentally call an LLM without capture.
15
+ - **Run-completion:** `assertRunCaptured` runs after every `endRun` with `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }` defaults. Failures are routed via `onIntegrityFailure: 'throw' | 'mark_failed' | 'log'` (default `'mark_failed'`).
16
+ - **End of campaign:** if `report.comparator` is set, computes `researchReport` over the collected `RunRecord`s and embeds the campaign fingerprint + `preregistrationHash`.
17
+ - **Concurrency:** local async worker pool, default 1, configurable via `concurrency`.
18
+ - **Determinism:** the default `runId` generator is a stable hash of `(campaignId, variantId, scenarioId, seed)`, so re-running the same campaign produces the same ids; override `runId` for non-deterministic generation.
19
+
20
+ Exported from the root barrel and the `@tangle-network/agent-eval/optimization` subpath: `runEvalCampaign`, `CampaignRunner`, `CampaignRunContext`, `CampaignRunOutcome`, `CampaignVariant`, `CampaignScenario`, `EvalCampaignOptions`, `EvalCampaignResult`, `FailedRun`, `CampaignIntegrityPolicy`, `CampaignFactoryParams`.
21
+
22
+ #### Replay-from-raw-events
23
+
24
+ Every campaign run is now a re-runnable artifact. `ReplayCache.fromSink(sink)` turns a populated `RawProviderSink` into a deterministic `(canonicalised request → cached response)` map; `createReplayFetch(cache)` returns a `fetch`-shaped function that satisfies `/chat/completions` calls out of the cache and passes other URLs through.
25
+
26
+ ```ts
27
+ const cache = await ReplayCache.fromSink(yesterdayRawSink)
28
+ const replayFetch = createReplayFetch(cache, { onMiss: 'fail-closed' })
29
+ await callLlm(req, { ...llmOpts, fetch: replayFetch }) // zero LLM cost
30
+ ```
31
+
32
+ Use cases:
33
+
34
+ - Post-hoc judging — apply a new judge or scorer to last week's runs without burning a single token.
35
+ - Determinism audits — replay a campaign and verify the responses match byte-for-byte.
36
+ - Free judge calibration — run two judges on identical responses and measure agreement.
37
+
38
+ `onMiss` is `'throw' | 'fallback' | 'fail-closed'`. The cache hashes a canonical projection (`model + messages + temperature + max_tokens|max_completion_tokens + response_format`) so insertion-order quirks don't cause spurious misses.
39
+
40
+ Exported from root and `@tangle-network/agent-eval/traces`: `ReplayCache`, `createReplayFetch`, `iterateRawCalls`, `ReplayCacheEntry`, `ReplayCacheStats`, `ReplayFetchOptions`, `ReplayCacheMissError`.
41
+
42
+ #### Always-valid sequential evaluation
43
+
44
+ `pairedEvalueSequence(deltas, opts)` and `evaluateInterimReleaseConfidence({ deltaSeries })` ship the predictable plug-in betting martingale of Waudby-Smith & Ramdas (2024) for paired bounded outcomes, plus the empirical Bernstein confidence sequence of Howard et al. (2021) for the running mean. Both are *anytime-valid* — type-I error is bounded by α at every stopping time, no peeking penalty.
45
+
46
+ ```ts
47
+ const verdict = evaluateInterimReleaseConfidence({
48
+ deltaSeries: [{ candidateId: 'cand', deltas }],
49
+ alpha: 0.05,
50
+ rope: { low: -0.02, high: 0.02 },
51
+ })
52
+ // → { recommendation: { decision: 'promote_now' | 'continue' | 'reject_now' | 'equivalent', candidateId } }
53
+ ```
54
+
55
+ This closes the methodological hole flagged in the 0.21 methodology doc as out-of-scope. Consumers running rolling campaigns can now ship the moment evidence is decisive, stop-early on dead-on-arrival variants, and accumulate evidence across partial runs without spending the FDR budget. Tested under-the-null at α=0.05 on 100 synthetic series; false-rejection rate stays below the bound.
56
+
57
+ Exported from root and `@tangle-network/agent-eval/reporting`: `pairedEvalueSequence`, `evaluateInterimReleaseConfidence`, `PairedEvalueOptions`, `PairedEvalueSequence`, `PairedEvalueStep`, `InterimReleaseConfidence`, `InterimReleaseConfidenceInput`, `SequentialDecision`.
58
+
59
+ #### Rubric predictive validity
60
+
61
+ `rubricPredictiveValidity({ runs, outcomes, outcomeMetrics })` joins canonical campaign `RunRecord`s to a `DeploymentOutcomeStore` and reports per-rubric Pearson + Spearman + bootstrap CI against each outcome metric. Verdict bucketing: `'load_bearing' | 'informative' | 'decorative'` based on `|spearman|`. **Without this loop every rubric is faith-based;** with it, you know which rubrics earn their promotion power and which are decoration.
62
+
63
+ ```ts
64
+ const validity = await rubricPredictiveValidity({
65
+ runs: lastQuarterRuns,
66
+ outcomes: shipFlagOutcomeStore,
67
+ outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
68
+ })
69
+ for (const r of validity.ranked) {
70
+ console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)} (${r.verdict})`)
71
+ }
72
+ ```
73
+
74
+ Builds on the existing `correlationStudy` primitive but works directly off `RunRecord` (the canonical campaign artifact) rather than `Run` from a `TraceStore`, so it composes cleanly with `runEvalCampaign`'s output. Returns a per-rubric ranking + every (rubric, outcome) pair tested + a list of rubrics that produced no usable data.
75
+
76
+ Exported from root and `@tangle-network/agent-eval/reporting`: `rubricPredictiveValidity`, `RubricOutcomePair`, `RubricRanking`, `RubricPredictiveValidityInput`, `RubricPredictiveValidityReport`. The existing `correlationStudy`, `OutcomeStore`, `InMemoryOutcomeStore`, `FileSystemOutcomeStore` continue to work unchanged.
77
+
78
+ #### `NoopRawProviderSink.list()` returns `[]`
79
+
80
+ Explicit opt-out from capture is no longer flagged by `assertRunCaptured` as `no_raw_sink`. Opt-out remains a deliberate choice; the campaign still requires the matching integrity overrides.
81
+
82
+ ### Why
83
+
84
+ Every consumer that adopted agent-eval before 0.22 wrote their own matrix runner, and every one of them re-introduced the same forgettable wiring (raw sink, route guard, integrity assertion, analyst hook). 0.21 documented the pattern; 0.22 owns it. The four new primitives compound:
85
+
86
+ - `runEvalCampaign` standardises the artifact (`RunRecord` + raw events + fingerprint).
87
+ - Replay turns every past run into free training/validation data for new judges.
88
+ - Sequential evaluation makes "ship-when-evidence-says-so" mathematically defensible.
89
+ - Predictive validity converts evals from belief-based to outcome-anchored.
90
+
91
+ `runMultiShotOptimization` remains the right primitive for trajectory-shaped GEPA optimization sweeps; `runPromptEvolution` for prompt + code evolution loops with sandbox pools; `runEvalCampaign` for the "compare N variants on M scenarios with K seeds and tell me which to ship" case that makes up the bulk of consumer evals.
92
+
93
+ ### References
94
+
95
+ - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021). Time-uniform, nonparametric, nonasymptotic confidence sequences. *Annals of Statistics*, 49(2), 1055–1080.
96
+ - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded random variables by betting. *JRSS B*, 86(1), 1–27.
97
+
98
+ ### Migration
99
+
100
+ Existing consumers do not need to change. All four primitives are additive. Recommended path: on the next eval-runner refactor, replace hand-rolled matrix loops with `runEvalCampaign`. Use `evaluateInterimReleaseConfidence` for any campaign you run on a recurring cadence. Wire `rubricPredictiveValidity` once you have ≥ 30 deployment outcomes joinable by `runId`. Replay is a free win — once campaigns are running, every eval R&D loop drops to CPU-bound.
101
+
102
+ ## 0.21.0 — capture integrity + launch-grade reporting
103
+
104
+ This release closes the layer-1 gap a downstream consumer surfaced: better
105
+ post-run statistics don't help if the underlying data wasn't captured. 0.21
106
+ adds first-class raw provider-event capture, a fail-loud route guard, a
107
+ run-completion integrity check, and run-complete hooks (with a trace-analyst
108
+ auto-execution helper) so a direct matrix run produces complete forensics
109
+ without out-of-band glue.
110
+
111
+ ### Added
112
+
113
+ - **`RawProviderSink` (capture).** First-class persistence for HTTP-level
114
+ provider request / response / error payloads alongside the structured
115
+ `LlmSpan`. `InMemoryRawProviderSink`, `FileSystemRawProviderSink` (NDJSON,
116
+ rolls at 32 MiB), and `NoopRawProviderSink` ship in core. Default redactor
117
+ strips `Authorization` / `X-Api-Key` / `Cookie` headers and credential-shaped
118
+ body fields (`apiKey`, `bearer`, `password`, `secret`, `token`); redacted
119
+ paths are recorded on `event.redactedFields` so a reviewer can see what was
120
+ stripped without exposing values. Wired into `callLlm` via
121
+ `LlmClientOptions.rawSink` — every retry attempt produces a `request` and
122
+ either a `response` or `error` event with the attempt index attached.
123
+ - **`assertLlmRoute` (route guard).** Pure function that throws
124
+ `LlmRouteAssertionError` when the configured client doesn't match the
125
+ caller's route requirements: `requireExplicitBaseUrl`, `allowedBaseUrls`,
126
+ `blockedBaseUrls`, `requireAuth`, `expectedProvider`. Designed for the
127
+ matrix-runner preflight — fail loud at the boundary instead of silently
128
+ falling back to the public/free-tier router.
129
+ - **`assertRunCaptured` (integrity check).** Read-only check on
130
+ `(store, runId, expectations)` that returns a structured
131
+ `RunIntegrityReport` with issue codes (`missing_llm_spans`,
132
+ `missing_raw_events`, `orphan_llm_span`, `no_raw_sink`, `missing_outcome`,
133
+ …). Pair with the new `requireRawCoverageOfLlmSpans` to assert every
134
+ `LlmSpan` has a matching raw `request` event. Use directly or via
135
+ `throwIfRunIncomplete` for strict mode.
136
+ - **`onRunComplete` hooks on `TraceEmitter`.** New
137
+ `TraceEmitterOptions.onRunComplete` array fires after `endRun` / `abortRun`
138
+ with full run context (run id, outcome, status, store, emitter). Errors are
139
+ swallowed and recorded as `log` events by default; opt into propagation via
140
+ `hookErrors: 'throw'`. `addRunCompleteHook` attaches hooks after construction.
141
+ - **`traceAnalystOnRunComplete` factory.** Drop-in run-complete hook that
142
+ runs `analyzeTraces` after each run and persists the result. Resolves the
143
+ "trace analyst never ran on this matrix sweep" complaint by making
144
+ auto-execution declarative.
145
+ - **`researchReport`** — executive research-report layer for coding-vertical
146
+ benchmark runs (originally landed in #34, elevated in #35). Composes
147
+ `summaryTable`, `paretoChart`, `gainHistogram`, held-out gate decisions,
148
+ and optional `failureClusterView` output into one structured artifact:
149
+ promote / hold / equivalent / reject / needs-more-data guidance with
150
+ rationale, risks, next actions, markdown, HTML, and JSON chart specs.
151
+ - Decisions are made on paired evidence — never on marginal means alone.
152
+ - ROPE (Region of Practical Equivalence) supported via the `rope` option.
153
+ - Bayesian-bootstrap-style `Pr(Δ>0)` and `Pr(Δ∈ROPE)` summaries (Rubin 1981).
154
+ - Per-candidate minimum detectable paired effect via `pairedMde`.
155
+ - SHA-256 `runFingerprint` and optional `preregistrationHash` linking a
156
+ signed `HypothesisManifest`.
157
+ - Embedded methodology + `docs/research-report-methodology.md` companion.
158
+ - **`pairedMde`** in `power-analysis`: closed-form minimum detectable paired
159
+ effect (inverse to the paired-t / sign-rank power formula).
160
+
161
+ ### Changed
162
+
163
+ - `researchReport` is async (uses Web Crypto via `hashJson` for the run
164
+ fingerprint).
165
+ - Default `researchReport.minPairs` is 20 (soft floor); hard floor of 6 is
166
+ enforced regardless via `RESEARCH_REPORT_HARD_PAIR_FLOOR`.
167
+
168
+ ### Wire-protocol consumers
169
+
170
+ No wire-protocol changes. The new capture / integrity / hook primitives are
171
+ TypeScript-only; cross-language consumers continue to use the existing RPC
172
+ surface.
173
+
174
+ ### Python client
175
+
176
+ The PyPI distribution renamed from `tangle-agent-eval` to **`agent-eval-rpc`**, and the import path from `tangle_agent_eval` to `agent_eval_rpc`. The new name accurately describes the package — it is a thin RPC client over the Node runtime, not a Python re-implementation of the eval logic — and the npm scope (`@tangle-network/agent-eval`) already provides the namespacing the `tangle-` prefix was substituting for. No prior PyPI version ever shipped under the old name (Trusted Publisher misconfiguration; see issue #40), so this rename is a clean first publish rather than a migration.
177
+
178
+ Locked at `agent-eval-rpc==0.21.0` to match the npm package.
179
+
3
180
  ## 0.20.10 — hardening audit follow-up
4
181
 
5
182
  ### Fixed
package/README.md CHANGED
@@ -111,9 +111,51 @@ import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
111
111
  | Compare prompt/tool/retrieval policies over full trajectories | `runMultiShotOptimization` |
112
112
  | Gate releases with paired evidence and holdouts | `evaluateReleaseConfidence`, `HeldOutGate` |
113
113
  | Explain regressions across trace corpora | `TraceAnalyst` / `analyzeTraces` |
114
- | Report a launch decision | `renderReleaseReport`, `summaryTable`, `paretoChart`, `gainHistogram` |
114
+ | Report a launch decision | `renderReleaseReport`, `researchReport`, `summaryTable`, `paretoChart`, `gainHistogram` |
115
+ | Capture every provider HTTP request / response for forensics | `RawProviderSink`, `LlmClientOptions.rawSink` |
116
+ | Fail loud if an eval would silently use the wrong route | `assertLlmRoute` |
117
+ | Assert at run-end that the artifact is complete | `assertRunCaptured`, `throwIfRunIncomplete` |
118
+ | Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
119
+ | Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
120
+ | Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
121
+ | Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
122
+ | Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
115
123
  | Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
116
124
 
125
+ ### Capture integrity (0.21+)
126
+
127
+ Launch-grade benchmark runs need four things that are easy to forget in glue
128
+ code: (1) raw HTTP capture alongside the structured spans so a reviewer can
129
+ verify which route answered, (2) a preflight assertion that the configured
130
+ client points at the intended provider, (3) a run-end assertion that the
131
+ expected events were actually written, and (4) auto-execution of the trace
132
+ analyst as part of the run lifecycle. The wiring fits in a few lines:
133
+
134
+ ```ts
135
+ import {
136
+ TraceEmitter, FileSystemRawProviderSink, callLlm, assertLlmRoute,
137
+ assertRunCaptured, throwIfRunIncomplete,
138
+ } from '@tangle-network/agent-eval'
139
+ import { traceAnalystOnRunComplete } from '@tangle-network/agent-eval/traces'
140
+
141
+ const sink = new FileSystemRawProviderSink({ dir: `${workDir}/raw-events` })
142
+ assertLlmRoute(llmOpts, { requireExplicitBaseUrl: true, allowedBaseUrls, requireAuth: true })
143
+
144
+ const emitter = new TraceEmitter(store, {
145
+ onRunComplete: [traceAnalystOnRunComplete({ analyze: analystOpts, save })],
146
+ })
147
+ await emitter.startRun(/* ... */)
148
+ // LLM calls flow through callLlm with `{ rawSink: sink, traceContext: { runId, spanId } }`.
149
+ await emitter.endRun({ pass, score })
150
+
151
+ throwIfRunIncomplete(await assertRunCaptured(store, emitter.runId, {
152
+ llmSpansMin: 1, rawSink: sink, requireRawCoverageOfLlmSpans: true, requireOutcome: true,
153
+ }))
154
+ ```
155
+
156
+ Directives, rationale, and shipped-bug context are in
157
+ [`SKILL.md` § Capture integrity](./.claude/skills/agent-eval/SKILL.md#capture-integrity-required-for-launch-grade-adoption).
158
+
117
159
  ## Examples
118
160
 
119
161
  Runnable examples live in
@@ -1,3 +1,8 @@
1
+ import {
2
+ canonicalize,
3
+ hashJson
4
+ } from "./chunk-6M774GY6.js";
5
+
1
6
  // src/trace/store.ts
2
7
  var InMemoryTraceStore = class {
3
8
  runs = /* @__PURE__ */ new Map();
@@ -497,6 +502,140 @@ function runToTraceId(run) {
497
502
  return cleaned.slice(0, 32).padEnd(32, "0");
498
503
  }
499
504
 
505
+ // src/replay.ts
506
+ var ReplayCacheMissError = class extends Error {
507
+ constructor(url, requestKey2, message) {
508
+ super(message ?? `replay cache miss for ${url} (key=${requestKey2})`);
509
+ this.url = url;
510
+ this.requestKey = requestKey2;
511
+ this.name = "ReplayCacheMissError";
512
+ }
513
+ url;
514
+ requestKey;
515
+ };
516
+ var ReplayCache = class _ReplayCache {
517
+ byKey = /* @__PURE__ */ new Map();
518
+ orphans = 0;
519
+ byProvider = {};
520
+ byModel = {};
521
+ /**
522
+ * Build a cache from a sink's events. The sink must implement `list()`.
523
+ * Filter by `runId` / `spanId` to scope to a specific replay.
524
+ */
525
+ static async fromSink(sink, filter = {}) {
526
+ if (!sink.list) {
527
+ throw new Error("ReplayCache.fromSink: sink must implement list() to be replayable.");
528
+ }
529
+ const events = await sink.list(filter);
530
+ return _ReplayCache.fromEvents(events);
531
+ }
532
+ /** Build a cache from an in-memory event list. */
533
+ static async fromEvents(events) {
534
+ const cache = new _ReplayCache();
535
+ const groups = /* @__PURE__ */ new Map();
536
+ for (const e of events) {
537
+ const k = `${e.runId ?? ""}::${e.spanId ?? ""}::${e.attemptIndex}`;
538
+ const g = groups.get(k) ?? {};
539
+ if (e.direction === "request") g.req = e;
540
+ else g.res = e;
541
+ groups.set(k, g);
542
+ }
543
+ for (const g of groups.values()) {
544
+ if (!g.req) continue;
545
+ if (!g.res) {
546
+ cache.orphans += 1;
547
+ continue;
548
+ }
549
+ const key = await requestKey(g.req);
550
+ cache.byKey.set(key, { request: g.req, response: g.res });
551
+ cache.byProvider[g.req.provider] = (cache.byProvider[g.req.provider] ?? 0) + 1;
552
+ cache.byModel[g.req.model] = (cache.byModel[g.req.model] ?? 0) + 1;
553
+ }
554
+ return cache;
555
+ }
556
+ /** Number of cacheable (request, response) pairs in the cache. */
557
+ size() {
558
+ return this.byKey.size;
559
+ }
560
+ stats() {
561
+ return {
562
+ total: this.byKey.size,
563
+ byProvider: { ...this.byProvider },
564
+ byModel: { ...this.byModel },
565
+ orphanRequests: this.orphans
566
+ };
567
+ }
568
+ /**
569
+ * Look up a cached response by hashing the (model, messages, temperature,
570
+ * maxTokens, response_format) shape. Returns `undefined` on miss; the
571
+ * caller decides whether to throw, fall back to the network, or skip.
572
+ */
573
+ async lookup(requestBody) {
574
+ const key = await keyFromBody(requestBody);
575
+ return this.byKey.get(key);
576
+ }
577
+ };
578
+ function createReplayFetch(cache, opts = {}) {
579
+ const onMiss = opts.onMiss ?? "throw";
580
+ const fallback = opts.fallbackFetch ?? globalThis.fetch?.bind(globalThis);
581
+ return (async (input, init) => {
582
+ const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url;
583
+ if (!/\/chat\/completions(?:[?#].*)?$/.test(url)) {
584
+ if (!fallback) throw new Error(`replay fetch: non-completions URL ${url} but no fallbackFetch configured`);
585
+ return fallback(input, init);
586
+ }
587
+ let bodyParsed;
588
+ if (init?.body && typeof init.body === "string") {
589
+ try {
590
+ bodyParsed = JSON.parse(init.body);
591
+ } catch {
592
+ }
593
+ }
594
+ const hit = bodyParsed === void 0 ? void 0 : await cache.lookup(bodyParsed);
595
+ if (hit) {
596
+ opts.onHit?.({ url, provider: hit.request.provider, model: hit.request.model });
597
+ const status = hit.response.statusCode ?? 200;
598
+ const headers = new Headers(Object.entries(hit.response.responseHeaders ?? { "Content-Type": "application/json" }));
599
+ const bodyText = typeof hit.response.responseBody === "string" ? hit.response.responseBody : JSON.stringify(hit.response.responseBody ?? {});
600
+ return new Response(bodyText, { status, headers });
601
+ }
602
+ opts.onMissNotify?.({ url, requestBody: bodyParsed });
603
+ if (onMiss === "throw") {
604
+ const key = bodyParsed === void 0 ? "<unparseable>" : await keyFromBody(bodyParsed);
605
+ throw new ReplayCacheMissError(url, key);
606
+ }
607
+ if (onMiss === "fail-closed") {
608
+ return new Response(JSON.stringify({ error: "replay_cache_miss" }), { status: 599 });
609
+ }
610
+ if (!fallback) throw new Error("replay fetch: onMiss=fallback but no fallbackFetch configured");
611
+ return fallback(input, init);
612
+ });
613
+ }
614
+ async function* iterateRawCalls(sink, filter = {}) {
615
+ if (!sink.list) {
616
+ throw new Error("iterateRawCalls: sink must implement list().");
617
+ }
618
+ const events = await sink.list(filter);
619
+ const cache = await ReplayCache.fromEvents(events);
620
+ for (const entry of cache["byKey"].values()) yield entry;
621
+ }
622
+ async function requestKey(event) {
623
+ return keyFromBody(event.requestBody);
624
+ }
625
+ async function keyFromBody(body) {
626
+ if (body == null || typeof body !== "object") return hashJson({ raw: String(body) });
627
+ const b = body;
628
+ const reduced = canonicalize({
629
+ model: b.model ?? null,
630
+ messages: b.messages ?? null,
631
+ temperature: b.temperature ?? null,
632
+ max_tokens: b.max_tokens ?? null,
633
+ max_completion_tokens: b.max_completion_tokens ?? null,
634
+ response_format: b.response_format ?? null
635
+ });
636
+ return hashJson(reduced);
637
+ }
638
+
500
639
  // src/trace-analyst/types.ts
501
640
  var DEFAULT_TRACE_ANALYST_BUDGETS = {
502
641
  perCallByteCeiling: 15e4,
@@ -1468,6 +1607,43 @@ function normalizeRecordArray(value) {
1468
1607
  return value.map((item) => item && typeof item === "object" ? { ...item } : { value: item });
1469
1608
  }
1470
1609
 
1610
+ // src/trace-analyst/hook.ts
1611
+ var DEFAULT_QUESTION = "Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.";
1612
+ function traceAnalystOnRunComplete(opts) {
1613
+ return async (ctx) => {
1614
+ if (opts.shouldRun && !opts.shouldRun(ctx)) return;
1615
+ const source = opts.analyze.source;
1616
+ if (source === void 0) {
1617
+ await ctx.store.appendEvent({
1618
+ eventId: `analyst-skip-${ctx.runId}`,
1619
+ runId: ctx.runId,
1620
+ kind: "log",
1621
+ timestamp: Date.now(),
1622
+ payload: { source: "trace_analyst_hook", reason: "no source configured" }
1623
+ });
1624
+ return;
1625
+ }
1626
+ const result = await analyzeTraces(
1627
+ { question: opts.question ?? DEFAULT_QUESTION },
1628
+ { ...opts.analyze, source }
1629
+ );
1630
+ if (opts.save) await opts.save(result, ctx);
1631
+ if (opts.gateOn && !opts.gateOn(result, ctx)) {
1632
+ await ctx.store.appendEvent({
1633
+ eventId: `analyst-gate-${ctx.runId}`,
1634
+ runId: ctx.runId,
1635
+ kind: "log",
1636
+ timestamp: Date.now(),
1637
+ payload: {
1638
+ source: "trace_analyst_hook",
1639
+ reason: "analyst_gate_failed",
1640
+ findings: result.findings
1641
+ }
1642
+ });
1643
+ }
1644
+ };
1645
+ }
1646
+
1471
1647
  // src/trace-analyst/insights.ts
1472
1648
  var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
1473
1649
  "and",
@@ -1739,6 +1915,10 @@ export {
1739
1915
  redactValue,
1740
1916
  OTEL_AGENT_EVAL_SCOPE,
1741
1917
  exportRunAsOtlp,
1918
+ ReplayCacheMissError,
1919
+ ReplayCache,
1920
+ createReplayFetch,
1921
+ iterateRawCalls,
1742
1922
  DEFAULT_TRACE_ANALYST_BUDGETS,
1743
1923
  TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
1744
1924
  OtlpFileTraceStore,
@@ -1751,6 +1931,7 @@ export {
1751
1931
  buildTraceAnalystTools,
1752
1932
  traceAnalystFunctionGroup,
1753
1933
  analyzeTraces,
1934
+ traceAnalystOnRunComplete,
1754
1935
  tokenizeDomainWords,
1755
1936
  inferDomainKeywords,
1756
1937
  domainEvidencePattern,
@@ -1761,4 +1942,4 @@ export {
1761
1942
  defaultTraceInsightPanel,
1762
1943
  buildTraceInsightPrompt
1763
1944
  };
1764
- //# sourceMappingURL=chunk-KWUAAIHR.js.map
1945
+ //# sourceMappingURL=chunk-4W4NCYM2.js.map