@tangle-network/agent-eval 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +102 -1
- package/README.md +4 -0
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/chunk-UAND2LOT.js +738 -0
- package/dist/chunk-UAND2LOT.js.map +1 -0
- package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
- package/dist/chunk-USHQBPMH.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/index.d.ts +10 -284
- package/dist/index.js +39 -19
- package/dist/index.js.map +1 -1
- package/dist/integrity-K2oVlF57.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization-UVDNKaO6.d.ts +574 -0
- package/dist/optimization.d.ts +6 -144
- package/dist/optimization.js +9 -2
- package/dist/reporting-B82RSv9C.d.ts +593 -0
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +15 -8
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +16 -5
- package/dist/wire/index.js +3 -3
- package/docs/research-report-methodology.md +19 -4
- package/docs/wire-protocol.md +1 -1
- package/package.json +2 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-HRZELXCR.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,104 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.22.0 — EvalCampaign + replay + always-valid + outcome calibration
|
|
4
|
+
|
|
5
|
+
0.21 shipped the four capture-integrity primitives as opt-in. Every consumer still had to wire them by hand, and the bug class blueprint-agent reported (forgotten wiring → silent partial-capture) reappears the moment a new consumer adopts agent-eval cold. **0.22 makes the right thing the default path** — and adds three primitives that compound on top of standardized capture: replay-from-raw-events, anytime-valid sequential evaluation, and rubric predictive validity. The four primitives together turn agent-eval from a TS framework into research-grade evaluation infrastructure.
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
|
|
9
|
+
#### `runEvalCampaign` — capture integrity by construction
|
|
10
|
+
|
|
11
|
+
Opinionated matrix runner that wires the four directives by construction. Inputs: variants, scenarios, seeds, an `LlmClientOptions`, factories for `TraceStore` and `RawProviderSink`, and a `runner(ctx)` callback. Outputs: per-cell `RunRecord[]`, `RunIntegrityReport[]`, optional `researchReport`, and a campaign fingerprint.
|
|
12
|
+
|
|
13
|
+
- **Preflight:** `assertLlmRoute` is called once before any work, with `{ requireExplicitBaseUrl: true, requireAuth: true }` defaults. Misconfigured routes never burn a run.
|
|
14
|
+
- **Per run:** the campaign constructs the `TraceStore`, `RawProviderSink`, and `TraceEmitter` (with `onRunComplete` hooks attached), then hands the runner an `LlmClientOptions` already pre-wired with `rawSink` + `traceContext`. The runner cannot accidentally call an LLM without capture.
|
|
15
|
+
- **Run-completion:** `assertRunCaptured` runs after every `endRun` with `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }` defaults. Failures are routed via `onIntegrityFailure: 'throw' | 'mark_failed' | 'log'` (default `'mark_failed'`).
|
|
16
|
+
- **End of campaign:** if `report.comparator` is set, computes `researchReport` over the collected `RunRecord`s and embeds the campaign fingerprint + `preregistrationHash`.
|
|
17
|
+
- **Concurrency:** local async worker pool, default 1, configurable via `concurrency`.
|
|
18
|
+
- **Determinism:** the default `runId` generator is a stable hash of `(campaignId, variantId, scenarioId, seed)`, so re-running the same campaign produces the same ids; override `runId` for non-deterministic generation.
|
|
19
|
+
|
|
20
|
+
Exported from the root barrel and the `@tangle-network/agent-eval/optimization` subpath: `runEvalCampaign`, `CampaignRunner`, `CampaignRunContext`, `CampaignRunOutcome`, `CampaignVariant`, `CampaignScenario`, `EvalCampaignOptions`, `EvalCampaignResult`, `FailedRun`, `CampaignIntegrityPolicy`, `CampaignFactoryParams`.
|
|
21
|
+
|
|
22
|
+
#### Replay-from-raw-events
|
|
23
|
+
|
|
24
|
+
Every campaign run is now a re-runnable artifact. `ReplayCache.fromSink(sink)` turns a populated `RawProviderSink` into a deterministic `(canonicalised request → cached response)` map; `createReplayFetch(cache)` returns a `fetch`-shaped function that satisfies `/chat/completions` calls out of the cache and passes other URLs through.
|
|
25
|
+
|
|
26
|
+
```ts
|
|
27
|
+
const cache = await ReplayCache.fromSink(yesterdayRawSink)
|
|
28
|
+
const replayFetch = createReplayFetch(cache, { onMiss: 'fail-closed' })
|
|
29
|
+
await callLlm(req, { ...llmOpts, fetch: replayFetch }) // zero LLM cost
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Use cases:
|
|
33
|
+
|
|
34
|
+
- Post-hoc judging — apply a new judge or scorer to last week's runs without burning a single token.
|
|
35
|
+
- Determinism audits — replay a campaign and verify the responses match byte-for-byte.
|
|
36
|
+
- Free judge calibration — run two judges on identical responses and measure agreement.
|
|
37
|
+
|
|
38
|
+
`onMiss` is `'throw' | 'fallback' | 'fail-closed'`. The cache hashes a canonical projection (`model + messages + temperature + max_tokens|max_completion_tokens + response_format`) so insertion-order quirks don't cause spurious misses.
|
|
39
|
+
|
|
40
|
+
Exported from root and `@tangle-network/agent-eval/traces`: `ReplayCache`, `createReplayFetch`, `iterateRawCalls`, `ReplayCacheEntry`, `ReplayCacheStats`, `ReplayFetchOptions`, `ReplayCacheMissError`.
|
|
41
|
+
|
|
42
|
+
#### Always-valid sequential evaluation
|
|
43
|
+
|
|
44
|
+
`pairedEvalueSequence(deltas, opts)` and `evaluateInterimReleaseConfidence({ deltaSeries })` ship the predictable plug-in betting martingale of Waudby-Smith & Ramdas (2024) for paired bounded outcomes, plus the empirical Bernstein confidence sequence of Howard et al. (2021) for the running mean. Both are *anytime-valid* — type-I error is bounded by α at every stopping time, no peeking penalty.
|
|
45
|
+
|
|
46
|
+
```ts
|
|
47
|
+
const verdict = evaluateInterimReleaseConfidence({
|
|
48
|
+
deltaSeries: [{ candidateId: 'cand', deltas }],
|
|
49
|
+
alpha: 0.05,
|
|
50
|
+
rope: { low: -0.02, high: 0.02 },
|
|
51
|
+
})
|
|
52
|
+
// → { recommendation: { decision: 'promote_now' | 'continue' | 'reject_now' | 'equivalent', candidateId } }
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
This closes the methodological hole flagged in the 0.21 methodology doc as out-of-scope. Consumers running rolling campaigns can now ship the moment evidence is decisive, stop-early on dead-on-arrival variants, and accumulate evidence across partial runs without spending the FDR budget. Tested under-the-null at α=0.05 on 100 synthetic series; false-rejection rate stays below the bound.
|
|
56
|
+
|
|
57
|
+
Exported from root and `@tangle-network/agent-eval/reporting`: `pairedEvalueSequence`, `evaluateInterimReleaseConfidence`, `PairedEvalueOptions`, `PairedEvalueSequence`, `PairedEvalueStep`, `InterimReleaseConfidence`, `InterimReleaseConfidenceInput`, `SequentialDecision`.
|
|
58
|
+
|
|
59
|
+
#### Rubric predictive validity
|
|
60
|
+
|
|
61
|
+
`rubricPredictiveValidity({ runs, outcomes, outcomeMetrics })` joins canonical campaign `RunRecord`s to a `DeploymentOutcomeStore` and reports per-rubric Pearson + Spearman + bootstrap CI against each outcome metric. Verdict bucketing: `'load_bearing' | 'informative' | 'decorative'` based on `|spearman|`. **Without this loop every rubric is faith-based;** with it, you know which rubrics earn their promotion power and which are decoration.
|
|
62
|
+
|
|
63
|
+
```ts
|
|
64
|
+
const validity = await rubricPredictiveValidity({
|
|
65
|
+
runs: lastQuarterRuns,
|
|
66
|
+
outcomes: shipFlagOutcomeStore,
|
|
67
|
+
outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
|
|
68
|
+
})
|
|
69
|
+
for (const r of validity.ranked) {
|
|
70
|
+
console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)} (${r.verdict})`)
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Builds on the existing `correlationStudy` primitive but works directly off `RunRecord` (the canonical campaign artifact) rather than `Run` from a `TraceStore`, so it composes cleanly with `runEvalCampaign`'s output. Returns a per-rubric ranking + every (rubric, outcome) pair tested + a list of rubrics that produced no usable data.
|
|
75
|
+
|
|
76
|
+
Exported from root and `@tangle-network/agent-eval/reporting`: `rubricPredictiveValidity`, `RubricOutcomePair`, `RubricRanking`, `RubricPredictiveValidityInput`, `RubricPredictiveValidityReport`. The existing `correlationStudy`, `OutcomeStore`, `InMemoryOutcomeStore`, `FileSystemOutcomeStore` continue to work unchanged.
|
|
77
|
+
|
|
78
|
+
#### `NoopRawProviderSink.list()` returns `[]`
|
|
79
|
+
|
|
80
|
+
Explicit opt-out from capture is no longer flagged by `assertRunCaptured` as `no_raw_sink`. Opt-out remains a deliberate choice; the campaign still requires the matching integrity overrides.
|
|
81
|
+
|
|
82
|
+
### Why
|
|
83
|
+
|
|
84
|
+
Every consumer that adopted agent-eval before 0.22 wrote their own matrix runner, and every one of them re-introduced the same forgettable wiring (raw sink, route guard, integrity assertion, analyst hook). 0.21 documented the pattern; 0.22 owns it. The four new primitives compound:
|
|
85
|
+
|
|
86
|
+
- `runEvalCampaign` standardises the artifact (`RunRecord` + raw events + fingerprint).
|
|
87
|
+
- Replay turns every past run into free training/validation data for new judges.
|
|
88
|
+
- Sequential evaluation makes "ship-when-evidence-says-so" mathematically defensible.
|
|
89
|
+
- Predictive validity converts evals from belief-based to outcome-anchored.
|
|
90
|
+
|
|
91
|
+
`runMultiShotOptimization` remains the right primitive for trajectory-shaped GEPA optimization sweeps; `runPromptEvolution` for prompt + code evolution loops with sandbox pools; `runEvalCampaign` for the "compare N variants on M scenarios with K seeds and tell me which to ship" case that makes up the bulk of consumer evals.
|
|
92
|
+
|
|
93
|
+
### References
|
|
94
|
+
|
|
95
|
+
- Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021). Time-uniform, nonparametric, nonasymptotic confidence sequences. *Annals of Statistics*, 49(2), 1055–1080.
|
|
96
|
+
- Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded random variables by betting. *JRSS B*, 86(1), 1–27.
|
|
97
|
+
|
|
98
|
+
### Migration
|
|
99
|
+
|
|
100
|
+
Existing consumers do not need to change. All four primitives are additive. Recommended path: on the next eval-runner refactor, replace hand-rolled matrix loops with `runEvalCampaign`. Use `evaluateInterimReleaseConfidence` for any campaign you run on a recurring cadence. Wire `rubricPredictiveValidity` once you have ≥ 30 deployment outcomes joinable by `runId`. Replay is a free win — once campaigns are running, every eval R&D loop drops to CPU-bound.
|
|
101
|
+
|
|
3
102
|
## 0.21.0 — capture integrity + launch-grade reporting
|
|
4
103
|
|
|
5
104
|
This release closes the layer-1 gap a downstream consumer surfaced: better
|
|
@@ -74,7 +173,9 @@ surface.
|
|
|
74
173
|
|
|
75
174
|
### Python client
|
|
76
175
|
|
|
77
|
-
|
|
176
|
+
The PyPI distribution renamed from `tangle-agent-eval` to **`agent-eval-rpc`**, and the import path from `tangle_agent_eval` to `agent_eval_rpc`. The new name accurately describes the package — it is a thin RPC client over the Node runtime, not a Python re-implementation of the eval logic — and the npm scope (`@tangle-network/agent-eval`) already provides the namespacing the `tangle-` prefix was substituting for. No prior PyPI version ever shipped under the old name (Trusted Publisher misconfiguration; see issue #40), so this rename is a clean first publish rather than a migration.
|
|
177
|
+
|
|
178
|
+
Locked at `agent-eval-rpc==0.21.0` to match the npm package.
|
|
78
179
|
|
|
79
180
|
## 0.20.10 — hardening audit follow-up
|
|
80
181
|
|
package/README.md
CHANGED
|
@@ -116,6 +116,10 @@ import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
|
|
|
116
116
|
| Fail loud if an eval would silently use the wrong route | `assertLlmRoute` |
|
|
117
117
|
| Assert at run-end that the artifact is complete | `assertRunCaptured`, `throwIfRunIncomplete` |
|
|
118
118
|
| Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
|
|
119
|
+
| Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
|
|
120
|
+
| Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
|
|
121
|
+
| Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
|
|
122
|
+
| Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
|
|
119
123
|
| Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
|
|
120
124
|
|
|
121
125
|
### Capture integrity (0.21+)
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
import {
|
|
2
|
+
canonicalize,
|
|
3
|
+
hashJson
|
|
4
|
+
} from "./chunk-6M774GY6.js";
|
|
5
|
+
|
|
1
6
|
// src/trace/store.ts
|
|
2
7
|
var InMemoryTraceStore = class {
|
|
3
8
|
runs = /* @__PURE__ */ new Map();
|
|
@@ -497,119 +502,138 @@ function runToTraceId(run) {
|
|
|
497
502
|
return cleaned.slice(0, 32).padEnd(32, "0");
|
|
498
503
|
}
|
|
499
504
|
|
|
500
|
-
// src/
|
|
501
|
-
var
|
|
502
|
-
constructor(
|
|
503
|
-
super(
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
this.
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
505
|
+
// src/replay.ts
|
|
506
|
+
var ReplayCacheMissError = class extends Error {
|
|
507
|
+
constructor(url, requestKey2, message) {
|
|
508
|
+
super(message ?? `replay cache miss for ${url} (key=${requestKey2})`);
|
|
509
|
+
this.url = url;
|
|
510
|
+
this.requestKey = requestKey2;
|
|
511
|
+
this.name = "ReplayCacheMissError";
|
|
512
|
+
}
|
|
513
|
+
url;
|
|
514
|
+
requestKey;
|
|
510
515
|
};
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
516
|
+
var ReplayCache = class _ReplayCache {
|
|
517
|
+
byKey = /* @__PURE__ */ new Map();
|
|
518
|
+
orphans = 0;
|
|
519
|
+
byProvider = {};
|
|
520
|
+
byModel = {};
|
|
521
|
+
/**
|
|
522
|
+
* Build a cache from a sink's events. The sink must implement `list()`.
|
|
523
|
+
* Filter by `runId` / `spanId` to scope to a specific replay.
|
|
524
|
+
*/
|
|
525
|
+
static async fromSink(sink, filter = {}) {
|
|
526
|
+
if (!sink.list) {
|
|
527
|
+
throw new Error("ReplayCache.fromSink: sink must implement list() to be replayable.");
|
|
528
|
+
}
|
|
529
|
+
const events = await sink.list(filter);
|
|
530
|
+
return _ReplayCache.fromEvents(events);
|
|
531
|
+
}
|
|
532
|
+
/** Build a cache from an in-memory event list. */
|
|
533
|
+
static async fromEvents(events) {
|
|
534
|
+
const cache = new _ReplayCache();
|
|
535
|
+
const groups = /* @__PURE__ */ new Map();
|
|
536
|
+
for (const e of events) {
|
|
537
|
+
const k = `${e.runId ?? ""}::${e.spanId ?? ""}::${e.attemptIndex}`;
|
|
538
|
+
const g = groups.get(k) ?? {};
|
|
539
|
+
if (e.direction === "request") g.req = e;
|
|
540
|
+
else g.res = e;
|
|
541
|
+
groups.set(k, g);
|
|
542
|
+
}
|
|
543
|
+
for (const g of groups.values()) {
|
|
544
|
+
if (!g.req) continue;
|
|
545
|
+
if (!g.res) {
|
|
546
|
+
cache.orphans += 1;
|
|
547
|
+
continue;
|
|
548
|
+
}
|
|
549
|
+
const key = await requestKey(g.req);
|
|
550
|
+
cache.byKey.set(key, { request: g.req, response: g.res });
|
|
551
|
+
cache.byProvider[g.req.provider] = (cache.byProvider[g.req.provider] ?? 0) + 1;
|
|
552
|
+
cache.byModel[g.req.model] = (cache.byModel[g.req.model] ?? 0) + 1;
|
|
553
|
+
}
|
|
554
|
+
return cache;
|
|
525
555
|
}
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
const toolSpans2 = spans.filter((s) => s.kind === "tool");
|
|
530
|
-
const llmMin = expectations.llmSpansMin ?? 0;
|
|
531
|
-
const judgeMin = expectations.judgeSpansMin ?? 0;
|
|
532
|
-
const toolMin = expectations.toolSpansMin ?? 0;
|
|
533
|
-
if (llmSpans2.length < llmMin) {
|
|
534
|
-
issues.push({
|
|
535
|
-
code: "missing_llm_spans",
|
|
536
|
-
message: `Expected \u2265 ${llmMin} LLM spans, found ${llmSpans2.length}.`,
|
|
537
|
-
detail: { expected: llmMin, found: llmSpans2.length }
|
|
538
|
-
});
|
|
556
|
+
/** Number of cacheable (request, response) pairs in the cache. */
|
|
557
|
+
size() {
|
|
558
|
+
return this.byKey.size;
|
|
539
559
|
}
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
560
|
+
stats() {
|
|
561
|
+
return {
|
|
562
|
+
total: this.byKey.size,
|
|
563
|
+
byProvider: { ...this.byProvider },
|
|
564
|
+
byModel: { ...this.byModel },
|
|
565
|
+
orphanRequests: this.orphans
|
|
566
|
+
};
|
|
546
567
|
}
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
568
|
+
/**
|
|
569
|
+
* Look up a cached response by hashing the (model, messages, temperature,
|
|
570
|
+
* maxTokens, response_format) shape. Returns `undefined` on miss; the
|
|
571
|
+
* caller decides whether to throw, fall back to the network, or skip.
|
|
572
|
+
*/
|
|
573
|
+
async lookup(requestBody) {
|
|
574
|
+
const key = await keyFromBody(requestBody);
|
|
575
|
+
return this.byKey.get(key);
|
|
553
576
|
}
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
});
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
message: `Expected \u2265 ${rawMin} raw provider events, found ${rawEventCount}.`,
|
|
570
|
-
detail: { expected: rawMin, found: rawEventCount }
|
|
571
|
-
});
|
|
572
|
-
}
|
|
573
|
-
if (expectations.requireRawCoverageOfLlmSpans) {
|
|
574
|
-
const requestEventsBySpan = new Set(
|
|
575
|
-
events.filter((e) => e.direction === "request" && e.spanId).map((e) => e.spanId)
|
|
576
|
-
);
|
|
577
|
-
const orphaned = llmSpans2.filter((s) => !requestEventsBySpan.has(s.spanId));
|
|
578
|
-
coverage = { covered: llmSpans2.length - orphaned.length, total: llmSpans2.length };
|
|
579
|
-
if (orphaned.length > 0) {
|
|
580
|
-
issues.push({
|
|
581
|
-
code: "orphan_llm_span",
|
|
582
|
-
message: `${orphaned.length} LLM span(s) have no matching raw provider request event.`,
|
|
583
|
-
detail: { orphanedSpanIds: orphaned.map((s) => s.spanId) }
|
|
584
|
-
});
|
|
585
|
-
}
|
|
577
|
+
};
|
|
578
|
+
function createReplayFetch(cache, opts = {}) {
|
|
579
|
+
const onMiss = opts.onMiss ?? "throw";
|
|
580
|
+
const fallback = opts.fallbackFetch ?? globalThis.fetch?.bind(globalThis);
|
|
581
|
+
return (async (input, init) => {
|
|
582
|
+
const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url;
|
|
583
|
+
if (!/\/chat\/completions(?:[?#].*)?$/.test(url)) {
|
|
584
|
+
if (!fallback) throw new Error(`replay fetch: non-completions URL ${url} but no fallbackFetch configured`);
|
|
585
|
+
return fallback(input, init);
|
|
586
|
+
}
|
|
587
|
+
let bodyParsed;
|
|
588
|
+
if (init?.body && typeof init.body === "string") {
|
|
589
|
+
try {
|
|
590
|
+
bodyParsed = JSON.parse(init.body);
|
|
591
|
+
} catch {
|
|
586
592
|
}
|
|
587
593
|
}
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
rawSpanCoverage: coverage,
|
|
608
|
-
issues
|
|
609
|
-
};
|
|
594
|
+
const hit = bodyParsed === void 0 ? void 0 : await cache.lookup(bodyParsed);
|
|
595
|
+
if (hit) {
|
|
596
|
+
opts.onHit?.({ url, provider: hit.request.provider, model: hit.request.model });
|
|
597
|
+
const status = hit.response.statusCode ?? 200;
|
|
598
|
+
const headers = new Headers(Object.entries(hit.response.responseHeaders ?? { "Content-Type": "application/json" }));
|
|
599
|
+
const bodyText = typeof hit.response.responseBody === "string" ? hit.response.responseBody : JSON.stringify(hit.response.responseBody ?? {});
|
|
600
|
+
return new Response(bodyText, { status, headers });
|
|
601
|
+
}
|
|
602
|
+
opts.onMissNotify?.({ url, requestBody: bodyParsed });
|
|
603
|
+
if (onMiss === "throw") {
|
|
604
|
+
const key = bodyParsed === void 0 ? "<unparseable>" : await keyFromBody(bodyParsed);
|
|
605
|
+
throw new ReplayCacheMissError(url, key);
|
|
606
|
+
}
|
|
607
|
+
if (onMiss === "fail-closed") {
|
|
608
|
+
return new Response(JSON.stringify({ error: "replay_cache_miss" }), { status: 599 });
|
|
609
|
+
}
|
|
610
|
+
if (!fallback) throw new Error("replay fetch: onMiss=fallback but no fallbackFetch configured");
|
|
611
|
+
return fallback(input, init);
|
|
612
|
+
});
|
|
610
613
|
}
|
|
611
|
-
function
|
|
612
|
-
if (!
|
|
614
|
+
async function* iterateRawCalls(sink, filter = {}) {
|
|
615
|
+
if (!sink.list) {
|
|
616
|
+
throw new Error("iterateRawCalls: sink must implement list().");
|
|
617
|
+
}
|
|
618
|
+
const events = await sink.list(filter);
|
|
619
|
+
const cache = await ReplayCache.fromEvents(events);
|
|
620
|
+
for (const entry of cache["byKey"].values()) yield entry;
|
|
621
|
+
}
|
|
622
|
+
async function requestKey(event) {
|
|
623
|
+
return keyFromBody(event.requestBody);
|
|
624
|
+
}
|
|
625
|
+
async function keyFromBody(body) {
|
|
626
|
+
if (body == null || typeof body !== "object") return hashJson({ raw: String(body) });
|
|
627
|
+
const b = body;
|
|
628
|
+
const reduced = canonicalize({
|
|
629
|
+
model: b.model ?? null,
|
|
630
|
+
messages: b.messages ?? null,
|
|
631
|
+
temperature: b.temperature ?? null,
|
|
632
|
+
max_tokens: b.max_tokens ?? null,
|
|
633
|
+
max_completion_tokens: b.max_completion_tokens ?? null,
|
|
634
|
+
response_format: b.response_format ?? null
|
|
635
|
+
});
|
|
636
|
+
return hashJson(reduced);
|
|
613
637
|
}
|
|
614
638
|
|
|
615
639
|
// src/trace-analyst/types.ts
|
|
@@ -1891,9 +1915,10 @@ export {
|
|
|
1891
1915
|
redactValue,
|
|
1892
1916
|
OTEL_AGENT_EVAL_SCOPE,
|
|
1893
1917
|
exportRunAsOtlp,
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1918
|
+
ReplayCacheMissError,
|
|
1919
|
+
ReplayCache,
|
|
1920
|
+
createReplayFetch,
|
|
1921
|
+
iterateRawCalls,
|
|
1897
1922
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
1898
1923
|
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
1899
1924
|
OtlpFileTraceStore,
|
|
@@ -1917,4 +1942,4 @@ export {
|
|
|
1917
1942
|
defaultTraceInsightPanel,
|
|
1918
1943
|
buildTraceInsightPrompt
|
|
1919
1944
|
};
|
|
1920
|
-
//# sourceMappingURL=chunk-
|
|
1945
|
+
//# sourceMappingURL=chunk-4W4NCYM2.js.map
|