@tangle-network/agent-eval 0.20.12 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +177 -0
- package/README.md +43 -1
- package/dist/{chunk-KWUAAIHR.js → chunk-4W4NCYM2.js} +182 -1
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
- package/dist/chunk-5IIQKMD5.js.map +1 -0
- package/dist/{chunk-HNJLMAJ2.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
- package/dist/chunk-IOXMGMHQ.js +1226 -0
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-75MCTH7P.js → chunk-KAO3Q65R.js} +198 -3
- package/dist/chunk-KAO3Q65R.js.map +1 -0
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/chunk-SQQLHODJ.js +163 -0
- package/dist/chunk-SQQLHODJ.js.map +1 -0
- package/dist/{chunk-IKFVX537.js → chunk-UAND2LOT.js} +232 -211
- package/dist/chunk-UAND2LOT.js.map +1 -0
- package/dist/{chunk-HKYRWNHV.js → chunk-USHQBPMH.js} +283 -7
- package/dist/chunk-USHQBPMH.js.map +1 -0
- package/dist/cli.js +3 -2
- package/dist/cli.js.map +1 -1
- package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
- package/dist/control.d.ts +4 -3
- package/dist/control.js +2 -2
- package/dist/emitter-B2XqDKFU.d.ts +121 -0
- package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
- package/dist/index.d.ts +16 -302
- package/dist/index.js +70 -62
- package/dist/index.js.map +1 -1
- package/dist/integrity-K2oVlF57.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization-UVDNKaO6.d.ts +574 -0
- package/dist/optimization.d.ts +7 -144
- package/dist/optimization.js +9 -2
- package/dist/reporting-B82RSv9C.d.ts +593 -0
- package/dist/reporting.d.ts +5 -426
- package/dist/reporting.js +17 -6
- package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
- package/dist/traces.d.ts +179 -3
- package/dist/traces.js +35 -4
- package/dist/wire/index.js +3 -2
- package/docs/research-report-methodology.md +170 -0
- package/docs/wire-protocol.md +1 -1
- package/package.json +11 -13
- package/dist/chunk-75MCTH7P.js.map +0 -1
- package/dist/chunk-HKYRWNHV.js.map +0 -1
- package/dist/chunk-IKFVX537.js.map +0 -1
- package/dist/chunk-KWUAAIHR.js.map +0 -1
- package/dist/chunk-ODFINDLQ.js +0 -413
- package/dist/chunk-ODFINDLQ.js.map +0 -1
- package/dist/chunk-PKCVBYTQ.js.map +0 -1
- /package/dist/{chunk-HNJLMAJ2.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,182 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.22.0 — EvalCampaign + replay + always-valid + outcome calibration
|
|
4
|
+
|
|
5
|
+
0.21 shipped the four capture-integrity primitives as opt-in. Every consumer still had to wire them by hand, and the bug class blueprint-agent reported (forgotten wiring → silent partial-capture) reappears the moment a new consumer adopts agent-eval cold. **0.22 makes the right thing the default path** — and adds three primitives that compound on top of standardized capture: replay-from-raw-events, anytime-valid sequential evaluation, and rubric predictive validity. The four primitives together turn agent-eval from a TS framework into research-grade evaluation infrastructure.
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
|
|
9
|
+
#### `runEvalCampaign` — capture integrity by construction
|
|
10
|
+
|
|
11
|
+
Opinionated matrix runner that wires the four directives by construction. Inputs: variants, scenarios, seeds, an `LlmClientOptions`, factories for `TraceStore` and `RawProviderSink`, and a `runner(ctx)` callback. Outputs: per-cell `RunRecord[]`, `RunIntegrityReport[]`, optional `researchReport`, and a campaign fingerprint.
|
|
12
|
+
|
|
13
|
+
- **Preflight:** `assertLlmRoute` is called once before any work, with `{ requireExplicitBaseUrl: true, requireAuth: true }` defaults. Misconfigured routes never burn a run.
|
|
14
|
+
- **Per run:** the campaign constructs the `TraceStore`, `RawProviderSink`, and `TraceEmitter` (with `onRunComplete` hooks attached), then hands the runner an `LlmClientOptions` already pre-wired with `rawSink` + `traceContext`. The runner cannot accidentally call an LLM without capture.
|
|
15
|
+
- **Run-completion:** `assertRunCaptured` runs after every `endRun` with `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }` defaults. Failures are routed via `onIntegrityFailure: 'throw' | 'mark_failed' | 'log'` (default `'mark_failed'`).
|
|
16
|
+
- **End of campaign:** if `report.comparator` is set, computes `researchReport` over the collected `RunRecord`s and embeds the campaign fingerprint + `preregistrationHash`.
|
|
17
|
+
- **Concurrency:** local async worker pool, default 1, configurable via `concurrency`.
|
|
18
|
+
- **Determinism:** the default `runId` generator is a stable hash of `(campaignId, variantId, scenarioId, seed)`, so re-running the same campaign produces the same ids; override `runId` for non-deterministic generation.
|
|
19
|
+
|
|
20
|
+
Exported from the root barrel and the `@tangle-network/agent-eval/optimization` subpath: `runEvalCampaign`, `CampaignRunner`, `CampaignRunContext`, `CampaignRunOutcome`, `CampaignVariant`, `CampaignScenario`, `EvalCampaignOptions`, `EvalCampaignResult`, `FailedRun`, `CampaignIntegrityPolicy`, `CampaignFactoryParams`.
|
|
21
|
+
|
|
22
|
+
#### Replay-from-raw-events
|
|
23
|
+
|
|
24
|
+
Every campaign run is now a re-runnable artifact. `ReplayCache.fromSink(sink)` turns a populated `RawProviderSink` into a deterministic `(canonicalised request → cached response)` map; `createReplayFetch(cache)` returns a `fetch`-shaped function that satisfies `/chat/completions` calls out of the cache and passes other URLs through.
|
|
25
|
+
|
|
26
|
+
```ts
|
|
27
|
+
const cache = await ReplayCache.fromSink(yesterdayRawSink)
|
|
28
|
+
const replayFetch = createReplayFetch(cache, { onMiss: 'fail-closed' })
|
|
29
|
+
await callLlm(req, { ...llmOpts, fetch: replayFetch }) // zero LLM cost
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Use cases:
|
|
33
|
+
|
|
34
|
+
- Post-hoc judging — apply a new judge or scorer to last week's runs without burning a single token.
|
|
35
|
+
- Determinism audits — replay a campaign and verify the responses match byte-for-byte.
|
|
36
|
+
- Free judge calibration — run two judges on identical responses and measure agreement.
|
|
37
|
+
|
|
38
|
+
`onMiss` is `'throw' | 'fallback' | 'fail-closed'`. The cache hashes a canonical projection (`model + messages + temperature + max_tokens|max_completion_tokens + response_format`) so insertion-order quirks don't cause spurious misses.
|
|
39
|
+
|
|
40
|
+
Exported from root and `@tangle-network/agent-eval/traces`: `ReplayCache`, `createReplayFetch`, `iterateRawCalls`, `ReplayCacheEntry`, `ReplayCacheStats`, `ReplayFetchOptions`, `ReplayCacheMissError`.
|
|
41
|
+
|
|
42
|
+
#### Always-valid sequential evaluation
|
|
43
|
+
|
|
44
|
+
`pairedEvalueSequence(deltas, opts)` and `evaluateInterimReleaseConfidence({ deltaSeries })` ship the predictable plug-in betting martingale of Waudby-Smith & Ramdas (2024) for paired bounded outcomes, plus the empirical Bernstein confidence sequence of Howard et al. (2021) for the running mean. Both are *anytime-valid* — type-I error is bounded by α at every stopping time, no peeking penalty.
|
|
45
|
+
|
|
46
|
+
```ts
|
|
47
|
+
const verdict = evaluateInterimReleaseConfidence({
|
|
48
|
+
deltaSeries: [{ candidateId: 'cand', deltas }],
|
|
49
|
+
alpha: 0.05,
|
|
50
|
+
rope: { low: -0.02, high: 0.02 },
|
|
51
|
+
})
|
|
52
|
+
// → { recommendation: { decision: 'promote_now' | 'continue' | 'reject_now' | 'equivalent', candidateId } }
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
This closes the methodological hole flagged in the 0.21 methodology doc as out-of-scope. Consumers running rolling campaigns can now ship the moment evidence is decisive, stop-early on dead-on-arrival variants, and accumulate evidence across partial runs without spending the FDR budget. Tested under-the-null at α=0.05 on 100 synthetic series; false-rejection rate stays below the bound.
|
|
56
|
+
|
|
57
|
+
Exported from root and `@tangle-network/agent-eval/reporting`: `pairedEvalueSequence`, `evaluateInterimReleaseConfidence`, `PairedEvalueOptions`, `PairedEvalueSequence`, `PairedEvalueStep`, `InterimReleaseConfidence`, `InterimReleaseConfidenceInput`, `SequentialDecision`.
|
|
58
|
+
|
|
59
|
+
#### Rubric predictive validity
|
|
60
|
+
|
|
61
|
+
`rubricPredictiveValidity({ runs, outcomes, outcomeMetrics })` joins canonical campaign `RunRecord`s to a `DeploymentOutcomeStore` and reports per-rubric Pearson + Spearman + bootstrap CI against each outcome metric. Verdict bucketing: `'load_bearing' | 'informative' | 'decorative'` based on `|spearman|`. **Without this loop every rubric is faith-based;** with it, you know which rubrics earn their promotion power and which are decoration.
|
|
62
|
+
|
|
63
|
+
```ts
|
|
64
|
+
const validity = await rubricPredictiveValidity({
|
|
65
|
+
runs: lastQuarterRuns,
|
|
66
|
+
outcomes: shipFlagOutcomeStore,
|
|
67
|
+
outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
|
|
68
|
+
})
|
|
69
|
+
for (const r of validity.ranked) {
|
|
70
|
+
console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)} (${r.verdict})`)
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Builds on the existing `correlationStudy` primitive but works directly off `RunRecord` (the canonical campaign artifact) rather than `Run` from a `TraceStore`, so it composes cleanly with `runEvalCampaign`'s output. Returns a per-rubric ranking + every (rubric, outcome) pair tested + a list of rubrics that produced no usable data.
|
|
75
|
+
|
|
76
|
+
Exported from root and `@tangle-network/agent-eval/reporting`: `rubricPredictiveValidity`, `RubricOutcomePair`, `RubricRanking`, `RubricPredictiveValidityInput`, `RubricPredictiveValidityReport`. The existing `correlationStudy`, `OutcomeStore`, `InMemoryOutcomeStore`, `FileSystemOutcomeStore` continue to work unchanged.
|
|
77
|
+
|
|
78
|
+
#### `NoopRawProviderSink.list()` returns `[]`
|
|
79
|
+
|
|
80
|
+
Explicit opt-out from capture is no longer flagged by `assertRunCaptured` as `no_raw_sink`. Opt-out remains a deliberate choice; the campaign still requires the matching integrity overrides.
|
|
81
|
+
|
|
82
|
+
### Why
|
|
83
|
+
|
|
84
|
+
Every consumer that adopted agent-eval before 0.22 wrote their own matrix runner, and every one of them re-introduced the same forgettable wiring (raw sink, route guard, integrity assertion, analyst hook). 0.21 documented the pattern; 0.22 owns it. The four new primitives compound:
|
|
85
|
+
|
|
86
|
+
- `runEvalCampaign` standardises the artifact (`RunRecord` + raw events + fingerprint).
|
|
87
|
+
- Replay turns every past run into free training/validation data for new judges.
|
|
88
|
+
- Sequential evaluation makes "ship-when-evidence-says-so" mathematically defensible.
|
|
89
|
+
- Predictive validity converts evals from belief-based to outcome-anchored.
|
|
90
|
+
|
|
91
|
+
`runMultiShotOptimization` remains the right primitive for trajectory-shaped GEPA optimization sweeps; `runPromptEvolution` for prompt + code evolution loops with sandbox pools; `runEvalCampaign` for the "compare N variants on M scenarios with K seeds and tell me which to ship" case that makes up the bulk of consumer evals.
|
|
92
|
+
|
|
93
|
+
### References
|
|
94
|
+
|
|
95
|
+
- Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021). Time-uniform, nonparametric, nonasymptotic confidence sequences. *Annals of Statistics*, 49(2), 1055–1080.
|
|
96
|
+
- Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded random variables by betting. *JRSS B*, 86(1), 1–27.
|
|
97
|
+
|
|
98
|
+
### Migration
|
|
99
|
+
|
|
100
|
+
Existing consumers do not need to change. All four primitives are additive. Recommended path: on the next eval-runner refactor, replace hand-rolled matrix loops with `runEvalCampaign`. Use `evaluateInterimReleaseConfidence` for any campaign you run on a recurring cadence. Wire `rubricPredictiveValidity` once you have ≥ 30 deployment outcomes joinable by `runId`. Replay is a free win — once campaigns are running, every eval R&D loop drops to CPU-bound.
|
|
101
|
+
|
|
102
|
+
## 0.21.0 — capture integrity + launch-grade reporting
|
|
103
|
+
|
|
104
|
+
This release closes the layer-1 gap a downstream consumer surfaced: better
|
|
105
|
+
post-run statistics don't help if the underlying data wasn't captured. 0.21
|
|
106
|
+
adds first-class raw provider-event capture, a fail-loud route guard, a
|
|
107
|
+
run-completion integrity check, and run-complete hooks (with a trace-analyst
|
|
108
|
+
auto-execution helper) so a direct matrix run produces complete forensics
|
|
109
|
+
without out-of-band glue.
|
|
110
|
+
|
|
111
|
+
### Added
|
|
112
|
+
|
|
113
|
+
- **`RawProviderSink` (capture).** First-class persistence for HTTP-level
|
|
114
|
+
provider request / response / error payloads alongside the structured
|
|
115
|
+
`LlmSpan`. `InMemoryRawProviderSink`, `FileSystemRawProviderSink` (NDJSON,
|
|
116
|
+
rolls at 32 MiB), and `NoopRawProviderSink` ship in core. Default redactor
|
|
117
|
+
strips `Authorization` / `X-Api-Key` / `Cookie` headers and credential-shaped
|
|
118
|
+
body fields (`apiKey`, `bearer`, `password`, `secret`, `token`); redacted
|
|
119
|
+
paths are recorded on `event.redactedFields` so a reviewer can see what was
|
|
120
|
+
stripped without exposing values. Wired into `callLlm` via
|
|
121
|
+
`LlmClientOptions.rawSink` — every retry attempt produces a `request` and
|
|
122
|
+
either a `response` or `error` event with the attempt index attached.
|
|
123
|
+
- **`assertLlmRoute` (route guard).** Pure function that throws
|
|
124
|
+
`LlmRouteAssertionError` when the configured client doesn't match the
|
|
125
|
+
caller's route requirements: `requireExplicitBaseUrl`, `allowedBaseUrls`,
|
|
126
|
+
`blockedBaseUrls`, `requireAuth`, `expectedProvider`. Designed for the
|
|
127
|
+
matrix-runner preflight — fail loud at the boundary instead of silently
|
|
128
|
+
falling back to the public/free-tier router.
|
|
129
|
+
- **`assertRunCaptured` (integrity check).** Read-only check on
|
|
130
|
+
`(store, runId, expectations)` that returns a structured
|
|
131
|
+
`RunIntegrityReport` with issue codes (`missing_llm_spans`,
|
|
132
|
+
`missing_raw_events`, `orphan_llm_span`, `no_raw_sink`, `missing_outcome`,
|
|
133
|
+
…). Pair with the new `requireRawCoverageOfLlmSpans` to assert every
|
|
134
|
+
`LlmSpan` has a matching raw `request` event. Use directly or via
|
|
135
|
+
`throwIfRunIncomplete` for strict mode.
|
|
136
|
+
- **`onRunComplete` hooks on `TraceEmitter`.** New
|
|
137
|
+
`TraceEmitterOptions.onRunComplete` array fires after `endRun` / `abortRun`
|
|
138
|
+
with full run context (run id, outcome, status, store, emitter). Errors are
|
|
139
|
+
swallowed and recorded as `log` events by default; opt into propagation via
|
|
140
|
+
`hookErrors: 'throw'`. `addRunCompleteHook` attaches hooks after construction.
|
|
141
|
+
- **`traceAnalystOnRunComplete` factory.** Drop-in run-complete hook that
|
|
142
|
+
runs `analyzeTraces` after each run and persists the result. Resolves the
|
|
143
|
+
"trace analyst never ran on this matrix sweep" complaint by making
|
|
144
|
+
auto-execution declarative.
|
|
145
|
+
- **`researchReport`** — executive research-report layer for coding-vertical
|
|
146
|
+
benchmark runs (originally landed in #34, elevated in #35). Composes
|
|
147
|
+
`summaryTable`, `paretoChart`, `gainHistogram`, held-out gate decisions,
|
|
148
|
+
and optional `failureClusterView` output into one structured artifact:
|
|
149
|
+
promote / hold / equivalent / reject / needs-more-data guidance with
|
|
150
|
+
rationale, risks, next actions, markdown, HTML, and JSON chart specs.
|
|
151
|
+
- Decisions are made on paired evidence — never on marginal means alone.
|
|
152
|
+
- ROPE (Region of Practical Equivalence) supported via the `rope` option.
|
|
153
|
+
- Bayesian-bootstrap-style `Pr(Δ>0)` and `Pr(Δ∈ROPE)` summaries (Rubin 1981).
|
|
154
|
+
- Per-candidate minimum detectable paired effect via `pairedMde`.
|
|
155
|
+
- SHA-256 `runFingerprint` and optional `preregistrationHash` linking a
|
|
156
|
+
signed `HypothesisManifest`.
|
|
157
|
+
- Embedded methodology + `docs/research-report-methodology.md` companion.
|
|
158
|
+
- **`pairedMde`** in `power-analysis`: closed-form minimum detectable paired
|
|
159
|
+
effect (inverse to the paired-t / sign-rank power formula).
|
|
160
|
+
|
|
161
|
+
### Changed
|
|
162
|
+
|
|
163
|
+
- `researchReport` is async (uses Web Crypto via `hashJson` for the run
|
|
164
|
+
fingerprint).
|
|
165
|
+
- Default `researchReport.minPairs` is 20 (soft floor); hard floor of 6 is
|
|
166
|
+
enforced regardless via `RESEARCH_REPORT_HARD_PAIR_FLOOR`.
|
|
167
|
+
|
|
168
|
+
### Wire-protocol consumers
|
|
169
|
+
|
|
170
|
+
No wire-protocol changes. The new capture / integrity / hook primitives are
|
|
171
|
+
TypeScript-only; cross-language consumers continue to use the existing RPC
|
|
172
|
+
surface.
|
|
173
|
+
|
|
174
|
+
### Python client
|
|
175
|
+
|
|
176
|
+
The PyPI distribution renamed from `tangle-agent-eval` to **`agent-eval-rpc`**, and the import path from `tangle_agent_eval` to `agent_eval_rpc`. The new name accurately describes the package — it is a thin RPC client over the Node runtime, not a Python re-implementation of the eval logic — and the npm scope (`@tangle-network/agent-eval`) already provides the namespacing the `tangle-` prefix was substituting for. No prior PyPI version ever shipped under the old name (Trusted Publisher misconfiguration; see issue #40), so this rename is a clean first publish rather than a migration.
|
|
177
|
+
|
|
178
|
+
Locked at `agent-eval-rpc==0.21.0` to match the npm package.
|
|
179
|
+
|
|
3
180
|
## 0.20.10 — hardening audit follow-up
|
|
4
181
|
|
|
5
182
|
### Fixed
|
package/README.md
CHANGED
|
@@ -111,9 +111,51 @@ import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
|
|
|
111
111
|
| Compare prompt/tool/retrieval policies over full trajectories | `runMultiShotOptimization` |
|
|
112
112
|
| Gate releases with paired evidence and holdouts | `evaluateReleaseConfidence`, `HeldOutGate` |
|
|
113
113
|
| Explain regressions across trace corpora | `TraceAnalyst` / `analyzeTraces` |
|
|
114
|
-
| Report a launch decision | `renderReleaseReport`, `summaryTable`, `paretoChart`, `gainHistogram` |
|
|
114
|
+
| Report a launch decision | `renderReleaseReport`, `researchReport`, `summaryTable`, `paretoChart`, `gainHistogram` |
|
|
115
|
+
| Capture every provider HTTP request / response for forensics | `RawProviderSink`, `LlmClientOptions.rawSink` |
|
|
116
|
+
| Fail loud if an eval would silently use the wrong route | `assertLlmRoute` |
|
|
117
|
+
| Assert at run-end that the artifact is complete | `assertRunCaptured`, `throwIfRunIncomplete` |
|
|
118
|
+
| Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
|
|
119
|
+
| Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
|
|
120
|
+
| Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
|
|
121
|
+
| Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
|
|
122
|
+
| Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
|
|
115
123
|
| Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
|
|
116
124
|
|
|
125
|
+
### Capture integrity (0.21+)
|
|
126
|
+
|
|
127
|
+
Launch-grade benchmark runs need four things that are easy to forget in glue
|
|
128
|
+
code: (1) raw HTTP capture alongside the structured spans so a reviewer can
|
|
129
|
+
verify which route answered, (2) a preflight assertion that the configured
|
|
130
|
+
client points at the intended provider, (3) a run-end assertion that the
|
|
131
|
+
expected events were actually written, and (4) auto-execution of the trace
|
|
132
|
+
analyst as part of the run lifecycle. The wiring fits in a few lines:
|
|
133
|
+
|
|
134
|
+
```ts
|
|
135
|
+
import {
|
|
136
|
+
TraceEmitter, FileSystemRawProviderSink, callLlm, assertLlmRoute,
|
|
137
|
+
assertRunCaptured, throwIfRunIncomplete,
|
|
138
|
+
} from '@tangle-network/agent-eval'
|
|
139
|
+
import { traceAnalystOnRunComplete } from '@tangle-network/agent-eval/traces'
|
|
140
|
+
|
|
141
|
+
const sink = new FileSystemRawProviderSink({ dir: `${workDir}/raw-events` })
|
|
142
|
+
assertLlmRoute(llmOpts, { requireExplicitBaseUrl: true, allowedBaseUrls, requireAuth: true })
|
|
143
|
+
|
|
144
|
+
const emitter = new TraceEmitter(store, {
|
|
145
|
+
onRunComplete: [traceAnalystOnRunComplete({ analyze: analystOpts, save })],
|
|
146
|
+
})
|
|
147
|
+
await emitter.startRun(/* ... */)
|
|
148
|
+
// LLM calls flow through callLlm with `{ rawSink: sink, traceContext: { runId, spanId } }`.
|
|
149
|
+
await emitter.endRun({ pass, score })
|
|
150
|
+
|
|
151
|
+
throwIfRunIncomplete(await assertRunCaptured(store, emitter.runId, {
|
|
152
|
+
llmSpansMin: 1, rawSink: sink, requireRawCoverageOfLlmSpans: true, requireOutcome: true,
|
|
153
|
+
}))
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Directives, rationale, and shipped-bug context are in
|
|
157
|
+
[`SKILL.md` § Capture integrity](./.claude/skills/agent-eval/SKILL.md#capture-integrity-required-for-launch-grade-adoption).
|
|
158
|
+
|
|
117
159
|
## Examples
|
|
118
160
|
|
|
119
161
|
Runnable examples live in
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
import {
|
|
2
|
+
canonicalize,
|
|
3
|
+
hashJson
|
|
4
|
+
} from "./chunk-6M774GY6.js";
|
|
5
|
+
|
|
1
6
|
// src/trace/store.ts
|
|
2
7
|
var InMemoryTraceStore = class {
|
|
3
8
|
runs = /* @__PURE__ */ new Map();
|
|
@@ -497,6 +502,140 @@ function runToTraceId(run) {
|
|
|
497
502
|
return cleaned.slice(0, 32).padEnd(32, "0");
|
|
498
503
|
}
|
|
499
504
|
|
|
505
|
+
// src/replay.ts
|
|
506
|
+
var ReplayCacheMissError = class extends Error {
|
|
507
|
+
constructor(url, requestKey2, message) {
|
|
508
|
+
super(message ?? `replay cache miss for ${url} (key=${requestKey2})`);
|
|
509
|
+
this.url = url;
|
|
510
|
+
this.requestKey = requestKey2;
|
|
511
|
+
this.name = "ReplayCacheMissError";
|
|
512
|
+
}
|
|
513
|
+
url;
|
|
514
|
+
requestKey;
|
|
515
|
+
};
|
|
516
|
+
var ReplayCache = class _ReplayCache {
|
|
517
|
+
byKey = /* @__PURE__ */ new Map();
|
|
518
|
+
orphans = 0;
|
|
519
|
+
byProvider = {};
|
|
520
|
+
byModel = {};
|
|
521
|
+
/**
|
|
522
|
+
* Build a cache from a sink's events. The sink must implement `list()`.
|
|
523
|
+
* Filter by `runId` / `spanId` to scope to a specific replay.
|
|
524
|
+
*/
|
|
525
|
+
static async fromSink(sink, filter = {}) {
|
|
526
|
+
if (!sink.list) {
|
|
527
|
+
throw new Error("ReplayCache.fromSink: sink must implement list() to be replayable.");
|
|
528
|
+
}
|
|
529
|
+
const events = await sink.list(filter);
|
|
530
|
+
return _ReplayCache.fromEvents(events);
|
|
531
|
+
}
|
|
532
|
+
/** Build a cache from an in-memory event list. */
|
|
533
|
+
static async fromEvents(events) {
|
|
534
|
+
const cache = new _ReplayCache();
|
|
535
|
+
const groups = /* @__PURE__ */ new Map();
|
|
536
|
+
for (const e of events) {
|
|
537
|
+
const k = `${e.runId ?? ""}::${e.spanId ?? ""}::${e.attemptIndex}`;
|
|
538
|
+
const g = groups.get(k) ?? {};
|
|
539
|
+
if (e.direction === "request") g.req = e;
|
|
540
|
+
else g.res = e;
|
|
541
|
+
groups.set(k, g);
|
|
542
|
+
}
|
|
543
|
+
for (const g of groups.values()) {
|
|
544
|
+
if (!g.req) continue;
|
|
545
|
+
if (!g.res) {
|
|
546
|
+
cache.orphans += 1;
|
|
547
|
+
continue;
|
|
548
|
+
}
|
|
549
|
+
const key = await requestKey(g.req);
|
|
550
|
+
cache.byKey.set(key, { request: g.req, response: g.res });
|
|
551
|
+
cache.byProvider[g.req.provider] = (cache.byProvider[g.req.provider] ?? 0) + 1;
|
|
552
|
+
cache.byModel[g.req.model] = (cache.byModel[g.req.model] ?? 0) + 1;
|
|
553
|
+
}
|
|
554
|
+
return cache;
|
|
555
|
+
}
|
|
556
|
+
/** Number of cacheable (request, response) pairs in the cache. */
|
|
557
|
+
size() {
|
|
558
|
+
return this.byKey.size;
|
|
559
|
+
}
|
|
560
|
+
stats() {
|
|
561
|
+
return {
|
|
562
|
+
total: this.byKey.size,
|
|
563
|
+
byProvider: { ...this.byProvider },
|
|
564
|
+
byModel: { ...this.byModel },
|
|
565
|
+
orphanRequests: this.orphans
|
|
566
|
+
};
|
|
567
|
+
}
|
|
568
|
+
/**
|
|
569
|
+
* Look up a cached response by hashing the (model, messages, temperature,
|
|
570
|
+
* maxTokens, response_format) shape. Returns `undefined` on miss; the
|
|
571
|
+
* caller decides whether to throw, fall back to the network, or skip.
|
|
572
|
+
*/
|
|
573
|
+
async lookup(requestBody) {
|
|
574
|
+
const key = await keyFromBody(requestBody);
|
|
575
|
+
return this.byKey.get(key);
|
|
576
|
+
}
|
|
577
|
+
};
|
|
578
|
+
function createReplayFetch(cache, opts = {}) {
|
|
579
|
+
const onMiss = opts.onMiss ?? "throw";
|
|
580
|
+
const fallback = opts.fallbackFetch ?? globalThis.fetch?.bind(globalThis);
|
|
581
|
+
return (async (input, init) => {
|
|
582
|
+
const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url;
|
|
583
|
+
if (!/\/chat\/completions(?:[?#].*)?$/.test(url)) {
|
|
584
|
+
if (!fallback) throw new Error(`replay fetch: non-completions URL ${url} but no fallbackFetch configured`);
|
|
585
|
+
return fallback(input, init);
|
|
586
|
+
}
|
|
587
|
+
let bodyParsed;
|
|
588
|
+
if (init?.body && typeof init.body === "string") {
|
|
589
|
+
try {
|
|
590
|
+
bodyParsed = JSON.parse(init.body);
|
|
591
|
+
} catch {
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
const hit = bodyParsed === void 0 ? void 0 : await cache.lookup(bodyParsed);
|
|
595
|
+
if (hit) {
|
|
596
|
+
opts.onHit?.({ url, provider: hit.request.provider, model: hit.request.model });
|
|
597
|
+
const status = hit.response.statusCode ?? 200;
|
|
598
|
+
const headers = new Headers(Object.entries(hit.response.responseHeaders ?? { "Content-Type": "application/json" }));
|
|
599
|
+
const bodyText = typeof hit.response.responseBody === "string" ? hit.response.responseBody : JSON.stringify(hit.response.responseBody ?? {});
|
|
600
|
+
return new Response(bodyText, { status, headers });
|
|
601
|
+
}
|
|
602
|
+
opts.onMissNotify?.({ url, requestBody: bodyParsed });
|
|
603
|
+
if (onMiss === "throw") {
|
|
604
|
+
const key = bodyParsed === void 0 ? "<unparseable>" : await keyFromBody(bodyParsed);
|
|
605
|
+
throw new ReplayCacheMissError(url, key);
|
|
606
|
+
}
|
|
607
|
+
if (onMiss === "fail-closed") {
|
|
608
|
+
return new Response(JSON.stringify({ error: "replay_cache_miss" }), { status: 599 });
|
|
609
|
+
}
|
|
610
|
+
if (!fallback) throw new Error("replay fetch: onMiss=fallback but no fallbackFetch configured");
|
|
611
|
+
return fallback(input, init);
|
|
612
|
+
});
|
|
613
|
+
}
|
|
614
|
+
async function* iterateRawCalls(sink, filter = {}) {
|
|
615
|
+
if (!sink.list) {
|
|
616
|
+
throw new Error("iterateRawCalls: sink must implement list().");
|
|
617
|
+
}
|
|
618
|
+
const events = await sink.list(filter);
|
|
619
|
+
const cache = await ReplayCache.fromEvents(events);
|
|
620
|
+
for (const entry of cache["byKey"].values()) yield entry;
|
|
621
|
+
}
|
|
622
|
+
async function requestKey(event) {
|
|
623
|
+
return keyFromBody(event.requestBody);
|
|
624
|
+
}
|
|
625
|
+
async function keyFromBody(body) {
|
|
626
|
+
if (body == null || typeof body !== "object") return hashJson({ raw: String(body) });
|
|
627
|
+
const b = body;
|
|
628
|
+
const reduced = canonicalize({
|
|
629
|
+
model: b.model ?? null,
|
|
630
|
+
messages: b.messages ?? null,
|
|
631
|
+
temperature: b.temperature ?? null,
|
|
632
|
+
max_tokens: b.max_tokens ?? null,
|
|
633
|
+
max_completion_tokens: b.max_completion_tokens ?? null,
|
|
634
|
+
response_format: b.response_format ?? null
|
|
635
|
+
});
|
|
636
|
+
return hashJson(reduced);
|
|
637
|
+
}
|
|
638
|
+
|
|
500
639
|
// src/trace-analyst/types.ts
|
|
501
640
|
var DEFAULT_TRACE_ANALYST_BUDGETS = {
|
|
502
641
|
perCallByteCeiling: 15e4,
|
|
@@ -1468,6 +1607,43 @@ function normalizeRecordArray(value) {
|
|
|
1468
1607
|
return value.map((item) => item && typeof item === "object" ? { ...item } : { value: item });
|
|
1469
1608
|
}
|
|
1470
1609
|
|
|
1610
|
+
// src/trace-analyst/hook.ts
|
|
1611
|
+
var DEFAULT_QUESTION = "Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.";
|
|
1612
|
+
function traceAnalystOnRunComplete(opts) {
|
|
1613
|
+
return async (ctx) => {
|
|
1614
|
+
if (opts.shouldRun && !opts.shouldRun(ctx)) return;
|
|
1615
|
+
const source = opts.analyze.source;
|
|
1616
|
+
if (source === void 0) {
|
|
1617
|
+
await ctx.store.appendEvent({
|
|
1618
|
+
eventId: `analyst-skip-${ctx.runId}`,
|
|
1619
|
+
runId: ctx.runId,
|
|
1620
|
+
kind: "log",
|
|
1621
|
+
timestamp: Date.now(),
|
|
1622
|
+
payload: { source: "trace_analyst_hook", reason: "no source configured" }
|
|
1623
|
+
});
|
|
1624
|
+
return;
|
|
1625
|
+
}
|
|
1626
|
+
const result = await analyzeTraces(
|
|
1627
|
+
{ question: opts.question ?? DEFAULT_QUESTION },
|
|
1628
|
+
{ ...opts.analyze, source }
|
|
1629
|
+
);
|
|
1630
|
+
if (opts.save) await opts.save(result, ctx);
|
|
1631
|
+
if (opts.gateOn && !opts.gateOn(result, ctx)) {
|
|
1632
|
+
await ctx.store.appendEvent({
|
|
1633
|
+
eventId: `analyst-gate-${ctx.runId}`,
|
|
1634
|
+
runId: ctx.runId,
|
|
1635
|
+
kind: "log",
|
|
1636
|
+
timestamp: Date.now(),
|
|
1637
|
+
payload: {
|
|
1638
|
+
source: "trace_analyst_hook",
|
|
1639
|
+
reason: "analyst_gate_failed",
|
|
1640
|
+
findings: result.findings
|
|
1641
|
+
}
|
|
1642
|
+
});
|
|
1643
|
+
}
|
|
1644
|
+
};
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1471
1647
|
// src/trace-analyst/insights.ts
|
|
1472
1648
|
var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
|
|
1473
1649
|
"and",
|
|
@@ -1739,6 +1915,10 @@ export {
|
|
|
1739
1915
|
redactValue,
|
|
1740
1916
|
OTEL_AGENT_EVAL_SCOPE,
|
|
1741
1917
|
exportRunAsOtlp,
|
|
1918
|
+
ReplayCacheMissError,
|
|
1919
|
+
ReplayCache,
|
|
1920
|
+
createReplayFetch,
|
|
1921
|
+
iterateRawCalls,
|
|
1742
1922
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
1743
1923
|
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
1744
1924
|
OtlpFileTraceStore,
|
|
@@ -1751,6 +1931,7 @@ export {
|
|
|
1751
1931
|
buildTraceAnalystTools,
|
|
1752
1932
|
traceAnalystFunctionGroup,
|
|
1753
1933
|
analyzeTraces,
|
|
1934
|
+
traceAnalystOnRunComplete,
|
|
1754
1935
|
tokenizeDomainWords,
|
|
1755
1936
|
inferDomainKeywords,
|
|
1756
1937
|
domainEvidencePattern,
|
|
@@ -1761,4 +1942,4 @@ export {
|
|
|
1761
1942
|
defaultTraceInsightPanel,
|
|
1762
1943
|
buildTraceInsightPrompt
|
|
1763
1944
|
};
|
|
1764
|
-
//# sourceMappingURL=chunk-
|
|
1945
|
+
//# sourceMappingURL=chunk-4W4NCYM2.js.map
|