npm - @tangle-network/agent-eval - Versions diffs - 0.21.0 → 0.23.0 - Mend

@tangle-network/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/CHANGELOG.md +236 -1
package/README.md +17 -3
package/dist/benchmarks/index.d.ts +2 -2
package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
package/dist/chunk-4W4NCYM2.js.map +1 -0
package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
package/dist/chunk-6M774GY6.js +53 -0
package/dist/chunk-6M774GY6.js.map +1 -0
package/dist/chunk-7EAUOUQS.js +495 -0
package/dist/chunk-7EAUOUQS.js.map +1 -0
package/dist/chunk-AXHNWLIX.js +246 -0
package/dist/chunk-AXHNWLIX.js.map +1 -0
package/dist/chunk-EXGR4XEM.js +283 -0
package/dist/chunk-EXGR4XEM.js.map +1 -0
package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
package/dist/chunk-IOXMGMHQ.js.map +1 -0
package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
package/dist/chunk-LZKIOBG2.js +2026 -0
package/dist/chunk-LZKIOBG2.js.map +1 -0
package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
package/dist/chunk-QBW3YBTR.js.map +1 -0
package/dist/chunk-QUKKGHTZ.js +121 -0
package/dist/chunk-QUKKGHTZ.js.map +1 -0
package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
package/dist/cli.js +3 -3
package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
package/dist/control.d.ts +3 -3
package/dist/control.js +2 -2
package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
package/dist/index-ekBXweiQ.d.ts +1894 -0
package/dist/index.d.ts +20 -430
package/dist/index.js +154 -34
package/dist/index.js.map +1 -1
package/dist/integrity-Cr5YodSY.d.ts +210 -0
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +7 -145
package/dist/optimization.js +12 -3
package/dist/reporting.d.ts +294 -4
package/dist/reporting.js +18 -9
package/dist/rl.d.ts +8 -0
package/dist/rl.js +113 -0
package/dist/rl.js.map +1 -0
package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
package/dist/sequential-DgU2mFsE.d.ts +304 -0
package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
package/dist/traces.d.ts +101 -181
package/dist/traces.js +19 -8
package/dist/wire/index.js +3 -3
package/docs/auto-research-loop-end-to-end.md +186 -0
package/docs/research-report-methodology.md +19 -4
package/docs/three-package-architecture.md +180 -0
package/docs/wire-protocol.md +1 -1
package/package.json +7 -2
package/dist/chunk-3IX6QTB7.js.map +0 -1
package/dist/chunk-KRR4VMH7.js +0 -423
package/dist/chunk-KRR4VMH7.js.map +0 -1
package/dist/chunk-WOK2RTWG.js.map +0 -1
package/dist/chunk-YUFXO3TU.js.map +0 -1
package/dist/reporting-Da2ihlcM.d.ts +0 -672
/package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
/package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
/package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
/package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,238 @@
 # Changelog
+## 0.23.0 — RL primitives + auto-research worked example
+In addition to the RL bridge primitives below, this release ships the
+canonical worked example of the auto-research loop end-to-end against
+agent-builder, plus a concrete prime-rl SFT integration. The auto-research
+thesis — capture → score → preferences → mutate → improved candidate —
+is now demonstrably real, not aspirational.
+### Added (worked examples)
+- **`examples/auto-research-with-agent-builder/`** — runnable demo of the
+  closed loop: a synthetic agent-builder driver iterates 4 generations
+  of prompt variants, with each generation's runs feeding
+  `analyzeOptimizationResult` for preferences + reward-hacking + sequential
+  verdict, and the next generation proposed via a deterministic mutator.
+  The demo shows score climbing from 0.739 → 0.973 over 4 iterations on
+  the synthetic environment. Real-driver mode (replace the synthetic
+  runner with `runForgeBuilderSim` from `agent-builder`) is documented
+  inline.
+- **`examples/fine-tune-with-prime-rl/`** — concrete integration with
+  Prime Intellect's prime-rl SFT trainer. Reads `RunRecord[]` (NDJSON),
+  filters to high-quality runs, projects via `toSftRows` to messages-list
+  JSONL, writes a 15-line prime-rl SFT TOML config, prints the runnable
+  command. ~150 LoC of glue. SFT was chosen as the first integration
+  because it's the cleanest fit between agent-eval's exporters and
+  prime-rl's entrypoints (DPO/PRM go to TRL; offline GRPO requires a
+  custom verifiers env — both called out in the README).
+- **`docs/three-package-architecture.md`** — the contracts between
+  agent-eval, agent-knowledge, agent-runtime. Dependency direction (both
+  consume agent-eval; agent-eval imports neither), shared data
+  interchange (RunRecord, Scenario, KnowledgeBundle), and known
+  contract gaps tracked as follow-ups.
+- **`docs/auto-research-loop-end-to-end.md`** — the runnable composition
+  pattern with the explicit invariants every iteration must preserve
+  (canonical RunRecord with scenarioId, capture wired by construction,
+  stable comparator, deterministic mutator).
+### Added (RL primitives)
+0.22 made eval rigorous and integrated; 0.23 closes the loop back to RL training. The package now ships the canonical primitives a working RL-on-LLM-agents team needs — verifiable rewards, preference extraction, off-policy evaluation, process reward scaffolding, contamination probing, Bradley-Terry / Elo tournaments, adversarial scenario search, and test-time compute scaling — all designed to consume the standardised `RunRecord` artifact 0.22 produced. The auto-research loop is now coherent end-to-end.
+#### RL barrel — `@tangle-network/agent-eval/rl` (new subpath)
+A single subpath for every RL-shaped primitive, importable as a unit. The 9 modules:
+1. **`run-record-adapters.ts`** — convert `TrialResult[]` (from `runPromptEvolution` / `runMultiShotOptimization`), `VerificationReport` (from `MultiLayerVerifier`), and `VariantAggregate` into canonical `RunRecord[]`. Closes the integration gap between the pre-0.22 optimization stack and the post-0.22 campaign artifact. Existing optimization runs become `replayCache`-able and `rubricPredictiveValidity`-scorable for free.
+2. **`verifiable-reward.ts`** — extract a clean `VerifiableReward` from `VerificationReport` or `RunRecord`. Distinguishes `'deterministic'` (compile, test, schema, sandbox) from `'probabilistic'` (judge) reward sources. The seam every credible 2025-2026 frontier RL result on coding agents leans on (DeepSeek-R1 GRPO on test pass-rate, AlphaProof on Lean kernel checking).
+3. **`preferences.ts`** — `extractPreferences(runRecords)` produces DPO/PPO/KTO-shape `(chosen, rejected)` triples with three documented strategies (`paired-by-scenario-and-seed`, `paired-by-scenario`, `top-vs-bottom`). Bridge from campaign artifact to RL training. Includes `toTRLFormat` and `toAnthropicFormat` adapters.
+4. **`off-policy.ts`** — IPS, SNIPS, doubly-robust off-policy estimators (Dudík–Langford–Li 2011 for DR, Owen 2013 for SNIPS SE). Caller supplies behavior + target propensity scores (typically from token log-probs). All three return matched-shape `OffPolicyEstimate` with effective-sample-size and max-importance-weight diagnostics. `offPolicyEstimateAll` runs all three side-by-side — agreement across estimators is a much stronger signal than any one alone.
+5. **`process-reward.ts`** — step-level credit assignment from trace spans. `extractStepRewards(store, runId, scorers)` produces `StepReward[]`; `prmTrainingPairs(stepRewardsByRun)` produces `(prefix, chosen_step, rejected_step)` triples in the canonical Lightman et al. / DeepSeek-R1 process supervision shape. We ship the data extraction, not the trainer — gradient descent over a transformer is out of scope for a TS package.
+6. **`contamination.ts`** — held-out perturbation contamination probe. `runContaminationProbe({ originals, perturbation, scoreFn })` runs the policy against original + perturbed scenarios, computes paired Wilcoxon on the deltas, and flags suspected contamination when median drop ≥ 5pp at p < 0.05. Stock perturbations: `renameVariables`, `shuffleOrder`, `injectIrrelevantClause`. Catches the SWE-Bench → SWE-Bench-Verified failure mode upstream.
+7. **`tournament.ts`** — `fitBradleyTerry(outcomes)` uses Hunter's MM algorithm to recover candidate strengths from pairwise outcomes; `applyEloUpdate(ratings, outcome)` for online updates with FIDE-style K-factor. `buildPairwiseFromCampaign` extracts pairwise outcomes from per-scenario campaign runs. Sample-efficient ranking for many-candidate sweeps; the methodology Chatbot Arena and AlpacaEval converged on.
+8. **`adversarial.ts`** — `adversarialScenarioSearch({ seeds, mutations, scoreFn })` actively searches for inputs the policy fails on. Hill-climb-against-failure-indicator loop (the simplest version of AdA / POET / auto-jailbreak rigs). Caller supplies mutation strategies; the harness deduplicates, budgets, and reports per-generation statistics.
+9. **`compute-curves.ts`** — characterize a candidate as a *curve* across compute budgets, not a point. `runComputeCurve` produces `(cost, score)` points + log-slope. `bestOfN`, `selfConsistency` are the canonical test-time-scaling primitives (Snell et al. 2024). `paretoFrontier` removes dominated (candidate, compute) combinations. Required for honest cost-quality reporting in the o1-era.
+#### RL barrel — additional experimental modules
+The 9 modules above are stable and tested. The following modules are also shipped under `@tangle-network/agent-eval/rl` as **experimental** — interfaces are reasonable but may evolve based on real production consumer feedback. Marked clearly in the barrel docstring; flagged here so consumers know the contract may shift.
+10. **`active-curriculum.ts`** — adaptive scenario allocation. `varianceBasedCurriculum` (Neyman 1934 optimal allocation: weight ∝ √variance + 1/√n for under-sampled-cell tie-break) and `thompsonCurriculum` (Beta-Bernoulli posterior + decision-threshold-weighted sampling) reallocate next-round budget toward cells whose outcome is uncertain.
+11. **`reward-hacking.ts`** — `detectRewardHacking({ runs, truthOf })` watches four signature signals (proxy-vs-truth divergence, distributional shift, reward disagreement between independent rewards, judge drift relative to deterministic reward) and returns a structured `'clean' | 'suspect' | 'gaming'` verdict with per-signal severity. Krakovna et al. + Skalse et al. 2022 + Kim et al. 2023 lineage.
+12. **`adaptation-eval.ts`** — `runAdaptationCurve` and `compareAdaptationCurves` for sample-efficient adaptation evaluation. The metric a foundation-model-based agent should be measured on isn't end-state performance but the curve of score vs k (k=0, 1, 2, 4, 8, 16 demonstrations). Returns area-under-curve summary + per-k bootstrap CIs.
+13. **`exporters.ts`** — trainer-format export functions. `toDpoRows` (HuggingFace TRL DPO/IPO/KTO format), `toGrpoRows` (offline GRPO `{prompt, completions[], rewards[]}`), `toSftRows` (TRL/prime-rl SFT messages list), `toPrmRows` (Lightman-style PRM training shape), `stepRewardsToJsonl` (step-level rewards for value-function regression). **Honest scope:** `toSftRows` is the only one that maps directly onto a prime-rl entrypoint; the others target TRL or custom trainers — see `examples/fine-tune-with-prime-rl/README.md` for the explicit fit table.
+14. **`rl-campaign.ts`** — `runRLCampaign(opts)` wraps `runEvalCampaign` and runs the full RL bridge (verifiable rewards + preferences + sequential interim verdict + reward-hacking + optional predictive validity + optional trainer export) in one call. The single top-level orchestrator the pre-0.23 audit panel called out as missing.
+15. **`auto-research.ts`** — `analyzeOptimizationResult({ result, ctx, comparator })` takes a `PromptEvolutionResult` or `MultiShotOptimizationResult` (the existing GEPA/AxRLM stack outputs) and runs the same RL bridge on top, producing a unified artifact. Closes the architectural fragmentation between the optimization primitives and the RL bridge.
+16. **`predictive-validity-researcher.ts`** — `PredictiveValidityResearcher` is a concrete `Researcher` interface implementation (the interface had been a placeholder + `NoopResearcher` until now). Drives steering changes from outcome-anchored predictive validity: rubrics that don't predict deployment outcomes get down-weighted; load-bearing rubrics get up-weighted.
+17. **`run-record.ts`** — `RunRecord.scenarioId` is now an optional canonical field (was previously inferred from `outcome.raw.scenario_id`). Populated automatically by `runEvalCampaign` and the optimization adapters; legacy `RunRecord[]` arrays without it fall back to the `outcome.raw.scenario_id` convention. Closes the fragility called out by the 0.23 audit.
+#### Build / surface
+- New build entry: `dist/rl.{js,d.ts}` exposed via the `@tangle-network/agent-eval/rl` package subpath.
+- All RL primitives also re-exported from the root barrel for ergonomic single-import use.
+- Default `BradleyTerry` smoothing raised from 0 to 0.1 — Hunter's MM degenerates when a candidate has zero wins; 0.1 keeps the iteration well-conditioned without meaningfully biasing real win counts.
+### Why
+The previous release shipped EvalCampaign + replay + sequential + outcome calibration as parallel infrastructure to the existing optimization primitives. That left a real gap: `runMultiShotOptimization` and `runPromptEvolution` produced their own trial shapes that didn't compose with the new artifacts. 0.23 closes that gap with the adapter layer, and ships the eight downstream primitives that turn the unified artifact into RL training data, OPE estimates, contamination probes, tournament rankings, adversarial scenarios, and compute curves.
+After 0.23, the auto-research loop is coherent end-to-end:
+```
+mutate (existing primitives)
+  → trial outcomes (TrialResult)
+  → adapter (run-record-adapters)
+  → RunRecord[] (canonical artifact)
+  → preferences / verifiable rewards / OPE / step rewards
+  → policy update (consumer's choice of TRL / GRPO / PPO / DPO)
+  → next sweep
+```
+### References
+- Dudík, M., Langford, J., Li, L. (2011). Doubly Robust Policy Evaluation and Learning. *ICML*.
+- Owen, A. B. (2013). *Monte Carlo Theory, Methods and Examples*. Ch. 9 — Importance Sampling.
+- Hunter, D. R. (2004). MM algorithms for generalized Bradley-Terry models. *Annals of Statistics*, 32(1), 384–406.
+- Bradley, R. A., Terry, M. E. (1952). Rank analysis of incomplete block designs. *Biometrika*, 39(3/4).
+- Lightman, H. et al. (2023). Let's Verify Step by Step. *arXiv:2305.20050*.
+- Snell, C. et al. (2024). Scaling LLM Test-Time Compute Optimally. *arXiv:2408.03314*.
+- Plus the foundational citations from 0.21 / 0.22.
+### Migration
+All 0.23 primitives are additive. Existing consumers don't need to change. Recommended adoption sequence:
+1. Add `trialsToRunRecords(trials, ctx)` after every existing optimization sweep — every old run becomes replay-able and predictive-validity-scorable for free.
+2. Wire `extractVerifiableReward` into your scoring pipeline; route deterministic and probabilistic rewards into separate training batches.
+3. Use `extractPreferences` to produce DPO/PPO triples for any RL training the consumer runs.
+4. Run `rubricPredictiveValidity` quarterly + `runContaminationProbe` per release to keep the rubric weights honest.
+5. Replace fixed-comparator HeldOutGate with `fitBradleyTerry` once you have ≥ 5 candidates running on shared scenarios.
+6. Replace single-budget evaluation with `runComputeCurve` for any candidate where compute scaling is a question.
+### Caveats and out-of-scope
+- The DR estimator's Q-function is caller-supplied. We don't ship a learned Q-function trainer — that's a regression problem with too many domain-specific choices to ship a default.
+- PRM training itself (gradient descent over a transformer) is out of scope; we ship the data extraction shape.
+- The contamination probe's per-scenario q-values use a heuristic pseudo-p (the load-bearing test is the global Wilcoxon).
+- `prmTrainingPairs` matches trajectories by step name + kind; production use should replace this with a token-level prefix hash.
+- Adversarial scenario search is a simple hill-climb; novel scenario synthesis (compositional, language-model-driven) is future work.
+## 0.22.0 — EvalCampaign + replay + always-valid + outcome calibration
+0.21 shipped the four capture-integrity primitives as opt-in. Every consumer still had to wire them by hand, and the bug class blueprint-agent reported (forgotten wiring → silent partial-capture) reappears the moment a new consumer adopts agent-eval cold. **0.22 makes the right thing the default path** — and adds three primitives that compound on top of standardized capture: replay-from-raw-events, anytime-valid sequential evaluation, and rubric predictive validity. The four primitives together turn agent-eval from a TS framework into research-grade evaluation infrastructure.
+### Added
+#### `runEvalCampaign` — capture integrity by construction
+Opinionated matrix runner that wires the four directives by construction. Inputs: variants, scenarios, seeds, an `LlmClientOptions`, factories for `TraceStore` and `RawProviderSink`, and a `runner(ctx)` callback. Outputs: per-cell `RunRecord[]`, `RunIntegrityReport[]`, optional `researchReport`, and a campaign fingerprint.
+- **Preflight:** `assertLlmRoute` is called once before any work, with `{ requireExplicitBaseUrl: true, requireAuth: true }` defaults. Misconfigured routes never burn a run.
+- **Per run:** the campaign constructs the `TraceStore`, `RawProviderSink`, and `TraceEmitter` (with `onRunComplete` hooks attached), then hands the runner an `LlmClientOptions` already pre-wired with `rawSink` + `traceContext`. The runner cannot accidentally call an LLM without capture.
+- **Run-completion:** `assertRunCaptured` runs after every `endRun` with `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }` defaults. Failures are routed via `onIntegrityFailure: 'throw' | 'mark_failed' | 'log'` (default `'mark_failed'`).
+- **End of campaign:** if `report.comparator` is set, computes `researchReport` over the collected `RunRecord`s and embeds the campaign fingerprint + `preregistrationHash`.
+- **Concurrency:** local async worker pool, default 1, configurable via `concurrency`.
+- **Determinism:** the default `runId` generator is a stable hash of `(campaignId, variantId, scenarioId, seed)`, so re-running the same campaign produces the same ids; override `runId` for non-deterministic generation.
+Exported from the root barrel and the `@tangle-network/agent-eval/optimization` subpath: `runEvalCampaign`, `CampaignRunner`, `CampaignRunContext`, `CampaignRunOutcome`, `CampaignVariant`, `CampaignScenario`, `EvalCampaignOptions`, `EvalCampaignResult`, `FailedRun`, `CampaignIntegrityPolicy`, `CampaignFactoryParams`.
+#### Replay-from-raw-events
+Every campaign run is now a re-runnable artifact. `ReplayCache.fromSink(sink)` turns a populated `RawProviderSink` into a deterministic `(canonicalised request → cached response)` map; `createReplayFetch(cache)` returns a `fetch`-shaped function that satisfies `/chat/completions` calls out of the cache and passes other URLs through.
+```ts
+const cache = await ReplayCache.fromSink(yesterdayRawSink)
+const replayFetch = createReplayFetch(cache, { onMiss: 'fail-closed' })
+await callLlm(req, { ...llmOpts, fetch: replayFetch }) // zero LLM cost
+```
+Use cases:
+- Post-hoc judging — apply a new judge or scorer to last week's runs without burning a single token.
+- Determinism audits — replay a campaign and verify the responses match byte-for-byte.
+- Free judge calibration — run two judges on identical responses and measure agreement.
+`onMiss` is `'throw' | 'fallback' | 'fail-closed'`. The cache hashes a canonical projection (`model + messages + temperature + max_tokens|max_completion_tokens + response_format`) so insertion-order quirks don't cause spurious misses.
+Exported from root and `@tangle-network/agent-eval/traces`: `ReplayCache`, `createReplayFetch`, `iterateRawCalls`, `ReplayCacheEntry`, `ReplayCacheStats`, `ReplayFetchOptions`, `ReplayCacheMissError`.
+#### Always-valid sequential evaluation
+`pairedEvalueSequence(deltas, opts)` and `evaluateInterimReleaseConfidence({ deltaSeries })` ship the predictable plug-in betting martingale of Waudby-Smith & Ramdas (2024) for paired bounded outcomes, plus the empirical Bernstein confidence sequence of Howard et al. (2021) for the running mean. Both are *anytime-valid* — type-I error is bounded by α at every stopping time, no peeking penalty.
+```ts
+const verdict = evaluateInterimReleaseConfidence({
+  deltaSeries: [{ candidateId: 'cand', deltas }],
+  alpha: 0.05,
+  rope: { low: -0.02, high: 0.02 },
+})
+// → { recommendation: { decision: 'promote_now' | 'continue' | 'reject_now' | 'equivalent', candidateId } }
+```
+This closes the methodological hole flagged in the 0.21 methodology doc as out-of-scope. Consumers running rolling campaigns can now ship the moment evidence is decisive, stop-early on dead-on-arrival variants, and accumulate evidence across partial runs without spending the FDR budget. Tested under-the-null at α=0.05 on 100 synthetic series; false-rejection rate stays below the bound.
+Exported from root and `@tangle-network/agent-eval/reporting`: `pairedEvalueSequence`, `evaluateInterimReleaseConfidence`, `PairedEvalueOptions`, `PairedEvalueSequence`, `PairedEvalueStep`, `InterimReleaseConfidence`, `InterimReleaseConfidenceInput`, `SequentialDecision`.
+#### Rubric predictive validity
+`rubricPredictiveValidity({ runs, outcomes, outcomeMetrics })` joins canonical campaign `RunRecord`s to a `DeploymentOutcomeStore` and reports per-rubric Pearson + Spearman + bootstrap CI against each outcome metric. Verdict bucketing: `'load_bearing' | 'informative' | 'decorative'` based on `|spearman|`. **Without this loop every rubric is faith-based;** with it, you know which rubrics earn their promotion power and which are decoration.
+```ts
+const validity = await rubricPredictiveValidity({
+  runs: lastQuarterRuns,
+  outcomes: shipFlagOutcomeStore,
+  outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
+})
+for (const r of validity.ranked) {
+  console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)} (${r.verdict})`)
+}
+```
+Builds on the existing `correlationStudy` primitive but works directly off `RunRecord` (the canonical campaign artifact) rather than `Run` from a `TraceStore`, so it composes cleanly with `runEvalCampaign`'s output. Returns a per-rubric ranking + every (rubric, outcome) pair tested + a list of rubrics that produced no usable data.
+Exported from root and `@tangle-network/agent-eval/reporting`: `rubricPredictiveValidity`, `RubricOutcomePair`, `RubricRanking`, `RubricPredictiveValidityInput`, `RubricPredictiveValidityReport`. The existing `correlationStudy`, `OutcomeStore`, `InMemoryOutcomeStore`, `FileSystemOutcomeStore` continue to work unchanged.
+#### `NoopRawProviderSink.list()` returns `[]`
+Explicit opt-out from capture is no longer flagged by `assertRunCaptured` as `no_raw_sink`. Opt-out remains a deliberate choice; the campaign still requires the matching integrity overrides.
+### Why
+Every consumer that adopted agent-eval before 0.22 wrote their own matrix runner, and every one of them re-introduced the same forgettable wiring (raw sink, route guard, integrity assertion, analyst hook). 0.21 documented the pattern; 0.22 owns it. The four new primitives compound:
+- `runEvalCampaign` standardises the artifact (`RunRecord` + raw events + fingerprint).
+- Replay turns every past run into free training/validation data for new judges.
+- Sequential evaluation makes "ship-when-evidence-says-so" mathematically defensible.
+- Predictive validity converts evals from belief-based to outcome-anchored.
+`runMultiShotOptimization` remains the right primitive for trajectory-shaped GEPA optimization sweeps; `runPromptEvolution` for prompt + code evolution loops with sandbox pools; `runEvalCampaign` for the "compare N variants on M scenarios with K seeds and tell me which to ship" case that makes up the bulk of consumer evals.
+### References
+- Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021). Time-uniform, nonparametric, nonasymptotic confidence sequences. *Annals of Statistics*, 49(2), 1055–1080.
+- Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded random variables by betting. *JRSS B*, 86(1), 1–27.
+### Migration
+Existing consumers do not need to change. All four primitives are additive. Recommended path: on the next eval-runner refactor, replace hand-rolled matrix loops with `runEvalCampaign`. Use `evaluateInterimReleaseConfidence` for any campaign you run on a recurring cadence. Wire `rubricPredictiveValidity` once you have ≥ 30 deployment outcomes joinable by `runId`. Replay is a free win — once campaigns are running, every eval R&D loop drops to CPU-bound.
 ## 0.21.0 — capture integrity + launch-grade reporting
 This release closes the layer-1 gap a downstream consumer surfaced: better
@@ -74,7 +307,9 @@ surface.
 ### Python client
-Locked at `tangle-agent-eval==0.21.0` to match the npm package.
+The PyPI distribution renamed from `tangle-agent-eval` to **`agent-eval-rpc`**, and the import path from `tangle_agent_eval` to `agent_eval_rpc`. The new name accurately describes the package — it is a thin RPC client over the Node runtime, not a Python re-implementation of the eval logic — and the npm scope (`@tangle-network/agent-eval`) already provides the namespacing the `tangle-` prefix was substituting for. No prior PyPI version ever shipped under the old name (Trusted Publisher misconfiguration; see issue #40), so this rename is a clean first publish rather than a migration.
+Locked at `agent-eval-rpc==0.21.0` to match the npm package.
 ## 0.20.10 — hardening audit follow-up

package/README.md CHANGED Viewed

@@ -96,9 +96,10 @@ import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
 | Subpath | Use for |
 | --- | --- |
 | `@tangle-network/agent-eval/control` | `observe -> validate -> decide -> act`, action policy, propose/review loops |
-| `@tangle-network/agent-eval/traces` | trace stores, emitters, TraceAnalyst |
-| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot optimization, prompt evolution |
-| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, report/table/chart specs |
+| `@tangle-network/agent-eval/traces` | trace stores, emitters, TraceAnalyst, replay |
+| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot optimization, prompt evolution, EvalCampaign |
+| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, report/table/chart specs, predictive validity |
+| `@tangle-network/agent-eval/rl` | RL bridge: adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves |
 | `@tangle-network/agent-eval/wire` | HTTP/RPC judge server and schemas |
 | `@tangle-network/agent-eval/benchmarks` | benchmark adapter contracts and reference wrappers |
@@ -116,6 +117,19 @@ import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
 | Fail loud if an eval would silently use the wrong route | `assertLlmRoute` |
 | Assert at run-end that the artifact is complete | `assertRunCaptured`, `throwIfRunIncomplete` |
 | Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
+| Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
+| Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
+| Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
+| Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
+| Bridge legacy optimization output to canonical `RunRecord[]` | `trialsToRunRecords`, `verificationReportToRunRecord` |
+| Extract a clean reward signal for RL training (compile/test/schema vs judge) | `extractVerifiableReward`, `filterDeterministicallyRewarded` |
+| Produce DPO / PPO / KTO `(chosen, rejected)` triples | `extractPreferences` |
+| Estimate a new policy's value on old trajectories without re-running | `offPolicyEstimateAll` (IPS + SNIPS + DR) |
+| Step-level credit assignment / PRM training data | `extractStepRewards`, `prmTrainingPairs` |
+| Detect benchmark contamination via held-out perturbations | `runContaminationProbe` |
+| Pairwise tournament ratings for many-candidate sweeps | `fitBradleyTerry`, `applyEloUpdate` |
+| Active search for inputs the policy fails on | `adversarialScenarioSearch` |
+| Characterise a candidate across compute budgets | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` |
 | Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
 ### Capture integrity (0.21+)

package/dist/benchmarks/index.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-c5saLbKD.js';
-import '../run-record-CX_jcAyr.js';
+export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-DDTlbHEK.js';
+import '../run-record-DNiOMBrZ.js';

package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} RENAMED Viewed

@@ -1,3 +1,8 @@
+import {
+  canonicalize,
+  hashJson
+} from "./chunk-6M774GY6.js";
 // src/trace/store.ts
 var InMemoryTraceStore = class {
   runs = /* @__PURE__ */ new Map();
@@ -497,119 +502,138 @@ function runToTraceId(run) {
   return cleaned.slice(0, 32).padEnd(32, "0");
 }
-// src/trace/integrity.ts
-var RunIntegrityError = class extends Error {
-  constructor(report) {
-    super(
-      `Run ${report.runId} failed integrity check: ${report.issues.map((i) => i.code).join(", ")}`
-    );
-    this.report = report;
-    this.name = "RunIntegrityError";
-  }
-  report;
+// src/replay.ts
+var ReplayCacheMissError = class extends Error {
+  constructor(url, requestKey2, message) {
+    super(message ?? `replay cache miss for ${url} (key=${requestKey2})`);
+    this.url = url;
+    this.requestKey = requestKey2;
+    this.name = "ReplayCacheMissError";
+  }
+  url;
+  requestKey;
 };
-async function assertRunCaptured(store, runId, expectations = {}) {
-  const issues = [];
-  const run = await store.getRun(runId);
-  if (!run) {
-    return {
-      ok: false,
-      runId,
-      llmSpanCount: 0,
-      judgeSpanCount: 0,
-      toolSpanCount: 0,
-      rawProviderEventCount: 0,
-      rawSpanCoverage: { covered: 0, total: 0 },
-      issues: [{ code: "no_run", message: `Run ${runId} not found in store.` }]
-    };
+var ReplayCache = class _ReplayCache {
+  byKey = /* @__PURE__ */ new Map();
+  orphans = 0;
+  byProvider = {};
+  byModel = {};
+  /**
+   * Build a cache from a sink's events. The sink must implement `list()`.
+   * Filter by `runId` / `spanId` to scope to a specific replay.
+   */
+  static async fromSink(sink, filter = {}) {
+    if (!sink.list) {
+      throw new Error("ReplayCache.fromSink: sink must implement list() to be replayable.");
+    }
+    const events = await sink.list(filter);
+    return _ReplayCache.fromEvents(events);
+  }
+  /** Build a cache from an in-memory event list. */
+  static async fromEvents(events) {
+    const cache = new _ReplayCache();
+    const groups = /* @__PURE__ */ new Map();
+    for (const e of events) {
+      const k = `${e.runId ?? ""}::${e.spanId ?? ""}::${e.attemptIndex}`;
+      const g = groups.get(k) ?? {};
+      if (e.direction === "request") g.req = e;
+      else g.res = e;
+      groups.set(k, g);
+    }
+    for (const g of groups.values()) {
+      if (!g.req) continue;
+      if (!g.res) {
+        cache.orphans += 1;
+        continue;
+      }
+      const key = await requestKey(g.req);
+      cache.byKey.set(key, { request: g.req, response: g.res });
+      cache.byProvider[g.req.provider] = (cache.byProvider[g.req.provider] ?? 0) + 1;
+      cache.byModel[g.req.model] = (cache.byModel[g.req.model] ?? 0) + 1;
+    }
+    return cache;
   }
-  const spans = await store.spans({ runId });
-  const llmSpans2 = spans.filter((s) => s.kind === "llm");
-  const judgeSpans2 = spans.filter((s) => s.kind === "judge");
-  const toolSpans2 = spans.filter((s) => s.kind === "tool");
-  const llmMin = expectations.llmSpansMin ?? 0;
-  const judgeMin = expectations.judgeSpansMin ?? 0;
-  const toolMin = expectations.toolSpansMin ?? 0;
-  if (llmSpans2.length < llmMin) {
-    issues.push({
-      code: "missing_llm_spans",
-      message: `Expected \u2265 ${llmMin} LLM spans, found ${llmSpans2.length}.`,
-      detail: { expected: llmMin, found: llmSpans2.length }
-    });
+  /** Number of cacheable (request, response) pairs in the cache. */
+  size() {
+    return this.byKey.size;
   }
-  if (judgeSpans2.length < judgeMin) {
-    issues.push({
-      code: "missing_judge_spans",
-      message: `Expected \u2265 ${judgeMin} judge spans, found ${judgeSpans2.length}.`,
-      detail: { expected: judgeMin, found: judgeSpans2.length }
-    });
+  stats() {
+    return {
+      total: this.byKey.size,
+      byProvider: { ...this.byProvider },
+      byModel: { ...this.byModel },
+      orphanRequests: this.orphans
+    };
   }
-  if (toolSpans2.length < toolMin) {
-    issues.push({
-      code: "missing_tool_spans",
-      message: `Expected \u2265 ${toolMin} tool spans, found ${toolSpans2.length}.`,
-      detail: { expected: toolMin, found: toolSpans2.length }
-    });
+  /**
+   * Look up a cached response by hashing the (model, messages, temperature,
+   * maxTokens, response_format) shape. Returns `undefined` on miss; the
+   * caller decides whether to throw, fall back to the network, or skip.
+   */
+  async lookup(requestBody) {
+    const key = await keyFromBody(requestBody);
+    return this.byKey.get(key);
   }
-  let rawEventCount = 0;
-  let coverage = { covered: 0, total: llmSpans2.length };
-  if (expectations.rawSink) {
-    if (!expectations.rawSink.list) {
-      issues.push({
-        code: "no_raw_sink",
-        message: "Provided rawSink does not implement list(); cannot verify capture."
-      });
-    } else {
-      const events = await expectations.rawSink.list({ runId });
-      rawEventCount = events.length;
-      const rawMin = expectations.rawProviderEventsMin ?? 1;
-      if (rawEventCount < rawMin) {
-        issues.push({
-          code: "missing_raw_events",
-          message: `Expected \u2265 ${rawMin} raw provider events, found ${rawEventCount}.`,
-          detail: { expected: rawMin, found: rawEventCount }
-        });
-      }
-      if (expectations.requireRawCoverageOfLlmSpans) {
-        const requestEventsBySpan = new Set(
-          events.filter((e) => e.direction === "request" && e.spanId).map((e) => e.spanId)
-        );
-        const orphaned = llmSpans2.filter((s) => !requestEventsBySpan.has(s.spanId));
-        coverage = { covered: llmSpans2.length - orphaned.length, total: llmSpans2.length };
-        if (orphaned.length > 0) {
-          issues.push({
-            code: "orphan_llm_span",
-            message: `${orphaned.length} LLM span(s) have no matching raw provider request event.`,
-            detail: { orphanedSpanIds: orphaned.map((s) => s.spanId) }
-          });
-        }
+};
+function createReplayFetch(cache, opts = {}) {
+  const onMiss = opts.onMiss ?? "throw";
+  const fallback = opts.fallbackFetch ?? globalThis.fetch?.bind(globalThis);
+  return (async (input, init) => {
+    const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url;
+    if (!/\/chat\/completions(?:[?#].*)?$/.test(url)) {
+      if (!fallback) throw new Error(`replay fetch: non-completions URL ${url} but no fallbackFetch configured`);
+      return fallback(input, init);
+    }
+    let bodyParsed;
+    if (init?.body && typeof init.body === "string") {
+      try {
+        bodyParsed = JSON.parse(init.body);
+      } catch {
       }
     }
-  } else if (expectations.requireRawCoverageOfLlmSpans || expectations.rawProviderEventsMin) {
-    issues.push({
-      code: "no_raw_sink",
-      message: "Raw coverage required but no rawSink supplied to the integrity check."
-    });
-  }
-  if (expectations.requireOutcome && (run.outcome === void 0 || run.outcome === null)) {
-    issues.push({
-      code: "missing_outcome",
-      message: `Run ${runId} has no outcome recorded.`
-    });
-  }
-  return {
-    ok: issues.length === 0,
-    runId,
-    llmSpanCount: llmSpans2.length,
-    judgeSpanCount: judgeSpans2.length,
-    toolSpanCount: toolSpans2.length,
-    rawProviderEventCount: rawEventCount,
-    rawSpanCoverage: coverage,
-    issues
-  };
+    const hit = bodyParsed === void 0 ? void 0 : await cache.lookup(bodyParsed);
+    if (hit) {
+      opts.onHit?.({ url, provider: hit.request.provider, model: hit.request.model });
+      const status = hit.response.statusCode ?? 200;
+      const headers = new Headers(Object.entries(hit.response.responseHeaders ?? { "Content-Type": "application/json" }));
+      const bodyText = typeof hit.response.responseBody === "string" ? hit.response.responseBody : JSON.stringify(hit.response.responseBody ?? {});
+      return new Response(bodyText, { status, headers });
+    }
+    opts.onMissNotify?.({ url, requestBody: bodyParsed });
+    if (onMiss === "throw") {
+      const key = bodyParsed === void 0 ? "<unparseable>" : await keyFromBody(bodyParsed);
+      throw new ReplayCacheMissError(url, key);
+    }
+    if (onMiss === "fail-closed") {
+      return new Response(JSON.stringify({ error: "replay_cache_miss" }), { status: 599 });
+    }
+    if (!fallback) throw new Error("replay fetch: onMiss=fallback but no fallbackFetch configured");
+    return fallback(input, init);
+  });
 }
-function throwIfRunIncomplete(report) {
-  if (!report.ok) throw new RunIntegrityError(report);
+async function* iterateRawCalls(sink, filter = {}) {
+  if (!sink.list) {
+    throw new Error("iterateRawCalls: sink must implement list().");
+  }
+  const events = await sink.list(filter);
+  const cache = await ReplayCache.fromEvents(events);
+  for (const entry of cache["byKey"].values()) yield entry;
+}
+async function requestKey(event) {
+  return keyFromBody(event.requestBody);
+}
+async function keyFromBody(body) {
+  if (body == null || typeof body !== "object") return hashJson({ raw: String(body) });
+  const b = body;
+  const reduced = canonicalize({
+    model: b.model ?? null,
+    messages: b.messages ?? null,
+    temperature: b.temperature ?? null,
+    max_tokens: b.max_tokens ?? null,
+    max_completion_tokens: b.max_completion_tokens ?? null,
+    response_format: b.response_format ?? null
+  });
+  return hashJson(reduced);
 }
 // src/trace-analyst/types.ts
@@ -1891,9 +1915,10 @@ export {
   redactValue,
   OTEL_AGENT_EVAL_SCOPE,
   exportRunAsOtlp,
-  RunIntegrityError,
-  assertRunCaptured,
-  throwIfRunIncomplete,
+  ReplayCacheMissError,
+  ReplayCache,
+  createReplayFetch,
+  iterateRawCalls,
   DEFAULT_TRACE_ANALYST_BUDGETS,
   TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
   OtlpFileTraceStore,
@@ -1917,4 +1942,4 @@ export {
   defaultTraceInsightPanel,
   buildTraceInsightPrompt
 };
-//# sourceMappingURL=chunk-WOK2RTWG.js.map
+//# sourceMappingURL=chunk-4W4NCYM2.js.map