npm - @tangle-network/agent-eval - Versions diffs - 0.23.0 → 0.24.0 - Mend

@tangle-network/agent-eval 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

package/CHANGELOG.md +102 -0
package/README.md +141 -79
package/dist/baseline-4R5deP0N.d.ts +108 -0
package/dist/benchmarks/index.d.ts +3 -2
package/dist/benchmarks/index.js +1 -1
package/dist/builder-eval/index.d.ts +249 -0
package/dist/builder-eval/index.js +391 -0
package/dist/builder-eval/index.js.map +1 -0
package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
package/dist/chunk-2A5XJB43.js.map +1 -0
package/dist/chunk-47X6LRCE.js +76 -0
package/dist/chunk-47X6LRCE.js.map +1 -0
package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
package/dist/chunk-4F5DQN55.js.map +1 -0
package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
package/dist/chunk-4S4BM3QQ.js.map +1 -0
package/dist/chunk-5BKGXME7.js +65 -0
package/dist/chunk-5BKGXME7.js.map +1 -0
package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
package/dist/chunk-6QDKWHLS.js.map +1 -0
package/dist/chunk-I4MBDTY5.js +272 -0
package/dist/chunk-I4MBDTY5.js.map +1 -0
package/dist/chunk-K2TPS5LB.js +569 -0
package/dist/chunk-K2TPS5LB.js.map +1 -0
package/dist/chunk-KKHDIONI.js +414 -0
package/dist/chunk-KKHDIONI.js.map +1 -0
package/dist/chunk-KMPRBJK4.js +74 -0
package/dist/chunk-KMPRBJK4.js.map +1 -0
package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
package/dist/chunk-KTGTIOFD.js.map +1 -0
package/dist/chunk-LSH4MMOZ.js +838 -0
package/dist/chunk-LSH4MMOZ.js.map +1 -0
package/dist/chunk-NG236HPC.js +57 -0
package/dist/chunk-NG236HPC.js.map +1 -0
package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
package/dist/chunk-NLMNWKVM.js.map +1 -0
package/dist/chunk-NU65VQ7M.js +99 -0
package/dist/chunk-NU65VQ7M.js.map +1 -0
package/dist/chunk-OHEPNJQN.js +554 -0
package/dist/chunk-OHEPNJQN.js.map +1 -0
package/dist/chunk-OWLAAMME.js +250 -0
package/dist/chunk-OWLAAMME.js.map +1 -0
package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
package/dist/chunk-PC4UYEBM.js.map +1 -0
package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
package/dist/chunk-RAF443UI.js.map +1 -0
package/dist/chunk-RZTMDUO7.js +49 -0
package/dist/chunk-RZTMDUO7.js.map +1 -0
package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
package/dist/chunk-SESZDQPX.js.map +1 -0
package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
package/dist/chunk-SY6WAAAD.js.map +1 -0
package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
package/dist/chunk-TVVP3ZZQ.js.map +1 -0
package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
package/dist/chunk-VRJVTXRV.js.map +1 -0
package/dist/chunk-WWYCWKUM.js +196 -0
package/dist/chunk-WWYCWKUM.js.map +1 -0
package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
package/dist/chunk-YRZ4M5GS.js.map +1 -0
package/dist/chunk-ZN274SWR.js +613 -0
package/dist/chunk-ZN274SWR.js.map +1 -0
package/dist/cli.js +10 -6
package/dist/cli.js.map +1 -1
package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
package/dist/control.d.ts +8 -6
package/dist/control.js +10 -7
package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
package/dist/errors-BZ9sTdz7.d.ts +70 -0
package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
package/dist/governance/index.d.ts +5 -0
package/dist/governance/index.js +18 -0
package/dist/governance/index.js.map +1 -0
package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
package/dist/index-Oj9fAPPN.d.ts +270 -0
package/dist/index.d.ts +1866 -3151
package/dist/index.js +5457 -7809
package/dist/index.js.map +1 -1
package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
package/dist/knowledge/index.d.ts +102 -0
package/dist/knowledge/index.js +18 -0
package/dist/knowledge/index.js.map +1 -0
package/dist/meta-eval/index.d.ts +99 -0
package/dist/meta-eval/index.js +324 -0
package/dist/meta-eval/index.js.map +1 -0
package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +11 -8
package/dist/optimization.js +11 -9
package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
package/dist/pipelines/index.d.ts +172 -0
package/dist/pipelines/index.js +409 -0
package/dist/pipelines/index.js.map +1 -0
package/dist/prm/index.d.ts +99 -0
package/dist/prm/index.js +222 -0
package/dist/prm/index.js.map +1 -0
package/dist/query-DODUYdPg.d.ts +30 -0
package/dist/release-report-TDPn1cxq.d.ts +292 -0
package/dist/replay-BL96gCEP.d.ts +226 -0
package/dist/reporting.d.ts +10 -295
package/dist/reporting.js +10 -6
package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
package/dist/rl.d.ts +1762 -8
package/dist/rl.js +2035 -58
package/dist/rl.js.map +1 -1
package/dist/rubric-D5tjHNJQ.d.ts +72 -0
package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
package/dist/sequential-Dgz1n51-.d.ts +139 -0
package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
package/dist/telemetry/file.js +4 -1
package/dist/telemetry/file.js.map +1 -1
package/dist/telemetry/index.js +57 -57
package/dist/telemetry/index.js.map +1 -1
package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
package/dist/traces.d.ts +142 -387
package/dist/traces.js +1302 -40
package/dist/traces.js.map +1 -1
package/dist/trajectory-CnoBo-JY.d.ts +32 -0
package/dist/wire/index.d.ts +22 -22
package/dist/wire/index.js +4 -3
package/package.json +35 -2
package/dist/chunk-42I2QC2L.js.map +0 -1
package/dist/chunk-4W4NCYM2.js +0 -1945
package/dist/chunk-4W4NCYM2.js.map +0 -1
package/dist/chunk-5IIQKMD5.js.map +0 -1
package/dist/chunk-6KQG5HAH.js.map +0 -1
package/dist/chunk-6M774GY6.js.map +0 -1
package/dist/chunk-7EAUOUQS.js.map +0 -1
package/dist/chunk-AXHNWLIX.js.map +0 -1
package/dist/chunk-EXGR4XEM.js.map +0 -1
package/dist/chunk-IOXMGMHQ.js.map +0 -1
package/dist/chunk-KAO3Q65R.js.map +0 -1
package/dist/chunk-LZKIOBG2.js +0 -2026
package/dist/chunk-LZKIOBG2.js.map +0 -1
package/dist/chunk-QBW3YBTR.js.map +0 -1
package/dist/chunk-QUKKGHTZ.js.map +0 -1
package/dist/chunk-SQQLHODJ.js.map +0 -1
package/dist/chunk-V5QSWN7L.js +0 -1310
package/dist/chunk-V5QSWN7L.js.map +0 -1
package/dist/chunk-VQQSPGSM.js.map +0 -1
package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
package/dist/index-ekBXweiQ.d.ts +0 -1894
package/dist/sequential-DgU2mFsE.d.ts +0 -304

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,107 @@
 # Changelog
+## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices
+This release is **DX + correctness**. No production behavior moved; consumer
+contracts tightened across the board. Library went from 7.5/10 to 10/10 on
+first-touch usability and contract clarity. The visible deltas:
+### Strictness
+- **`noUncheckedIndexedAccess: true`** in `tsconfig.json`. 251 latent
+  `T | undefined` sites surfaced and fixed across ~70 files. Loop-bound
+  indices documented with `!`, external lookups guarded explicitly, accumulator
+  patterns refactored to capture-then-assign. Every fix audited for semantic
+  correctness (math code: `!`; untrusted data: guards).
+- **Subpath imports forced.** Six `export * from './X'` wildcards at root
+  deleted (`./rl`, `./pipelines`, `./builder-eval`, `./meta-eval`, `./prm`,
+  `./trace-analyst`). New subpaths in `package.json`: `/pipelines`,
+  `/meta-eval`, `/prm`, `/builder-eval`, `/governance`, `/knowledge`. Root
+  re-exports retained only for the load-bearing capture-integrity surface
+  (`./trace`, `./knowledge`, `./governance`).
+- **Error taxonomy.** New `src/errors.ts` exports `AgentEvalError` base plus
+  `ValidationError`, `NotFoundError`, `ConfigError`, `CaptureIntegrityError`,
+  `JudgeError`, `VerificationError`, `ReplayError`. Existing custom errors
+  re-parented: `ReplayCacheMissError`, `BudgetBreachError`, `RunIntegrityError`,
+  `HoldoutLockedError`, `RunRecordValidationError`, `LlmCallError`,
+  `LlmRouteAssertionError`, `TraceFileMissingError`, `TraceNotFoundError`,
+  `SpanNotFoundError`. ~25 user-facing `throw new Error(...)` calls migrated
+  to typed errors across `rl/*`, `replay`, `sandbox-harness`, `statistics`,
+  `release-confidence`, `visual-diff`, `counterfactual`, `run-critic`,
+  `observability`. Internal invariant guards intentionally left as plain
+  `Error` — those are bugs, not contract failures.
+- **`LlmRouteAssertionError.code` → `reason`** (breaking, greenfield).
+  The subclass's route-specific reason now lives on `.reason`; the base
+  category `code = 'capture_integrity'` survives via the `AgentEvalError`
+  contract.
+### Visible deltas
+### Changed
+- **README reframed** as the substrate for self-improving agents. The package
+  has shipped `EvalCampaign`, replay, GEPA / reflective mutation, auto-research,
+  active curriculum, contamination probes, tournaments, compute curves, PRM,
+  off-policy estimators, and sequential anytime-valid stats since 0.22 — the
+  README now actually names them, not just "evaluation infrastructure."
+- **`src/rl/index.ts` carries stability markers** — every re-export is tagged
+  `@stable` or `@experimental` via JSDoc. Stable: `run-record-adapters`,
+  `verifiable-reward`, `preferences`, `off-policy`, `tournament`,
+  `contamination`, `compute-curves`. Experimental: `process-reward`,
+  `adversarial`, `active-curriculum`, `reward-hacking`, `adaptation-eval`,
+  `exporters`, `rl-campaign`, `predictive-validity-researcher`, `auto-research`.
+  Tags are visible in IDE hover and emitted into `dist/rl.d.ts` so consumers
+  can see the contract at the call site.
+### Added
+- **Biome lint + format** — `biome.json` codifies the project style (no
+  semicolons, single quotes, 2-space indent, 100 col, `noNonNullAssertion`
+  off, `useNodejsImportProtocol` on). `pnpm lint` and `pnpm format` scripts.
+- **`.github/workflows/ci.yml`** — runs typecheck + lint + test + build +
+  Python pytest on every PR. Previously only the publish workflow on tag
+  push exercised this surface; PRs were unguarded.
+- **`ReplayCache.entries()`** — public iterator for the cached
+  `(request, response)` pairs. Replaces the bracket-access escape hatch into
+  the private `byKey` map. Same semantics, exposed in the type contract.
+- **Per-example READMEs** — `examples/multi-shot-optimization` and
+  `examples/same-sandbox-harness` now document what they show, how to run,
+  expected output, and adaptation guidance. The other three examples already
+  had READMEs; the README index now links to all five.
+- **`clients/python/examples/judge_anti_slop.py`** — runnable script that
+  doubles as a pytest, anchoring the `judge` API contract: composite in
+  `[0, 1]`, `RubricNotFoundError` for bogus rubric name, `ValidationError`
+  for no-rubric call.
+### Fixed
+- **`reflective-mutation.ts`** — local `escape` variable shadowed the global
+  `escape` property. Renamed to `escaped`. No behavior change; flagged by
+  biome.
+## 0.23.1 — FileSystemTraceStore.updateRun no longer double-appends
+### Fixed
+- **`FileSystemTraceStore.updateRun` / `updateSpan`** — once the lazy
+  in-memory index had been populated (by any prior `getRun` / `listRuns` /
+  `spans` / `events` query), an `updateRun` would mirror the synthetic
+  update row back into the index via `appendRun`, throwing
+  `run X already exists`. Same root cause for `updateSpan`, which would
+  silently insert a phantom duplicate span row. The `append()` helper now
+  skips `insertInto` for rows carrying the internal `_update: true` marker;
+  `updateRun` / `updateSpan` continue to apply the patch directly via the
+  index's `updateRun` / `updateSpan` APIs.
+  Surfaced by tax-agent's canonical eval running multiple variants per
+  persona against a shared store: the second variant's `endRun`
+  consistently threw, forcing callers to instantiate one store per
+  (persona × variant) cell and stitch results back together post-hoc.
+  After this fix, a single `FileSystemTraceStore` can fan out runs across
+  arbitrarily many cells with interleaved reads, which is the intended
+  usage pattern. Regression test added in `tests/trace-store.test.ts`.
 ## 0.23.0 — RL primitives + auto-research worked example
 In addition to the RL bridge primitives below, this release ships the

package/README.md CHANGED Viewed

@@ -1,32 +1,39 @@
 # @tangle-network/agent-eval
-Evaluation infrastructure for agent products.
-Use it to wrap the real workflow your users run, record what happened, verify
-the result, turn feedback into replay data, compare variants, and ship only
-when the evidence improves.
+**Substrate for self-improving agents.** Trace what runs, verify the result,
+turn outcomes into preferences and rewards, mutate prompts and policies under
+anytime-valid evidence, and ship only when the improvement is decisive.
 ```txt
-product task
-  -> observe state
-  -> validate with deterministic gates first
-  -> act through the real product adapter
-  -> trace + feedback trajectory
-  -> replay / optimize / release gate
+real product task
+  -> observe / act (your runtime)
+  -> trace + verifier pipeline (capture integrity)
+  -> RunRecord (canonical eval artifact)
+       -> judge calibration · paired stats · sequential α
+       -> preferences · verifiable rewards · process rewards
+       -> GEPA / reflective mutation · auto-research · active curriculum
+       -> release gate · replay · contamination probe · tournament rating
+  -> next iteration
 ```
-`agent-eval` does not own product state, credentials, UI, storage, model
+`agent-eval` does **not** own product state, credentials, UI, storage, model
 routing, browser drivers, sandbox policy, or deployment. Products own those.
-This package owns eval contracts, loop mechanics, traces, statistics,
-optimization inputs, and release evidence.
+This package owns the loop that closes evaluation → preference → mutation →
+redeploy, with capture integrity and statistically rigorous evidence at every
+step.
+It ships as a TypeScript library (npm) with a generated Python client (PyPI),
+both speaking the same wire protocol. MIT, self-hostable, no SaaS dependency.
 ## Install
 ```sh
 pnpm add @tangle-network/agent-eval
+# or, from Python:
+pip install agent-eval-rpc
 ```
-## Quick Start
+## Quick Start — the control loop
 ```ts
 import {
@@ -78,68 +85,102 @@ const result = await runAgentControlLoop({
 await product.storeEvalResult(task.id, result)
 ```
-That loop should be the same shape in production, replay, benchmark, and
-optimization. Swap dependencies behind `observe()` and `act()`, not the eval
-contract itself.
+Same loop shape in production, replay, benchmark, and optimization. Swap the
+dependencies behind `observe()` and `act()`, never the eval contract.
-## Import Paths
+## Self-improvement loop
-The root export remains available, but new code should prefer focused subpaths:
+Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
+proposals, and curriculum updates — all from the same `RunRecord` produced by
+the control loop.
 ```ts
-import { runAgentControlLoop } from '@tangle-network/agent-eval/control'
-import { runMultiShotOptimization } from '@tangle-network/agent-eval/optimization'
-import { TraceEmitter } from '@tangle-network/agent-eval/traces'
-import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
+import { runEvalCampaign } from '@tangle-network/agent-eval'
+import {
+  extractPreferences,
+  extractVerifiableReward,
+  filterDeterministicallyRewarded,
+  offPolicyEstimateAll,
+  analyzeOptimizationResult,
+} from '@tangle-network/agent-eval/rl'
+// 1. Run a matrix of variants × scenarios with capture integrity by construction.
+const campaign = await runEvalCampaign({ variants, scenarios, run })
+// 2. Convert outcomes into RL signal.
+const rewards = extractVerifiableReward(campaign.runs)          // compile/test/schema
+const prefs   = extractPreferences(campaign.runs)               // (chosen, rejected) triples
+const clean   = filterDeterministicallyRewarded(rewards)        // judge-noise free
+// 3. Estimate a candidate policy's value without re-running.
+const ope = offPolicyEstimateAll(campaign.runs, candidatePolicy)  // IPS + SNIPS + DR
+// 4. Or close the loop end-to-end: score → reflect → mutate → re-run.
+const next = await analyzeOptimizationResult(campaign, { researcher })
 ```
+| Step | Primitive | Subpath |
+| --- | --- | --- |
+| Eval matrix with integrity | `runEvalCampaign` | `/` |
+| Deterministic re-judge / audit | `ReplayCache`, `createReplayFetch` | `/` |
+| Anytime-valid α across rolling looks | `pairedEvalueSequence` | `/reporting` |
+| Judge quality vs gold | `calibrateJudge` (κ, Pearson, MAE, bias probes) | `/` |
+| (chosen, rejected) for DPO/KTO/PPO | `extractPreferences` | `/rl` |
+| Verifiable reward signal | `extractVerifiableReward` | `/rl` |
+| Step-level / PRM training data | `extractStepRewards`, `prmTrainingPairs` | `/rl` |
+| Estimate policy value off-policy | `offPolicyEstimateAll` (IPS + SNIPS + DR) | `/rl` |
+| GEPA / reflective prompt mutation | `buildReflectionPrompt`, `parseReflectionResponse`, Ax-GEPA `SteeringOptimizer` | `/` `/optimization` |
+| Auto-research (read runs → propose) | `analyzeOptimizationResult`, `PredictiveValidityResearcher` | `/rl` |
+| Active curriculum (variance / Thompson) | `allocateCurriculum` | `/rl` |
+| Tournament ratings (Bradley-Terry + Elo) | `fitBradleyTerry`, `applyEloUpdate` | `/rl` |
+| Adversarial scenario search | `adversarialScenarioSearch` | `/rl` |
+| Contamination probe (held-out perturb) | `runContaminationProbe` | `/rl` |
+| Reward hacking signatures | `detectRewardHacking` | `/rl` |
+| Compute curves (best-of-N, self-consist, Pareto) | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` | `/rl` |
+| Knowledge gap separated from reasoning gap | `scoreKnowledgeReadiness` | `/` |
+| Release gate (paired evidence + holdouts) | `evaluateReleaseConfidence`, `HeldOutGate` | `/reporting` |
+| Launch report (decision-grade) | `renderReleaseReport`, `researchReport` | `/reporting` |
+## Import Paths
 | Subpath | Use for |
 | --- | --- |
-| `@tangle-network/agent-eval/control` | `observe -> validate -> decide -> act`, action policy, propose/review loops |
+| `@tangle-network/agent-eval/control` | `observe → validate → decide → act`, action policy, propose/review loops |
 | `@tangle-network/agent-eval/traces` | trace stores, emitters, TraceAnalyst, replay |
-| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot optimization, prompt evolution, EvalCampaign |
-| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, report/table/chart specs, predictive validity |
-| `@tangle-network/agent-eval/rl` | RL bridge: adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves |
-| `@tangle-network/agent-eval/wire` | HTTP/RPC judge server and schemas |
+| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot, prompt evolution, GEPA, EvalCampaign |
+| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, launch reports |
+| `@tangle-network/agent-eval/rl` | adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves, auto-research |
+| `@tangle-network/agent-eval/wire` | HTTP/RPC server + schemas (same protocol the Python client speaks) |
 | `@tangle-network/agent-eval/benchmarks` | benchmark adapter contracts and reference wrappers |
-## Core Pieces
+The root export remains available for convenience; new code should prefer
+focused subpaths. Anything under `/rl` should be imported from `/rl` — root
+re-export is retained only for backward compatibility and will be narrowed in
+0.25.
+## API stability
-| Need | Use |
+Public exports are tagged with JSDoc stability markers so consumers can see
+status at the call site (IDE hover, language server, declaration files).
+| Tag | Meaning |
 | --- | --- |
-| Keep an agent working until objective state passes | `runAgentControlLoop` |
-| Turn user/reviewer feedback into replay data | `FeedbackTrajectory` |
-| Compare prompt/tool/retrieval policies over full trajectories | `runMultiShotOptimization` |
-| Gate releases with paired evidence and holdouts | `evaluateReleaseConfidence`, `HeldOutGate` |
-| Explain regressions across trace corpora | `TraceAnalyst` / `analyzeTraces` |
-| Report a launch decision | `renderReleaseReport`, `researchReport`, `summaryTable`, `paretoChart`, `gainHistogram` |
-| Capture every provider HTTP request / response for forensics | `RawProviderSink`, `LlmClientOptions.rawSink` |
-| Fail loud if an eval would silently use the wrong route | `assertLlmRoute` |
-| Assert at run-end that the artifact is complete | `assertRunCaptured`, `throwIfRunIncomplete` |
-| Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
-| Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
-| Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
-| Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
-| Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
-| Bridge legacy optimization output to canonical `RunRecord[]` | `trialsToRunRecords`, `verificationReportToRunRecord` |
-| Extract a clean reward signal for RL training (compile/test/schema vs judge) | `extractVerifiableReward`, `filterDeterministicallyRewarded` |
-| Produce DPO / PPO / KTO `(chosen, rejected)` triples | `extractPreferences` |
-| Estimate a new policy's value on old trajectories without re-running | `offPolicyEstimateAll` (IPS + SNIPS + DR) |
-| Step-level credit assignment / PRM training data | `extractStepRewards`, `prmTrainingPairs` |
-| Detect benchmark contamination via held-out perturbations | `runContaminationProbe` |
-| Pairwise tournament ratings for many-candidate sweeps | `fitBradleyTerry`, `applyEloUpdate` |
-| Active search for inputs the policy fails on | `adversarialScenarioSearch` |
-| Characterise a candidate across compute budgets | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` |
-| Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
-### Capture integrity (0.21+)
+| `@stable` | API frozen at this major. Breaking changes require a major bump. |
+| `@experimental` | Interface may evolve before becoming `@stable`. Pin the patch version if you depend on it. |
+| `@internal` | Not part of the public contract. Use the documented subpath instead. |
+The `/rl` subpath is the most active surface. See
+[`src/rl/index.ts`](./src/rl/index.ts) for the current stable/experimental
+breakdown.
+## Capture integrity (0.21+)
 Launch-grade benchmark runs need four things that are easy to forget in glue
 code: (1) raw HTTP capture alongside the structured spans so a reviewer can
 verify which route answered, (2) a preflight assertion that the configured
 client points at the intended provider, (3) a run-end assertion that the
 expected events were actually written, and (4) auto-execution of the trace
-analyst as part of the run lifecycle. The wiring fits in a few lines:
+analyst as part of the run lifecycle.
 ```ts
 import {
@@ -168,28 +209,33 @@ Directives, rationale, and shipped-bug context are in
 ## Examples
-Runnable examples live in
-[`examples/`](https://github.com/tangle-network/agent-eval/tree/main/examples).
+Each example has its own README with what it demonstrates, expected output,
+and runtime. See [`examples/`](./examples/).
-- [`examples/multi-shot-optimization`](https://github.com/tangle-network/agent-eval/tree/main/examples/multi-shot-optimization):
+- [`examples/multi-shot-optimization`](./examples/multi-shot-optimization/README.md):
   optimize full trajectories with held-out promotion.
-- [`examples/same-sandbox-harness`](https://github.com/tangle-network/agent-eval/tree/main/examples/same-sandbox-harness):
+- [`examples/same-sandbox-harness`](./examples/same-sandbox-harness/README.md):
   run setup/build/test and evidence checks in one workspace.
-- [`examples/benchmarks`](https://github.com/tangle-network/agent-eval/tree/main/examples/benchmarks):
+- [`examples/benchmarks`](./examples/benchmarks/README.md):
   benchmark adapter shape and reference wrappers.
+- [`examples/auto-research-with-agent-builder`](./examples/auto-research-with-agent-builder/README.md):
+  closed loop — score, reflect, mutate, re-score, repeat.
+- [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
+  RunRecord → preferences → trainer (prime-rl) → next campaign.
 ## Docs
 Read in this order:
-1. [Product Eval Adoption](./docs/product-eval-adoption.md)
-2. [Control Runtime](./docs/control-runtime.md)
-3. [Feedback Trajectories](./docs/feedback-trajectories.md)
-4. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
-5. [Trace Analysis](./docs/trace-analysis.md)
-6. [Knowledge Readiness](./docs/knowledge-readiness.md)
-7. [Integration Launch Gates](./docs/integration-launch-gates.md)
-8. [Wire Protocol](./docs/wire-protocol.md)
+1. [Concepts](./docs/concepts.md) — mental model, 5 min
+2. [Product Eval Adoption](./docs/product-eval-adoption.md)
+3. [Control Runtime](./docs/control-runtime.md)
+4. [Feedback Trajectories](./docs/feedback-trajectories.md)
+5. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
+6. [Trace Analysis](./docs/trace-analysis.md)
+7. [Knowledge Readiness](./docs/knowledge-readiness.md)
+8. [Integration Launch Gates](./docs/integration-launch-gates.md)
+9. [Wire Protocol](./docs/wire-protocol.md) — required for non-TypeScript consumers
 ## CLI / Wire Protocol
@@ -198,28 +244,44 @@ npm i -g @tangle-network/agent-eval
 agent-eval serve --port 5005
 ```
-The Python client lives in `clients/python`:
+Python:
 ```sh
-cd clients/python
-pip install -e .
+pip install agent-eval-rpc
 ```
+```py
+from agent_eval_rpc import Client
+client = Client()  # auto-detects HTTP server, falls back to subprocess
+score = await client.judge(content=output, rubric_name="anti-slop")
+```
+TypeScript is the source of truth. Python is a thin transport client over the
+generated OpenAPI schema. Schema drift is enforced impossible at release time
+(version-locked CI).
 ## Development
 ```sh
 pnpm install
 pnpm typecheck
 pnpm test
-pnpm build
-pnpm openapi
+pnpm lint        # biome
+pnpm build       # tsup + openapi.json
 ```
 ## Related Packages
-- `@tangle-network/agent-runtime`: production session/runtime layer.
-- `@tangle-network/agent-knowledge`: source-grounded knowledge bases and readiness.
-- `@tangle-network/agent-integrations`: connection, grant, capability, and integration invocation contracts.
+- [`@tangle-network/agent-runtime`](https://www.npmjs.com/package/@tangle-network/agent-runtime):
+  production session/runtime layer.
+- [`@tangle-network/agent-knowledge`](https://www.npmjs.com/package/@tangle-network/agent-knowledge):
+  source-grounded knowledge bases and readiness.
+- [`@tangle-network/agent-integrations`](https://www.npmjs.com/package/@tangle-network/agent-integrations):
+  connection, grant, capability, and integration invocation contracts.
+Together: `agent-runtime` is where the agent runs; `agent-knowledge` is what
+it knows; `agent-integrations` is what it can do; `agent-eval` is how it gets
+better.
 ## License

package/dist/baseline-4R5deP0N.d.ts ADDED Viewed

@@ -0,0 +1,108 @@
+import { T as TraceStore } from './store-Db2Bv8Cf.js';
+/**
+ * Tool-use metrics — derived purely from trace data.
+ *
+ * No scoring assumptions: consumers supply optional ground-truth tool
+ * selections per turn + optional "information used downstream" signals.
+ * Without those, we still compute descriptive metrics (error rate,
+ * retry rate, duplicate-call rate) that are useful on their own.
+ */
+interface ToolUseMetrics {
+    runId: string;
+    totalCalls: number;
+    byTool: Record<string, ToolStats>;
+    errorRate: number;
+    /** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */
+    duplicateRate: number;
+    /** Ratio of error calls followed by ≥1 retry on same tool. */
+    retryRate: number;
+    /** Optional: of the calls agent made, fraction the evaluator marked as "correct selection". */
+    selectionAccuracy?: number;
+}
+interface ToolStats {
+    calls: number;
+    errors: number;
+    avgLatencyMs: number;
+    duplicates: number;
+}
+interface ToolUseOptions {
+    /** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */
+    selectionLabels?: Record<string, boolean>;
+}
+declare function computeToolUseMetrics(store: TraceStore, runId: string, options?: ToolUseOptions): Promise<ToolUseMetrics>;
+/**
+ * Baseline regression detection.
+ *
+ * Lifted from ADC baseline.ts. Every promotion-blocking signal boils down
+ * to: "is this run measurably worse than baseline?" — with enough
+ * statistical rigor to distinguish noise from drift.
+ *
+ * Uses:
+ *   - Welch's t-test (unequal variance) for per-metric mean comparison
+ *   - Cohen's d for effect size magnitude
+ *   - IQR for stability flag (unstable samples can't be trusted for comparisons)
+ *
+ * Returns a structured verdict: improved | regressed | stable | unstable.
+ */
+interface MetricSamples {
+    /** Stable metric key (e.g. "overallScore", "firstTokenMs"). */
+    metric: string;
+    /** Whether higher values are better. */
+    higherIsBetter: boolean;
+    baseline: number[];
+    candidate: number[];
+}
+interface MetricVerdict {
+    metric: string;
+    baselineMean: number;
+    candidateMean: number;
+    delta: number;
+    cohensD: number;
+    welchT: number;
+    welchDf: number;
+    welchP: number;
+    stable: boolean;
+    /** IQR of the combined samples — used as a rough stability indicator. */
+    iqr: number;
+    verdict: 'improved' | 'regressed' | 'stable' | 'unstable';
+}
+interface BaselineReport {
+    metrics: MetricVerdict[];
+    /** True if any critical metric regressed. */
+    hasRegression: boolean;
+    /** True if any metric is unstable (too noisy to judge). */
+    hasUnstable: boolean;
+}
+interface BaselineOptions {
+    /** Effect size threshold for meaningful delta (default 0.5 — medium effect). */
+    effectThreshold?: number;
+    /** p-value threshold for statistical significance (default 0.05). */
+    alpha?: number;
+    /** IQR/mean ratio above which samples are flagged unstable (default 0.30). */
+    unstableCvThreshold?: number;
+}
+/**
+ * Compare candidate samples against baseline per metric. Verdict logic:
+ *   - unstable: IQR/|mean| > threshold on either set — not enough signal
+ *   - improved: meaningful effect in the "better" direction AND p < alpha
+ *   - regressed: meaningful effect in the "worse" direction AND p < alpha
+ *   - stable: otherwise (no significant change)
+ */
+declare function compareToBaseline(samples: MetricSamples[], options?: BaselineOptions): BaselineReport;
+/** Inter-quartile range; 0 when the sample has no spread. */
+declare function iqr(xs: number[]): number;
+/**
+ * Welch's t-test — unequal-variance two-sample t. Uses the same Student-t
+ * CDF as `pairedTTest` (via incomplete beta); falls back to normal tail
+ * when df is large.
+ */
+declare function welchsTTest(a: number[], b: number[]): {
+    t: number;
+    df: number;
+    p: number;
+};
+export { type BaselineOptions as B, type MetricSamples as M, type ToolStats as T, type BaselineReport as a, type MetricVerdict as b, computeToolUseMetrics as c, type ToolUseMetrics as d, type ToolUseOptions as e, compareToBaseline as f, iqr as i, welchsTTest as w };

package/dist/benchmarks/index.d.ts CHANGED Viewed

@@ -1,2 +1,3 @@
-export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-DDTlbHEK.js';
-import '../run-record-DNiOMBrZ.js';
+export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index--fVrWDiR.js';
+import '../run-record-CqzahIbx.js';
+import '../errors-BZ9sTdz7.js';

package/dist/benchmarks/index.js CHANGED Viewed

@@ -2,7 +2,7 @@ import {
   BENCHMARK_SPLIT_SEED,
   deterministicSplit,
   routing_exports
-} from "../chunk-42I2QC2L.js";
+} from "../chunk-6QDKWHLS.js";
 import "../chunk-PZ5AY32C.js";
 export {
   BENCHMARK_SPLIT_SEED,