npm - @tangle-network/agent-eval - Versions diffs - 0.23.1 → 0.25.0 - Mend

@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

package/CHANGELOG.md +145 -0
package/README.md +212 -79
package/dist/baseline-4R5deP0N.d.ts +108 -0
package/dist/benchmarks/index.d.ts +3 -2
package/dist/benchmarks/index.js +1 -1
package/dist/builder-eval/index.d.ts +249 -0
package/dist/builder-eval/index.js +391 -0
package/dist/builder-eval/index.js.map +1 -0
package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
package/dist/chunk-2A5XJB43.js.map +1 -0
package/dist/chunk-47X6LRCE.js +76 -0
package/dist/chunk-47X6LRCE.js.map +1 -0
package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
package/dist/chunk-4F5DQN55.js.map +1 -0
package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
package/dist/chunk-4S4BM3QQ.js.map +1 -0
package/dist/chunk-5BKGXME7.js +65 -0
package/dist/chunk-5BKGXME7.js.map +1 -0
package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
package/dist/chunk-5LBB5B3Z.js.map +1 -0
package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
package/dist/chunk-6QDKWHLS.js.map +1 -0
package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
package/dist/chunk-EDUKQ5AM.js.map +1 -0
package/dist/chunk-I4MBDTY5.js +272 -0
package/dist/chunk-I4MBDTY5.js.map +1 -0
package/dist/chunk-JLZQWFV3.js +618 -0
package/dist/chunk-JLZQWFV3.js.map +1 -0
package/dist/chunk-K2TPS5LB.js +569 -0
package/dist/chunk-K2TPS5LB.js.map +1 -0
package/dist/chunk-KKHDIONI.js +414 -0
package/dist/chunk-KKHDIONI.js.map +1 -0
package/dist/chunk-KMPRBJK4.js +74 -0
package/dist/chunk-KMPRBJK4.js.map +1 -0
package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
package/dist/chunk-KTGTIOFD.js.map +1 -0
package/dist/chunk-LSH4MMOZ.js +838 -0
package/dist/chunk-LSH4MMOZ.js.map +1 -0
package/dist/chunk-NG236HPC.js +57 -0
package/dist/chunk-NG236HPC.js.map +1 -0
package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
package/dist/chunk-NLMNWKVM.js.map +1 -0
package/dist/chunk-NU65VQ7M.js +99 -0
package/dist/chunk-NU65VQ7M.js.map +1 -0
package/dist/chunk-OWLAAMME.js +250 -0
package/dist/chunk-OWLAAMME.js.map +1 -0
package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
package/dist/chunk-PC4UYEBM.js.map +1 -0
package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
package/dist/chunk-RAF443UI.js.map +1 -0
package/dist/chunk-RZTMDUO7.js +49 -0
package/dist/chunk-RZTMDUO7.js.map +1 -0
package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
package/dist/chunk-SESZDQPX.js.map +1 -0
package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
package/dist/chunk-TVVP3ZZQ.js.map +1 -0
package/dist/chunk-WWYCWKUM.js +196 -0
package/dist/chunk-WWYCWKUM.js.map +1 -0
package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
package/dist/chunk-YRZ4M5GS.js.map +1 -0
package/dist/chunk-ZN274SWR.js +613 -0
package/dist/chunk-ZN274SWR.js.map +1 -0
package/dist/cli.js +10 -6
package/dist/cli.js.map +1 -1
package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
package/dist/control.d.ts +8 -6
package/dist/control.js +10 -7
package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
package/dist/errors-BZ9sTdz7.d.ts +70 -0
package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
package/dist/governance/index.d.ts +5 -0
package/dist/governance/index.js +18 -0
package/dist/governance/index.js.map +1 -0
package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
package/dist/index-Oj9fAPPN.d.ts +270 -0
package/dist/index.d.ts +2018 -3003
package/dist/index.js +7443 -9102
package/dist/index.js.map +1 -1
package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
package/dist/knowledge/index.d.ts +102 -0
package/dist/knowledge/index.js +18 -0
package/dist/knowledge/index.js.map +1 -0
package/dist/meta-eval/index.d.ts +99 -0
package/dist/meta-eval/index.js +324 -0
package/dist/meta-eval/index.js.map +1 -0
package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
package/dist/openapi.json +491 -1
package/dist/optimization.d.ts +11 -8
package/dist/optimization.js +11 -9
package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
package/dist/pipelines/index.d.ts +172 -0
package/dist/pipelines/index.js +345 -0
package/dist/pipelines/index.js.map +1 -0
package/dist/prm/index.d.ts +99 -0
package/dist/prm/index.js +222 -0
package/dist/prm/index.js.map +1 -0
package/dist/query-DODUYdPg.d.ts +30 -0
package/dist/release-report-BNgMdqPF.d.ts +292 -0
package/dist/replay-BL96gCEP.d.ts +226 -0
package/dist/reporting.d.ts +10 -295
package/dist/reporting.js +10 -6
package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
package/dist/rl.d.ts +1762 -8
package/dist/rl.js +2035 -58
package/dist/rl.js.map +1 -1
package/dist/rubric-D5tjHNJQ.d.ts +72 -0
package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
package/dist/sequential-Dgz1n51-.d.ts +139 -0
package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
package/dist/telemetry/file.js +4 -1
package/dist/telemetry/file.js.map +1 -1
package/dist/telemetry/index.js +57 -57
package/dist/telemetry/index.js.map +1 -1
package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
package/dist/traces.d.ts +142 -387
package/dist/traces.js +1302 -40
package/dist/traces.js.map +1 -1
package/dist/trajectory-CnoBo-JY.d.ts +32 -0
package/dist/wire/index.d.ts +369 -25
package/dist/wire/index.js +22 -3
package/package.json +44 -18
package/dist/chunk-42I2QC2L.js.map +0 -1
package/dist/chunk-5IIQKMD5.js.map +0 -1
package/dist/chunk-6KQG5HAH.js.map +0 -1
package/dist/chunk-6M774GY6.js.map +0 -1
package/dist/chunk-7EAUOUQS.js.map +0 -1
package/dist/chunk-AXHNWLIX.js.map +0 -1
package/dist/chunk-EXGR4XEM.js.map +0 -1
package/dist/chunk-IOXMGMHQ.js.map +0 -1
package/dist/chunk-KAO3Q65R.js.map +0 -1
package/dist/chunk-LZKIOBG2.js +0 -2026
package/dist/chunk-LZKIOBG2.js.map +0 -1
package/dist/chunk-QBW3YBTR.js.map +0 -1
package/dist/chunk-QUKKGHTZ.js.map +0 -1
package/dist/chunk-SQQLHODJ.js.map +0 -1
package/dist/chunk-V5QSWN7L.js +0 -1310
package/dist/chunk-V5QSWN7L.js.map +0 -1
package/dist/chunk-VQQSPGSM.js.map +0 -1
package/dist/chunk-XPHOZPOM.js +0 -1947
package/dist/chunk-XPHOZPOM.js.map +0 -1
package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
package/dist/index-ekBXweiQ.d.ts +0 -1894
package/dist/sequential-DgU2mFsE.d.ts +0 -304

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,150 @@
 # Changelog
+## 0.25.0 — ProductionLoop primitive: close the eval → prod → eval cycle
+This release ships the **orchestration layer** that turns the existing
+eval substrate into a continuously-improving production system. Static
+prompts decay; today's regulation flips tomorrow. The pieces to close
+the loop were already in the package (`runMultiShotOptimization`,
+`failureClusterView`, `evaluateReleaseConfidence`, `extractPreferences`,
+`FeedbackTrajectoryStore`, `TraceStore`); this release adds the one
+clean primitive that wires them together end-to-end.
+### Added
+- **`runProductionLoop({ ... })`** (`src/production-loop.ts`,
+  `@experimental`) — one call = one cycle. Ingests production traces
+  and feedback, clusters failures, runs evolve against the worst
+  cluster, gates with `HeldOutGate` + `evaluateReleaseConfidence`
+  (fail-closed), and — when wired with an `AutoPrClient` — opens a PR
+  with the improved prompt. Idempotent + replayable: same `runId`
+  yields the same plan. Cron / GitHub Actions are the consumer's job;
+  the primitive doesn't own scheduling.
+- **`proposeAutomatedPullRequest(client, input)`** + two transports
+  (`src/auto-pr.ts`, `@experimental`):
+    - `httpGithubClient({ token, ... })` — direct REST against
+      `api.github.com`, no extra deps. Idempotent on branch name:
+      existing open PRs are returned, not duplicated.
+    - `ghCliClient({ ... })` — shells out to `gh` for environments
+      where developer auth state is already configured.
+  Both validate inputs (no `..` paths, no whitespace branches, no
+  duplicate file changes) and surface `ValidationError` / `ConfigError`
+  from the typed taxonomy.
+- **`POST /v1/feedback` + `POST /v1/traces/ingest`** wire endpoints
+  (`src/wire/`). Both Zod-validated, both append to the configured
+  store (`FeedbackTrajectoryStore` / `TraceStore`). 503 when no store
+  is wired (fail loud, not silent). Traces ingest accepts both
+  `application/json` (`{events:[...]}`) and `application/x-ndjson` for
+  streaming production runtimes. Schemas (`TraceEvent`,
+  `FeedbackTrajectory`, `TracesIngestRequest/Response`,
+  `FeedbackIngestResponse`) added to `openapi.json` for cross-language
+  clients.
+- **Optional bearer-token auth** on the wire server, configured via
+  `createApp({ auth: { bearer: '...' } })` or as a verifier function
+  for rotating tokens. `/healthz` and `/v1/version` remain unprotected
+  (regression: never lock monitoring out of the runtime).
+- **`examples/production-loop/`** — synthetic end-to-end demo wiring
+  the loop against in-memory trace + feedback stores and a fake
+  auto-PR client. Shows the failure-cluster trigger, the evolve round,
+  the gate verdict, and the PR-shaped output without requiring
+  credentials or a live model.
+### Changed
+- **Wire server** (`createApp(opts)`) now accepts optional
+  `IngestionStores` (`{ traceStore?, feedbackStore? }`) and `auth`.
+  Existing zero-arg callers continue to work — judge / rubrics /
+  version / healthz are unchanged.
+### Status tags
+- Every new export is `@experimental` initially. Pin the patch version
+  if you depend on it. All other 0.24.0 stability tags are preserved.
+## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices
+This release is **DX + correctness**. No production behavior moved; consumer
+contracts tightened across the board. Library went from 7.5/10 to 10/10 on
+first-touch usability and contract clarity. The visible deltas:
+### Strictness
+- **`noUncheckedIndexedAccess: true`** in `tsconfig.json`. 251 latent
+  `T | undefined` sites surfaced and fixed across ~70 files. Loop-bound
+  indices documented with `!`, external lookups guarded explicitly, accumulator
+  patterns refactored to capture-then-assign. Every fix audited for semantic
+  correctness (math code: `!`; untrusted data: guards).
+- **Subpath imports forced.** Six `export * from './X'` wildcards at root
+  deleted (`./rl`, `./pipelines`, `./builder-eval`, `./meta-eval`, `./prm`,
+  `./trace-analyst`). New subpaths in `package.json`: `/pipelines`,
+  `/meta-eval`, `/prm`, `/builder-eval`, `/governance`, `/knowledge`. Root
+  re-exports retained only for the load-bearing capture-integrity surface
+  (`./trace`, `./knowledge`, `./governance`).
+- **Error taxonomy.** New `src/errors.ts` exports `AgentEvalError` base plus
+  `ValidationError`, `NotFoundError`, `ConfigError`, `CaptureIntegrityError`,
+  `JudgeError`, `VerificationError`, `ReplayError`. Existing custom errors
+  re-parented: `ReplayCacheMissError`, `BudgetBreachError`, `RunIntegrityError`,
+  `HoldoutLockedError`, `RunRecordValidationError`, `LlmCallError`,
+  `LlmRouteAssertionError`, `TraceFileMissingError`, `TraceNotFoundError`,
+  `SpanNotFoundError`. ~25 user-facing `throw new Error(...)` calls migrated
+  to typed errors across `rl/*`, `replay`, `sandbox-harness`, `statistics`,
+  `release-confidence`, `visual-diff`, `counterfactual`, `run-critic`,
+  `observability`. Internal invariant guards intentionally left as plain
+  `Error` — those are bugs, not contract failures.
+- **`LlmRouteAssertionError.code` → `reason`** (breaking, greenfield).
+  The subclass's route-specific reason now lives on `.reason`; the base
+  category `code = 'capture_integrity'` survives via the `AgentEvalError`
+  contract.
+### Visible deltas
+### Changed
+- **README reframed** as the substrate for self-improving agents. The package
+  has shipped `EvalCampaign`, replay, GEPA / reflective mutation, auto-research,
+  active curriculum, contamination probes, tournaments, compute curves, PRM,
+  off-policy estimators, and sequential anytime-valid stats since 0.22 — the
+  README now actually names them, not just "evaluation infrastructure."
+- **`src/rl/index.ts` carries stability markers** — every re-export is tagged
+  `@stable` or `@experimental` via JSDoc. Stable: `run-record-adapters`,
+  `verifiable-reward`, `preferences`, `off-policy`, `tournament`,
+  `contamination`, `compute-curves`. Experimental: `process-reward`,
+  `adversarial`, `active-curriculum`, `reward-hacking`, `adaptation-eval`,
+  `exporters`, `rl-campaign`, `predictive-validity-researcher`, `auto-research`.
+  Tags are visible in IDE hover and emitted into `dist/rl.d.ts` so consumers
+  can see the contract at the call site.
+### Added
+- **Biome lint + format** — `biome.json` codifies the project style (no
+  semicolons, single quotes, 2-space indent, 100 col, `noNonNullAssertion`
+  off, `useNodejsImportProtocol` on). `pnpm lint` and `pnpm format` scripts.
+- **`.github/workflows/ci.yml`** — runs typecheck + lint + test + build +
+  Python pytest on every PR. Previously only the publish workflow on tag
+  push exercised this surface; PRs were unguarded.
+- **`ReplayCache.entries()`** — public iterator for the cached
+  `(request, response)` pairs. Replaces the bracket-access escape hatch into
+  the private `byKey` map. Same semantics, exposed in the type contract.
+- **Per-example READMEs** — `examples/multi-shot-optimization` and
+  `examples/same-sandbox-harness` now document what they show, how to run,
+  expected output, and adaptation guidance. The other three examples already
+  had READMEs; the README index now links to all five.
+- **`clients/python/examples/judge_anti_slop.py`** — runnable script that
+  doubles as a pytest, anchoring the `judge` API contract: composite in
+  `[0, 1]`, `RubricNotFoundError` for bogus rubric name, `ValidationError`
+  for no-rubric call.
+### Fixed
+- **`reflective-mutation.ts`** — local `escape` variable shadowed the global
+  `escape` property. Renamed to `escaped`. No behavior change; flagged by
+  biome.
 ## 0.23.1 — FileSystemTraceStore.updateRun no longer double-appends
 ### Fixed

package/README.md CHANGED Viewed

@@ -1,32 +1,39 @@
 # @tangle-network/agent-eval
-Evaluation infrastructure for agent products.
-Use it to wrap the real workflow your users run, record what happened, verify
-the result, turn feedback into replay data, compare variants, and ship only
-when the evidence improves.
+**Substrate for self-improving agents.** Trace what runs, verify the result,
+turn outcomes into preferences and rewards, mutate prompts and policies under
+anytime-valid evidence, and ship only when the improvement is decisive.
 ```txt
-product task
-  -> observe state
-  -> validate with deterministic gates first
-  -> act through the real product adapter
-  -> trace + feedback trajectory
-  -> replay / optimize / release gate
+real product task
+  -> observe / act (your runtime)
+  -> trace + verifier pipeline (capture integrity)
+  -> RunRecord (canonical eval artifact)
+       -> judge calibration · paired stats · sequential α
+       -> preferences · verifiable rewards · process rewards
+       -> GEPA / reflective mutation · auto-research · active curriculum
+       -> release gate · replay · contamination probe · tournament rating
+  -> next iteration
 ```
-`agent-eval` does not own product state, credentials, UI, storage, model
+`agent-eval` does **not** own product state, credentials, UI, storage, model
 routing, browser drivers, sandbox policy, or deployment. Products own those.
-This package owns eval contracts, loop mechanics, traces, statistics,
-optimization inputs, and release evidence.
+This package owns the loop that closes evaluation → preference → mutation →
+redeploy, with capture integrity and statistically rigorous evidence at every
+step.
+It ships as a TypeScript library (npm) with a generated Python client (PyPI),
+both speaking the same wire protocol. MIT, self-hostable, no SaaS dependency.
 ## Install
 ```sh
 pnpm add @tangle-network/agent-eval
+# or, from Python:
+pip install agent-eval-rpc
 ```
-## Quick Start
+## Quick Start — the control loop
 ```ts
 import {
@@ -78,68 +85,171 @@ const result = await runAgentControlLoop({
 await product.storeEvalResult(task.id, result)
 ```
-That loop should be the same shape in production, replay, benchmark, and
-optimization. Swap dependencies behind `observe()` and `act()`, not the eval
-contract itself.
+Same loop shape in production, replay, benchmark, and optimization. Swap the
+dependencies behind `observe()` and `act()`, never the eval contract.
-## Import Paths
+## Production loop — close the eval → prod → eval cycle (0.25.0)
+Static prompts decay. Yesterday's FTC rule flips today; yesterday's tool quirk
+becomes today's incident. The production agents that win are the ones that
+**continuously re-train against live failure modes**.
-The root export remains available, but new code should prefer focused subpaths:
+`runProductionLoop` is the orchestration layer that wires the existing eval
+substrate into a self-improvement cron:
 ```ts
-import { runAgentControlLoop } from '@tangle-network/agent-eval/control'
-import { runMultiShotOptimization } from '@tangle-network/agent-eval/optimization'
-import { TraceEmitter } from '@tangle-network/agent-eval/traces'
-import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
+import {
+  runProductionLoop,
+  httpGithubClient,
+  FileSystemFeedbackTrajectoryStore,
+} from '@tangle-network/agent-eval'
+import { FileSystemTraceStore } from '@tangle-network/agent-eval/traces'
+const result = await runProductionLoop({
+  runId: `weekly-${new Date().toISOString().slice(0, 10)}`,
+  target: 'tax-agent',
+  // 1. Where production traces + feedback land. Wire the HTTP ingestion
+  //    endpoints (POST /v1/traces/ingest, POST /v1/feedback) from your
+  //    runtime; the same store reads them here.
+  traceStore: new FileSystemTraceStore({ dir: 'data/prod-traces' }),
+  feedbackStore: new FileSystemFeedbackTrajectoryStore({ dir: 'data/prod-feedback' }),
+  // 2. Cluster threshold: act on failure groups ≥ 20 runs or ≥ 5% of corpus.
+  cluster: { minClusterSize: 20, minSeverityRatio: 0.05, maxClustersPerCycle: 1 },
+  // 3. Evolve: seed = current prompt, gate against holdout scenarios.
+  evolve: {
+    baselinePrompt: currentSystemPrompt,
+    holdoutScenarios: productionShapeScenarios,
+    runner,                            // your agent driver
+    scorer,                            // calibrated judge or rubric
+    mutator,                           // GEPA-style or addendum-style mutator
+    gate: {
+      baselineKey: 'baseline',
+      minProductiveRuns: 5,
+      pairedDeltaThreshold: 0.03,      // require Nσ improvement on holdout
+      overfitGapThreshold: 0.10,
+    },
+  },
+  // 4. Ship: when the gate passes, open a PR with the new prompt.
+  ship: {
+    client: httpGithubClient({ token: process.env.GITHUB_TOKEN! }),
+    repo: { owner: 'tangle-network', name: 'tax-agent' },
+    branchPrefix: 'eval/auto-improve',
+    promptFilePath: 'prompts/tax-agent-system.txt',
+    reviewers: ['drew'],
+  },
+  cron: { cadence: 'weekly' },         // surface-only; consumer schedules
+})
+console.log(result.decision)            // 'pr_opened' | 'gate_failed' | 'no_actionable_failures' | ...
+console.log(result.pullRequest?.prUrl)  // populated when a PR was opened
 ```
+The primitive runs **one cycle**. Schedule it with `workflow_dispatch` + cron in
+GitHub Actions. It is **idempotent + replayable**: same `runId` → same plan.
+Gate failures are fail-closed — a candidate that beats baseline on search but
+overfits on holdout never lands.
+Full runnable demo (synthetic traces, no credentials) in
+[`examples/production-loop`](./examples/production-loop/README.md).
+## Self-improvement loop
+Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
+proposals, and curriculum updates — all from the same `RunRecord` produced by
+the control loop.
+```ts
+import { runEvalCampaign } from '@tangle-network/agent-eval'
+import {
+  extractPreferences,
+  extractVerifiableReward,
+  filterDeterministicallyRewarded,
+  offPolicyEstimateAll,
+  analyzeOptimizationResult,
+} from '@tangle-network/agent-eval/rl'
+// 1. Run a matrix of variants × scenarios with capture integrity by construction.
+const campaign = await runEvalCampaign({ variants, scenarios, run })
+// 2. Convert outcomes into RL signal.
+const rewards = extractVerifiableReward(campaign.runs)          // compile/test/schema
+const prefs   = extractPreferences(campaign.runs)               // (chosen, rejected) triples
+const clean   = filterDeterministicallyRewarded(rewards)        // judge-noise free
+// 3. Estimate a candidate policy's value without re-running.
+const ope = offPolicyEstimateAll(campaign.runs, candidatePolicy)  // IPS + SNIPS + DR
+// 4. Or close the loop end-to-end: score → reflect → mutate → re-run.
+const next = await analyzeOptimizationResult(campaign, { researcher })
+```
+| Step | Primitive | Subpath |
+| --- | --- | --- |
+| Eval matrix with integrity | `runEvalCampaign` | `/` |
+| Deterministic re-judge / audit | `ReplayCache`, `createReplayFetch` | `/` |
+| Anytime-valid α across rolling looks | `pairedEvalueSequence` | `/reporting` |
+| Judge quality vs gold | `calibrateJudge` (κ, Pearson, MAE, bias probes) | `/` |
+| (chosen, rejected) for DPO/KTO/PPO | `extractPreferences` | `/rl` |
+| Verifiable reward signal | `extractVerifiableReward` | `/rl` |
+| Step-level / PRM training data | `extractStepRewards`, `prmTrainingPairs` | `/rl` |
+| Estimate policy value off-policy | `offPolicyEstimateAll` (IPS + SNIPS + DR) | `/rl` |
+| GEPA / reflective prompt mutation | `buildReflectionPrompt`, `parseReflectionResponse`, Ax-GEPA `SteeringOptimizer` | `/` `/optimization` |
+| Auto-research (read runs → propose) | `analyzeOptimizationResult`, `PredictiveValidityResearcher` | `/rl` |
+| Active curriculum (variance / Thompson) | `allocateCurriculum` | `/rl` |
+| Tournament ratings (Bradley-Terry + Elo) | `fitBradleyTerry`, `applyEloUpdate` | `/rl` |
+| Adversarial scenario search | `adversarialScenarioSearch` | `/rl` |
+| Contamination probe (held-out perturb) | `runContaminationProbe` | `/rl` |
+| Reward hacking signatures | `detectRewardHacking` | `/rl` |
+| Compute curves (best-of-N, self-consist, Pareto) | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` | `/rl` |
+| Knowledge gap separated from reasoning gap | `scoreKnowledgeReadiness` | `/` |
+| Release gate (paired evidence + holdouts) | `evaluateReleaseConfidence`, `HeldOutGate` | `/reporting` |
+| Launch report (decision-grade) | `renderReleaseReport`, `researchReport` | `/reporting` |
+## Import Paths
 | Subpath | Use for |
 | --- | --- |
-| `@tangle-network/agent-eval/control` | `observe -> validate -> decide -> act`, action policy, propose/review loops |
+| `@tangle-network/agent-eval/control` | `observe → validate → decide → act`, action policy, propose/review loops |
 | `@tangle-network/agent-eval/traces` | trace stores, emitters, TraceAnalyst, replay |
-| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot optimization, prompt evolution, EvalCampaign |
-| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, report/table/chart specs, predictive validity |
-| `@tangle-network/agent-eval/rl` | RL bridge: adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves |
-| `@tangle-network/agent-eval/wire` | HTTP/RPC judge server and schemas |
+| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot, prompt evolution, GEPA, EvalCampaign |
+| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, launch reports |
+| `@tangle-network/agent-eval/rl` | adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves, auto-research |
+| `@tangle-network/agent-eval/wire` | HTTP/RPC server + schemas (same protocol the Python client speaks) |
 | `@tangle-network/agent-eval/benchmarks` | benchmark adapter contracts and reference wrappers |
-## Core Pieces
+The root export remains available for convenience; new code should prefer
+focused subpaths. Anything under `/rl` should be imported from `/rl` — root
+re-export is retained only for backward compatibility and will be narrowed in
+0.25.
+## API stability
-| Need | Use |
+Public exports are tagged with JSDoc stability markers so consumers can see
+status at the call site (IDE hover, language server, declaration files).
+| Tag | Meaning |
 | --- | --- |
-| Keep an agent working until objective state passes | `runAgentControlLoop` |
-| Turn user/reviewer feedback into replay data | `FeedbackTrajectory` |
-| Compare prompt/tool/retrieval policies over full trajectories | `runMultiShotOptimization` |
-| Gate releases with paired evidence and holdouts | `evaluateReleaseConfidence`, `HeldOutGate` |
-| Explain regressions across trace corpora | `TraceAnalyst` / `analyzeTraces` |
-| Report a launch decision | `renderReleaseReport`, `researchReport`, `summaryTable`, `paretoChart`, `gainHistogram` |
-| Capture every provider HTTP request / response for forensics | `RawProviderSink`, `LlmClientOptions.rawSink` |
-| Fail loud if an eval would silently use the wrong route | `assertLlmRoute` |
-| Assert at run-end that the artifact is complete | `assertRunCaptured`, `throwIfRunIncomplete` |
-| Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
-| Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
-| Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
-| Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
-| Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
-| Bridge legacy optimization output to canonical `RunRecord[]` | `trialsToRunRecords`, `verificationReportToRunRecord` |
-| Extract a clean reward signal for RL training (compile/test/schema vs judge) | `extractVerifiableReward`, `filterDeterministicallyRewarded` |
-| Produce DPO / PPO / KTO `(chosen, rejected)` triples | `extractPreferences` |
-| Estimate a new policy's value on old trajectories without re-running | `offPolicyEstimateAll` (IPS + SNIPS + DR) |
-| Step-level credit assignment / PRM training data | `extractStepRewards`, `prmTrainingPairs` |
-| Detect benchmark contamination via held-out perturbations | `runContaminationProbe` |
-| Pairwise tournament ratings for many-candidate sweeps | `fitBradleyTerry`, `applyEloUpdate` |
-| Active search for inputs the policy fails on | `adversarialScenarioSearch` |
-| Characterise a candidate across compute budgets | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` |
-| Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
-### Capture integrity (0.21+)
+| `@stable` | API frozen at this major. Breaking changes require a major bump. |
+| `@experimental` | Interface may evolve before becoming `@stable`. Pin the patch version if you depend on it. |
+| `@internal` | Not part of the public contract. Use the documented subpath instead. |
+The `/rl` subpath is the most active surface. See
+[`src/rl/index.ts`](./src/rl/index.ts) for the current stable/experimental
+breakdown.
+## Capture integrity (0.21+)
 Launch-grade benchmark runs need four things that are easy to forget in glue
 code: (1) raw HTTP capture alongside the structured spans so a reviewer can
 verify which route answered, (2) a preflight assertion that the configured
 client points at the intended provider, (3) a run-end assertion that the
 expected events were actually written, and (4) auto-execution of the trace
-analyst as part of the run lifecycle. The wiring fits in a few lines:
+analyst as part of the run lifecycle.
 ```ts
 import {
@@ -168,28 +278,35 @@ Directives, rationale, and shipped-bug context are in
 ## Examples
-Runnable examples live in
-[`examples/`](https://github.com/tangle-network/agent-eval/tree/main/examples).
+Each example has its own README with what it demonstrates, expected output,
+and runtime. See [`examples/`](./examples/).
-- [`examples/multi-shot-optimization`](https://github.com/tangle-network/agent-eval/tree/main/examples/multi-shot-optimization):
+- [`examples/multi-shot-optimization`](./examples/multi-shot-optimization/README.md):
   optimize full trajectories with held-out promotion.
-- [`examples/same-sandbox-harness`](https://github.com/tangle-network/agent-eval/tree/main/examples/same-sandbox-harness):
+- [`examples/same-sandbox-harness`](./examples/same-sandbox-harness/README.md):
   run setup/build/test and evidence checks in one workspace.
-- [`examples/benchmarks`](https://github.com/tangle-network/agent-eval/tree/main/examples/benchmarks):
+- [`examples/benchmarks`](./examples/benchmarks/README.md):
   benchmark adapter shape and reference wrappers.
+- [`examples/auto-research-with-agent-builder`](./examples/auto-research-with-agent-builder/README.md):
+  closed loop — score, reflect, mutate, re-score, repeat.
+- [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
+  RunRecord → preferences → trainer (prime-rl) → next campaign.
+- [`examples/production-loop`](./examples/production-loop/README.md):
+  ingest prod traces + feedback, cluster failures, evolve, gate, open a PR.
 ## Docs
 Read in this order:
-1. [Product Eval Adoption](./docs/product-eval-adoption.md)
-2. [Control Runtime](./docs/control-runtime.md)
-3. [Feedback Trajectories](./docs/feedback-trajectories.md)
-4. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
-5. [Trace Analysis](./docs/trace-analysis.md)
-6. [Knowledge Readiness](./docs/knowledge-readiness.md)
-7. [Integration Launch Gates](./docs/integration-launch-gates.md)
-8. [Wire Protocol](./docs/wire-protocol.md)
+1. [Concepts](./docs/concepts.md) — mental model, 5 min
+2. [Product Eval Adoption](./docs/product-eval-adoption.md)
+3. [Control Runtime](./docs/control-runtime.md)
+4. [Feedback Trajectories](./docs/feedback-trajectories.md)
+5. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
+6. [Trace Analysis](./docs/trace-analysis.md)
+7. [Knowledge Readiness](./docs/knowledge-readiness.md)
+8. [Integration Launch Gates](./docs/integration-launch-gates.md)
+9. [Wire Protocol](./docs/wire-protocol.md) — required for non-TypeScript consumers
 ## CLI / Wire Protocol
@@ -198,28 +315,44 @@ npm i -g @tangle-network/agent-eval
 agent-eval serve --port 5005
 ```
-The Python client lives in `clients/python`:
+Python:
 ```sh
-cd clients/python
-pip install -e .
+pip install agent-eval-rpc
 ```
+```py
+from agent_eval_rpc import Client
+client = Client()  # auto-detects HTTP server, falls back to subprocess
+score = await client.judge(content=output, rubric_name="anti-slop")
+```
+TypeScript is the source of truth. Python is a thin transport client over the
+generated OpenAPI schema. Schema drift is enforced impossible at release time
+(version-locked CI).
 ## Development
 ```sh
 pnpm install
 pnpm typecheck
 pnpm test
-pnpm build
-pnpm openapi
+pnpm lint        # biome
+pnpm build       # tsup + openapi.json
 ```
 ## Related Packages
-- `@tangle-network/agent-runtime`: production session/runtime layer.
-- `@tangle-network/agent-knowledge`: source-grounded knowledge bases and readiness.
-- `@tangle-network/agent-integrations`: connection, grant, capability, and integration invocation contracts.
+- [`@tangle-network/agent-runtime`](https://www.npmjs.com/package/@tangle-network/agent-runtime):
+  production session/runtime layer.
+- [`@tangle-network/agent-knowledge`](https://www.npmjs.com/package/@tangle-network/agent-knowledge):
+  source-grounded knowledge bases and readiness.
+- [`@tangle-network/agent-integrations`](https://www.npmjs.com/package/@tangle-network/agent-integrations):
+  connection, grant, capability, and integration invocation contracts.
+Together: `agent-runtime` is where the agent runs; `agent-knowledge` is what
+it knows; `agent-integrations` is what it can do; `agent-eval` is how it gets
+better.
 ## License

package/dist/baseline-4R5deP0N.d.ts ADDED Viewed

@@ -0,0 +1,108 @@
+import { T as TraceStore } from './store-Db2Bv8Cf.js';
+/**
+ * Tool-use metrics — derived purely from trace data.
+ *
+ * No scoring assumptions: consumers supply optional ground-truth tool
+ * selections per turn + optional "information used downstream" signals.
+ * Without those, we still compute descriptive metrics (error rate,
+ * retry rate, duplicate-call rate) that are useful on their own.
+ */
+interface ToolUseMetrics {
+    runId: string;
+    totalCalls: number;
+    byTool: Record<string, ToolStats>;
+    errorRate: number;
+    /** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */
+    duplicateRate: number;
+    /** Ratio of error calls followed by ≥1 retry on same tool. */
+    retryRate: number;
+    /** Optional: of the calls agent made, fraction the evaluator marked as "correct selection". */
+    selectionAccuracy?: number;
+}
+interface ToolStats {
+    calls: number;
+    errors: number;
+    avgLatencyMs: number;
+    duplicates: number;
+}
+interface ToolUseOptions {
+    /** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */
+    selectionLabels?: Record<string, boolean>;
+}
+declare function computeToolUseMetrics(store: TraceStore, runId: string, options?: ToolUseOptions): Promise<ToolUseMetrics>;
+/**
+ * Baseline regression detection.
+ *
+ * Lifted from ADC baseline.ts. Every promotion-blocking signal boils down
+ * to: "is this run measurably worse than baseline?" — with enough
+ * statistical rigor to distinguish noise from drift.
+ *
+ * Uses:
+ *   - Welch's t-test (unequal variance) for per-metric mean comparison
+ *   - Cohen's d for effect size magnitude
+ *   - IQR for stability flag (unstable samples can't be trusted for comparisons)
+ *
+ * Returns a structured verdict: improved | regressed | stable | unstable.
+ */
+interface MetricSamples {
+    /** Stable metric key (e.g. "overallScore", "firstTokenMs"). */
+    metric: string;
+    /** Whether higher values are better. */
+    higherIsBetter: boolean;
+    baseline: number[];
+    candidate: number[];
+}
+interface MetricVerdict {
+    metric: string;
+    baselineMean: number;
+    candidateMean: number;
+    delta: number;
+    cohensD: number;
+    welchT: number;
+    welchDf: number;
+    welchP: number;
+    stable: boolean;
+    /** IQR of the combined samples — used as a rough stability indicator. */
+    iqr: number;
+    verdict: 'improved' | 'regressed' | 'stable' | 'unstable';
+}
+interface BaselineReport {
+    metrics: MetricVerdict[];
+    /** True if any critical metric regressed. */
+    hasRegression: boolean;
+    /** True if any metric is unstable (too noisy to judge). */
+    hasUnstable: boolean;
+}
+interface BaselineOptions {
+    /** Effect size threshold for meaningful delta (default 0.5 — medium effect). */
+    effectThreshold?: number;
+    /** p-value threshold for statistical significance (default 0.05). */
+    alpha?: number;
+    /** IQR/mean ratio above which samples are flagged unstable (default 0.30). */
+    unstableCvThreshold?: number;
+}
+/**
+ * Compare candidate samples against baseline per metric. Verdict logic:
+ *   - unstable: IQR/|mean| > threshold on either set — not enough signal
+ *   - improved: meaningful effect in the "better" direction AND p < alpha
+ *   - regressed: meaningful effect in the "worse" direction AND p < alpha
+ *   - stable: otherwise (no significant change)
+ */
+declare function compareToBaseline(samples: MetricSamples[], options?: BaselineOptions): BaselineReport;
+/** Inter-quartile range; 0 when the sample has no spread. */
+declare function iqr(xs: number[]): number;
+/**
+ * Welch's t-test — unequal-variance two-sample t. Uses the same Student-t
+ * CDF as `pairedTTest` (via incomplete beta); falls back to normal tail
+ * when df is large.
+ */
+declare function welchsTTest(a: number[], b: number[]): {
+    t: number;
+    df: number;
+    p: number;
+};
+export { type BaselineOptions as B, type MetricSamples as M, type ToolStats as T, type BaselineReport as a, type MetricVerdict as b, computeToolUseMetrics as c, type ToolUseMetrics as d, type ToolUseOptions as e, compareToBaseline as f, iqr as i, welchsTTest as w };

package/dist/benchmarks/index.d.ts CHANGED Viewed

@@ -1,2 +1,3 @@
-export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-DDTlbHEK.js';
-import '../run-record-DNiOMBrZ.js';
+export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index--fVrWDiR.js';
+import '../run-record-CqzahIbx.js';
+import '../errors-BZ9sTdz7.js';

package/dist/benchmarks/index.js CHANGED Viewed

@@ -2,7 +2,7 @@ import {
   BENCHMARK_SPLIT_SEED,
   deterministicSplit,
   routing_exports
-} from "../chunk-42I2QC2L.js";
+} from "../chunk-6QDKWHLS.js";
 import "../chunk-PZ5AY32C.js";
 export {
   BENCHMARK_SPLIT_SEED,