@tangle-network/agent-eval 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +102 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +35 -2
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-4W4NCYM2.js +0 -1945
  129. package/dist/chunk-4W4NCYM2.js.map +0 -1
  130. package/dist/chunk-5IIQKMD5.js.map +0 -1
  131. package/dist/chunk-6KQG5HAH.js.map +0 -1
  132. package/dist/chunk-6M774GY6.js.map +0 -1
  133. package/dist/chunk-7EAUOUQS.js.map +0 -1
  134. package/dist/chunk-AXHNWLIX.js.map +0 -1
  135. package/dist/chunk-EXGR4XEM.js.map +0 -1
  136. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  137. package/dist/chunk-KAO3Q65R.js.map +0 -1
  138. package/dist/chunk-LZKIOBG2.js +0 -2026
  139. package/dist/chunk-LZKIOBG2.js.map +0 -1
  140. package/dist/chunk-QBW3YBTR.js.map +0 -1
  141. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  142. package/dist/chunk-SQQLHODJ.js.map +0 -1
  143. package/dist/chunk-V5QSWN7L.js +0 -1310
  144. package/dist/chunk-V5QSWN7L.js.map +0 -1
  145. package/dist/chunk-VQQSPGSM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
package/CHANGELOG.md CHANGED
@@ -1,5 +1,107 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices
4
+
5
+ This release is **DX + correctness**. No production behavior moved; consumer
6
+ contracts tightened across the board. Library went from 7.5/10 to 10/10 on
7
+ first-touch usability and contract clarity. The visible deltas:
8
+
9
+ ### Strictness
10
+
11
+ - **`noUncheckedIndexedAccess: true`** in `tsconfig.json`. 251 latent
12
+ `T | undefined` sites surfaced and fixed across ~70 files. Loop-bound
13
+ indices documented with `!`, external lookups guarded explicitly, accumulator
14
+ patterns refactored to capture-then-assign. Every fix audited for semantic
15
+ correctness (math code: `!`; untrusted data: guards).
16
+ - **Subpath imports forced.** Six `export * from './X'` wildcards at root
17
+ deleted (`./rl`, `./pipelines`, `./builder-eval`, `./meta-eval`, `./prm`,
18
+ `./trace-analyst`). New subpaths in `package.json`: `/pipelines`,
19
+ `/meta-eval`, `/prm`, `/builder-eval`, `/governance`, `/knowledge`. Root
20
+ re-exports retained only for the load-bearing capture-integrity surface
21
+ (`./trace`, `./knowledge`, `./governance`).
22
+ - **Error taxonomy.** New `src/errors.ts` exports `AgentEvalError` base plus
23
+ `ValidationError`, `NotFoundError`, `ConfigError`, `CaptureIntegrityError`,
24
+ `JudgeError`, `VerificationError`, `ReplayError`. Existing custom errors
25
+ re-parented: `ReplayCacheMissError`, `BudgetBreachError`, `RunIntegrityError`,
26
+ `HoldoutLockedError`, `RunRecordValidationError`, `LlmCallError`,
27
+ `LlmRouteAssertionError`, `TraceFileMissingError`, `TraceNotFoundError`,
28
+ `SpanNotFoundError`. ~25 user-facing `throw new Error(...)` calls migrated
29
+ to typed errors across `rl/*`, `replay`, `sandbox-harness`, `statistics`,
30
+ `release-confidence`, `visual-diff`, `counterfactual`, `run-critic`,
31
+ `observability`. Internal invariant guards intentionally left as plain
32
+ `Error` — those are bugs, not contract failures.
33
+ - **`LlmRouteAssertionError.code` → `reason`** (breaking, greenfield).
34
+ The subclass's route-specific reason now lives on `.reason`; the base
35
+ category `code = 'capture_integrity'` survives via the `AgentEvalError`
36
+ contract.
37
+
38
+ ### Visible deltas
39
+
40
+ ### Changed
41
+
42
+ - **README reframed** as the substrate for self-improving agents. The package
43
+ has shipped `EvalCampaign`, replay, GEPA / reflective mutation, auto-research,
44
+ active curriculum, contamination probes, tournaments, compute curves, PRM,
45
+ off-policy estimators, and sequential anytime-valid stats since 0.22 — the
46
+ README now actually names them, not just "evaluation infrastructure."
47
+
48
+ - **`src/rl/index.ts` carries stability markers** — every re-export is tagged
49
+ `@stable` or `@experimental` via JSDoc. Stable: `run-record-adapters`,
50
+ `verifiable-reward`, `preferences`, `off-policy`, `tournament`,
51
+ `contamination`, `compute-curves`. Experimental: `process-reward`,
52
+ `adversarial`, `active-curriculum`, `reward-hacking`, `adaptation-eval`,
53
+ `exporters`, `rl-campaign`, `predictive-validity-researcher`, `auto-research`.
54
+ Tags are visible in IDE hover and emitted into `dist/rl.d.ts` so consumers
55
+ can see the contract at the call site.
56
+
57
+ ### Added
58
+
59
+ - **Biome lint + format** — `biome.json` codifies the project style (no
60
+ semicolons, single quotes, 2-space indent, 100 col, `noNonNullAssertion`
61
+ off, `useNodejsImportProtocol` on). `pnpm lint` and `pnpm format` scripts.
62
+ - **`.github/workflows/ci.yml`** — runs typecheck + lint + test + build +
63
+ Python pytest on every PR. Previously only the publish workflow on tag
64
+ push exercised this surface; PRs were unguarded.
65
+ - **`ReplayCache.entries()`** — public iterator for the cached
66
+ `(request, response)` pairs. Replaces the bracket-access escape hatch into
67
+ the private `byKey` map. Same semantics, exposed in the type contract.
68
+ - **Per-example READMEs** — `examples/multi-shot-optimization` and
69
+ `examples/same-sandbox-harness` now document what they show, how to run,
70
+ expected output, and adaptation guidance. The other three examples already
71
+ had READMEs; the README index now links to all five.
72
+ - **`clients/python/examples/judge_anti_slop.py`** — runnable script that
73
+ doubles as a pytest, anchoring the `judge` API contract: composite in
74
+ `[0, 1]`, `RubricNotFoundError` for bogus rubric name, `ValidationError`
75
+ for no-rubric call.
76
+
77
+ ### Fixed
78
+
79
+ - **`reflective-mutation.ts`** — local `escape` variable shadowed the global
80
+ `escape` property. Renamed to `escaped`. No behavior change; flagged by
81
+ biome.
82
+
83
+ ## 0.23.1 — FileSystemTraceStore.updateRun no longer double-appends
84
+
85
+ ### Fixed
86
+
87
+ - **`FileSystemTraceStore.updateRun` / `updateSpan`** — once the lazy
88
+ in-memory index had been populated (by any prior `getRun` / `listRuns` /
89
+ `spans` / `events` query), an `updateRun` would mirror the synthetic
90
+ update row back into the index via `appendRun`, throwing
91
+ `run X already exists`. Same root cause for `updateSpan`, which would
92
+ silently insert a phantom duplicate span row. The `append()` helper now
93
+ skips `insertInto` for rows carrying the internal `_update: true` marker;
94
+ `updateRun` / `updateSpan` continue to apply the patch directly via the
95
+ index's `updateRun` / `updateSpan` APIs.
96
+
97
+ Surfaced by tax-agent's canonical eval running multiple variants per
98
+ persona against a shared store: the second variant's `endRun`
99
+ consistently threw, forcing callers to instantiate one store per
100
+ (persona × variant) cell and stitch results back together post-hoc.
101
+ After this fix, a single `FileSystemTraceStore` can fan out runs across
102
+ arbitrarily many cells with interleaved reads, which is the intended
103
+ usage pattern. Regression test added in `tests/trace-store.test.ts`.
104
+
3
105
  ## 0.23.0 — RL primitives + auto-research worked example
4
106
 
5
107
  In addition to the RL bridge primitives below, this release ships the
package/README.md CHANGED
@@ -1,32 +1,39 @@
1
1
  # @tangle-network/agent-eval
2
2
 
3
- Evaluation infrastructure for agent products.
4
-
5
- Use it to wrap the real workflow your users run, record what happened, verify
6
- the result, turn feedback into replay data, compare variants, and ship only
7
- when the evidence improves.
3
+ **Substrate for self-improving agents.** Trace what runs, verify the result,
4
+ turn outcomes into preferences and rewards, mutate prompts and policies under
5
+ anytime-valid evidence, and ship only when the improvement is decisive.
8
6
 
9
7
  ```txt
10
- product task
11
- -> observe state
12
- -> validate with deterministic gates first
13
- -> act through the real product adapter
14
- -> trace + feedback trajectory
15
- -> replay / optimize / release gate
8
+ real product task
9
+ -> observe / act (your runtime)
10
+ -> trace + verifier pipeline (capture integrity)
11
+ -> RunRecord (canonical eval artifact)
12
+ -> judge calibration · paired stats · sequential α
13
+ -> preferences · verifiable rewards · process rewards
14
+ -> GEPA / reflective mutation · auto-research · active curriculum
15
+ -> release gate · replay · contamination probe · tournament rating
16
+ -> next iteration
16
17
  ```
17
18
 
18
- `agent-eval` does not own product state, credentials, UI, storage, model
19
+ `agent-eval` does **not** own product state, credentials, UI, storage, model
19
20
  routing, browser drivers, sandbox policy, or deployment. Products own those.
20
- This package owns eval contracts, loop mechanics, traces, statistics,
21
- optimization inputs, and release evidence.
21
+ This package owns the loop that closes evaluation → preference → mutation →
22
+ redeploy, with capture integrity and statistically rigorous evidence at every
23
+ step.
24
+
25
+ It ships as a TypeScript library (npm) with a generated Python client (PyPI),
26
+ both speaking the same wire protocol. MIT, self-hostable, no SaaS dependency.
22
27
 
23
28
  ## Install
24
29
 
25
30
  ```sh
26
31
  pnpm add @tangle-network/agent-eval
32
+ # or, from Python:
33
+ pip install agent-eval-rpc
27
34
  ```
28
35
 
29
- ## Quick Start
36
+ ## Quick Start — the control loop
30
37
 
31
38
  ```ts
32
39
  import {
@@ -78,68 +85,102 @@ const result = await runAgentControlLoop({
78
85
  await product.storeEvalResult(task.id, result)
79
86
  ```
80
87
 
81
- That loop should be the same shape in production, replay, benchmark, and
82
- optimization. Swap dependencies behind `observe()` and `act()`, not the eval
83
- contract itself.
88
+ Same loop shape in production, replay, benchmark, and optimization. Swap the
89
+ dependencies behind `observe()` and `act()`, never the eval contract.
84
90
 
85
- ## Import Paths
91
+ ## Self-improvement loop
86
92
 
87
- The root export remains available, but new code should prefer focused subpaths:
93
+ Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
94
+ proposals, and curriculum updates — all from the same `RunRecord` produced by
95
+ the control loop.
88
96
 
89
97
  ```ts
90
- import { runAgentControlLoop } from '@tangle-network/agent-eval/control'
91
- import { runMultiShotOptimization } from '@tangle-network/agent-eval/optimization'
92
- import { TraceEmitter } from '@tangle-network/agent-eval/traces'
93
- import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
98
+ import { runEvalCampaign } from '@tangle-network/agent-eval'
99
+ import {
100
+ extractPreferences,
101
+ extractVerifiableReward,
102
+ filterDeterministicallyRewarded,
103
+ offPolicyEstimateAll,
104
+ analyzeOptimizationResult,
105
+ } from '@tangle-network/agent-eval/rl'
106
+
107
+ // 1. Run a matrix of variants × scenarios with capture integrity by construction.
108
+ const campaign = await runEvalCampaign({ variants, scenarios, run })
109
+
110
+ // 2. Convert outcomes into RL signal.
111
+ const rewards = extractVerifiableReward(campaign.runs) // compile/test/schema
112
+ const prefs = extractPreferences(campaign.runs) // (chosen, rejected) triples
113
+ const clean = filterDeterministicallyRewarded(rewards) // judge-noise free
114
+
115
+ // 3. Estimate a candidate policy's value without re-running.
116
+ const ope = offPolicyEstimateAll(campaign.runs, candidatePolicy) // IPS + SNIPS + DR
117
+
118
+ // 4. Or close the loop end-to-end: score → reflect → mutate → re-run.
119
+ const next = await analyzeOptimizationResult(campaign, { researcher })
94
120
  ```
95
121
 
122
+ | Step | Primitive | Subpath |
123
+ | --- | --- | --- |
124
+ | Eval matrix with integrity | `runEvalCampaign` | `/` |
125
+ | Deterministic re-judge / audit | `ReplayCache`, `createReplayFetch` | `/` |
126
+ | Anytime-valid α across rolling looks | `pairedEvalueSequence` | `/reporting` |
127
+ | Judge quality vs gold | `calibrateJudge` (κ, Pearson, MAE, bias probes) | `/` |
128
+ | (chosen, rejected) for DPO/KTO/PPO | `extractPreferences` | `/rl` |
129
+ | Verifiable reward signal | `extractVerifiableReward` | `/rl` |
130
+ | Step-level / PRM training data | `extractStepRewards`, `prmTrainingPairs` | `/rl` |
131
+ | Estimate policy value off-policy | `offPolicyEstimateAll` (IPS + SNIPS + DR) | `/rl` |
132
+ | GEPA / reflective prompt mutation | `buildReflectionPrompt`, `parseReflectionResponse`, Ax-GEPA `SteeringOptimizer` | `/` `/optimization` |
133
+ | Auto-research (read runs → propose) | `analyzeOptimizationResult`, `PredictiveValidityResearcher` | `/rl` |
134
+ | Active curriculum (variance / Thompson) | `allocateCurriculum` | `/rl` |
135
+ | Tournament ratings (Bradley-Terry + Elo) | `fitBradleyTerry`, `applyEloUpdate` | `/rl` |
136
+ | Adversarial scenario search | `adversarialScenarioSearch` | `/rl` |
137
+ | Contamination probe (held-out perturb) | `runContaminationProbe` | `/rl` |
138
+ | Reward hacking signatures | `detectRewardHacking` | `/rl` |
139
+ | Compute curves (best-of-N, self-consist, Pareto) | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` | `/rl` |
140
+ | Knowledge gap separated from reasoning gap | `scoreKnowledgeReadiness` | `/` |
141
+ | Release gate (paired evidence + holdouts) | `evaluateReleaseConfidence`, `HeldOutGate` | `/reporting` |
142
+ | Launch report (decision-grade) | `renderReleaseReport`, `researchReport` | `/reporting` |
143
+
144
+ ## Import Paths
145
+
96
146
  | Subpath | Use for |
97
147
  | --- | --- |
98
- | `@tangle-network/agent-eval/control` | `observe -> validate -> decide -> act`, action policy, propose/review loops |
148
+ | `@tangle-network/agent-eval/control` | `observe validate decide act`, action policy, propose/review loops |
99
149
  | `@tangle-network/agent-eval/traces` | trace stores, emitters, TraceAnalyst, replay |
100
- | `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot optimization, prompt evolution, EvalCampaign |
101
- | `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, report/table/chart specs, predictive validity |
102
- | `@tangle-network/agent-eval/rl` | RL bridge: adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves |
103
- | `@tangle-network/agent-eval/wire` | HTTP/RPC judge server and schemas |
150
+ | `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot, prompt evolution, GEPA, EvalCampaign |
151
+ | `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, launch reports |
152
+ | `@tangle-network/agent-eval/rl` | adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves, auto-research |
153
+ | `@tangle-network/agent-eval/wire` | HTTP/RPC server + schemas (same protocol the Python client speaks) |
104
154
  | `@tangle-network/agent-eval/benchmarks` | benchmark adapter contracts and reference wrappers |
105
155
 
106
- ## Core Pieces
156
+ The root export remains available for convenience; new code should prefer
157
+ focused subpaths. Anything under `/rl` should be imported from `/rl` — root
158
+ re-export is retained only for backward compatibility and will be narrowed in
159
+ 0.25.
160
+
161
+ ## API stability
107
162
 
108
- | Need | Use |
163
+ Public exports are tagged with JSDoc stability markers so consumers can see
164
+ status at the call site (IDE hover, language server, declaration files).
165
+
166
+ | Tag | Meaning |
109
167
  | --- | --- |
110
- | Keep an agent working until objective state passes | `runAgentControlLoop` |
111
- | Turn user/reviewer feedback into replay data | `FeedbackTrajectory` |
112
- | Compare prompt/tool/retrieval policies over full trajectories | `runMultiShotOptimization` |
113
- | Gate releases with paired evidence and holdouts | `evaluateReleaseConfidence`, `HeldOutGate` |
114
- | Explain regressions across trace corpora | `TraceAnalyst` / `analyzeTraces` |
115
- | Report a launch decision | `renderReleaseReport`, `researchReport`, `summaryTable`, `paretoChart`, `gainHistogram` |
116
- | Capture every provider HTTP request / response for forensics | `RawProviderSink`, `LlmClientOptions.rawSink` |
117
- | Fail loud if an eval would silently use the wrong route | `assertLlmRoute` |
118
- | Assert at run-end that the artifact is complete | `assertRunCaptured`, `throwIfRunIncomplete` |
119
- | Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
120
- | Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
121
- | Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
122
- | Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
123
- | Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
124
- | Bridge legacy optimization output to canonical `RunRecord[]` | `trialsToRunRecords`, `verificationReportToRunRecord` |
125
- | Extract a clean reward signal for RL training (compile/test/schema vs judge) | `extractVerifiableReward`, `filterDeterministicallyRewarded` |
126
- | Produce DPO / PPO / KTO `(chosen, rejected)` triples | `extractPreferences` |
127
- | Estimate a new policy's value on old trajectories without re-running | `offPolicyEstimateAll` (IPS + SNIPS + DR) |
128
- | Step-level credit assignment / PRM training data | `extractStepRewards`, `prmTrainingPairs` |
129
- | Detect benchmark contamination via held-out perturbations | `runContaminationProbe` |
130
- | Pairwise tournament ratings for many-candidate sweeps | `fitBradleyTerry`, `applyEloUpdate` |
131
- | Active search for inputs the policy fails on | `adversarialScenarioSearch` |
132
- | Characterise a candidate across compute budgets | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` |
133
- | Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
134
-
135
- ### Capture integrity (0.21+)
168
+ | `@stable` | API frozen at this major. Breaking changes require a major bump. |
169
+ | `@experimental` | Interface may evolve before becoming `@stable`. Pin the patch version if you depend on it. |
170
+ | `@internal` | Not part of the public contract. Use the documented subpath instead. |
171
+
172
+ The `/rl` subpath is the most active surface. See
173
+ [`src/rl/index.ts`](./src/rl/index.ts) for the current stable/experimental
174
+ breakdown.
175
+
176
+ ## Capture integrity (0.21+)
136
177
 
137
178
  Launch-grade benchmark runs need four things that are easy to forget in glue
138
179
  code: (1) raw HTTP capture alongside the structured spans so a reviewer can
139
180
  verify which route answered, (2) a preflight assertion that the configured
140
181
  client points at the intended provider, (3) a run-end assertion that the
141
182
  expected events were actually written, and (4) auto-execution of the trace
142
- analyst as part of the run lifecycle. The wiring fits in a few lines:
183
+ analyst as part of the run lifecycle.
143
184
 
144
185
  ```ts
145
186
  import {
@@ -168,28 +209,33 @@ Directives, rationale, and shipped-bug context are in
168
209
 
169
210
  ## Examples
170
211
 
171
- Runnable examples live in
172
- [`examples/`](https://github.com/tangle-network/agent-eval/tree/main/examples).
212
+ Each example has its own README with what it demonstrates, expected output,
213
+ and runtime. See [`examples/`](./examples/).
173
214
 
174
- - [`examples/multi-shot-optimization`](https://github.com/tangle-network/agent-eval/tree/main/examples/multi-shot-optimization):
215
+ - [`examples/multi-shot-optimization`](./examples/multi-shot-optimization/README.md):
175
216
  optimize full trajectories with held-out promotion.
176
- - [`examples/same-sandbox-harness`](https://github.com/tangle-network/agent-eval/tree/main/examples/same-sandbox-harness):
217
+ - [`examples/same-sandbox-harness`](./examples/same-sandbox-harness/README.md):
177
218
  run setup/build/test and evidence checks in one workspace.
178
- - [`examples/benchmarks`](https://github.com/tangle-network/agent-eval/tree/main/examples/benchmarks):
219
+ - [`examples/benchmarks`](./examples/benchmarks/README.md):
179
220
  benchmark adapter shape and reference wrappers.
221
+ - [`examples/auto-research-with-agent-builder`](./examples/auto-research-with-agent-builder/README.md):
222
+ closed loop — score, reflect, mutate, re-score, repeat.
223
+ - [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
224
+ RunRecord → preferences → trainer (prime-rl) → next campaign.
180
225
 
181
226
  ## Docs
182
227
 
183
228
  Read in this order:
184
229
 
185
- 1. [Product Eval Adoption](./docs/product-eval-adoption.md)
186
- 2. [Control Runtime](./docs/control-runtime.md)
187
- 3. [Feedback Trajectories](./docs/feedback-trajectories.md)
188
- 4. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
189
- 5. [Trace Analysis](./docs/trace-analysis.md)
190
- 6. [Knowledge Readiness](./docs/knowledge-readiness.md)
191
- 7. [Integration Launch Gates](./docs/integration-launch-gates.md)
192
- 8. [Wire Protocol](./docs/wire-protocol.md)
230
+ 1. [Concepts](./docs/concepts.md) — mental model, 5 min
231
+ 2. [Product Eval Adoption](./docs/product-eval-adoption.md)
232
+ 3. [Control Runtime](./docs/control-runtime.md)
233
+ 4. [Feedback Trajectories](./docs/feedback-trajectories.md)
234
+ 5. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
235
+ 6. [Trace Analysis](./docs/trace-analysis.md)
236
+ 7. [Knowledge Readiness](./docs/knowledge-readiness.md)
237
+ 8. [Integration Launch Gates](./docs/integration-launch-gates.md)
238
+ 9. [Wire Protocol](./docs/wire-protocol.md) — required for non-TypeScript consumers
193
239
 
194
240
  ## CLI / Wire Protocol
195
241
 
@@ -198,28 +244,44 @@ npm i -g @tangle-network/agent-eval
198
244
  agent-eval serve --port 5005
199
245
  ```
200
246
 
201
- The Python client lives in `clients/python`:
247
+ Python:
202
248
 
203
249
  ```sh
204
- cd clients/python
205
- pip install -e .
250
+ pip install agent-eval-rpc
206
251
  ```
207
252
 
253
+ ```py
254
+ from agent_eval_rpc import Client
255
+ client = Client() # auto-detects HTTP server, falls back to subprocess
256
+ score = await client.judge(content=output, rubric_name="anti-slop")
257
+ ```
258
+
259
+ TypeScript is the source of truth. Python is a thin transport client over the
260
+ generated OpenAPI schema. Schema drift is enforced impossible at release time
261
+ (version-locked CI).
262
+
208
263
  ## Development
209
264
 
210
265
  ```sh
211
266
  pnpm install
212
267
  pnpm typecheck
213
268
  pnpm test
214
- pnpm build
215
- pnpm openapi
269
+ pnpm lint # biome
270
+ pnpm build # tsup + openapi.json
216
271
  ```
217
272
 
218
273
  ## Related Packages
219
274
 
220
- - `@tangle-network/agent-runtime`: production session/runtime layer.
221
- - `@tangle-network/agent-knowledge`: source-grounded knowledge bases and readiness.
222
- - `@tangle-network/agent-integrations`: connection, grant, capability, and integration invocation contracts.
275
+ - [`@tangle-network/agent-runtime`](https://www.npmjs.com/package/@tangle-network/agent-runtime):
276
+ production session/runtime layer.
277
+ - [`@tangle-network/agent-knowledge`](https://www.npmjs.com/package/@tangle-network/agent-knowledge):
278
+ source-grounded knowledge bases and readiness.
279
+ - [`@tangle-network/agent-integrations`](https://www.npmjs.com/package/@tangle-network/agent-integrations):
280
+ connection, grant, capability, and integration invocation contracts.
281
+
282
+ Together: `agent-runtime` is where the agent runs; `agent-knowledge` is what
283
+ it knows; `agent-integrations` is what it can do; `agent-eval` is how it gets
284
+ better.
223
285
 
224
286
  ## License
225
287
 
@@ -0,0 +1,108 @@
1
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
2
+
3
+ /**
4
+ * Tool-use metrics — derived purely from trace data.
5
+ *
6
+ * No scoring assumptions: consumers supply optional ground-truth tool
7
+ * selections per turn + optional "information used downstream" signals.
8
+ * Without those, we still compute descriptive metrics (error rate,
9
+ * retry rate, duplicate-call rate) that are useful on their own.
10
+ */
11
+
12
+ interface ToolUseMetrics {
13
+ runId: string;
14
+ totalCalls: number;
15
+ byTool: Record<string, ToolStats>;
16
+ errorRate: number;
17
+ /** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */
18
+ duplicateRate: number;
19
+ /** Ratio of error calls followed by ≥1 retry on same tool. */
20
+ retryRate: number;
21
+ /** Optional: of the calls agent made, fraction the evaluator marked as "correct selection". */
22
+ selectionAccuracy?: number;
23
+ }
24
+ interface ToolStats {
25
+ calls: number;
26
+ errors: number;
27
+ avgLatencyMs: number;
28
+ duplicates: number;
29
+ }
30
+ interface ToolUseOptions {
31
+ /** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */
32
+ selectionLabels?: Record<string, boolean>;
33
+ }
34
+ declare function computeToolUseMetrics(store: TraceStore, runId: string, options?: ToolUseOptions): Promise<ToolUseMetrics>;
35
+
36
+ /**
37
+ * Baseline regression detection.
38
+ *
39
+ * Lifted from ADC baseline.ts. Every promotion-blocking signal boils down
40
+ * to: "is this run measurably worse than baseline?" — with enough
41
+ * statistical rigor to distinguish noise from drift.
42
+ *
43
+ * Uses:
44
+ * - Welch's t-test (unequal variance) for per-metric mean comparison
45
+ * - Cohen's d for effect size magnitude
46
+ * - IQR for stability flag (unstable samples can't be trusted for comparisons)
47
+ *
48
+ * Returns a structured verdict: improved | regressed | stable | unstable.
49
+ */
50
+ interface MetricSamples {
51
+ /** Stable metric key (e.g. "overallScore", "firstTokenMs"). */
52
+ metric: string;
53
+ /** Whether higher values are better. */
54
+ higherIsBetter: boolean;
55
+ baseline: number[];
56
+ candidate: number[];
57
+ }
58
+ interface MetricVerdict {
59
+ metric: string;
60
+ baselineMean: number;
61
+ candidateMean: number;
62
+ delta: number;
63
+ cohensD: number;
64
+ welchT: number;
65
+ welchDf: number;
66
+ welchP: number;
67
+ stable: boolean;
68
+ /** IQR of the combined samples — used as a rough stability indicator. */
69
+ iqr: number;
70
+ verdict: 'improved' | 'regressed' | 'stable' | 'unstable';
71
+ }
72
+ interface BaselineReport {
73
+ metrics: MetricVerdict[];
74
+ /** True if any critical metric regressed. */
75
+ hasRegression: boolean;
76
+ /** True if any metric is unstable (too noisy to judge). */
77
+ hasUnstable: boolean;
78
+ }
79
+ interface BaselineOptions {
80
+ /** Effect size threshold for meaningful delta (default 0.5 — medium effect). */
81
+ effectThreshold?: number;
82
+ /** p-value threshold for statistical significance (default 0.05). */
83
+ alpha?: number;
84
+ /** IQR/mean ratio above which samples are flagged unstable (default 0.30). */
85
+ unstableCvThreshold?: number;
86
+ }
87
+ /**
88
+ * Compare candidate samples against baseline per metric. Verdict logic:
89
+ * - unstable: IQR/|mean| > threshold on either set — not enough signal
90
+ * - improved: meaningful effect in the "better" direction AND p < alpha
91
+ * - regressed: meaningful effect in the "worse" direction AND p < alpha
92
+ * - stable: otherwise (no significant change)
93
+ */
94
+ declare function compareToBaseline(samples: MetricSamples[], options?: BaselineOptions): BaselineReport;
95
+ /** Inter-quartile range; 0 when the sample has no spread. */
96
+ declare function iqr(xs: number[]): number;
97
+ /**
98
+ * Welch's t-test — unequal-variance two-sample t. Uses the same Student-t
99
+ * CDF as `pairedTTest` (via incomplete beta); falls back to normal tail
100
+ * when df is large.
101
+ */
102
+ declare function welchsTTest(a: number[], b: number[]): {
103
+ t: number;
104
+ df: number;
105
+ p: number;
106
+ };
107
+
108
+ export { type BaselineOptions as B, type MetricSamples as M, type ToolStats as T, type BaselineReport as a, type MetricVerdict as b, computeToolUseMetrics as c, type ToolUseMetrics as d, type ToolUseOptions as e, compareToBaseline as f, iqr as i, welchsTTest as w };
@@ -1,2 +1,3 @@
1
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-DDTlbHEK.js';
2
- import '../run-record-DNiOMBrZ.js';
1
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index--fVrWDiR.js';
2
+ import '../run-record-CqzahIbx.js';
3
+ import '../errors-BZ9sTdz7.js';
@@ -2,7 +2,7 @@ import {
2
2
  BENCHMARK_SPLIT_SEED,
3
3
  deterministicSplit,
4
4
  routing_exports
5
- } from "../chunk-42I2QC2L.js";
5
+ } from "../chunk-6QDKWHLS.js";
6
6
  import "../chunk-PZ5AY32C.js";
7
7
  export {
8
8
  BENCHMARK_SPLIT_SEED,