@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +212 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
  20. package/dist/chunk-5LBB5B3Z.js.map +1 -0
  21. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  22. package/dist/chunk-6QDKWHLS.js.map +1 -0
  23. package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
  24. package/dist/chunk-EDUKQ5AM.js.map +1 -0
  25. package/dist/chunk-I4MBDTY5.js +272 -0
  26. package/dist/chunk-I4MBDTY5.js.map +1 -0
  27. package/dist/chunk-JLZQWFV3.js +618 -0
  28. package/dist/chunk-JLZQWFV3.js.map +1 -0
  29. package/dist/chunk-K2TPS5LB.js +569 -0
  30. package/dist/chunk-K2TPS5LB.js.map +1 -0
  31. package/dist/chunk-KKHDIONI.js +414 -0
  32. package/dist/chunk-KKHDIONI.js.map +1 -0
  33. package/dist/chunk-KMPRBJK4.js +74 -0
  34. package/dist/chunk-KMPRBJK4.js.map +1 -0
  35. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  36. package/dist/chunk-KTGTIOFD.js.map +1 -0
  37. package/dist/chunk-LSH4MMOZ.js +838 -0
  38. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  39. package/dist/chunk-NG236HPC.js +57 -0
  40. package/dist/chunk-NG236HPC.js.map +1 -0
  41. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  42. package/dist/chunk-NLMNWKVM.js.map +1 -0
  43. package/dist/chunk-NU65VQ7M.js +99 -0
  44. package/dist/chunk-NU65VQ7M.js.map +1 -0
  45. package/dist/chunk-OWLAAMME.js +250 -0
  46. package/dist/chunk-OWLAAMME.js.map +1 -0
  47. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  48. package/dist/chunk-PC4UYEBM.js.map +1 -0
  49. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  50. package/dist/chunk-RAF443UI.js.map +1 -0
  51. package/dist/chunk-RZTMDUO7.js +49 -0
  52. package/dist/chunk-RZTMDUO7.js.map +1 -0
  53. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  54. package/dist/chunk-SESZDQPX.js.map +1 -0
  55. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  56. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +2018 -3003
  80. package/dist/index.js +7443 -9102
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +491 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +345 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-BNgMdqPF.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +369 -25
  125. package/dist/wire/index.js +22 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
package/CHANGELOG.md CHANGED
@@ -1,5 +1,150 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.25.0 — ProductionLoop primitive: close the eval → prod → eval cycle
4
+
5
+ This release ships the **orchestration layer** that turns the existing
6
+ eval substrate into a continuously-improving production system. Static
7
+ prompts decay; today's regulation flips tomorrow. The pieces to close
8
+ the loop were already in the package (`runMultiShotOptimization`,
9
+ `failureClusterView`, `evaluateReleaseConfidence`, `extractPreferences`,
10
+ `FeedbackTrajectoryStore`, `TraceStore`); this release adds the one
11
+ clean primitive that wires them together end-to-end.
12
+
13
+ ### Added
14
+
15
+ - **`runProductionLoop({ ... })`** (`src/production-loop.ts`,
16
+ `@experimental`) — one call = one cycle. Ingests production traces
17
+ and feedback, clusters failures, runs evolve against the worst
18
+ cluster, gates with `HeldOutGate` + `evaluateReleaseConfidence`
19
+ (fail-closed), and — when wired with an `AutoPrClient` — opens a PR
20
+ with the improved prompt. Idempotent + replayable: same `runId`
21
+ yields the same plan. Cron / GitHub Actions are the consumer's job;
22
+ the primitive doesn't own scheduling.
23
+
24
+ - **`proposeAutomatedPullRequest(client, input)`** + two transports
25
+ (`src/auto-pr.ts`, `@experimental`):
26
+ - `httpGithubClient({ token, ... })` — direct REST against
27
+ `api.github.com`, no extra deps. Idempotent on branch name:
28
+ existing open PRs are returned, not duplicated.
29
+ - `ghCliClient({ ... })` — shells out to `gh` for environments
30
+ where developer auth state is already configured.
31
+ Both validate inputs (no `..` paths, no whitespace branches, no
32
+ duplicate file changes) and surface `ValidationError` / `ConfigError`
33
+ from the typed taxonomy.
34
+
35
+ - **`POST /v1/feedback` + `POST /v1/traces/ingest`** wire endpoints
36
+ (`src/wire/`). Both Zod-validated, both append to the configured
37
+ store (`FeedbackTrajectoryStore` / `TraceStore`). 503 when no store
38
+ is wired (fail loud, not silent). Traces ingest accepts both
39
+ `application/json` (`{events:[...]}`) and `application/x-ndjson` for
40
+ streaming production runtimes. Schemas (`TraceEvent`,
41
+ `FeedbackTrajectory`, `TracesIngestRequest/Response`,
42
+ `FeedbackIngestResponse`) added to `openapi.json` for cross-language
43
+ clients.
44
+
45
+ - **Optional bearer-token auth** on the wire server, configured via
46
+ `createApp({ auth: { bearer: '...' } })` or as a verifier function
47
+ for rotating tokens. `/healthz` and `/v1/version` remain unprotected
48
+ (regression: never lock monitoring out of the runtime).
49
+
50
+ - **`examples/production-loop/`** — synthetic end-to-end demo wiring
51
+ the loop against in-memory trace + feedback stores and a fake
52
+ auto-PR client. Shows the failure-cluster trigger, the evolve round,
53
+ the gate verdict, and the PR-shaped output without requiring
54
+ credentials or a live model.
55
+
56
+ ### Changed
57
+
58
+ - **Wire server** (`createApp(opts)`) now accepts optional
59
+ `IngestionStores` (`{ traceStore?, feedbackStore? }`) and `auth`.
60
+ Existing zero-arg callers continue to work — judge / rubrics /
61
+ version / healthz are unchanged.
62
+
63
+ ### Status tags
64
+
65
+ - Every new export is `@experimental` initially. Pin the patch version
66
+ if you depend on it. All other 0.24.0 stability tags are preserved.
67
+
68
+ ## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices
69
+
70
+ This release is **DX + correctness**. No production behavior moved; consumer
71
+ contracts tightened across the board. Library went from 7.5/10 to 10/10 on
72
+ first-touch usability and contract clarity. The visible deltas:
73
+
74
+ ### Strictness
75
+
76
+ - **`noUncheckedIndexedAccess: true`** in `tsconfig.json`. 251 latent
77
+ `T | undefined` sites surfaced and fixed across ~70 files. Loop-bound
78
+ indices documented with `!`, external lookups guarded explicitly, accumulator
79
+ patterns refactored to capture-then-assign. Every fix audited for semantic
80
+ correctness (math code: `!`; untrusted data: guards).
81
+ - **Subpath imports forced.** Six `export * from './X'` wildcards at root
82
+ deleted (`./rl`, `./pipelines`, `./builder-eval`, `./meta-eval`, `./prm`,
83
+ `./trace-analyst`). New subpaths in `package.json`: `/pipelines`,
84
+ `/meta-eval`, `/prm`, `/builder-eval`, `/governance`, `/knowledge`. Root
85
+ re-exports retained only for the load-bearing capture-integrity surface
86
+ (`./trace`, `./knowledge`, `./governance`).
87
+ - **Error taxonomy.** New `src/errors.ts` exports `AgentEvalError` base plus
88
+ `ValidationError`, `NotFoundError`, `ConfigError`, `CaptureIntegrityError`,
89
+ `JudgeError`, `VerificationError`, `ReplayError`. Existing custom errors
90
+ re-parented: `ReplayCacheMissError`, `BudgetBreachError`, `RunIntegrityError`,
91
+ `HoldoutLockedError`, `RunRecordValidationError`, `LlmCallError`,
92
+ `LlmRouteAssertionError`, `TraceFileMissingError`, `TraceNotFoundError`,
93
+ `SpanNotFoundError`. ~25 user-facing `throw new Error(...)` calls migrated
94
+ to typed errors across `rl/*`, `replay`, `sandbox-harness`, `statistics`,
95
+ `release-confidence`, `visual-diff`, `counterfactual`, `run-critic`,
96
+ `observability`. Internal invariant guards intentionally left as plain
97
+ `Error` — those are bugs, not contract failures.
98
+ - **`LlmRouteAssertionError.code` → `reason`** (breaking, greenfield).
99
+ The subclass's route-specific reason now lives on `.reason`; the base
100
+ category `code = 'capture_integrity'` survives via the `AgentEvalError`
101
+ contract.
102
+
103
+ ### Visible deltas
104
+
105
+ ### Changed
106
+
107
+ - **README reframed** as the substrate for self-improving agents. The package
108
+ has shipped `EvalCampaign`, replay, GEPA / reflective mutation, auto-research,
109
+ active curriculum, contamination probes, tournaments, compute curves, PRM,
110
+ off-policy estimators, and sequential anytime-valid stats since 0.22 — the
111
+ README now actually names them, not just "evaluation infrastructure."
112
+
113
+ - **`src/rl/index.ts` carries stability markers** — every re-export is tagged
114
+ `@stable` or `@experimental` via JSDoc. Stable: `run-record-adapters`,
115
+ `verifiable-reward`, `preferences`, `off-policy`, `tournament`,
116
+ `contamination`, `compute-curves`. Experimental: `process-reward`,
117
+ `adversarial`, `active-curriculum`, `reward-hacking`, `adaptation-eval`,
118
+ `exporters`, `rl-campaign`, `predictive-validity-researcher`, `auto-research`.
119
+ Tags are visible in IDE hover and emitted into `dist/rl.d.ts` so consumers
120
+ can see the contract at the call site.
121
+
122
+ ### Added
123
+
124
+ - **Biome lint + format** — `biome.json` codifies the project style (no
125
+ semicolons, single quotes, 2-space indent, 100 col, `noNonNullAssertion`
126
+ off, `useNodejsImportProtocol` on). `pnpm lint` and `pnpm format` scripts.
127
+ - **`.github/workflows/ci.yml`** — runs typecheck + lint + test + build +
128
+ Python pytest on every PR. Previously only the publish workflow on tag
129
+ push exercised this surface; PRs were unguarded.
130
+ - **`ReplayCache.entries()`** — public iterator for the cached
131
+ `(request, response)` pairs. Replaces the bracket-access escape hatch into
132
+ the private `byKey` map. Same semantics, exposed in the type contract.
133
+ - **Per-example READMEs** — `examples/multi-shot-optimization` and
134
+ `examples/same-sandbox-harness` now document what they show, how to run,
135
+ expected output, and adaptation guidance. The other three examples already
136
+ had READMEs; the README index now links to all five.
137
+ - **`clients/python/examples/judge_anti_slop.py`** — runnable script that
138
+ doubles as a pytest, anchoring the `judge` API contract: composite in
139
+ `[0, 1]`, `RubricNotFoundError` for bogus rubric name, `ValidationError`
140
+ for no-rubric call.
141
+
142
+ ### Fixed
143
+
144
+ - **`reflective-mutation.ts`** — local `escape` variable shadowed the global
145
+ `escape` property. Renamed to `escaped`. No behavior change; flagged by
146
+ biome.
147
+
3
148
  ## 0.23.1 — FileSystemTraceStore.updateRun no longer double-appends
4
149
 
5
150
  ### Fixed
package/README.md CHANGED
@@ -1,32 +1,39 @@
1
1
  # @tangle-network/agent-eval
2
2
 
3
- Evaluation infrastructure for agent products.
4
-
5
- Use it to wrap the real workflow your users run, record what happened, verify
6
- the result, turn feedback into replay data, compare variants, and ship only
7
- when the evidence improves.
3
+ **Substrate for self-improving agents.** Trace what runs, verify the result,
4
+ turn outcomes into preferences and rewards, mutate prompts and policies under
5
+ anytime-valid evidence, and ship only when the improvement is decisive.
8
6
 
9
7
  ```txt
10
- product task
11
- -> observe state
12
- -> validate with deterministic gates first
13
- -> act through the real product adapter
14
- -> trace + feedback trajectory
15
- -> replay / optimize / release gate
8
+ real product task
9
+ -> observe / act (your runtime)
10
+ -> trace + verifier pipeline (capture integrity)
11
+ -> RunRecord (canonical eval artifact)
12
+ -> judge calibration · paired stats · sequential α
13
+ -> preferences · verifiable rewards · process rewards
14
+ -> GEPA / reflective mutation · auto-research · active curriculum
15
+ -> release gate · replay · contamination probe · tournament rating
16
+ -> next iteration
16
17
  ```
17
18
 
18
- `agent-eval` does not own product state, credentials, UI, storage, model
19
+ `agent-eval` does **not** own product state, credentials, UI, storage, model
19
20
  routing, browser drivers, sandbox policy, or deployment. Products own those.
20
- This package owns eval contracts, loop mechanics, traces, statistics,
21
- optimization inputs, and release evidence.
21
+ This package owns the loop that closes evaluation → preference → mutation →
22
+ redeploy, with capture integrity and statistically rigorous evidence at every
23
+ step.
24
+
25
+ It ships as a TypeScript library (npm) with a generated Python client (PyPI),
26
+ both speaking the same wire protocol. MIT, self-hostable, no SaaS dependency.
22
27
 
23
28
  ## Install
24
29
 
25
30
  ```sh
26
31
  pnpm add @tangle-network/agent-eval
32
+ # or, from Python:
33
+ pip install agent-eval-rpc
27
34
  ```
28
35
 
29
- ## Quick Start
36
+ ## Quick Start — the control loop
30
37
 
31
38
  ```ts
32
39
  import {
@@ -78,68 +85,171 @@ const result = await runAgentControlLoop({
78
85
  await product.storeEvalResult(task.id, result)
79
86
  ```
80
87
 
81
- That loop should be the same shape in production, replay, benchmark, and
82
- optimization. Swap dependencies behind `observe()` and `act()`, not the eval
83
- contract itself.
88
+ Same loop shape in production, replay, benchmark, and optimization. Swap the
89
+ dependencies behind `observe()` and `act()`, never the eval contract.
84
90
 
85
- ## Import Paths
91
+ ## Production loop — close the eval → prod → eval cycle (0.25.0)
92
+
93
+ Static prompts decay. Yesterday's FTC rule flips today; yesterday's tool quirk
94
+ becomes today's incident. The production agents that win are the ones that
95
+ **continuously re-train against live failure modes**.
86
96
 
87
- The root export remains available, but new code should prefer focused subpaths:
97
+ `runProductionLoop` is the orchestration layer that wires the existing eval
98
+ substrate into a self-improvement cron:
88
99
 
89
100
  ```ts
90
- import { runAgentControlLoop } from '@tangle-network/agent-eval/control'
91
- import { runMultiShotOptimization } from '@tangle-network/agent-eval/optimization'
92
- import { TraceEmitter } from '@tangle-network/agent-eval/traces'
93
- import { renderReleaseReport } from '@tangle-network/agent-eval/reporting'
101
+ import {
102
+ runProductionLoop,
103
+ httpGithubClient,
104
+ FileSystemFeedbackTrajectoryStore,
105
+ } from '@tangle-network/agent-eval'
106
+ import { FileSystemTraceStore } from '@tangle-network/agent-eval/traces'
107
+
108
+ const result = await runProductionLoop({
109
+ runId: `weekly-${new Date().toISOString().slice(0, 10)}`,
110
+ target: 'tax-agent',
111
+
112
+ // 1. Where production traces + feedback land. Wire the HTTP ingestion
113
+ // endpoints (POST /v1/traces/ingest, POST /v1/feedback) from your
114
+ // runtime; the same store reads them here.
115
+ traceStore: new FileSystemTraceStore({ dir: 'data/prod-traces' }),
116
+ feedbackStore: new FileSystemFeedbackTrajectoryStore({ dir: 'data/prod-feedback' }),
117
+
118
+ // 2. Cluster threshold: act on failure groups ≥ 20 runs or ≥ 5% of corpus.
119
+ cluster: { minClusterSize: 20, minSeverityRatio: 0.05, maxClustersPerCycle: 1 },
120
+
121
+ // 3. Evolve: seed = current prompt, gate against holdout scenarios.
122
+ evolve: {
123
+ baselinePrompt: currentSystemPrompt,
124
+ holdoutScenarios: productionShapeScenarios,
125
+ runner, // your agent driver
126
+ scorer, // calibrated judge or rubric
127
+ mutator, // GEPA-style or addendum-style mutator
128
+ gate: {
129
+ baselineKey: 'baseline',
130
+ minProductiveRuns: 5,
131
+ pairedDeltaThreshold: 0.03, // require Nσ improvement on holdout
132
+ overfitGapThreshold: 0.10,
133
+ },
134
+ },
135
+
136
+ // 4. Ship: when the gate passes, open a PR with the new prompt.
137
+ ship: {
138
+ client: httpGithubClient({ token: process.env.GITHUB_TOKEN! }),
139
+ repo: { owner: 'tangle-network', name: 'tax-agent' },
140
+ branchPrefix: 'eval/auto-improve',
141
+ promptFilePath: 'prompts/tax-agent-system.txt',
142
+ reviewers: ['drew'],
143
+ },
144
+
145
+ cron: { cadence: 'weekly' }, // surface-only; consumer schedules
146
+ })
147
+
148
+ console.log(result.decision) // 'pr_opened' | 'gate_failed' | 'no_actionable_failures' | ...
149
+ console.log(result.pullRequest?.prUrl) // populated when a PR was opened
94
150
  ```
95
151
 
152
+ The primitive runs **one cycle**. Schedule it with `workflow_dispatch` + cron in
153
+ GitHub Actions. It is **idempotent + replayable**: same `runId` → same plan.
154
+ Gate failures are fail-closed — a candidate that beats baseline on search but
155
+ overfits on holdout never lands.
156
+
157
+ Full runnable demo (synthetic traces, no credentials) in
158
+ [`examples/production-loop`](./examples/production-loop/README.md).
159
+
160
+ ## Self-improvement loop
161
+
162
+ Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
163
+ proposals, and curriculum updates — all from the same `RunRecord` produced by
164
+ the control loop.
165
+
166
+ ```ts
167
+ import { runEvalCampaign } from '@tangle-network/agent-eval'
168
+ import {
169
+ extractPreferences,
170
+ extractVerifiableReward,
171
+ filterDeterministicallyRewarded,
172
+ offPolicyEstimateAll,
173
+ analyzeOptimizationResult,
174
+ } from '@tangle-network/agent-eval/rl'
175
+
176
+ // 1. Run a matrix of variants × scenarios with capture integrity by construction.
177
+ const campaign = await runEvalCampaign({ variants, scenarios, run })
178
+
179
+ // 2. Convert outcomes into RL signal.
180
+ const rewards = extractVerifiableReward(campaign.runs) // compile/test/schema
181
+ const prefs = extractPreferences(campaign.runs) // (chosen, rejected) triples
182
+ const clean = filterDeterministicallyRewarded(rewards) // judge-noise free
183
+
184
+ // 3. Estimate a candidate policy's value without re-running.
185
+ const ope = offPolicyEstimateAll(campaign.runs, candidatePolicy) // IPS + SNIPS + DR
186
+
187
+ // 4. Or close the loop end-to-end: score → reflect → mutate → re-run.
188
+ const next = await analyzeOptimizationResult(campaign, { researcher })
189
+ ```
190
+
191
+ | Step | Primitive | Subpath |
192
+ | --- | --- | --- |
193
+ | Eval matrix with integrity | `runEvalCampaign` | `/` |
194
+ | Deterministic re-judge / audit | `ReplayCache`, `createReplayFetch` | `/` |
195
+ | Anytime-valid α across rolling looks | `pairedEvalueSequence` | `/reporting` |
196
+ | Judge quality vs gold | `calibrateJudge` (κ, Pearson, MAE, bias probes) | `/` |
197
+ | (chosen, rejected) for DPO/KTO/PPO | `extractPreferences` | `/rl` |
198
+ | Verifiable reward signal | `extractVerifiableReward` | `/rl` |
199
+ | Step-level / PRM training data | `extractStepRewards`, `prmTrainingPairs` | `/rl` |
200
+ | Estimate policy value off-policy | `offPolicyEstimateAll` (IPS + SNIPS + DR) | `/rl` |
201
+ | GEPA / reflective prompt mutation | `buildReflectionPrompt`, `parseReflectionResponse`, Ax-GEPA `SteeringOptimizer` | `/` `/optimization` |
202
+ | Auto-research (read runs → propose) | `analyzeOptimizationResult`, `PredictiveValidityResearcher` | `/rl` |
203
+ | Active curriculum (variance / Thompson) | `allocateCurriculum` | `/rl` |
204
+ | Tournament ratings (Bradley-Terry + Elo) | `fitBradleyTerry`, `applyEloUpdate` | `/rl` |
205
+ | Adversarial scenario search | `adversarialScenarioSearch` | `/rl` |
206
+ | Contamination probe (held-out perturb) | `runContaminationProbe` | `/rl` |
207
+ | Reward hacking signatures | `detectRewardHacking` | `/rl` |
208
+ | Compute curves (best-of-N, self-consist, Pareto) | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` | `/rl` |
209
+ | Knowledge gap separated from reasoning gap | `scoreKnowledgeReadiness` | `/` |
210
+ | Release gate (paired evidence + holdouts) | `evaluateReleaseConfidence`, `HeldOutGate` | `/reporting` |
211
+ | Launch report (decision-grade) | `renderReleaseReport`, `researchReport` | `/reporting` |
212
+
213
+ ## Import Paths
214
+
96
215
  | Subpath | Use for |
97
216
  | --- | --- |
98
- | `@tangle-network/agent-eval/control` | `observe -> validate -> decide -> act`, action policy, propose/review loops |
217
+ | `@tangle-network/agent-eval/control` | `observe validate decide act`, action policy, propose/review loops |
99
218
  | `@tangle-network/agent-eval/traces` | trace stores, emitters, TraceAnalyst, replay |
100
- | `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot optimization, prompt evolution, EvalCampaign |
101
- | `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, report/table/chart specs, predictive validity |
102
- | `@tangle-network/agent-eval/rl` | RL bridge: adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves |
103
- | `@tangle-network/agent-eval/wire` | HTTP/RPC judge server and schemas |
219
+ | `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot, prompt evolution, GEPA, EvalCampaign |
220
+ | `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, launch reports |
221
+ | `@tangle-network/agent-eval/rl` | adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves, auto-research |
222
+ | `@tangle-network/agent-eval/wire` | HTTP/RPC server + schemas (same protocol the Python client speaks) |
104
223
  | `@tangle-network/agent-eval/benchmarks` | benchmark adapter contracts and reference wrappers |
105
224
 
106
- ## Core Pieces
225
+ The root export remains available for convenience; new code should prefer
226
+ focused subpaths. Anything under `/rl` should be imported from `/rl` — root
227
+ re-export is retained only for backward compatibility and will be narrowed in
228
+ 0.25.
229
+
230
+ ## API stability
107
231
 
108
- | Need | Use |
232
+ Public exports are tagged with JSDoc stability markers so consumers can see
233
+ status at the call site (IDE hover, language server, declaration files).
234
+
235
+ | Tag | Meaning |
109
236
  | --- | --- |
110
- | Keep an agent working until objective state passes | `runAgentControlLoop` |
111
- | Turn user/reviewer feedback into replay data | `FeedbackTrajectory` |
112
- | Compare prompt/tool/retrieval policies over full trajectories | `runMultiShotOptimization` |
113
- | Gate releases with paired evidence and holdouts | `evaluateReleaseConfidence`, `HeldOutGate` |
114
- | Explain regressions across trace corpora | `TraceAnalyst` / `analyzeTraces` |
115
- | Report a launch decision | `renderReleaseReport`, `researchReport`, `summaryTable`, `paretoChart`, `gainHistogram` |
116
- | Capture every provider HTTP request / response for forensics | `RawProviderSink`, `LlmClientOptions.rawSink` |
117
- | Fail loud if an eval would silently use the wrong route | `assertLlmRoute` |
118
- | Assert at run-end that the artifact is complete | `assertRunCaptured`, `throwIfRunIncomplete` |
119
- | Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
120
- | Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
121
- | Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
122
- | Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
123
- | Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
124
- | Bridge legacy optimization output to canonical `RunRecord[]` | `trialsToRunRecords`, `verificationReportToRunRecord` |
125
- | Extract a clean reward signal for RL training (compile/test/schema vs judge) | `extractVerifiableReward`, `filterDeterministicallyRewarded` |
126
- | Produce DPO / PPO / KTO `(chosen, rejected)` triples | `extractPreferences` |
127
- | Estimate a new policy's value on old trajectories without re-running | `offPolicyEstimateAll` (IPS + SNIPS + DR) |
128
- | Step-level credit assignment / PRM training data | `extractStepRewards`, `prmTrainingPairs` |
129
- | Detect benchmark contamination via held-out perturbations | `runContaminationProbe` |
130
- | Pairwise tournament ratings for many-candidate sweeps | `fitBradleyTerry`, `applyEloUpdate` |
131
- | Active search for inputs the policy fails on | `adversarialScenarioSearch` |
132
- | Characterise a candidate across compute budgets | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` |
133
- | Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
134
-
135
- ### Capture integrity (0.21+)
237
+ | `@stable` | API frozen at this major. Breaking changes require a major bump. |
238
+ | `@experimental` | Interface may evolve before becoming `@stable`. Pin the patch version if you depend on it. |
239
+ | `@internal` | Not part of the public contract. Use the documented subpath instead. |
240
+
241
+ The `/rl` subpath is the most active surface. See
242
+ [`src/rl/index.ts`](./src/rl/index.ts) for the current stable/experimental
243
+ breakdown.
244
+
245
+ ## Capture integrity (0.21+)
136
246
 
137
247
  Launch-grade benchmark runs need four things that are easy to forget in glue
138
248
  code: (1) raw HTTP capture alongside the structured spans so a reviewer can
139
249
  verify which route answered, (2) a preflight assertion that the configured
140
250
  client points at the intended provider, (3) a run-end assertion that the
141
251
  expected events were actually written, and (4) auto-execution of the trace
142
- analyst as part of the run lifecycle. The wiring fits in a few lines:
252
+ analyst as part of the run lifecycle.
143
253
 
144
254
  ```ts
145
255
  import {
@@ -168,28 +278,35 @@ Directives, rationale, and shipped-bug context are in
168
278
 
169
279
  ## Examples
170
280
 
171
- Runnable examples live in
172
- [`examples/`](https://github.com/tangle-network/agent-eval/tree/main/examples).
281
+ Each example has its own README with what it demonstrates, expected output,
282
+ and runtime. See [`examples/`](./examples/).
173
283
 
174
- - [`examples/multi-shot-optimization`](https://github.com/tangle-network/agent-eval/tree/main/examples/multi-shot-optimization):
284
+ - [`examples/multi-shot-optimization`](./examples/multi-shot-optimization/README.md):
175
285
  optimize full trajectories with held-out promotion.
176
- - [`examples/same-sandbox-harness`](https://github.com/tangle-network/agent-eval/tree/main/examples/same-sandbox-harness):
286
+ - [`examples/same-sandbox-harness`](./examples/same-sandbox-harness/README.md):
177
287
  run setup/build/test and evidence checks in one workspace.
178
- - [`examples/benchmarks`](https://github.com/tangle-network/agent-eval/tree/main/examples/benchmarks):
288
+ - [`examples/benchmarks`](./examples/benchmarks/README.md):
179
289
  benchmark adapter shape and reference wrappers.
290
+ - [`examples/auto-research-with-agent-builder`](./examples/auto-research-with-agent-builder/README.md):
291
+ closed loop — score, reflect, mutate, re-score, repeat.
292
+ - [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
293
+ RunRecord → preferences → trainer (prime-rl) → next campaign.
294
+ - [`examples/production-loop`](./examples/production-loop/README.md):
295
+ ingest prod traces + feedback, cluster failures, evolve, gate, open a PR.
180
296
 
181
297
  ## Docs
182
298
 
183
299
  Read in this order:
184
300
 
185
- 1. [Product Eval Adoption](./docs/product-eval-adoption.md)
186
- 2. [Control Runtime](./docs/control-runtime.md)
187
- 3. [Feedback Trajectories](./docs/feedback-trajectories.md)
188
- 4. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
189
- 5. [Trace Analysis](./docs/trace-analysis.md)
190
- 6. [Knowledge Readiness](./docs/knowledge-readiness.md)
191
- 7. [Integration Launch Gates](./docs/integration-launch-gates.md)
192
- 8. [Wire Protocol](./docs/wire-protocol.md)
301
+ 1. [Concepts](./docs/concepts.md) — mental model, 5 min
302
+ 2. [Product Eval Adoption](./docs/product-eval-adoption.md)
303
+ 3. [Control Runtime](./docs/control-runtime.md)
304
+ 4. [Feedback Trajectories](./docs/feedback-trajectories.md)
305
+ 5. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
306
+ 6. [Trace Analysis](./docs/trace-analysis.md)
307
+ 7. [Knowledge Readiness](./docs/knowledge-readiness.md)
308
+ 8. [Integration Launch Gates](./docs/integration-launch-gates.md)
309
+ 9. [Wire Protocol](./docs/wire-protocol.md) — required for non-TypeScript consumers
193
310
 
194
311
  ## CLI / Wire Protocol
195
312
 
@@ -198,28 +315,44 @@ npm i -g @tangle-network/agent-eval
198
315
  agent-eval serve --port 5005
199
316
  ```
200
317
 
201
- The Python client lives in `clients/python`:
318
+ Python:
202
319
 
203
320
  ```sh
204
- cd clients/python
205
- pip install -e .
321
+ pip install agent-eval-rpc
206
322
  ```
207
323
 
324
+ ```py
325
+ from agent_eval_rpc import Client
326
+ client = Client() # auto-detects HTTP server, falls back to subprocess
327
+ score = await client.judge(content=output, rubric_name="anti-slop")
328
+ ```
329
+
330
+ TypeScript is the source of truth. Python is a thin transport client over the
331
+ generated OpenAPI schema. Schema drift is enforced impossible at release time
332
+ (version-locked CI).
333
+
208
334
  ## Development
209
335
 
210
336
  ```sh
211
337
  pnpm install
212
338
  pnpm typecheck
213
339
  pnpm test
214
- pnpm build
215
- pnpm openapi
340
+ pnpm lint # biome
341
+ pnpm build # tsup + openapi.json
216
342
  ```
217
343
 
218
344
  ## Related Packages
219
345
 
220
- - `@tangle-network/agent-runtime`: production session/runtime layer.
221
- - `@tangle-network/agent-knowledge`: source-grounded knowledge bases and readiness.
222
- - `@tangle-network/agent-integrations`: connection, grant, capability, and integration invocation contracts.
346
+ - [`@tangle-network/agent-runtime`](https://www.npmjs.com/package/@tangle-network/agent-runtime):
347
+ production session/runtime layer.
348
+ - [`@tangle-network/agent-knowledge`](https://www.npmjs.com/package/@tangle-network/agent-knowledge):
349
+ source-grounded knowledge bases and readiness.
350
+ - [`@tangle-network/agent-integrations`](https://www.npmjs.com/package/@tangle-network/agent-integrations):
351
+ connection, grant, capability, and integration invocation contracts.
352
+
353
+ Together: `agent-runtime` is where the agent runs; `agent-knowledge` is what
354
+ it knows; `agent-integrations` is what it can do; `agent-eval` is how it gets
355
+ better.
223
356
 
224
357
  ## License
225
358
 
@@ -0,0 +1,108 @@
1
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
2
+
3
+ /**
4
+ * Tool-use metrics — derived purely from trace data.
5
+ *
6
+ * No scoring assumptions: consumers supply optional ground-truth tool
7
+ * selections per turn + optional "information used downstream" signals.
8
+ * Without those, we still compute descriptive metrics (error rate,
9
+ * retry rate, duplicate-call rate) that are useful on their own.
10
+ */
11
+
12
+ interface ToolUseMetrics {
13
+ runId: string;
14
+ totalCalls: number;
15
+ byTool: Record<string, ToolStats>;
16
+ errorRate: number;
17
+ /** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */
18
+ duplicateRate: number;
19
+ /** Ratio of error calls followed by ≥1 retry on same tool. */
20
+ retryRate: number;
21
+ /** Optional: of the calls agent made, fraction the evaluator marked as "correct selection". */
22
+ selectionAccuracy?: number;
23
+ }
24
+ interface ToolStats {
25
+ calls: number;
26
+ errors: number;
27
+ avgLatencyMs: number;
28
+ duplicates: number;
29
+ }
30
+ interface ToolUseOptions {
31
+ /** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */
32
+ selectionLabels?: Record<string, boolean>;
33
+ }
34
+ declare function computeToolUseMetrics(store: TraceStore, runId: string, options?: ToolUseOptions): Promise<ToolUseMetrics>;
35
+
36
+ /**
37
+ * Baseline regression detection.
38
+ *
39
+ * Lifted from ADC baseline.ts. Every promotion-blocking signal boils down
40
+ * to: "is this run measurably worse than baseline?" — with enough
41
+ * statistical rigor to distinguish noise from drift.
42
+ *
43
+ * Uses:
44
+ * - Welch's t-test (unequal variance) for per-metric mean comparison
45
+ * - Cohen's d for effect size magnitude
46
+ * - IQR for stability flag (unstable samples can't be trusted for comparisons)
47
+ *
48
+ * Returns a structured verdict: improved | regressed | stable | unstable.
49
+ */
50
+ interface MetricSamples {
51
+ /** Stable metric key (e.g. "overallScore", "firstTokenMs"). */
52
+ metric: string;
53
+ /** Whether higher values are better. */
54
+ higherIsBetter: boolean;
55
+ baseline: number[];
56
+ candidate: number[];
57
+ }
58
+ interface MetricVerdict {
59
+ metric: string;
60
+ baselineMean: number;
61
+ candidateMean: number;
62
+ delta: number;
63
+ cohensD: number;
64
+ welchT: number;
65
+ welchDf: number;
66
+ welchP: number;
67
+ stable: boolean;
68
+ /** IQR of the combined samples — used as a rough stability indicator. */
69
+ iqr: number;
70
+ verdict: 'improved' | 'regressed' | 'stable' | 'unstable';
71
+ }
72
+ interface BaselineReport {
73
+ metrics: MetricVerdict[];
74
+ /** True if any critical metric regressed. */
75
+ hasRegression: boolean;
76
+ /** True if any metric is unstable (too noisy to judge). */
77
+ hasUnstable: boolean;
78
+ }
79
+ interface BaselineOptions {
80
+ /** Effect size threshold for meaningful delta (default 0.5 — medium effect). */
81
+ effectThreshold?: number;
82
+ /** p-value threshold for statistical significance (default 0.05). */
83
+ alpha?: number;
84
+ /** IQR/mean ratio above which samples are flagged unstable (default 0.30). */
85
+ unstableCvThreshold?: number;
86
+ }
87
+ /**
88
+ * Compare candidate samples against baseline per metric. Verdict logic:
89
+ * - unstable: IQR/|mean| > threshold on either set — not enough signal
90
+ * - improved: meaningful effect in the "better" direction AND p < alpha
91
+ * - regressed: meaningful effect in the "worse" direction AND p < alpha
92
+ * - stable: otherwise (no significant change)
93
+ */
94
+ declare function compareToBaseline(samples: MetricSamples[], options?: BaselineOptions): BaselineReport;
95
+ /** Inter-quartile range; 0 when the sample has no spread. */
96
+ declare function iqr(xs: number[]): number;
97
+ /**
98
+ * Welch's t-test — unequal-variance two-sample t. Uses the same Student-t
99
+ * CDF as `pairedTTest` (via incomplete beta); falls back to normal tail
100
+ * when df is large.
101
+ */
102
+ declare function welchsTTest(a: number[], b: number[]): {
103
+ t: number;
104
+ df: number;
105
+ p: number;
106
+ };
107
+
108
+ export { type BaselineOptions as B, type MetricSamples as M, type ToolStats as T, type BaselineReport as a, type MetricVerdict as b, computeToolUseMetrics as c, type ToolUseMetrics as d, type ToolUseOptions as e, compareToBaseline as f, iqr as i, welchsTTest as w };
@@ -1,2 +1,3 @@
1
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-DDTlbHEK.js';
2
- import '../run-record-DNiOMBrZ.js';
1
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index--fVrWDiR.js';
2
+ import '../run-record-CqzahIbx.js';
3
+ import '../errors-BZ9sTdz7.js';
@@ -2,7 +2,7 @@ import {
2
2
  BENCHMARK_SPLIT_SEED,
3
3
  deterministicSplit,
4
4
  routing_exports
5
- } from "../chunk-42I2QC2L.js";
5
+ } from "../chunk-6QDKWHLS.js";
6
6
  import "../chunk-PZ5AY32C.js";
7
7
  export {
8
8
  BENCHMARK_SPLIT_SEED,