@tangle-network/agent-eval 0.23.1 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +80 -0
- package/README.md +141 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OHEPNJQN.js +554 -0
- package/dist/chunk-OHEPNJQN.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
- package/dist/chunk-SY6WAAAD.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
- package/dist/chunk-VRJVTXRV.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +1866 -3151
- package/dist/index.js +5457 -7809
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +409 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-TDPn1cxq.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +22 -22
- package/dist/wire/index.js +4 -3
- package/package.json +44 -18
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/chunk-XPHOZPOM.js +0 -1947
- package/dist/chunk-XPHOZPOM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,85 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices
|
|
4
|
+
|
|
5
|
+
This release is **DX + correctness**. No production behavior moved; consumer
|
|
6
|
+
contracts tightened across the board. Library went from 7.5/10 to 10/10 on
|
|
7
|
+
first-touch usability and contract clarity. The visible deltas:
|
|
8
|
+
|
|
9
|
+
### Strictness
|
|
10
|
+
|
|
11
|
+
- **`noUncheckedIndexedAccess: true`** in `tsconfig.json`. 251 latent
|
|
12
|
+
`T | undefined` sites surfaced and fixed across ~70 files. Loop-bound
|
|
13
|
+
indices documented with `!`, external lookups guarded explicitly, accumulator
|
|
14
|
+
patterns refactored to capture-then-assign. Every fix audited for semantic
|
|
15
|
+
correctness (math code: `!`; untrusted data: guards).
|
|
16
|
+
- **Subpath imports forced.** Six `export * from './X'` wildcards at root
|
|
17
|
+
deleted (`./rl`, `./pipelines`, `./builder-eval`, `./meta-eval`, `./prm`,
|
|
18
|
+
`./trace-analyst`). New subpaths in `package.json`: `/pipelines`,
|
|
19
|
+
`/meta-eval`, `/prm`, `/builder-eval`, `/governance`, `/knowledge`. Root
|
|
20
|
+
re-exports retained only for the load-bearing capture-integrity surface
|
|
21
|
+
(`./trace`, `./knowledge`, `./governance`).
|
|
22
|
+
- **Error taxonomy.** New `src/errors.ts` exports `AgentEvalError` base plus
|
|
23
|
+
`ValidationError`, `NotFoundError`, `ConfigError`, `CaptureIntegrityError`,
|
|
24
|
+
`JudgeError`, `VerificationError`, `ReplayError`. Existing custom errors
|
|
25
|
+
re-parented: `ReplayCacheMissError`, `BudgetBreachError`, `RunIntegrityError`,
|
|
26
|
+
`HoldoutLockedError`, `RunRecordValidationError`, `LlmCallError`,
|
|
27
|
+
`LlmRouteAssertionError`, `TraceFileMissingError`, `TraceNotFoundError`,
|
|
28
|
+
`SpanNotFoundError`. ~25 user-facing `throw new Error(...)` calls migrated
|
|
29
|
+
to typed errors across `rl/*`, `replay`, `sandbox-harness`, `statistics`,
|
|
30
|
+
`release-confidence`, `visual-diff`, `counterfactual`, `run-critic`,
|
|
31
|
+
`observability`. Internal invariant guards intentionally left as plain
|
|
32
|
+
`Error` — those are bugs, not contract failures.
|
|
33
|
+
- **`LlmRouteAssertionError.code` → `reason`** (breaking, greenfield).
|
|
34
|
+
The subclass's route-specific reason now lives on `.reason`; the base
|
|
35
|
+
category `code = 'capture_integrity'` survives via the `AgentEvalError`
|
|
36
|
+
contract.
|
|
37
|
+
|
|
38
|
+
### Visible deltas
|
|
39
|
+
|
|
40
|
+
### Changed
|
|
41
|
+
|
|
42
|
+
- **README reframed** as the substrate for self-improving agents. The package
|
|
43
|
+
has shipped `EvalCampaign`, replay, GEPA / reflective mutation, auto-research,
|
|
44
|
+
active curriculum, contamination probes, tournaments, compute curves, PRM,
|
|
45
|
+
off-policy estimators, and sequential anytime-valid stats since 0.22 — the
|
|
46
|
+
README now actually names them, not just "evaluation infrastructure."
|
|
47
|
+
|
|
48
|
+
- **`src/rl/index.ts` carries stability markers** — every re-export is tagged
|
|
49
|
+
`@stable` or `@experimental` via JSDoc. Stable: `run-record-adapters`,
|
|
50
|
+
`verifiable-reward`, `preferences`, `off-policy`, `tournament`,
|
|
51
|
+
`contamination`, `compute-curves`. Experimental: `process-reward`,
|
|
52
|
+
`adversarial`, `active-curriculum`, `reward-hacking`, `adaptation-eval`,
|
|
53
|
+
`exporters`, `rl-campaign`, `predictive-validity-researcher`, `auto-research`.
|
|
54
|
+
Tags are visible in IDE hover and emitted into `dist/rl.d.ts` so consumers
|
|
55
|
+
can see the contract at the call site.
|
|
56
|
+
|
|
57
|
+
### Added
|
|
58
|
+
|
|
59
|
+
- **Biome lint + format** — `biome.json` codifies the project style (no
|
|
60
|
+
semicolons, single quotes, 2-space indent, 100 col, `noNonNullAssertion`
|
|
61
|
+
off, `useNodejsImportProtocol` on). `pnpm lint` and `pnpm format` scripts.
|
|
62
|
+
- **`.github/workflows/ci.yml`** — runs typecheck + lint + test + build +
|
|
63
|
+
Python pytest on every PR. Previously only the publish workflow on tag
|
|
64
|
+
push exercised this surface; PRs were unguarded.
|
|
65
|
+
- **`ReplayCache.entries()`** — public iterator for the cached
|
|
66
|
+
`(request, response)` pairs. Replaces the bracket-access escape hatch into
|
|
67
|
+
the private `byKey` map. Same semantics, exposed in the type contract.
|
|
68
|
+
- **Per-example READMEs** — `examples/multi-shot-optimization` and
|
|
69
|
+
`examples/same-sandbox-harness` now document what they show, how to run,
|
|
70
|
+
expected output, and adaptation guidance. The other three examples already
|
|
71
|
+
had READMEs; the README index now links to all five.
|
|
72
|
+
- **`clients/python/examples/judge_anti_slop.py`** — runnable script that
|
|
73
|
+
doubles as a pytest, anchoring the `judge` API contract: composite in
|
|
74
|
+
`[0, 1]`, `RubricNotFoundError` for bogus rubric name, `ValidationError`
|
|
75
|
+
for no-rubric call.
|
|
76
|
+
|
|
77
|
+
### Fixed
|
|
78
|
+
|
|
79
|
+
- **`reflective-mutation.ts`** — local `escape` variable shadowed the global
|
|
80
|
+
`escape` property. Renamed to `escaped`. No behavior change; flagged by
|
|
81
|
+
biome.
|
|
82
|
+
|
|
3
83
|
## 0.23.1 — FileSystemTraceStore.updateRun no longer double-appends
|
|
4
84
|
|
|
5
85
|
### Fixed
|
package/README.md
CHANGED
|
@@ -1,32 +1,39 @@
|
|
|
1
1
|
# @tangle-network/agent-eval
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
the result, turn feedback into replay data, compare variants, and ship only
|
|
7
|
-
when the evidence improves.
|
|
3
|
+
**Substrate for self-improving agents.** Trace what runs, verify the result,
|
|
4
|
+
turn outcomes into preferences and rewards, mutate prompts and policies under
|
|
5
|
+
anytime-valid evidence, and ship only when the improvement is decisive.
|
|
8
6
|
|
|
9
7
|
```txt
|
|
10
|
-
product task
|
|
11
|
-
-> observe
|
|
12
|
-
->
|
|
13
|
-
->
|
|
14
|
-
|
|
15
|
-
|
|
8
|
+
real product task
|
|
9
|
+
-> observe / act (your runtime)
|
|
10
|
+
-> trace + verifier pipeline (capture integrity)
|
|
11
|
+
-> RunRecord (canonical eval artifact)
|
|
12
|
+
-> judge calibration · paired stats · sequential α
|
|
13
|
+
-> preferences · verifiable rewards · process rewards
|
|
14
|
+
-> GEPA / reflective mutation · auto-research · active curriculum
|
|
15
|
+
-> release gate · replay · contamination probe · tournament rating
|
|
16
|
+
-> next iteration
|
|
16
17
|
```
|
|
17
18
|
|
|
18
|
-
`agent-eval` does not own product state, credentials, UI, storage, model
|
|
19
|
+
`agent-eval` does **not** own product state, credentials, UI, storage, model
|
|
19
20
|
routing, browser drivers, sandbox policy, or deployment. Products own those.
|
|
20
|
-
This package owns
|
|
21
|
-
|
|
21
|
+
This package owns the loop that closes evaluation → preference → mutation →
|
|
22
|
+
redeploy, with capture integrity and statistically rigorous evidence at every
|
|
23
|
+
step.
|
|
24
|
+
|
|
25
|
+
It ships as a TypeScript library (npm) with a generated Python client (PyPI),
|
|
26
|
+
both speaking the same wire protocol. MIT, self-hostable, no SaaS dependency.
|
|
22
27
|
|
|
23
28
|
## Install
|
|
24
29
|
|
|
25
30
|
```sh
|
|
26
31
|
pnpm add @tangle-network/agent-eval
|
|
32
|
+
# or, from Python:
|
|
33
|
+
pip install agent-eval-rpc
|
|
27
34
|
```
|
|
28
35
|
|
|
29
|
-
## Quick Start
|
|
36
|
+
## Quick Start — the control loop
|
|
30
37
|
|
|
31
38
|
```ts
|
|
32
39
|
import {
|
|
@@ -78,68 +85,102 @@ const result = await runAgentControlLoop({
|
|
|
78
85
|
await product.storeEvalResult(task.id, result)
|
|
79
86
|
```
|
|
80
87
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
contract itself.
|
|
88
|
+
Same loop shape in production, replay, benchmark, and optimization. Swap the
|
|
89
|
+
dependencies behind `observe()` and `act()`, never the eval contract.
|
|
84
90
|
|
|
85
|
-
##
|
|
91
|
+
## Self-improvement loop
|
|
86
92
|
|
|
87
|
-
|
|
93
|
+
Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
|
|
94
|
+
proposals, and curriculum updates — all from the same `RunRecord` produced by
|
|
95
|
+
the control loop.
|
|
88
96
|
|
|
89
97
|
```ts
|
|
90
|
-
import {
|
|
91
|
-
import {
|
|
92
|
-
|
|
93
|
-
|
|
98
|
+
import { runEvalCampaign } from '@tangle-network/agent-eval'
|
|
99
|
+
import {
|
|
100
|
+
extractPreferences,
|
|
101
|
+
extractVerifiableReward,
|
|
102
|
+
filterDeterministicallyRewarded,
|
|
103
|
+
offPolicyEstimateAll,
|
|
104
|
+
analyzeOptimizationResult,
|
|
105
|
+
} from '@tangle-network/agent-eval/rl'
|
|
106
|
+
|
|
107
|
+
// 1. Run a matrix of variants × scenarios with capture integrity by construction.
|
|
108
|
+
const campaign = await runEvalCampaign({ variants, scenarios, run })
|
|
109
|
+
|
|
110
|
+
// 2. Convert outcomes into RL signal.
|
|
111
|
+
const rewards = extractVerifiableReward(campaign.runs) // compile/test/schema
|
|
112
|
+
const prefs = extractPreferences(campaign.runs) // (chosen, rejected) triples
|
|
113
|
+
const clean = filterDeterministicallyRewarded(rewards) // judge-noise free
|
|
114
|
+
|
|
115
|
+
// 3. Estimate a candidate policy's value without re-running.
|
|
116
|
+
const ope = offPolicyEstimateAll(campaign.runs, candidatePolicy) // IPS + SNIPS + DR
|
|
117
|
+
|
|
118
|
+
// 4. Or close the loop end-to-end: score → reflect → mutate → re-run.
|
|
119
|
+
const next = await analyzeOptimizationResult(campaign, { researcher })
|
|
94
120
|
```
|
|
95
121
|
|
|
122
|
+
| Step | Primitive | Subpath |
|
|
123
|
+
| --- | --- | --- |
|
|
124
|
+
| Eval matrix with integrity | `runEvalCampaign` | `/` |
|
|
125
|
+
| Deterministic re-judge / audit | `ReplayCache`, `createReplayFetch` | `/` |
|
|
126
|
+
| Anytime-valid α across rolling looks | `pairedEvalueSequence` | `/reporting` |
|
|
127
|
+
| Judge quality vs gold | `calibrateJudge` (κ, Pearson, MAE, bias probes) | `/` |
|
|
128
|
+
| (chosen, rejected) for DPO/KTO/PPO | `extractPreferences` | `/rl` |
|
|
129
|
+
| Verifiable reward signal | `extractVerifiableReward` | `/rl` |
|
|
130
|
+
| Step-level / PRM training data | `extractStepRewards`, `prmTrainingPairs` | `/rl` |
|
|
131
|
+
| Estimate policy value off-policy | `offPolicyEstimateAll` (IPS + SNIPS + DR) | `/rl` |
|
|
132
|
+
| GEPA / reflective prompt mutation | `buildReflectionPrompt`, `parseReflectionResponse`, Ax-GEPA `SteeringOptimizer` | `/` `/optimization` |
|
|
133
|
+
| Auto-research (read runs → propose) | `analyzeOptimizationResult`, `PredictiveValidityResearcher` | `/rl` |
|
|
134
|
+
| Active curriculum (variance / Thompson) | `allocateCurriculum` | `/rl` |
|
|
135
|
+
| Tournament ratings (Bradley-Terry + Elo) | `fitBradleyTerry`, `applyEloUpdate` | `/rl` |
|
|
136
|
+
| Adversarial scenario search | `adversarialScenarioSearch` | `/rl` |
|
|
137
|
+
| Contamination probe (held-out perturb) | `runContaminationProbe` | `/rl` |
|
|
138
|
+
| Reward hacking signatures | `detectRewardHacking` | `/rl` |
|
|
139
|
+
| Compute curves (best-of-N, self-consist, Pareto) | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` | `/rl` |
|
|
140
|
+
| Knowledge gap separated from reasoning gap | `scoreKnowledgeReadiness` | `/` |
|
|
141
|
+
| Release gate (paired evidence + holdouts) | `evaluateReleaseConfidence`, `HeldOutGate` | `/reporting` |
|
|
142
|
+
| Launch report (decision-grade) | `renderReleaseReport`, `researchReport` | `/reporting` |
|
|
143
|
+
|
|
144
|
+
## Import Paths
|
|
145
|
+
|
|
96
146
|
| Subpath | Use for |
|
|
97
147
|
| --- | --- |
|
|
98
|
-
| `@tangle-network/agent-eval/control` | `observe
|
|
148
|
+
| `@tangle-network/agent-eval/control` | `observe → validate → decide → act`, action policy, propose/review loops |
|
|
99
149
|
| `@tangle-network/agent-eval/traces` | trace stores, emitters, TraceAnalyst, replay |
|
|
100
|
-
| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot
|
|
101
|
-
| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values,
|
|
102
|
-
| `@tangle-network/agent-eval/rl` |
|
|
103
|
-
| `@tangle-network/agent-eval/wire` | HTTP/RPC
|
|
150
|
+
| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot, prompt evolution, GEPA, EvalCampaign |
|
|
151
|
+
| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, launch reports |
|
|
152
|
+
| `@tangle-network/agent-eval/rl` | adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves, auto-research |
|
|
153
|
+
| `@tangle-network/agent-eval/wire` | HTTP/RPC server + schemas (same protocol the Python client speaks) |
|
|
104
154
|
| `@tangle-network/agent-eval/benchmarks` | benchmark adapter contracts and reference wrappers |
|
|
105
155
|
|
|
106
|
-
|
|
156
|
+
The root export remains available for convenience; new code should prefer
|
|
157
|
+
focused subpaths. Anything under `/rl` should be imported from `/rl` — root
|
|
158
|
+
re-export is retained only for backward compatibility and will be narrowed in
|
|
159
|
+
0.25.
|
|
160
|
+
|
|
161
|
+
## API stability
|
|
107
162
|
|
|
108
|
-
|
|
163
|
+
Public exports are tagged with JSDoc stability markers so consumers can see
|
|
164
|
+
status at the call site (IDE hover, language server, declaration files).
|
|
165
|
+
|
|
166
|
+
| Tag | Meaning |
|
|
109
167
|
| --- | --- |
|
|
110
|
-
|
|
|
111
|
-
|
|
|
112
|
-
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
| Auto-execute the trace analyst on every run | `traceAnalystOnRunComplete` + `TraceEmitterOptions.onRunComplete` |
|
|
120
|
-
| Run a matrix of variants × scenarios × seeds with capture integrity by construction | `runEvalCampaign` |
|
|
121
|
-
| Re-judge / determinism-audit a past campaign for free | `ReplayCache`, `createReplayFetch` |
|
|
122
|
-
| Ship-when-decisive with anytime-valid α across rolling looks | `pairedEvalueSequence`, `evaluateInterimReleaseConfidence` |
|
|
123
|
-
| Tell load-bearing rubrics from decorative ones using deployment outcomes | `rubricPredictiveValidity` |
|
|
124
|
-
| Bridge legacy optimization output to canonical `RunRecord[]` | `trialsToRunRecords`, `verificationReportToRunRecord` |
|
|
125
|
-
| Extract a clean reward signal for RL training (compile/test/schema vs judge) | `extractVerifiableReward`, `filterDeterministicallyRewarded` |
|
|
126
|
-
| Produce DPO / PPO / KTO `(chosen, rejected)` triples | `extractPreferences` |
|
|
127
|
-
| Estimate a new policy's value on old trajectories without re-running | `offPolicyEstimateAll` (IPS + SNIPS + DR) |
|
|
128
|
-
| Step-level credit assignment / PRM training data | `extractStepRewards`, `prmTrainingPairs` |
|
|
129
|
-
| Detect benchmark contamination via held-out perturbations | `runContaminationProbe` |
|
|
130
|
-
| Pairwise tournament ratings for many-candidate sweeps | `fitBradleyTerry`, `applyEloUpdate` |
|
|
131
|
-
| Active search for inputs the policy fails on | `adversarialScenarioSearch` |
|
|
132
|
-
| Characterise a candidate across compute budgets | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` |
|
|
133
|
-
| Model missing context separately from bad reasoning | `KnowledgeRequirement`, `KnowledgeBundle` |
|
|
134
|
-
|
|
135
|
-
### Capture integrity (0.21+)
|
|
168
|
+
| `@stable` | API frozen at this major. Breaking changes require a major bump. |
|
|
169
|
+
| `@experimental` | Interface may evolve before becoming `@stable`. Pin the patch version if you depend on it. |
|
|
170
|
+
| `@internal` | Not part of the public contract. Use the documented subpath instead. |
|
|
171
|
+
|
|
172
|
+
The `/rl` subpath is the most active surface. See
|
|
173
|
+
[`src/rl/index.ts`](./src/rl/index.ts) for the current stable/experimental
|
|
174
|
+
breakdown.
|
|
175
|
+
|
|
176
|
+
## Capture integrity (0.21+)
|
|
136
177
|
|
|
137
178
|
Launch-grade benchmark runs need four things that are easy to forget in glue
|
|
138
179
|
code: (1) raw HTTP capture alongside the structured spans so a reviewer can
|
|
139
180
|
verify which route answered, (2) a preflight assertion that the configured
|
|
140
181
|
client points at the intended provider, (3) a run-end assertion that the
|
|
141
182
|
expected events were actually written, and (4) auto-execution of the trace
|
|
142
|
-
analyst as part of the run lifecycle.
|
|
183
|
+
analyst as part of the run lifecycle.
|
|
143
184
|
|
|
144
185
|
```ts
|
|
145
186
|
import {
|
|
@@ -168,28 +209,33 @@ Directives, rationale, and shipped-bug context are in
|
|
|
168
209
|
|
|
169
210
|
## Examples
|
|
170
211
|
|
|
171
|
-
|
|
172
|
-
[`examples/`](
|
|
212
|
+
Each example has its own README with what it demonstrates, expected output,
|
|
213
|
+
and runtime. See [`examples/`](./examples/).
|
|
173
214
|
|
|
174
|
-
- [`examples/multi-shot-optimization`](
|
|
215
|
+
- [`examples/multi-shot-optimization`](./examples/multi-shot-optimization/README.md):
|
|
175
216
|
optimize full trajectories with held-out promotion.
|
|
176
|
-
- [`examples/same-sandbox-harness`](
|
|
217
|
+
- [`examples/same-sandbox-harness`](./examples/same-sandbox-harness/README.md):
|
|
177
218
|
run setup/build/test and evidence checks in one workspace.
|
|
178
|
-
- [`examples/benchmarks`](
|
|
219
|
+
- [`examples/benchmarks`](./examples/benchmarks/README.md):
|
|
179
220
|
benchmark adapter shape and reference wrappers.
|
|
221
|
+
- [`examples/auto-research-with-agent-builder`](./examples/auto-research-with-agent-builder/README.md):
|
|
222
|
+
closed loop — score, reflect, mutate, re-score, repeat.
|
|
223
|
+
- [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
|
|
224
|
+
RunRecord → preferences → trainer (prime-rl) → next campaign.
|
|
180
225
|
|
|
181
226
|
## Docs
|
|
182
227
|
|
|
183
228
|
Read in this order:
|
|
184
229
|
|
|
185
|
-
1. [
|
|
186
|
-
2. [
|
|
187
|
-
3. [
|
|
188
|
-
4. [
|
|
189
|
-
5. [
|
|
190
|
-
6. [
|
|
191
|
-
7. [
|
|
192
|
-
8. [
|
|
230
|
+
1. [Concepts](./docs/concepts.md) — mental model, 5 min
|
|
231
|
+
2. [Product Eval Adoption](./docs/product-eval-adoption.md)
|
|
232
|
+
3. [Control Runtime](./docs/control-runtime.md)
|
|
233
|
+
4. [Feedback Trajectories](./docs/feedback-trajectories.md)
|
|
234
|
+
5. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
|
|
235
|
+
6. [Trace Analysis](./docs/trace-analysis.md)
|
|
236
|
+
7. [Knowledge Readiness](./docs/knowledge-readiness.md)
|
|
237
|
+
8. [Integration Launch Gates](./docs/integration-launch-gates.md)
|
|
238
|
+
9. [Wire Protocol](./docs/wire-protocol.md) — required for non-TypeScript consumers
|
|
193
239
|
|
|
194
240
|
## CLI / Wire Protocol
|
|
195
241
|
|
|
@@ -198,28 +244,44 @@ npm i -g @tangle-network/agent-eval
|
|
|
198
244
|
agent-eval serve --port 5005
|
|
199
245
|
```
|
|
200
246
|
|
|
201
|
-
|
|
247
|
+
Python:
|
|
202
248
|
|
|
203
249
|
```sh
|
|
204
|
-
|
|
205
|
-
pip install -e .
|
|
250
|
+
pip install agent-eval-rpc
|
|
206
251
|
```
|
|
207
252
|
|
|
253
|
+
```py
|
|
254
|
+
from agent_eval_rpc import Client
|
|
255
|
+
client = Client() # auto-detects HTTP server, falls back to subprocess
|
|
256
|
+
score = await client.judge(content=output, rubric_name="anti-slop")
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
TypeScript is the source of truth. Python is a thin transport client over the
|
|
260
|
+
generated OpenAPI schema. Schema drift is enforced impossible at release time
|
|
261
|
+
(version-locked CI).
|
|
262
|
+
|
|
208
263
|
## Development
|
|
209
264
|
|
|
210
265
|
```sh
|
|
211
266
|
pnpm install
|
|
212
267
|
pnpm typecheck
|
|
213
268
|
pnpm test
|
|
214
|
-
pnpm
|
|
215
|
-
pnpm openapi
|
|
269
|
+
pnpm lint # biome
|
|
270
|
+
pnpm build # tsup + openapi.json
|
|
216
271
|
```
|
|
217
272
|
|
|
218
273
|
## Related Packages
|
|
219
274
|
|
|
220
|
-
- `@tangle-network/agent-runtime
|
|
221
|
-
|
|
222
|
-
- `@tangle-network/agent-
|
|
275
|
+
- [`@tangle-network/agent-runtime`](https://www.npmjs.com/package/@tangle-network/agent-runtime):
|
|
276
|
+
production session/runtime layer.
|
|
277
|
+
- [`@tangle-network/agent-knowledge`](https://www.npmjs.com/package/@tangle-network/agent-knowledge):
|
|
278
|
+
source-grounded knowledge bases and readiness.
|
|
279
|
+
- [`@tangle-network/agent-integrations`](https://www.npmjs.com/package/@tangle-network/agent-integrations):
|
|
280
|
+
connection, grant, capability, and integration invocation contracts.
|
|
281
|
+
|
|
282
|
+
Together: `agent-runtime` is where the agent runs; `agent-knowledge` is what
|
|
283
|
+
it knows; `agent-integrations` is what it can do; `agent-eval` is how it gets
|
|
284
|
+
better.
|
|
223
285
|
|
|
224
286
|
## License
|
|
225
287
|
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Tool-use metrics — derived purely from trace data.
|
|
5
|
+
*
|
|
6
|
+
* No scoring assumptions: consumers supply optional ground-truth tool
|
|
7
|
+
* selections per turn + optional "information used downstream" signals.
|
|
8
|
+
* Without those, we still compute descriptive metrics (error rate,
|
|
9
|
+
* retry rate, duplicate-call rate) that are useful on their own.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
interface ToolUseMetrics {
|
|
13
|
+
runId: string;
|
|
14
|
+
totalCalls: number;
|
|
15
|
+
byTool: Record<string, ToolStats>;
|
|
16
|
+
errorRate: number;
|
|
17
|
+
/** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */
|
|
18
|
+
duplicateRate: number;
|
|
19
|
+
/** Ratio of error calls followed by ≥1 retry on same tool. */
|
|
20
|
+
retryRate: number;
|
|
21
|
+
/** Optional: of the calls agent made, fraction the evaluator marked as "correct selection". */
|
|
22
|
+
selectionAccuracy?: number;
|
|
23
|
+
}
|
|
24
|
+
interface ToolStats {
|
|
25
|
+
calls: number;
|
|
26
|
+
errors: number;
|
|
27
|
+
avgLatencyMs: number;
|
|
28
|
+
duplicates: number;
|
|
29
|
+
}
|
|
30
|
+
interface ToolUseOptions {
|
|
31
|
+
/** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */
|
|
32
|
+
selectionLabels?: Record<string, boolean>;
|
|
33
|
+
}
|
|
34
|
+
declare function computeToolUseMetrics(store: TraceStore, runId: string, options?: ToolUseOptions): Promise<ToolUseMetrics>;
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Baseline regression detection.
|
|
38
|
+
*
|
|
39
|
+
* Lifted from ADC baseline.ts. Every promotion-blocking signal boils down
|
|
40
|
+
* to: "is this run measurably worse than baseline?" — with enough
|
|
41
|
+
* statistical rigor to distinguish noise from drift.
|
|
42
|
+
*
|
|
43
|
+
* Uses:
|
|
44
|
+
* - Welch's t-test (unequal variance) for per-metric mean comparison
|
|
45
|
+
* - Cohen's d for effect size magnitude
|
|
46
|
+
* - IQR for stability flag (unstable samples can't be trusted for comparisons)
|
|
47
|
+
*
|
|
48
|
+
* Returns a structured verdict: improved | regressed | stable | unstable.
|
|
49
|
+
*/
|
|
50
|
+
interface MetricSamples {
|
|
51
|
+
/** Stable metric key (e.g. "overallScore", "firstTokenMs"). */
|
|
52
|
+
metric: string;
|
|
53
|
+
/** Whether higher values are better. */
|
|
54
|
+
higherIsBetter: boolean;
|
|
55
|
+
baseline: number[];
|
|
56
|
+
candidate: number[];
|
|
57
|
+
}
|
|
58
|
+
interface MetricVerdict {
|
|
59
|
+
metric: string;
|
|
60
|
+
baselineMean: number;
|
|
61
|
+
candidateMean: number;
|
|
62
|
+
delta: number;
|
|
63
|
+
cohensD: number;
|
|
64
|
+
welchT: number;
|
|
65
|
+
welchDf: number;
|
|
66
|
+
welchP: number;
|
|
67
|
+
stable: boolean;
|
|
68
|
+
/** IQR of the combined samples — used as a rough stability indicator. */
|
|
69
|
+
iqr: number;
|
|
70
|
+
verdict: 'improved' | 'regressed' | 'stable' | 'unstable';
|
|
71
|
+
}
|
|
72
|
+
interface BaselineReport {
|
|
73
|
+
metrics: MetricVerdict[];
|
|
74
|
+
/** True if any critical metric regressed. */
|
|
75
|
+
hasRegression: boolean;
|
|
76
|
+
/** True if any metric is unstable (too noisy to judge). */
|
|
77
|
+
hasUnstable: boolean;
|
|
78
|
+
}
|
|
79
|
+
interface BaselineOptions {
|
|
80
|
+
/** Effect size threshold for meaningful delta (default 0.5 — medium effect). */
|
|
81
|
+
effectThreshold?: number;
|
|
82
|
+
/** p-value threshold for statistical significance (default 0.05). */
|
|
83
|
+
alpha?: number;
|
|
84
|
+
/** IQR/mean ratio above which samples are flagged unstable (default 0.30). */
|
|
85
|
+
unstableCvThreshold?: number;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Compare candidate samples against baseline per metric. Verdict logic:
|
|
89
|
+
* - unstable: IQR/|mean| > threshold on either set — not enough signal
|
|
90
|
+
* - improved: meaningful effect in the "better" direction AND p < alpha
|
|
91
|
+
* - regressed: meaningful effect in the "worse" direction AND p < alpha
|
|
92
|
+
* - stable: otherwise (no significant change)
|
|
93
|
+
*/
|
|
94
|
+
declare function compareToBaseline(samples: MetricSamples[], options?: BaselineOptions): BaselineReport;
|
|
95
|
+
/** Inter-quartile range; 0 when the sample has no spread. */
|
|
96
|
+
declare function iqr(xs: number[]): number;
|
|
97
|
+
/**
|
|
98
|
+
* Welch's t-test — unequal-variance two-sample t. Uses the same Student-t
|
|
99
|
+
* CDF as `pairedTTest` (via incomplete beta); falls back to normal tail
|
|
100
|
+
* when df is large.
|
|
101
|
+
*/
|
|
102
|
+
declare function welchsTTest(a: number[], b: number[]): {
|
|
103
|
+
t: number;
|
|
104
|
+
df: number;
|
|
105
|
+
p: number;
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
export { type BaselineOptions as B, type MetricSamples as M, type ToolStats as T, type BaselineReport as a, type MetricVerdict as b, computeToolUseMetrics as c, type ToolUseMetrics as d, type ToolUseOptions as e, compareToBaseline as f, iqr as i, welchsTTest as w };
|
|
@@ -1,2 +1,3 @@
|
|
|
1
|
-
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index
|
|
2
|
-
import '../run-record-
|
|
1
|
+
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index--fVrWDiR.js';
|
|
2
|
+
import '../run-record-CqzahIbx.js';
|
|
3
|
+
import '../errors-BZ9sTdz7.js';
|