@tangle-network/agent-eval 0.22.0 → 0.23.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +156 -0
- package/README.md +13 -3
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-UAND2LOT.js → chunk-7EAUOUQS.js} +4 -247
- package/dist/chunk-7EAUOUQS.js.map +1 -0
- package/dist/chunk-AXHNWLIX.js +246 -0
- package/dist/chunk-AXHNWLIX.js.map +1 -0
- package/dist/chunk-EXGR4XEM.js +283 -0
- package/dist/chunk-EXGR4XEM.js.map +1 -0
- package/dist/chunk-LZKIOBG2.js +2026 -0
- package/dist/chunk-LZKIOBG2.js.map +1 -0
- package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
- package/dist/chunk-QBW3YBTR.js.map +1 -0
- package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
- package/dist/{chunk-USHQBPMH.js → chunk-VQQSPGSM.js} +7 -283
- package/dist/chunk-VQQSPGSM.js.map +1 -0
- package/dist/{chunk-4W4NCYM2.js → chunk-XPHOZPOM.js} +4 -2
- package/dist/chunk-XPHOZPOM.js.map +1 -0
- package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{optimization-UVDNKaO6.d.ts → eval-campaign-Ds5QljIh.d.ts} +4 -5
- package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
- package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
- package/dist/index-ekBXweiQ.d.ts +1894 -0
- package/dist/index.d.ts +18 -154
- package/dist/index.js +126 -26
- package/dist/index.js.map +1 -1
- package/dist/{integrity-K2oVlF57.d.ts → integrity-Cr5YodSY.d.ts} +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +5 -5
- package/dist/optimization.js +7 -5
- package/dist/reporting.d.ts +294 -4
- package/dist/reporting.js +6 -4
- package/dist/rl.d.ts +8 -0
- package/dist/rl.js +113 -0
- package/dist/rl.js.map +1 -0
- package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
- package/dist/sequential-DgU2mFsE.d.ts +304 -0
- package/dist/{summary-report-D4p7RlDu.d.ts → summary-report-Ce1r4EYo.d.ts} +2 -2
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +6 -6
- package/docs/auto-research-loop-end-to-end.md +186 -0
- package/docs/three-package-architecture.md +180 -0
- package/package.json +22 -10
- package/dist/chunk-4W4NCYM2.js.map +0 -1
- package/dist/chunk-UAND2LOT.js.map +0 -1
- package/dist/chunk-USHQBPMH.js.map +0 -1
- package/dist/chunk-YUFXO3TU.js.map +0 -1
- package/dist/reporting-B82RSv9C.d.ts +0 -593
- /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# Three-package architecture: agent-eval × agent-knowledge × agent-runtime
|
|
2
|
+
|
|
3
|
+
The Tangle agent stack splits responsibilities across three TypeScript
|
|
4
|
+
packages with explicit, narrow contracts. This doc is the reference for how
|
|
5
|
+
they fit together — what each owns, what each consumes from the others, and
|
|
6
|
+
the canonical data shapes that move between them.
|
|
7
|
+
|
|
8
|
+
## Why three packages
|
|
9
|
+
|
|
10
|
+
Each one has a single, defensible job. Combining them was a real temptation
|
|
11
|
+
(less version drift, fewer registries) and we said no on purpose:
|
|
12
|
+
|
|
13
|
+
- **`@tangle-network/agent-eval`** owns measurement, optimization, and the
|
|
14
|
+
RL bridge. It has no opinion about *what* the agent does or *how* it runs;
|
|
15
|
+
it has strong opinions about whether the answer is good and how to make it
|
|
16
|
+
better.
|
|
17
|
+
- **`@tangle-network/agent-knowledge`** owns the data side: source-grounded
|
|
18
|
+
knowledge graphs, source citations, eval-gated knowledge growth, knowledge
|
|
19
|
+
readiness scoring. It is domain-agnostic — legal, tax, coding, research
|
|
20
|
+
workflows define their own policies on top of it.
|
|
21
|
+
- **`@tangle-network/agent-runtime`** owns the *execution* side: the task
|
|
22
|
+
lifecycle, knowledge-readiness gating, control-loop orchestration,
|
|
23
|
+
streaming session kernels. It does not own domain policy, models, tools,
|
|
24
|
+
or UI; it standardizes the lifecycle and delegates domain behavior to
|
|
25
|
+
adapters.
|
|
26
|
+
|
|
27
|
+
Each package can be reasoned about independently. Each can be replaced
|
|
28
|
+
without rewriting the others.
|
|
29
|
+
|
|
30
|
+
## The data interchange — `RunRecord`, `Scenario`, `KnowledgeBundle`
|
|
31
|
+
|
|
32
|
+
These three types travel between the packages and tie the architecture
|
|
33
|
+
together.
|
|
34
|
+
|
|
35
|
+
### `RunRecord` (owned by agent-eval)
|
|
36
|
+
|
|
37
|
+
Every measurable thing — a campaign cell, an optimization trial, a
|
|
38
|
+
production rollout, a deployment outcome — projects to a `RunRecord`. It
|
|
39
|
+
carries identity (`runId`, `experimentId`, `candidateId`, `seed`,
|
|
40
|
+
`scenarioId`), provenance (`commitSha`, `model`, `promptHash`, `configHash`),
|
|
41
|
+
cost (`costUsd`, `tokenUsage`), and the outcome (per-split scores +
|
|
42
|
+
free-form `raw` metric bag).
|
|
43
|
+
|
|
44
|
+
agent-knowledge consumes `RunRecord[]` for release reporting and
|
|
45
|
+
optimization analysis. agent-runtime exposes hooks for projecting its own
|
|
46
|
+
task results into `RunRecord` shape. Every consumer of agent-eval's
|
|
47
|
+
campaign / RL primitives produces `RunRecord[]`.
|
|
48
|
+
|
|
49
|
+
### `Scenario` (currently each owner defines its own)
|
|
50
|
+
|
|
51
|
+
agent-eval's `runEvalCampaign` takes
|
|
52
|
+
`{ scenarioId: string; tags?: Record<string,string> }`. agent-knowledge
|
|
53
|
+
defines richer scenario types for knowledge-base optimization. agent-runtime
|
|
54
|
+
takes `TaskSpec` which is one task at a time, not a scenario set.
|
|
55
|
+
|
|
56
|
+
This is a known minor friction; not load-bearing yet. When it becomes one,
|
|
57
|
+
`Scenario` will get promoted to a shared interface.
|
|
58
|
+
|
|
59
|
+
### `KnowledgeBundle` (owned by agent-knowledge)
|
|
60
|
+
|
|
61
|
+
agent-knowledge produces `KnowledgeBundle` (a versioned graph of source
|
|
62
|
+
citations + generated content) and `KnowledgeReadinessReport` (gap
|
|
63
|
+
analysis). agent-eval's `KnowledgeRequirement` / `KnowledgeBundle` types
|
|
64
|
+
are imported from agent-eval into agent-knowledge — agent-knowledge
|
|
65
|
+
**adapts** its richer types to agent-eval's wire types, not the other way
|
|
66
|
+
around. The wire types are the contract; the rich types are agent-knowledge's
|
|
67
|
+
internal model.
|
|
68
|
+
|
|
69
|
+
## Dependency direction
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
┌────────────────────┐
|
|
73
|
+
│ agent-runtime │
|
|
74
|
+
│ (executor) │
|
|
75
|
+
└─────────┬──────────┘
|
|
76
|
+
│
|
|
77
|
+
▼ imports
|
|
78
|
+
┌────────────────────┐
|
|
79
|
+
│ agent-eval │
|
|
80
|
+
│ (measurement) │
|
|
81
|
+
└────────────────────┘
|
|
82
|
+
▲
|
|
83
|
+
│ imports
|
|
84
|
+
┌─────────┴──────────┐
|
|
85
|
+
│ agent-knowledge │
|
|
86
|
+
│ (data side) │
|
|
87
|
+
└────────────────────┘
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Both** agent-runtime and agent-knowledge import agent-eval. agent-eval
|
|
91
|
+
imports neither. This is deliberate: agent-eval is the leaf — its API is
|
|
92
|
+
the bottleneck, so its surface stays narrow and stable.
|
|
93
|
+
|
|
94
|
+
## What each package contributes to the auto-research loop
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
┌────────────────────┐ ┌────────────────────┐
|
|
98
|
+
│ agent-knowledge │ ────► │ agent-eval │
|
|
99
|
+
│ │ │ │
|
|
100
|
+
│ - scenario sets │ │ - runEvalCampaign │
|
|
101
|
+
│ - knowledge bundle │ │ - capture integrity│
|
|
102
|
+
│ - readiness gates │ │ - researchReport │
|
|
103
|
+
│ - source citations │ │ - replayCampaign │
|
|
104
|
+
│ │ │ - sequential │
|
|
105
|
+
│ produces: │ │ - RL bridge │
|
|
106
|
+
│ KnowledgeBundle │ │ - preferences │
|
|
107
|
+
│ Scenario │ │ - off-policy │
|
|
108
|
+
└────────────────────┘ │ - tournament │
|
|
109
|
+
│ │
|
|
110
|
+
│ produces: │
|
|
111
|
+
│ RunRecord[] │
|
|
112
|
+
│ PreferenceTriple │
|
|
113
|
+
│ etc. │
|
|
114
|
+
└─────────▲──────────┘
|
|
115
|
+
│
|
|
116
|
+
┌────────────────────┐ │
|
|
117
|
+
│ agent-runtime │ ──────────────────┘
|
|
118
|
+
│ │
|
|
119
|
+
│ - runAgentTask │
|
|
120
|
+
│ - runAgentControl │
|
|
121
|
+
│ - readiness gating │
|
|
122
|
+
│ - SSE / sessions │
|
|
123
|
+
│ │
|
|
124
|
+
│ produces: │
|
|
125
|
+
│ ControlRunResult │
|
|
126
|
+
│ SSE events │
|
|
127
|
+
└────────────────────┘
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
agent-knowledge brings the *what* (scenarios, knowledge, source data).
|
|
131
|
+
agent-runtime brings the *how to run it once* (task lifecycle, control
|
|
132
|
+
loop). agent-eval brings the *measurement and improvement* (campaign,
|
|
133
|
+
report, RL bridge).
|
|
134
|
+
|
|
135
|
+
## Cross-package contracts (current state, 0.23+)
|
|
136
|
+
|
|
137
|
+
| From → To | Type | What it carries |
|
|
138
|
+
|---|---|---|
|
|
139
|
+
| agent-knowledge → agent-eval | `RunRecord` | (consumed via `runMultiShotOptimization` for knowledge-base optimization) |
|
|
140
|
+
| agent-knowledge → agent-eval | `KnowledgeReadinessReport`, `KnowledgeBundle`, `KnowledgeRequirement` | (re-exported from agent-eval; agent-knowledge populates) |
|
|
141
|
+
| agent-knowledge → agent-eval | `ControlRuntimeConfig<KnowledgeBaseCandidate>` | (knowledge research adapter) |
|
|
142
|
+
| agent-runtime → agent-eval | `runAgentControlLoop`, `scoreKnowledgeReadiness`, `blockingKnowledgeEval` | (consumed; agent-runtime calls these in its task lifecycle) |
|
|
143
|
+
| agent-runtime → agent-eval | `RunRecord`, `TraceStore`, `ControlRunResult`, `ControlStep` | (re-exported types; agent-runtime adapters projects into these) |
|
|
144
|
+
| agent-eval ↘ neither package | (no upstream imports) | |
|
|
145
|
+
|
|
146
|
+
## What's missing for the contracts to be S-tier
|
|
147
|
+
|
|
148
|
+
These are honest gaps, surfaced after the 0.23 audit:
|
|
149
|
+
|
|
150
|
+
1. **Shared `Scenario` interface.** Each package has its own scenario
|
|
151
|
+
shape. agent-eval will promote a minimal `Scenario` to shared use when
|
|
152
|
+
the second consumer needs it.
|
|
153
|
+
2. **`agent-knowledge` is pinned at `agent-eval@^0.20.0`.** It misses
|
|
154
|
+
capture-integrity (0.21), the campaign artifact (0.22), and the RL
|
|
155
|
+
bridge (0.23). On its next `pnpm install` the caret will pick up
|
|
156
|
+
minors — but `RunRecord`'s `scenarioId` field (added in 0.23) won't be
|
|
157
|
+
populated by agent-knowledge's existing run records. A planned bump +
|
|
158
|
+
adapter pass closes this.
|
|
159
|
+
3. **`agent-runtime` is pinned at `agent-eval@^0.20.0`.** Same picture —
|
|
160
|
+
misses capture-integrity, campaign, RL bridge. Specifically the
|
|
161
|
+
`RawProviderSink` integration would let every agent-runtime task auto-
|
|
162
|
+
capture its provider HTTP envelope without wiring it per-consumer.
|
|
163
|
+
4. **No first-class trace-analyst hook in agent-runtime.** agent-runtime's
|
|
164
|
+
`runAgentTask` can emit traces but doesn't auto-execute the trace
|
|
165
|
+
analyst on completion the way `runEvalCampaign` does. A `onRunComplete`
|
|
166
|
+
hook on agent-runtime would close this — and the implementation is
|
|
167
|
+
one method change.
|
|
168
|
+
|
|
169
|
+
These are tracked as follow-up bumps after agent-eval 0.23 ships.
|
|
170
|
+
|
|
171
|
+
## Versioning policy
|
|
172
|
+
|
|
173
|
+
Each package versions independently. The minor-version axis carries
|
|
174
|
+
breaking changes; agent-eval's minor versions are tied to the major
|
|
175
|
+
methodological shifts (0.21 = capture integrity; 0.22 = campaign + RL
|
|
176
|
+
bridge experimental; 0.23 = RL bridge primitives, examples).
|
|
177
|
+
|
|
178
|
+
When agent-eval ships a minor, agent-knowledge and agent-runtime get a
|
|
179
|
+
follow-up PR to consume the new surface. The follow-up is tracked as a
|
|
180
|
+
deliberate change, not a passive caret pickup.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.23.1",
|
|
4
4
|
"description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -34,6 +34,11 @@
|
|
|
34
34
|
"import": "./dist/reporting.js",
|
|
35
35
|
"default": "./dist/reporting.js"
|
|
36
36
|
},
|
|
37
|
+
"./rl": {
|
|
38
|
+
"types": "./dist/rl.d.ts",
|
|
39
|
+
"import": "./dist/rl.js",
|
|
40
|
+
"default": "./dist/rl.js"
|
|
41
|
+
},
|
|
37
42
|
"./traces": {
|
|
38
43
|
"types": "./dist/traces.d.ts",
|
|
39
44
|
"import": "./dist/traces.js",
|
|
@@ -74,6 +79,15 @@
|
|
|
74
79
|
"publishConfig": {
|
|
75
80
|
"access": "public"
|
|
76
81
|
},
|
|
82
|
+
"scripts": {
|
|
83
|
+
"build": "tsup && pnpm openapi",
|
|
84
|
+
"dev": "tsup --watch",
|
|
85
|
+
"prepare": "pnpm build",
|
|
86
|
+
"test": "vitest run",
|
|
87
|
+
"test:watch": "vitest",
|
|
88
|
+
"typecheck": "tsc --noEmit",
|
|
89
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
90
|
+
},
|
|
77
91
|
"dependencies": {
|
|
78
92
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
79
93
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -89,16 +103,14 @@
|
|
|
89
103
|
"typescript": "^5.7.0",
|
|
90
104
|
"vitest": "^3.0.0"
|
|
91
105
|
},
|
|
106
|
+
"pnpm": {
|
|
107
|
+
"overrides": {
|
|
108
|
+
"postcss@<8.5.10": "^8.5.10"
|
|
109
|
+
}
|
|
110
|
+
},
|
|
92
111
|
"engines": {
|
|
93
112
|
"node": ">=20"
|
|
94
113
|
},
|
|
95
114
|
"license": "MIT",
|
|
96
|
-
"
|
|
97
|
-
|
|
98
|
-
"dev": "tsup --watch",
|
|
99
|
-
"test": "vitest run",
|
|
100
|
-
"test:watch": "vitest",
|
|
101
|
-
"typecheck": "tsc --noEmit",
|
|
102
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
103
|
-
}
|
|
104
|
-
}
|
|
115
|
+
"packageManager": "pnpm@10.22.0"
|
|
116
|
+
}
|