@tangle-network/agent-eval 0.22.0 → 0.23.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/CHANGELOG.md +156 -0
  2. package/README.md +13 -3
  3. package/dist/benchmarks/index.d.ts +2 -2
  4. package/dist/{chunk-UAND2LOT.js → chunk-7EAUOUQS.js} +4 -247
  5. package/dist/chunk-7EAUOUQS.js.map +1 -0
  6. package/dist/chunk-AXHNWLIX.js +246 -0
  7. package/dist/chunk-AXHNWLIX.js.map +1 -0
  8. package/dist/chunk-EXGR4XEM.js +283 -0
  9. package/dist/chunk-EXGR4XEM.js.map +1 -0
  10. package/dist/chunk-LZKIOBG2.js +2026 -0
  11. package/dist/chunk-LZKIOBG2.js.map +1 -0
  12. package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
  13. package/dist/chunk-QBW3YBTR.js.map +1 -0
  14. package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
  15. package/dist/{chunk-USHQBPMH.js → chunk-VQQSPGSM.js} +7 -283
  16. package/dist/chunk-VQQSPGSM.js.map +1 -0
  17. package/dist/{chunk-4W4NCYM2.js → chunk-XPHOZPOM.js} +4 -2
  18. package/dist/chunk-XPHOZPOM.js.map +1 -0
  19. package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
  20. package/dist/control.d.ts +3 -3
  21. package/dist/control.js +2 -2
  22. package/dist/{optimization-UVDNKaO6.d.ts → eval-campaign-Ds5QljIh.d.ts} +4 -5
  23. package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
  24. package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
  25. package/dist/index-ekBXweiQ.d.ts +1894 -0
  26. package/dist/index.d.ts +18 -154
  27. package/dist/index.js +126 -26
  28. package/dist/index.js.map +1 -1
  29. package/dist/{integrity-K2oVlF57.d.ts → integrity-Cr5YodSY.d.ts} +1 -1
  30. package/dist/openapi.json +1 -1
  31. package/dist/optimization.d.ts +5 -5
  32. package/dist/optimization.js +7 -5
  33. package/dist/reporting.d.ts +294 -4
  34. package/dist/reporting.js +6 -4
  35. package/dist/rl.d.ts +8 -0
  36. package/dist/rl.js +113 -0
  37. package/dist/rl.js.map +1 -0
  38. package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
  39. package/dist/sequential-DgU2mFsE.d.ts +304 -0
  40. package/dist/{summary-report-D4p7RlDu.d.ts → summary-report-Ce1r4EYo.d.ts} +2 -2
  41. package/dist/traces.d.ts +2 -2
  42. package/dist/traces.js +6 -6
  43. package/docs/auto-research-loop-end-to-end.md +186 -0
  44. package/docs/three-package-architecture.md +180 -0
  45. package/package.json +22 -10
  46. package/dist/chunk-4W4NCYM2.js.map +0 -1
  47. package/dist/chunk-UAND2LOT.js.map +0 -1
  48. package/dist/chunk-USHQBPMH.js.map +0 -1
  49. package/dist/chunk-YUFXO3TU.js.map +0 -1
  50. package/dist/reporting-B82RSv9C.d.ts +0 -593
  51. /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
@@ -0,0 +1,180 @@
1
+ # Three-package architecture: agent-eval × agent-knowledge × agent-runtime
2
+
3
+ The Tangle agent stack splits responsibilities across three TypeScript
4
+ packages with explicit, narrow contracts. This doc is the reference for how
5
+ they fit together — what each owns, what each consumes from the others, and
6
+ the canonical data shapes that move between them.
7
+
8
+ ## Why three packages
9
+
10
+ Each one has a single, defensible job. Combining them was a real temptation
11
+ (less version drift, fewer registries) and we said no on purpose:
12
+
13
+ - **`@tangle-network/agent-eval`** owns measurement, optimization, and the
14
+ RL bridge. It has no opinion about *what* the agent does or *how* it runs;
15
+ it has strong opinions about whether the answer is good and how to make it
16
+ better.
17
+ - **`@tangle-network/agent-knowledge`** owns the data side: source-grounded
18
+ knowledge graphs, source citations, eval-gated knowledge growth, knowledge
19
+ readiness scoring. It is domain-agnostic — legal, tax, coding, research
20
+ workflows define their own policies on top of it.
21
+ - **`@tangle-network/agent-runtime`** owns the *execution* side: the task
22
+ lifecycle, knowledge-readiness gating, control-loop orchestration,
23
+ streaming session kernels. It does not own domain policy, models, tools,
24
+ or UI; it standardizes the lifecycle and delegates domain behavior to
25
+ adapters.
26
+
27
+ Each package can be reasoned about independently. Each can be replaced
28
+ without rewriting the others.
29
+
30
+ ## The data interchange — `RunRecord`, `Scenario`, `KnowledgeBundle`
31
+
32
+ These three types travel between the packages and tie the architecture
33
+ together.
34
+
35
+ ### `RunRecord` (owned by agent-eval)
36
+
37
+ Every measurable thing — a campaign cell, an optimization trial, a
38
+ production rollout, a deployment outcome — projects to a `RunRecord`. It
39
+ carries identity (`runId`, `experimentId`, `candidateId`, `seed`,
40
+ `scenarioId`), provenance (`commitSha`, `model`, `promptHash`, `configHash`),
41
+ cost (`costUsd`, `tokenUsage`), and the outcome (per-split scores +
42
+ free-form `raw` metric bag).
43
+
44
+ agent-knowledge consumes `RunRecord[]` for release reporting and
45
+ optimization analysis. agent-runtime exposes hooks for projecting its own
46
+ task results into `RunRecord` shape. Every consumer of agent-eval's
47
+ campaign / RL primitives produces `RunRecord[]`.
48
+
49
+ ### `Scenario` (currently each owner defines its own)
50
+
51
+ agent-eval's `runEvalCampaign` takes
52
+ `{ scenarioId: string; tags?: Record<string,string> }`. agent-knowledge
53
+ defines richer scenario types for knowledge-base optimization. agent-runtime
54
+ takes `TaskSpec` which is one task at a time, not a scenario set.
55
+
56
+ This is a known minor friction; not load-bearing yet. When it becomes one,
57
+ `Scenario` will get promoted to a shared interface.
58
+
59
+ ### `KnowledgeBundle` (owned by agent-knowledge)
60
+
61
+ agent-knowledge produces `KnowledgeBundle` (a versioned graph of source
62
+ citations + generated content) and `KnowledgeReadinessReport` (gap
63
+ analysis). agent-eval's `KnowledgeRequirement` / `KnowledgeBundle` types
64
+ are imported from agent-eval into agent-knowledge — agent-knowledge
65
+ **adapts** its richer types to agent-eval's wire types, not the other way
66
+ around. The wire types are the contract; the rich types are agent-knowledge's
67
+ internal model.
68
+
69
+ ## Dependency direction
70
+
71
+ ```
72
+ ┌────────────────────┐
73
+ │ agent-runtime │
74
+ │ (executor) │
75
+ └─────────┬──────────┘
76
+
77
+ ▼ imports
78
+ ┌────────────────────┐
79
+ │ agent-eval │
80
+ │ (measurement) │
81
+ └────────────────────┘
82
+
83
+ │ imports
84
+ ┌─────────┴──────────┐
85
+ │ agent-knowledge │
86
+ │ (data side) │
87
+ └────────────────────┘
88
+ ```
89
+
90
+ **Both** agent-runtime and agent-knowledge import agent-eval. agent-eval
91
+ imports neither. This is deliberate: agent-eval is the leaf — its API is
92
+ the bottleneck, so its surface stays narrow and stable.
93
+
94
+ ## What each package contributes to the auto-research loop
95
+
96
+ ```
97
+ ┌────────────────────┐ ┌────────────────────┐
98
+ │ agent-knowledge │ ────► │ agent-eval │
99
+ │ │ │ │
100
+ │ - scenario sets │ │ - runEvalCampaign │
101
+ │ - knowledge bundle │ │ - capture integrity│
102
+ │ - readiness gates │ │ - researchReport │
103
+ │ - source citations │ │ - replayCampaign │
104
+ │ │ │ - sequential │
105
+ │ produces: │ │ - RL bridge │
106
+ │ KnowledgeBundle │ │ - preferences │
107
+ │ Scenario │ │ - off-policy │
108
+ └────────────────────┘ │ - tournament │
109
+ │ │
110
+ │ produces: │
111
+ │ RunRecord[] │
112
+ │ PreferenceTriple │
113
+ │ etc. │
114
+ └─────────▲──────────┘
115
+
116
+ ┌────────────────────┐ │
117
+ │ agent-runtime │ ──────────────────┘
118
+ │ │
119
+ │ - runAgentTask │
120
+ │ - runAgentControl │
121
+ │ - readiness gating │
122
+ │ - SSE / sessions │
123
+ │ │
124
+ │ produces: │
125
+ │ ControlRunResult │
126
+ │ SSE events │
127
+ └────────────────────┘
128
+ ```
129
+
130
+ agent-knowledge brings the *what* (scenarios, knowledge, source data).
131
+ agent-runtime brings the *how to run it once* (task lifecycle, control
132
+ loop). agent-eval brings the *measurement and improvement* (campaign,
133
+ report, RL bridge).
134
+
135
+ ## Cross-package contracts (current state, 0.23+)
136
+
137
+ | From → To | Type | What it carries |
138
+ |---|---|---|
139
+ | agent-knowledge → agent-eval | `RunRecord` | (consumed via `runMultiShotOptimization` for knowledge-base optimization) |
140
+ | agent-knowledge → agent-eval | `KnowledgeReadinessReport`, `KnowledgeBundle`, `KnowledgeRequirement` | (re-exported from agent-eval; agent-knowledge populates) |
141
+ | agent-knowledge → agent-eval | `ControlRuntimeConfig<KnowledgeBaseCandidate>` | (knowledge research adapter) |
142
+ | agent-runtime → agent-eval | `runAgentControlLoop`, `scoreKnowledgeReadiness`, `blockingKnowledgeEval` | (consumed; agent-runtime calls these in its task lifecycle) |
143
+ | agent-runtime → agent-eval | `RunRecord`, `TraceStore`, `ControlRunResult`, `ControlStep` | (re-exported types; agent-runtime adapters projects into these) |
144
+ | agent-eval ↘ neither package | (no upstream imports) | |
145
+
146
+ ## What's missing for the contracts to be S-tier
147
+
148
+ These are honest gaps, surfaced after the 0.23 audit:
149
+
150
+ 1. **Shared `Scenario` interface.** Each package has its own scenario
151
+ shape. agent-eval will promote a minimal `Scenario` to shared use when
152
+ the second consumer needs it.
153
+ 2. **`agent-knowledge` is pinned at `agent-eval@^0.20.0`.** It misses
154
+ capture-integrity (0.21), the campaign artifact (0.22), and the RL
155
+ bridge (0.23). On its next `pnpm install` the caret will pick up
156
+ minors — but `RunRecord`'s `scenarioId` field (added in 0.23) won't be
157
+ populated by agent-knowledge's existing run records. A planned bump +
158
+ adapter pass closes this.
159
+ 3. **`agent-runtime` is pinned at `agent-eval@^0.20.0`.** Same picture —
160
+ misses capture-integrity, campaign, RL bridge. Specifically the
161
+ `RawProviderSink` integration would let every agent-runtime task auto-
162
+ capture its provider HTTP envelope without wiring it per-consumer.
163
+ 4. **No first-class trace-analyst hook in agent-runtime.** agent-runtime's
164
+ `runAgentTask` can emit traces but doesn't auto-execute the trace
165
+ analyst on completion the way `runEvalCampaign` does. A `onRunComplete`
166
+ hook on agent-runtime would close this — and the implementation is
167
+ one method change.
168
+
169
+ These are tracked as follow-up bumps after agent-eval 0.23 ships.
170
+
171
+ ## Versioning policy
172
+
173
+ Each package versions independently. The minor-version axis carries
174
+ breaking changes; agent-eval's minor versions are tied to the major
175
+ methodological shifts (0.21 = capture integrity; 0.22 = campaign + RL
176
+ bridge experimental; 0.23 = RL bridge primitives, examples).
177
+
178
+ When agent-eval ships a minor, agent-knowledge and agent-runtime get a
179
+ follow-up PR to consume the new surface. The follow-up is tracked as a
180
+ deliberate change, not a passive caret pickup.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.22.0",
3
+ "version": "0.23.1",
4
4
  "description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -34,6 +34,11 @@
34
34
  "import": "./dist/reporting.js",
35
35
  "default": "./dist/reporting.js"
36
36
  },
37
+ "./rl": {
38
+ "types": "./dist/rl.d.ts",
39
+ "import": "./dist/rl.js",
40
+ "default": "./dist/rl.js"
41
+ },
37
42
  "./traces": {
38
43
  "types": "./dist/traces.d.ts",
39
44
  "import": "./dist/traces.js",
@@ -74,6 +79,15 @@
74
79
  "publishConfig": {
75
80
  "access": "public"
76
81
  },
82
+ "scripts": {
83
+ "build": "tsup && pnpm openapi",
84
+ "dev": "tsup --watch",
85
+ "prepare": "pnpm build",
86
+ "test": "vitest run",
87
+ "test:watch": "vitest",
88
+ "typecheck": "tsc --noEmit",
89
+ "openapi": "node dist/cli.js openapi --out dist/openapi.json"
90
+ },
77
91
  "dependencies": {
78
92
  "@asteasolutions/zod-to-openapi": "^8.5.0",
79
93
  "@ax-llm/ax": "^19.0.25",
@@ -89,16 +103,14 @@
89
103
  "typescript": "^5.7.0",
90
104
  "vitest": "^3.0.0"
91
105
  },
106
+ "pnpm": {
107
+ "overrides": {
108
+ "postcss@<8.5.10": "^8.5.10"
109
+ }
110
+ },
92
111
  "engines": {
93
112
  "node": ">=20"
94
113
  },
95
114
  "license": "MIT",
96
- "scripts": {
97
- "build": "tsup && pnpm openapi",
98
- "dev": "tsup --watch",
99
- "test": "vitest run",
100
- "test:watch": "vitest",
101
- "typecheck": "tsc --noEmit",
102
- "openapi": "node dist/cli.js openapi --out dist/openapi.json"
103
- }
104
- }
115
+ "packageManager": "pnpm@10.22.0"
116
+ }