@tangle-network/agent-eval 0.20.8 → 0.20.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,477 @@
1
+ {
2
+ "openapi": "3.1.0",
3
+ "info": {
4
+ "title": "@tangle-network/agent-eval — wire protocol",
5
+ "version": "0.20.9",
6
+ "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
+ "contact": {
8
+ "name": "Tangle Network",
9
+ "url": "https://github.com/tangle-network/agent-eval"
10
+ },
11
+ "license": {
12
+ "name": "MIT"
13
+ }
14
+ },
15
+ "servers": [
16
+ {
17
+ "url": "http://localhost:5005",
18
+ "description": "Local agent-eval serve"
19
+ }
20
+ ],
21
+ "components": {
22
+ "schemas": {
23
+ "JudgeRequest": {
24
+ "type": "object",
25
+ "properties": {
26
+ "rubricName": {
27
+ "type": "string",
28
+ "description": "Use a built-in rubric by name. Mutually exclusive with `rubric`."
29
+ },
30
+ "rubric": {
31
+ "$ref": "#/components/schemas/Rubric"
32
+ },
33
+ "content": {
34
+ "type": "string",
35
+ "minLength": 1,
36
+ "description": "The text being judged — a tweet, a blog post, a code snippet, anything stringly."
37
+ },
38
+ "context": {
39
+ "type": "object",
40
+ "additionalProperties": {},
41
+ "description": "Free-form metadata for the rubric to use — analytics, source URL, author, etc. Surfaced to the LLM."
42
+ },
43
+ "model": {
44
+ "type": "string",
45
+ "description": "Override the judge model (default routes via tcloud). e.g. \"claude-opus-4-7\"."
46
+ }
47
+ },
48
+ "required": [
49
+ "content"
50
+ ]
51
+ },
52
+ "Rubric": {
53
+ "type": "object",
54
+ "properties": {
55
+ "name": {
56
+ "type": "string",
57
+ "minLength": 1,
58
+ "description": "Stable name like \"anti-slop\" — used by clients to invoke this rubric."
59
+ },
60
+ "description": {
61
+ "type": "string",
62
+ "minLength": 1,
63
+ "description": "What this rubric measures. Shown in /v1/rubrics listing."
64
+ },
65
+ "systemPrompt": {
66
+ "type": "string",
67
+ "minLength": 1,
68
+ "description": "Instructs the judging LLM. Should explain the persona (e.g. \"senior engineer reviewing voice\"), what to score on, and what to return."
69
+ },
70
+ "dimensions": {
71
+ "type": "array",
72
+ "items": {
73
+ "$ref": "#/components/schemas/RubricDimension"
74
+ },
75
+ "minItems": 1,
76
+ "description": "Scoring axes. The composite score is a weighted sum of these."
77
+ },
78
+ "failureModes": {
79
+ "type": "array",
80
+ "items": {
81
+ "$ref": "#/components/schemas/FailureMode"
82
+ },
83
+ "default": [],
84
+ "description": "Patterns to detect; each detected mode appears in the result.failureModes list."
85
+ },
86
+ "wins": {
87
+ "type": "array",
88
+ "items": {
89
+ "$ref": "#/components/schemas/FailureMode"
90
+ },
91
+ "default": [],
92
+ "description": "Positive patterns; each detected one appears in the result.wins list."
93
+ }
94
+ },
95
+ "required": [
96
+ "name",
97
+ "description",
98
+ "systemPrompt",
99
+ "dimensions"
100
+ ],
101
+ "description": "Inline rubric definition. Mutually exclusive with `rubricName`."
102
+ },
103
+ "RubricDimension": {
104
+ "type": "object",
105
+ "properties": {
106
+ "id": {
107
+ "type": "string",
108
+ "minLength": 1,
109
+ "description": "Short stable id like \"buyer_quality\" — used as the key in scoring output."
110
+ },
111
+ "description": {
112
+ "type": "string",
113
+ "minLength": 1,
114
+ "description": "One-line plain-English meaning. Read by humans reviewing low scores."
115
+ },
116
+ "weight": {
117
+ "type": "number",
118
+ "minimum": 0,
119
+ "default": 1,
120
+ "description": "Relative weight in the composite score. Default 1; 0 disables."
121
+ },
122
+ "min": {
123
+ "type": "number",
124
+ "default": 0,
125
+ "description": "Lower bound of valid score for this dimension."
126
+ },
127
+ "max": {
128
+ "type": "number",
129
+ "default": 1,
130
+ "description": "Upper bound of valid score for this dimension."
131
+ }
132
+ },
133
+ "required": [
134
+ "id",
135
+ "description"
136
+ ]
137
+ },
138
+ "FailureMode": {
139
+ "type": "object",
140
+ "properties": {
141
+ "id": {
142
+ "type": "string",
143
+ "minLength": 1,
144
+ "description": "Short stable id like \"ai-cadence\" — used in detection lists."
145
+ },
146
+ "description": {
147
+ "type": "string",
148
+ "minLength": 1,
149
+ "description": "Plain-English description of the failure pattern."
150
+ }
151
+ },
152
+ "required": [
153
+ "id",
154
+ "description"
155
+ ]
156
+ },
157
+ "JudgeResult": {
158
+ "type": "object",
159
+ "properties": {
160
+ "composite": {
161
+ "type": "number",
162
+ "minimum": 0,
163
+ "maximum": 1,
164
+ "description": "Weighted combination of dimension scores in 0..1. The single number to gate on."
165
+ },
166
+ "dimensions": {
167
+ "type": "object",
168
+ "additionalProperties": {
169
+ "type": "number"
170
+ },
171
+ "description": "Per-dimension score, keyed by RubricDimension.id."
172
+ },
173
+ "failureModes": {
174
+ "type": "array",
175
+ "items": {
176
+ "type": "string"
177
+ },
178
+ "default": [],
179
+ "description": "Failure-mode ids detected in the content (subset of rubric.failureModes ids)."
180
+ },
181
+ "wins": {
182
+ "type": "array",
183
+ "items": {
184
+ "type": "string"
185
+ },
186
+ "default": [],
187
+ "description": "Win ids detected in the content (subset of rubric.wins ids)."
188
+ },
189
+ "rationale": {
190
+ "type": "string",
191
+ "description": "Plain-English explanation of the score. Surfaced to the human reviewer."
192
+ },
193
+ "rubricVersion": {
194
+ "type": "string",
195
+ "description": "Stable hash of the rubric used. Scores are only comparable across runs when this matches."
196
+ },
197
+ "model": {
198
+ "type": "string",
199
+ "description": "Model that produced the judgement, for reproducibility."
200
+ },
201
+ "durationMs": {
202
+ "type": "integer",
203
+ "minimum": 0,
204
+ "description": "End-to-end wall time for this call."
205
+ }
206
+ },
207
+ "required": [
208
+ "composite",
209
+ "dimensions",
210
+ "rationale",
211
+ "rubricVersion",
212
+ "model",
213
+ "durationMs"
214
+ ]
215
+ },
216
+ "ListRubricsResponse": {
217
+ "type": "object",
218
+ "properties": {
219
+ "rubrics": {
220
+ "type": "array",
221
+ "items": {
222
+ "$ref": "#/components/schemas/RubricInfo"
223
+ }
224
+ }
225
+ },
226
+ "required": [
227
+ "rubrics"
228
+ ]
229
+ },
230
+ "RubricInfo": {
231
+ "type": "object",
232
+ "properties": {
233
+ "name": {
234
+ "type": "string",
235
+ "description": "Pass this to /v1/judge as `rubricName`."
236
+ },
237
+ "description": {
238
+ "type": "string",
239
+ "description": "What this rubric measures."
240
+ },
241
+ "dimensions": {
242
+ "type": "array",
243
+ "items": {
244
+ "type": "object",
245
+ "properties": {
246
+ "id": {
247
+ "type": "string"
248
+ },
249
+ "description": {
250
+ "type": "string"
251
+ },
252
+ "weight": {
253
+ "type": "number"
254
+ }
255
+ },
256
+ "required": [
257
+ "id",
258
+ "description",
259
+ "weight"
260
+ ]
261
+ },
262
+ "description": "The scoring axes this rubric uses, with weights."
263
+ },
264
+ "failureModes": {
265
+ "type": "array",
266
+ "items": {
267
+ "type": "string"
268
+ },
269
+ "default": [],
270
+ "description": "Failure-mode ids this rubric detects."
271
+ },
272
+ "rubricVersion": {
273
+ "type": "string",
274
+ "description": "Stable hash — match this to compare scores across runs."
275
+ }
276
+ },
277
+ "required": [
278
+ "name",
279
+ "description",
280
+ "dimensions",
281
+ "rubricVersion"
282
+ ]
283
+ },
284
+ "VersionResponse": {
285
+ "type": "object",
286
+ "properties": {
287
+ "package": {
288
+ "type": "string",
289
+ "description": "Package name (always \"@tangle-network/agent-eval\")."
290
+ },
291
+ "version": {
292
+ "type": "string",
293
+ "description": "Semver of the running server. Match your client to this."
294
+ },
295
+ "wireVersion": {
296
+ "type": "string",
297
+ "description": "Wire-protocol semver. Bumps separately from package version when the schema changes."
298
+ },
299
+ "apiSurface": {
300
+ "type": "array",
301
+ "items": {
302
+ "type": "string"
303
+ },
304
+ "description": "List of supported method names."
305
+ }
306
+ },
307
+ "required": [
308
+ "package",
309
+ "version",
310
+ "wireVersion",
311
+ "apiSurface"
312
+ ]
313
+ },
314
+ "HealthResponse": {
315
+ "type": "object",
316
+ "properties": {
317
+ "status": {
318
+ "type": "string",
319
+ "enum": [
320
+ "ok"
321
+ ]
322
+ },
323
+ "uptimeSec": {
324
+ "type": "number"
325
+ }
326
+ },
327
+ "required": [
328
+ "status",
329
+ "uptimeSec"
330
+ ]
331
+ },
332
+ "ErrorResponse": {
333
+ "type": "object",
334
+ "properties": {
335
+ "error": {
336
+ "type": "object",
337
+ "properties": {
338
+ "code": {
339
+ "type": "string",
340
+ "description": "Machine-readable code: \"validation_error\", \"rubric_not_found\", \"judge_error\"."
341
+ },
342
+ "message": {
343
+ "type": "string",
344
+ "description": "Human-readable message."
345
+ },
346
+ "details": {
347
+ "description": "Optional structured detail."
348
+ }
349
+ },
350
+ "required": [
351
+ "code",
352
+ "message"
353
+ ],
354
+ "description": "Errors are always wrapped in this shape across all endpoints."
355
+ }
356
+ },
357
+ "required": [
358
+ "error"
359
+ ]
360
+ }
361
+ },
362
+ "parameters": {}
363
+ },
364
+ "paths": {
365
+ "/v1/judge": {
366
+ "post": {
367
+ "summary": "Score a piece of content against a rubric",
368
+ "description": "Runs the judging LLM with the named (or inline) rubric and returns dimension scores, detected failure modes, wins, and a composite score in 0..1.",
369
+ "requestBody": {
370
+ "content": {
371
+ "application/json": {
372
+ "schema": {
373
+ "$ref": "#/components/schemas/JudgeRequest"
374
+ }
375
+ }
376
+ }
377
+ },
378
+ "responses": {
379
+ "200": {
380
+ "description": "Successful judgement",
381
+ "content": {
382
+ "application/json": {
383
+ "schema": {
384
+ "$ref": "#/components/schemas/JudgeResult"
385
+ }
386
+ }
387
+ }
388
+ },
389
+ "400": {
390
+ "description": "Validation error",
391
+ "content": {
392
+ "application/json": {
393
+ "schema": {
394
+ "$ref": "#/components/schemas/ErrorResponse"
395
+ }
396
+ }
397
+ }
398
+ },
399
+ "404": {
400
+ "description": "Rubric not found",
401
+ "content": {
402
+ "application/json": {
403
+ "schema": {
404
+ "$ref": "#/components/schemas/ErrorResponse"
405
+ }
406
+ }
407
+ }
408
+ },
409
+ "500": {
410
+ "description": "Judge error",
411
+ "content": {
412
+ "application/json": {
413
+ "schema": {
414
+ "$ref": "#/components/schemas/ErrorResponse"
415
+ }
416
+ }
417
+ }
418
+ }
419
+ }
420
+ }
421
+ },
422
+ "/v1/rubrics": {
423
+ "get": {
424
+ "summary": "List built-in rubrics",
425
+ "description": "Returns every rubric registered server-side, with their dimensions and stable rubricVersion hash.",
426
+ "responses": {
427
+ "200": {
428
+ "description": "Listing",
429
+ "content": {
430
+ "application/json": {
431
+ "schema": {
432
+ "$ref": "#/components/schemas/ListRubricsResponse"
433
+ }
434
+ }
435
+ }
436
+ }
437
+ }
438
+ }
439
+ },
440
+ "/v1/version": {
441
+ "get": {
442
+ "summary": "Server and wire-protocol version",
443
+ "description": "Match your client version to `version`; check `wireVersion` for compatibility.",
444
+ "responses": {
445
+ "200": {
446
+ "description": "Version info",
447
+ "content": {
448
+ "application/json": {
449
+ "schema": {
450
+ "$ref": "#/components/schemas/VersionResponse"
451
+ }
452
+ }
453
+ }
454
+ }
455
+ }
456
+ }
457
+ },
458
+ "/healthz": {
459
+ "get": {
460
+ "summary": "Liveness check",
461
+ "responses": {
462
+ "200": {
463
+ "description": "OK",
464
+ "content": {
465
+ "application/json": {
466
+ "schema": {
467
+ "$ref": "#/components/schemas/HealthResponse"
468
+ }
469
+ }
470
+ }
471
+ }
472
+ }
473
+ }
474
+ }
475
+ },
476
+ "webhooks": {}
477
+ }
package/docs/concepts.md CHANGED
@@ -43,7 +43,7 @@ that can seed memory, replay scenarios, and optimization.
43
43
  | **Trace store** | The append-only log of every span/event during a run. Replay = read this back. |
44
44
  | **Composite score** | A 0..1 number combining all dimensions. The single number you gate on. |
45
45
  | **Rubric version** | A stable hash of the rubric. Scores from different rubric versions are not comparable. |
46
- | **Muffled gate** | A check that should fail loud but silently passes (e.g. `command || true`). The most expensive bug class in this codebase — see SKILL.md. |
46
+ | **Muffled gate** | A check that should fail loud but silently passes (e.g. `command || true`). The most expensive bug class in this codebase. |
47
47
 
48
48
  ## The feedback trajectory loop
49
49
 
@@ -119,7 +119,7 @@ report.blendedScore // 0..1 — weighted aggregate
119
119
  report.layers // per-layer status, findings, duration
120
120
  ```
121
121
 
122
- Two rules that will save you bugs (paid for in real incidents — see SKILL.md):
122
+ Two rules that will save you bugs:
123
123
 
124
124
  1. **Run both gates.** Build gates catch code that doesn't compile; structural assertions catch missing files. Run both unconditionally — they catch orthogonal failures.
125
125
 
@@ -150,6 +150,6 @@ You don't need to build the trace tree by hand. `BuilderSession` does it for you
150
150
  - **Just want to score a string against a rubric?** → [wire-protocol.md](./wire-protocol.md) — HTTP/RPC interface, pluggable from any language.
151
151
  - **Need a reusable driver/worker/evaluator loop?** → [control-runtime.md](./control-runtime.md) — generic runtime plus coding, browser, computer-use, and research integration patterns.
152
152
  - **Want review feedback to become eval/optimization data?** → [feedback-trajectories.md](./feedback-trajectories.md) — turn feedback into datasets, optimizer rows, and preference memory.
153
- - **Building a code-generator eval?** → SKILL.md §Minimal working path the `BuilderSession` recipe.
154
- - **Multi-layer verifier?** → SKILL.md §Verification pipeline.
153
+ - **Building a code-generator eval?** → Start with `BuilderSession`, `SandboxHarness`, and `MultiLayerVerifier`.
154
+ - **Multi-layer verifier?** → Use [control-runtime.md](./control-runtime.md) and `MultiLayerVerifier` for ordered gates with dependencies.
155
155
  - **Adding a new judge or rubric?** → `src/wire/rubrics.ts` for the cross-language path; `src/anti-slop.ts` and `src/judges.ts` for the in-process path.
@@ -2,8 +2,8 @@
2
2
 
3
3
  `agent-eval` owns the contract for deciding whether an agent had enough
4
4
  task-world context to run. It does not own web crawling, connector storage, wiki
5
- pages, credentials, or product policy. Those live in `agent-knowledge` and
6
- product repos.
5
+ pages, credentials, or product policy. Those live in
6
+ `@tangle-network/agent-knowledge` and product repos.
7
7
 
8
8
  The core loop is:
9
9
 
@@ -96,13 +96,13 @@ GET /v1/version
96
96
  ```json
97
97
  {
98
98
  "package": "@tangle-network/agent-eval",
99
- "version": "0.19.0",
99
+ "version": "0.20.9",
100
100
  "wireVersion": "1.0.0",
101
101
  "apiSurface": ["judge", "listRubrics", "version"]
102
102
  }
103
103
  ```
104
104
 
105
- `version` matches the npm/PyPI package version. `wireVersion` bumps independently — only on breaking request/response schema changes. Package versions can differ across releases as long as `wireVersion` matches.
105
+ `version` matches the package version. `wireVersion` bumps independently — only on breaking request/response schema changes. Package versions can differ across releases as long as `wireVersion` matches.
106
106
 
107
107
  ### `GET /healthz` — liveness
108
108
 
@@ -176,7 +176,7 @@ Each invocation is one process — Node startup adds ~500 ms. For more than a fe
176
176
 
177
177
  ## Clients
178
178
 
179
- - **Python**: [`tangle-agent-eval`](../clients/python/README.md) on PyPI. Auto-detects HTTP, falls back to subprocess. Version-locked to npm.
179
+ - **Python**: source lives in [`clients/python`](../clients/python/README.md). Auto-detects HTTP, falls back to subprocess. Version-locked to npm.
180
180
  - **TypeScript**: import directly from `@tangle-network/agent-eval` (no wire round-trip needed in-process).
181
181
  - **Rust / Go / Other**: generate from `dist/openapi.json`. PRs welcome to add an officially-maintained client.
182
182
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.20.8",
3
+ "version": "0.20.9",
4
4
  "description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -33,6 +33,14 @@
33
33
  "types": "./dist/wire/index.d.ts",
34
34
  "import": "./dist/wire/index.js",
35
35
  "default": "./dist/wire/index.js"
36
+ },
37
+ "./benchmarks": {
38
+ "types": "./dist/benchmarks/index.d.ts",
39
+ "import": "./dist/benchmarks/index.js",
40
+ "default": "./dist/benchmarks/index.js"
41
+ },
42
+ "./openapi.json": {
43
+ "default": "./dist/openapi.json"
36
44
  }
37
45
  },
38
46
  "bin": {
@@ -40,26 +48,25 @@
40
48
  },
41
49
  "files": [
42
50
  "dist",
43
- "docs",
44
- "examples"
51
+ "docs"
45
52
  ],
46
53
  "publishConfig": {
47
54
  "access": "public"
48
55
  },
49
56
  "scripts": {
50
- "build": "tsup",
57
+ "build": "tsup && node dist/cli.js openapi --out dist/openapi.json",
51
58
  "dev": "tsup --watch",
52
- "prepare": "tsup",
59
+ "prepare": "pnpm build",
53
60
  "test": "vitest run",
54
61
  "test:watch": "vitest",
55
62
  "typecheck": "tsc --noEmit",
56
- "openapi": "node dist/cli.js openapi --out dist/openapi.json"
63
+ "openapi": "pnpm build"
57
64
  },
58
65
  "dependencies": {
59
66
  "@asteasolutions/zod-to-openapi": "^8.5.0",
60
67
  "@ax-llm/ax": "^19.0.25",
61
68
  "@hono/node-server": "^2.0.0",
62
- "@tangle-network/tcloud": "^0.2.0",
69
+ "@tangle-network/tcloud": "^0.4.6",
63
70
  "hono": "^4.12.15",
64
71
  "zod": "^4.3.6"
65
72
  },
@@ -1,44 +0,0 @@
1
- # Example benchmark wrappers
2
-
3
- Reference implementations of `BenchmarkAdapter` for two public benchmarks. They are NOT bundled — they're intentionally shipped as source you read, copy, and adapt.
4
-
5
- | Wrapper | What it does | Why it's an example, not core |
6
- |---|---|---|
7
- | [`gsm8k/`](./gsm8k) | Exact-match grading on the final numeric answer of GSM8K (Cobbe et al.) | The dataset isn't ours and isn't bundled. The wrapper points to a local JSONL via `AGENT_EVAL_GSM8K_PATH`. |
8
- | [`swebench-lite/`](./swebench-lite) | Pass/fail grading via an external SWE-Bench grader command | The grader is a separate binary; the wrapper stubs the integration via `AGENT_EVAL_SWEBENCH_GRADER_CMD`. |
9
-
10
- The novel benchmark we ship and own — the synthetic routing task — lives in `src/benchmarks/routing/` and IS in the bundle.
11
-
12
- ## Using these wrappers
13
-
14
- Two paths.
15
-
16
- **Option A — read and inline.** Copy the wrapper file into your project. Replace the import paths from `../../../src/benchmarks/types` and `../../../src/run-record` with `@tangle-network/agent-eval`. Done.
17
-
18
- **Option B — import from agent-eval source.** If your project sits in this monorepo (or you've cloned the repo), import directly:
19
-
20
- ```ts
21
- import * as gsm8k from '@tangle-network/agent-eval/examples/benchmarks/gsm8k'
22
- ```
23
-
24
- This requires adding `examples/**/*.ts` to your TypeScript paths. Easier to just copy.
25
-
26
- ## What every BenchmarkAdapter exports
27
-
28
- ```ts
29
- loadDataset(split: 'search' | 'dev' | 'holdout'): Promise<DatasetItem[]>
30
- evaluate(item, response): Promise<{ score: number, raw: Record<string, unknown> }>
31
- assignSplit(itemId: string): 'search' | 'dev' | 'holdout'
32
- ```
33
-
34
- `assignSplit` uses `deterministicSplit(itemId, BENCHMARK_SPLIT_SEED)` — same item gets the same split everywhere. Don't change the seed; it's load-bearing for reproducibility.
35
-
36
- ## Adding a new benchmark
37
-
38
- 1. Create `examples/benchmarks/<your-benchmark>/index.ts`.
39
- 2. Export `loadDataset`, `evaluate`, `assignSplit`. Optionally a typed `Adapter` class.
40
- 3. Use `deterministicSplit` from `@tangle-network/agent-eval` for split assignment.
41
- 4. Fail loud on missing config (env vars, paths). Never default to silent-pass.
42
- 5. Document config requirements in a per-benchmark README.
43
-
44
- If your benchmark is novel and broadly useful, propose moving it into `src/benchmarks/` as core surface (PR welcome). The bar is: novel rubric, reusable across projects, low maintenance burden.