@tangle-network/agent-eval 0.20.7 → 0.20.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tangle Network
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -55,9 +55,9 @@ Package responsibilities:
55
55
  optimization, reporting.
56
56
  - Product app: domain state, tools, credentials, UI, storage, deployment, model
57
57
  gateway.
58
- - `agent-runtime`: production agent-loop/session runtime.
59
- - `agent-knowledge`: evidence stores, claim/page synthesis, retrieval, knowledge
60
- readiness implementation.
58
+ - `@tangle-network/agent-runtime`: production agent-loop/session runtime.
59
+ - `@tangle-network/agent-knowledge`: evidence stores, claim/page synthesis,
60
+ retrieval, knowledge readiness implementation.
61
61
 
62
62
  ## Install
63
63
 
@@ -72,10 +72,12 @@ npm i -g @tangle-network/agent-eval
72
72
  agent-eval serve --port 5005
73
73
  ```
74
74
 
75
- Python client:
75
+ Python client source lives in `clients/python`. Until the PyPI package is
76
+ published, install it from the repo:
76
77
 
77
78
  ```sh
78
- pip install tangle-agent-eval
79
+ cd clients/python
80
+ pip install -e .
79
81
  ```
80
82
 
81
83
  ## Core Primitives
@@ -98,7 +100,8 @@ pip install tangle-agent-eval
98
100
 
99
101
  ## Examples
100
102
 
101
- Runnable examples live in [`examples/`](./examples):
103
+ Runnable examples live in the repository's [`examples/`](./examples)
104
+ directory. They are not part of the published npm package.
102
105
 
103
106
  - [`examples/same-sandbox-harness`](./examples/same-sandbox-harness) - run
104
107
  multiple eval passes against the same workspace.
@@ -0,0 +1 @@
1
+ export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, i as deterministicSplit, l as routing } from '../index-CEWY1rmu.js';
@@ -0,0 +1,12 @@
1
+ import {
2
+ BENCHMARK_SPLIT_SEED,
3
+ deterministicSplit,
4
+ routing_exports
5
+ } from "../chunk-XDGJUIV2.js";
6
+ import "../chunk-PZ5AY32C.js";
7
+ export {
8
+ BENCHMARK_SPLIT_SEED,
9
+ deterministicSplit,
10
+ routing_exports as routing
11
+ };
12
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -0,0 +1,219 @@
1
+ import {
2
+ __export
3
+ } from "./chunk-PZ5AY32C.js";
4
+
5
+ // src/benchmarks/index.ts
6
+ var benchmarks_exports = {};
7
+ __export(benchmarks_exports, {
8
+ BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
9
+ deterministicSplit: () => deterministicSplit,
10
+ routing: () => routing_exports
11
+ });
12
+
13
+ // src/benchmarks/types.ts
14
+ function fnv1a32(input) {
15
+ let h = 2166136261;
16
+ for (let i = 0; i < input.length; i++) {
17
+ h ^= input.charCodeAt(i) & 255;
18
+ h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
19
+ }
20
+ return h >>> 0;
21
+ }
22
+ var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
23
+ function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
24
+ const h = fnv1a32(`${seed}::${itemId}`);
25
+ const pos = h / 4294967296;
26
+ if (pos < 0.6) return "search";
27
+ if (pos < 0.8) return "dev";
28
+ return "holdout";
29
+ }
30
+
31
+ // src/benchmarks/routing/index.ts
32
+ var routing_exports = {};
33
+ __export(routing_exports, {
34
+ ROUTING_DATASET: () => ROUTING_DATASET,
35
+ RoutingAdapter: () => RoutingAdapter,
36
+ assignSplit: () => assignSplit,
37
+ evaluate: () => evaluate,
38
+ extractRouteTokens: () => extractRouteTokens,
39
+ loadDataset: () => loadDataset
40
+ });
41
+
42
+ // src/benchmarks/routing/dataset.ts
43
+ var ROUTING_DATASET = [
44
+ {
45
+ id: "file_001",
46
+ category: "file",
47
+ prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
48
+ route: "fs.write",
49
+ synonyms: ["filesystem.write", "write_file"],
50
+ hardNegatives: ["fs.read", "chat.reply"]
51
+ },
52
+ {
53
+ id: "file_002",
54
+ category: "file",
55
+ prompt: "Read the contents of /etc/hosts and summarize the entries.",
56
+ route: "fs.read",
57
+ synonyms: ["filesystem.read", "read_file"],
58
+ hardNegatives: ["fs.write", "search.web"]
59
+ },
60
+ {
61
+ id: "file_003",
62
+ category: "file",
63
+ prompt: "List every Python file under src/ recursively.",
64
+ route: "fs.list",
65
+ synonyms: ["filesystem.list", "list_files"],
66
+ hardNegatives: ["fs.read", "search.code"]
67
+ },
68
+ {
69
+ id: "file_004",
70
+ category: "file",
71
+ prompt: "Delete the cached build at .turbo/cache.",
72
+ route: "fs.delete",
73
+ synonyms: ["filesystem.delete", "remove_file"],
74
+ hardNegatives: ["fs.write", "fs.list"]
75
+ },
76
+ {
77
+ id: "math_001",
78
+ category: "math",
79
+ prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
80
+ route: "math.integral",
81
+ synonyms: ["calculator.integral", "math.solve"],
82
+ hardNegatives: ["math.derivative", "chat.reply"]
83
+ },
84
+ {
85
+ id: "math_002",
86
+ category: "math",
87
+ prompt: "Compute the derivative of sin(x) * cos(x).",
88
+ route: "math.derivative",
89
+ synonyms: ["calculator.derivative", "math.solve"],
90
+ hardNegatives: ["math.integral", "math.algebra"]
91
+ },
92
+ {
93
+ id: "math_003",
94
+ category: "math",
95
+ prompt: "Solve 2x + 7 = 19 for x.",
96
+ route: "math.algebra",
97
+ synonyms: ["calculator.algebra", "math.solve"],
98
+ hardNegatives: ["math.derivative", "math.integral"]
99
+ },
100
+ {
101
+ id: "math_004",
102
+ category: "math",
103
+ prompt: "What is the prime factorization of 360?",
104
+ route: "math.numbertheory",
105
+ synonyms: ["calculator.factor", "math.solve"],
106
+ hardNegatives: ["math.algebra", "search.web"]
107
+ },
108
+ {
109
+ id: "search_001",
110
+ category: "search",
111
+ prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
112
+ route: "search.web",
113
+ synonyms: ["web.search", "search.papers"],
114
+ hardNegatives: ["search.code", "chat.reply"]
115
+ },
116
+ {
117
+ id: "search_002",
118
+ category: "search",
119
+ prompt: "Search the codebase for every call site of `runProposeReview`.",
120
+ route: "search.code",
121
+ synonyms: ["code.search", "grep"],
122
+ hardNegatives: ["search.web", "fs.read"]
123
+ },
124
+ {
125
+ id: "search_003",
126
+ category: "search",
127
+ prompt: "What is the latest release of the Tangle network on GitHub?",
128
+ route: "search.web",
129
+ synonyms: ["web.search", "github.releases"],
130
+ hardNegatives: ["search.code", "chat.reply"]
131
+ },
132
+ {
133
+ id: "search_004",
134
+ category: "search",
135
+ prompt: "Find all TODO comments in the agent-eval src tree.",
136
+ route: "search.code",
137
+ synonyms: ["code.search", "grep"],
138
+ hardNegatives: ["search.web", "fs.list"]
139
+ },
140
+ {
141
+ id: "chat_001",
142
+ category: "chat",
143
+ prompt: "Hi there, how are you doing today?",
144
+ route: "chat.reply",
145
+ synonyms: ["conversation.reply"],
146
+ hardNegatives: ["search.web", "fs.read"]
147
+ },
148
+ {
149
+ id: "chat_002",
150
+ category: "chat",
151
+ prompt: "Please explain the difference between an LLM and a foundation model.",
152
+ route: "chat.reply",
153
+ synonyms: ["conversation.reply", "qa.answer"],
154
+ hardNegatives: ["search.web", "math.algebra"]
155
+ },
156
+ {
157
+ id: "chat_003",
158
+ category: "chat",
159
+ prompt: "Tell me a short joke about distributed systems.",
160
+ route: "chat.reply",
161
+ synonyms: ["conversation.reply"],
162
+ hardNegatives: ["search.web", "fs.read"]
163
+ },
164
+ {
165
+ id: "chat_004",
166
+ category: "chat",
167
+ prompt: "Acknowledge my last message with a thumbs up.",
168
+ route: "chat.reply",
169
+ synonyms: ["conversation.reply", "react"],
170
+ hardNegatives: ["fs.write", "search.web"]
171
+ }
172
+ ];
173
+
174
+ // src/benchmarks/routing/index.ts
175
+ var RoutingAdapter = class {
176
+ async loadDataset(split) {
177
+ return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl(it.id) === split);
178
+ }
179
+ async evaluate(item, response) {
180
+ const tokens = extractRouteTokens(response);
181
+ const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
182
+ const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
183
+ const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null;
184
+ const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
185
+ const score = firstMatch ? 1 : 0;
186
+ return {
187
+ score,
188
+ raw: {
189
+ firstToken: tokens[0] ?? null,
190
+ matchedRoute: firstMatch,
191
+ hitHardNegative: Boolean(firstHardNeg),
192
+ hardNegativeRoute: firstHardNeg,
193
+ category: item.payload.category
194
+ }
195
+ };
196
+ }
197
+ assignSplit(itemId) {
198
+ return assignSplitImpl(itemId);
199
+ }
200
+ };
201
+ function assignSplitImpl(itemId) {
202
+ return deterministicSplit(`routing::${itemId}`);
203
+ }
204
+ function extractRouteTokens(response) {
205
+ const matches = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
206
+ return matches ?? [];
207
+ }
208
+ var adapter = new RoutingAdapter();
209
+ var loadDataset = adapter.loadDataset.bind(adapter);
210
+ var evaluate = adapter.evaluate.bind(adapter);
211
+ var assignSplit = adapter.assignSplit.bind(adapter);
212
+
213
+ export {
214
+ BENCHMARK_SPLIT_SEED,
215
+ deterministicSplit,
216
+ routing_exports,
217
+ benchmarks_exports
218
+ };
219
+ //# sourceMappingURL=chunk-XDGJUIV2.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/benchmarks/index.ts","../src/benchmarks/types.ts","../src/benchmarks/routing/index.ts","../src/benchmarks/routing/dataset.ts"],"sourcesContent":["/**\n * Reference benchmark wrappers — entry point.\n *\n * Core surface (exported here):\n * - The `BenchmarkAdapter` contract.\n * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.\n * - `routing` — synthetic 16-task router benchmark. The only novel\n * benchmark we built; ships in the package.\n *\n * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):\n * - `gsm8k` — exact-match math reasoning (HF mirror, dataset\n * not bundled).\n * - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an\n * external grader).\n *\n * The example wrappers are reference implementations of `BenchmarkAdapter`.\n * Read them, copy them, adapt them. They're intentionally not in the main\n * entry — every team will configure them differently.\n */\n\nexport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from './types'\nexport { deterministicSplit, BENCHMARK_SPLIT_SEED } from './types'\n\nexport * as routing from './routing/index'\n","/**\n * Shared types for the reference benchmark wrappers under\n * `src/benchmarks/`. Each wrapper exports the three functions in\n * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.\n */\n\nimport type { RunSplitTag } from '../run-record'\n\nexport interface BenchmarkDatasetItem<TPayload = unknown> {\n /** Stable dataset-local item id (used for split assignment + paper\n * references). Unique within a benchmark. */\n id: string\n /** Free-form payload. Each benchmark defines its own shape. */\n payload: TPayload\n}\n\nexport interface BenchmarkEvaluation {\n /** [0, 1] score for the response on this item. Exact-match\n * benchmarks use 0/1; partial-credit benchmarks may return\n * fractional values. */\n score: number\n /** Optional bag of raw scoring signals — e.g. parsed numeric\n * answer, regex match, judge sub-scores. */\n raw: Record<string, unknown>\n}\n\n/** Common signature implemented by every adapter under `src/benchmarks/*`. */\n// `TPayload` is the per-item payload type; `_TItem` is preserved for\n// downstream type-narrowing extensions (a richer `BenchmarkDatasetItem`\n// subclass that adds e.g. provenance metadata) but is intentionally\n// unused here. `noUnusedLocals` requires the leading underscore.\nexport interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {\n /** Load the dataset for the given split. May hit the network on\n * first call but should be cache-friendly. Adapters that don't\n * ship the dataset itself MUST throw a clearly-marked error\n * pointing the caller at the loader script. */\n loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>\n /** Score a single response. Pure with respect to the inputs. */\n evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>\n /** Deterministic split assignment via item id hashing. The\n * fraction of items in each split is implementation-defined but\n * MUST be stable across processes and platforms. */\n assignSplit(itemId: string): RunSplitTag\n}\n\n// ── Deterministic split assignment ───────────────────────────────────\n\n/**\n * 32-bit FNV-1a hash. Stable, allocation-free, deterministic across\n * runtimes. We use it to assign items to splits rather than depending\n * on a polyfilled crypto.subtle path.\n */\nfunction fnv1a32(input: string): number {\n let h = 0x811c9dc5\n for (let i = 0; i < input.length; i++) {\n h ^= input.charCodeAt(i) & 0xff\n h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0\n }\n return h >>> 0\n}\n\n/** Split-assignment seed shared across all benchmarks. Bumping this\n * value reshuffles every split — do NOT do that lightly. */\nexport const BENCHMARK_SPLIT_SEED = 'agent-eval-v1'\n\n/**\n * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a\n * stable 32-bit hash of `${seed}::${id}`. Default proportions:\n *\n * search: 60% (optimization-readable)\n * dev: 20% (held-out for tuning, leak-on-purpose during dev)\n * holdout:20% (paper-grade held-out, gated reads)\n */\nexport function deterministicSplit(\n itemId: string,\n seed: string = BENCHMARK_SPLIT_SEED,\n): RunSplitTag {\n const h = fnv1a32(`${seed}::${itemId}`)\n const pos = h / 0x100000000\n if (pos < 0.6) return 'search'\n if (pos < 0.8) return 'dev'\n return 'holdout'\n}\n","/**\n * Routing benchmark — synthetic, dependency-free, ships in the\n * package. 16 cross-category items in `dataset.ts`. See\n * `routing/README.md` for the format.\n *\n * `evaluate` does case-insensitive exact match against the canonical\n * route plus declared synonyms. The first valid route token in the\n * response wins; everything else is ignored. Wrong answers also\n * report whether they hit a hard negative — useful when triaging\n * \"always picks the popular route\" failure modes.\n */\n\nimport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from '../types'\nimport { deterministicSplit } from '../types'\nimport type { RunSplitTag } from '../../run-record'\nimport { ROUTING_DATASET, type RoutingItem } from './dataset'\n\nexport type { RoutingItem }\nexport type RoutingPayload = RoutingItem\nexport type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>\n\nclass RoutingAdapter\n implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload>\n{\n async loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]> {\n return ROUTING_DATASET\n .map((item) => ({ id: item.id, payload: item }))\n .filter((it) => assignSplitImpl(it.id) === split)\n }\n\n async evaluate(\n item: RoutingDatasetItem,\n response: string,\n ): Promise<BenchmarkEvaluation> {\n const tokens = extractRouteTokens(response)\n const correct = new Set<string>([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()))\n const hardNeg = new Set<string>(item.payload.hardNegatives.map((s) => s.toLowerCase()))\n const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null\n const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null\n const score = firstMatch ? 1 : 0\n return {\n score,\n raw: {\n firstToken: tokens[0] ?? null,\n matchedRoute: firstMatch,\n hitHardNegative: Boolean(firstHardNeg),\n hardNegativeRoute: firstHardNeg,\n category: item.payload.category,\n },\n }\n }\n\n assignSplit(itemId: string): RunSplitTag {\n return assignSplitImpl(itemId)\n }\n}\n\nfunction assignSplitImpl(itemId: string): RunSplitTag {\n return deterministicSplit(`routing::${itemId}`)\n}\n\n/**\n * Pull route-shaped tokens out of a model response. Routes look like\n * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics\n * are not routes, but `category.action` patterns are robust to most\n * model wrappers (JSON output, prose explanations, code fences).\n */\nexport function extractRouteTokens(response: string): string[] {\n const matches = response.match(/[a-z][a-z0-9_]*\\.[a-z][a-z0-9_]*/gi)\n return matches ?? []\n}\n\nconst adapter = new RoutingAdapter()\n\nexport const loadDataset = adapter.loadDataset.bind(adapter)\nexport const evaluate = adapter.evaluate.bind(adapter)\nexport const assignSplit = adapter.assignSplit.bind(adapter)\nexport { RoutingAdapter, ROUTING_DATASET }\n","/**\n * Synthetic routing dataset. 16 tasks across 4 categories. Used as a\n * deterministic, dependency-free benchmark for any router that maps a\n * natural-language request to one of a fixed set of route labels.\n *\n * Format (see `routing/README.md` for prose):\n *\n * {\n * id: stable per-task ID (matches across processes).\n * category: one of the four route labels.\n * prompt: the user-facing request the router must classify.\n * route: the ground-truth route the router should pick.\n * synonyms: other strings that count as a correct answer.\n * hardNegatives:close-but-wrong route labels — used to detect the\n * \"always picks the popular route\" failure mode.\n * }\n *\n * The four categories are intentionally cross-domain (file ops,\n * math, search, conversation) so a router that collapses to one\n * category is easy to spot.\n */\n\nexport interface RoutingItem {\n id: string\n category: 'file' | 'math' | 'search' | 'chat'\n prompt: string\n /** Canonical correct route label. */\n route: string\n /** Alternate route labels that also count as correct. */\n synonyms: string[]\n /** Wrong-but-tempting route labels (for analysis, not grading). */\n hardNegatives: string[]\n}\n\nexport const ROUTING_DATASET: RoutingItem[] = [\n {\n id: 'file_001',\n category: 'file',\n prompt: 'Save the meeting notes to /tmp/notes-2025-04.md as markdown.',\n route: 'fs.write',\n synonyms: ['filesystem.write', 'write_file'],\n hardNegatives: ['fs.read', 'chat.reply'],\n },\n {\n id: 'file_002',\n category: 'file',\n prompt: 'Read the contents of /etc/hosts and summarize the entries.',\n route: 'fs.read',\n synonyms: ['filesystem.read', 'read_file'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n {\n id: 'file_003',\n category: 'file',\n prompt: 'List every Python file under src/ recursively.',\n route: 'fs.list',\n synonyms: ['filesystem.list', 'list_files'],\n hardNegatives: ['fs.read', 'search.code'],\n },\n {\n id: 'file_004',\n category: 'file',\n prompt: 'Delete the cached build at .turbo/cache.',\n route: 'fs.delete',\n synonyms: ['filesystem.delete', 'remove_file'],\n hardNegatives: ['fs.write', 'fs.list'],\n },\n {\n id: 'math_001',\n category: 'math',\n prompt: 'What is the integral of 3x^2 + 2x from 0 to 5?',\n route: 'math.integral',\n synonyms: ['calculator.integral', 'math.solve'],\n hardNegatives: ['math.derivative', 'chat.reply'],\n },\n {\n id: 'math_002',\n category: 'math',\n prompt: 'Compute the derivative of sin(x) * cos(x).',\n route: 'math.derivative',\n synonyms: ['calculator.derivative', 'math.solve'],\n hardNegatives: ['math.integral', 'math.algebra'],\n },\n {\n id: 'math_003',\n category: 'math',\n prompt: 'Solve 2x + 7 = 19 for x.',\n route: 'math.algebra',\n synonyms: ['calculator.algebra', 'math.solve'],\n hardNegatives: ['math.derivative', 'math.integral'],\n },\n {\n id: 'math_004',\n category: 'math',\n prompt: 'What is the prime factorization of 360?',\n route: 'math.numbertheory',\n synonyms: ['calculator.factor', 'math.solve'],\n hardNegatives: ['math.algebra', 'search.web'],\n },\n {\n id: 'search_001',\n category: 'search',\n prompt: 'Find recent papers on agent prompt optimization with held-out promotion gates.',\n route: 'search.web',\n synonyms: ['web.search', 'search.papers'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_002',\n category: 'search',\n prompt: 'Search the codebase for every call site of `runProposeReview`.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'search_003',\n category: 'search',\n prompt: 'What is the latest release of the Tangle network on GitHub?',\n route: 'search.web',\n synonyms: ['web.search', 'github.releases'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_004',\n category: 'search',\n prompt: 'Find all TODO comments in the agent-eval src tree.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.list'],\n },\n {\n id: 'chat_001',\n category: 'chat',\n prompt: 'Hi there, how are you doing today?',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_002',\n category: 'chat',\n prompt: 'Please explain the difference between an LLM and a foundation model.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'qa.answer'],\n hardNegatives: ['search.web', 'math.algebra'],\n },\n {\n id: 'chat_003',\n category: 'chat',\n prompt: 'Tell me a short joke about distributed systems.',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_004',\n category: 'chat',\n prompt: 'Acknowledge my last message with a thumbs up.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'react'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n]\n"],"mappings":";;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACoDA,SAAS,QAAQ,OAAuB;AACtC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,SAAK,MAAM,WAAW,CAAC,IAAI;AAC3B,QAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,SAAU;AAAA,EACxE;AACA,SAAO,MAAM;AACf;AAIO,IAAM,uBAAuB;AAU7B,SAAS,mBACd,QACA,OAAe,sBACF;AACb,QAAM,IAAI,QAAQ,GAAG,IAAI,KAAK,MAAM,EAAE;AACtC,QAAM,MAAM,IAAI;AAChB,MAAI,MAAM,IAAK,QAAO;AACtB,MAAI,MAAM,IAAK,QAAO;AACtB,SAAO;AACT;;;AClFA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACkCO,IAAM,kBAAiC;AAAA,EAC5C;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB,YAAY;AAAA,IAC3C,eAAe,CAAC,WAAW,YAAY;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,WAAW;AAAA,IACzC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,YAAY;AAAA,IAC1C,eAAe,CAAC,WAAW,aAAa;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,aAAa;AAAA,IAC7C,eAAe,CAAC,YAAY,SAAS;AAAA,EACvC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,uBAAuB,YAAY;AAAA,IAC9C,eAAe,CAAC,mBAAmB,YAAY;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,yBAAyB,YAAY;AAAA,IAChD,eAAe,CAAC,iBAAiB,cAAc;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,YAAY;AAAA,IAC7C,eAAe,CAAC,mBAAmB,eAAe;AAAA,EACpD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,YAAY;AAAA,IAC5C,eAAe,CAAC,gBAAgB,YAAY;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,eAAe;AAAA,IACxC,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,iBAAiB;AAAA,IAC1C,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,WAAW;AAAA,IAC5C,eAAe,CAAC,cAAc,cAAc;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,OAAO;AAAA,IACxC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AACF;;;AD1IA,IAAM,iBAAN,MAEA;AAAA,EACE,MAAM,YAAY,OAAmD;AACnE,WAAO,gBACJ,IAAI,CAAC,UAAU,EAAE,IAAI,KAAK,IAAI,SAAS,KAAK,EAAE,EAC9C,OAAO,CAAC,OAAO,gBAAgB,GAAG,EAAE,MAAM,KAAK;AAAA,EACpD;AAAA,EAEA,MAAM,SACJ,MACA,UAC8B;AAC9B,UAAM,SAAS,mBAAmB,QAAQ;AAC1C,UAAM,UAAU,IAAI,IAAY,CAAC,KAAK,QAAQ,OAAO,GAAG,KAAK,QAAQ,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AAC1G,UAAM,UAAU,IAAI,IAAY,KAAK,QAAQ,cAAc,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AACtF,UAAM,aAAa,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACvE,UAAM,eAAe,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACzE,UAAM,QAAQ,aAAa,IAAI;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,KAAK;AAAA,QACH,YAAY,OAAO,CAAC,KAAK;AAAA,QACzB,cAAc;AAAA,QACd,iBAAiB,QAAQ,YAAY;AAAA,QACrC,mBAAmB;AAAA,QACnB,UAAU,KAAK,QAAQ;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AAAA,EAEA,YAAY,QAA6B;AACvC,WAAO,gBAAgB,MAAM;AAAA,EAC/B;AACF;AAEA,SAAS,gBAAgB,QAA6B;AACpD,SAAO,mBAAmB,YAAY,MAAM,EAAE;AAChD;AAQO,SAAS,mBAAmB,UAA4B;AAC7D,QAAM,UAAU,SAAS,MAAM,oCAAoC;AACnE,SAAO,WAAW,CAAC;AACrB;AAEA,IAAM,UAAU,IAAI,eAAe;AAE5B,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;AACpD,IAAM,WAAW,QAAQ,SAAS,KAAK,OAAO;AAC9C,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;","names":[]}
@@ -0,0 +1,290 @@
1
+ /**
2
+ * Paper-grade RunRecord schema + runtime validator.
3
+ *
4
+ * Every run that participates in a promotion gate, paper table, or
5
+ * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory
6
+ * fields are exactly those the paper "Two Loops, Three Roles" requires
7
+ * for reproducibility: who/what/when/cost/seed/hash, plus the search vs
8
+ * holdout split tag and either a `searchScore` or a `holdoutScore`.
9
+ *
10
+ * This is intentionally NOT a replacement for the rich `Run` /
11
+ * `ProposeReviewReport` / `ScenarioResult` types already in the
12
+ * package. Those are runtime structures with full provenance. A
13
+ * `RunRecord` is the analysis-time projection — the JSON-friendly
14
+ * row you'd put in a parquet file or paste into a notebook.
15
+ *
16
+ * Validate at the boundary:
17
+ *
18
+ * const rec = validateRunRecord(rawJson) // throws on missing
19
+ * const ok = isRunRecord(rawJson) // boolean check
20
+ * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }
21
+ *
22
+ * The validator runs in pure TS — zod is intentionally NOT a
23
+ * dependency. Round-trip tested in `tests/run-record.test.ts`.
24
+ */
25
+ /** Search/dev/holdout split tag. 'search' is the paper-grade alias for the
26
+ * combined train+test pool that the optimizer is allowed to read. */
27
+ type RunSplitTag = 'search' | 'dev' | 'holdout';
28
+ interface RunTokenUsage {
29
+ input: number;
30
+ output: number;
31
+ cached?: number;
32
+ }
33
+ interface RunJudgeMetadata {
34
+ model: string;
35
+ promptVersion: string;
36
+ /** [0,1] confidence the judge declared. Constant judge confidence
37
+ * across many runs is a fallback signal (see `canary.ts`). */
38
+ confidence: number;
39
+ /** True if the judge degraded to a fallback path (rules-only,
40
+ * prior-call cache, etc.). The canary uses this to alert. */
41
+ fallback: boolean;
42
+ }
43
+ interface RunOutcome {
44
+ /** Score on the search/optimization split. Optional because a
45
+ * holdout-only evaluation only fills `holdoutScore`. */
46
+ searchScore?: number;
47
+ /** Score on the held-out split. Optional because a search-only run
48
+ * only fills `searchScore`. At least one must be present. */
49
+ holdoutScore?: number;
50
+ /** Bag of any other metric the run produced — judge dimensions,
51
+ * pass/fail counters, latency stats, etc. Numeric only — keeps
52
+ * reporters honest. */
53
+ raw: Record<string, number>;
54
+ }
55
+ /**
56
+ * Mandatory paper-grade fields for a single evaluation run. Optional
57
+ * fields are extension points; mandatory fields throw if missing.
58
+ *
59
+ * Hash discipline:
60
+ * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the
61
+ * model (after any steering bundle merge).
62
+ * - `configHash` is the sha256 of the effective run config (model,
63
+ * temperature, tools, judges, splits). The pair (promptHash,
64
+ * configHash) uniquely identifies an experimental cell.
65
+ *
66
+ * Model snapshot discipline:
67
+ * - `model` MUST encode a snapshot version. Bare aliases like
68
+ * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.
69
+ * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.
70
+ */
71
+ interface RunRecord {
72
+ /** UUID for the run. */
73
+ runId: string;
74
+ /** Logical experiment grouping (a treatment vs a baseline within
75
+ * the same sweep should share `experimentId`). */
76
+ experimentId: string;
77
+ /** Stable identifier for the candidate (variant) being run. The
78
+ * promotion gate compares two `candidateId`s on matched items. */
79
+ candidateId: string;
80
+ /** RNG seed for the run. Always recorded — silent re-seeding is
81
+ * the most common cause of non-reproducible numbers. */
82
+ seed: number;
83
+ /** Model identifier WITH snapshot version. */
84
+ model: string;
85
+ /** sha256 of the effective prompt (post-steering). */
86
+ promptHash: string;
87
+ /** sha256 of the effective config. */
88
+ configHash: string;
89
+ /** Git SHA the harness was run from. */
90
+ commitSha: string;
91
+ /** End-to-end wall-clock duration in milliseconds. */
92
+ wallMs: number;
93
+ /** Time spent queued before execution started, if known. */
94
+ queueMs?: number;
95
+ /** Total USD cost. Mandatory — runs without a cost number are
96
+ * unbounded by definition and must not be admitted into the gate. */
97
+ costUsd: number;
98
+ /** Token usage breakdown. */
99
+ tokenUsage: RunTokenUsage;
100
+ /** Judge-side metadata, if a judge was used. */
101
+ judgeMetadata?: RunJudgeMetadata;
102
+ /** Per-split scores + raw bag. */
103
+ outcome: RunOutcome;
104
+ /** Categorical failure tag, when the run failed and the harness
105
+ * classified it. Free-form string; standard tags live in
106
+ * `failure-taxonomy.ts`. */
107
+ failureMode?: string;
108
+ /** Which split this run was drawn from. */
109
+ splitTag: RunSplitTag;
110
+ }
111
+ declare class RunRecordValidationError extends Error {
112
+ readonly path: string;
113
+ constructor(message: string, path?: string);
114
+ }
115
+ /**
116
+ * Strict validator. Throws `RunRecordValidationError` on the first
117
+ * missing or wrongly-typed field. Returns the input cast to
118
+ * `RunRecord` on success — the validator does not coerce.
119
+ */
120
+ declare function validateRunRecord(input: unknown): RunRecord;
121
+ /** Boolean validator — convenience for filtering arrays. */
122
+ declare function isRunRecord(input: unknown): input is RunRecord;
123
+ /** Non-throwing validator — returns a discriminated union. */
124
+ declare function parseRunRecordSafe(input: unknown): {
125
+ ok: true;
126
+ value: RunRecord;
127
+ } | {
128
+ ok: false;
129
+ error: RunRecordValidationError;
130
+ };
131
+ /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
132
+ declare function roundTripRunRecord(record: RunRecord): RunRecord;
133
+
134
+ /**
135
+ * Shared types for the reference benchmark wrappers under
136
+ * `src/benchmarks/`. Each wrapper exports the three functions in
137
+ * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.
138
+ */
139
+
140
+ interface BenchmarkDatasetItem<TPayload = unknown> {
141
+ /** Stable dataset-local item id (used for split assignment + paper
142
+ * references). Unique within a benchmark. */
143
+ id: string;
144
+ /** Free-form payload. Each benchmark defines its own shape. */
145
+ payload: TPayload;
146
+ }
147
+ interface BenchmarkEvaluation {
148
+ /** [0, 1] score for the response on this item. Exact-match
149
+ * benchmarks use 0/1; partial-credit benchmarks may return
150
+ * fractional values. */
151
+ score: number;
152
+ /** Optional bag of raw scoring signals — e.g. parsed numeric
153
+ * answer, regex match, judge sub-scores. */
154
+ raw: Record<string, unknown>;
155
+ }
156
+ /** Common signature implemented by every adapter under `src/benchmarks/*`. */
157
+ interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {
158
+ /** Load the dataset for the given split. May hit the network on
159
+ * first call but should be cache-friendly. Adapters that don't
160
+ * ship the dataset itself MUST throw a clearly-marked error
161
+ * pointing the caller at the loader script. */
162
+ loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>;
163
+ /** Score a single response. Pure with respect to the inputs. */
164
+ evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>;
165
+ /** Deterministic split assignment via item id hashing. The
166
+ * fraction of items in each split is implementation-defined but
167
+ * MUST be stable across processes and platforms. */
168
+ assignSplit(itemId: string): RunSplitTag;
169
+ }
170
+ /** Split-assignment seed shared across all benchmarks. Bumping this
171
+ * value reshuffles every split — do NOT do that lightly. */
172
+ declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
173
+ /**
174
+ * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a
175
+ * stable 32-bit hash of `${seed}::${id}`. Default proportions:
176
+ *
177
+ * search: 60% (optimization-readable)
178
+ * dev: 20% (held-out for tuning, leak-on-purpose during dev)
179
+ * holdout:20% (paper-grade held-out, gated reads)
180
+ */
181
+ declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
182
+
183
+ /**
184
+ * Synthetic routing dataset. 16 tasks across 4 categories. Used as a
185
+ * deterministic, dependency-free benchmark for any router that maps a
186
+ * natural-language request to one of a fixed set of route labels.
187
+ *
188
+ * Format (see `routing/README.md` for prose):
189
+ *
190
+ * {
191
+ * id: stable per-task ID (matches across processes).
192
+ * category: one of the four route labels.
193
+ * prompt: the user-facing request the router must classify.
194
+ * route: the ground-truth route the router should pick.
195
+ * synonyms: other strings that count as a correct answer.
196
+ * hardNegatives:close-but-wrong route labels — used to detect the
197
+ * "always picks the popular route" failure mode.
198
+ * }
199
+ *
200
+ * The four categories are intentionally cross-domain (file ops,
201
+ * math, search, conversation) so a router that collapses to one
202
+ * category is easy to spot.
203
+ */
204
+ interface RoutingItem {
205
+ id: string;
206
+ category: 'file' | 'math' | 'search' | 'chat';
207
+ prompt: string;
208
+ /** Canonical correct route label. */
209
+ route: string;
210
+ /** Alternate route labels that also count as correct. */
211
+ synonyms: string[];
212
+ /** Wrong-but-tempting route labels (for analysis, not grading). */
213
+ hardNegatives: string[];
214
+ }
215
+ declare const ROUTING_DATASET: RoutingItem[];
216
+
217
+ /**
218
+ * Routing benchmark — synthetic, dependency-free, ships in the
219
+ * package. 16 cross-category items in `dataset.ts`. See
220
+ * `routing/README.md` for the format.
221
+ *
222
+ * `evaluate` does case-insensitive exact match against the canonical
223
+ * route plus declared synonyms. The first valid route token in the
224
+ * response wins; everything else is ignored. Wrong answers also
225
+ * report whether they hit a hard negative — useful when triaging
226
+ * "always picks the popular route" failure modes.
227
+ */
228
+
229
+ type RoutingPayload = RoutingItem;
230
+ type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>;
231
+ declare class RoutingAdapter implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload> {
232
+ loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]>;
233
+ evaluate(item: RoutingDatasetItem, response: string): Promise<BenchmarkEvaluation>;
234
+ assignSplit(itemId: string): RunSplitTag;
235
+ }
236
+ /**
237
+ * Pull route-shaped tokens out of a model response. Routes look like
238
+ * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics
239
+ * are not routes, but `category.action` patterns are robust to most
240
+ * model wrappers (JSON output, prose explanations, code fences).
241
+ */
242
+ declare function extractRouteTokens(response: string): string[];
243
+ declare const loadDataset: (split: RunSplitTag) => Promise<RoutingDatasetItem[]>;
244
+ declare const evaluate: (item: RoutingDatasetItem, response: string) => Promise<BenchmarkEvaluation>;
245
+ declare const assignSplit: (itemId: string) => RunSplitTag;
246
+
247
+ declare const index$1_ROUTING_DATASET: typeof ROUTING_DATASET;
248
+ type index$1_RoutingAdapter = RoutingAdapter;
249
+ declare const index$1_RoutingAdapter: typeof RoutingAdapter;
250
+ type index$1_RoutingDatasetItem = RoutingDatasetItem;
251
+ type index$1_RoutingItem = RoutingItem;
252
+ type index$1_RoutingPayload = RoutingPayload;
253
+ declare const index$1_assignSplit: typeof assignSplit;
254
+ declare const index$1_evaluate: typeof evaluate;
255
+ declare const index$1_extractRouteTokens: typeof extractRouteTokens;
256
+ declare const index$1_loadDataset: typeof loadDataset;
257
+ declare namespace index$1 {
258
+ export { index$1_ROUTING_DATASET as ROUTING_DATASET, index$1_RoutingAdapter as RoutingAdapter, type index$1_RoutingDatasetItem as RoutingDatasetItem, type index$1_RoutingItem as RoutingItem, type index$1_RoutingPayload as RoutingPayload, index$1_assignSplit as assignSplit, index$1_evaluate as evaluate, index$1_extractRouteTokens as extractRouteTokens, index$1_loadDataset as loadDataset };
259
+ }
260
+
261
+ /**
262
+ * Reference benchmark wrappers — entry point.
263
+ *
264
+ * Core surface (exported here):
265
+ * - The `BenchmarkAdapter` contract.
266
+ * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.
267
+ * - `routing` — synthetic 16-task router benchmark. The only novel
268
+ * benchmark we built; ships in the package.
269
+ *
270
+ * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):
271
+ * - `gsm8k` — exact-match math reasoning (HF mirror, dataset
272
+ * not bundled).
273
+ * - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an
274
+ * external grader).
275
+ *
276
+ * The example wrappers are reference implementations of `BenchmarkAdapter`.
277
+ * Read them, copy them, adapt them. They're intentionally not in the main
278
+ * entry — every team will configure them differently.
279
+ */
280
+
281
+ declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
282
+ type index_BenchmarkAdapter<_TItem = unknown, TPayload = unknown> = BenchmarkAdapter<_TItem, TPayload>;
283
+ type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayload>;
284
+ type index_BenchmarkEvaluation = BenchmarkEvaluation;
285
+ declare const index_deterministicSplit: typeof deterministicSplit;
286
+ declare namespace index {
287
+ export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$1 as routing };
288
+ }
289
+
290
+ export { BENCHMARK_SPLIT_SEED as B, type RunRecord as R, type RunSplitTag as a, type BenchmarkAdapter as b, type BenchmarkDatasetItem as c, type BenchmarkEvaluation as d, type RunJudgeMetadata as e, type RunOutcome as f, RunRecordValidationError as g, type RunTokenUsage as h, deterministicSplit as i, index as j, isRunRecord as k, index$1 as l, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };