npm - @tangle-network/agent-eval - Versions diffs - 0.20.7 → 0.20.9 - Mend

@tangle-network/agent-eval 0.20.7 → 0.20.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/LICENSE +21 -0
package/README.md +9 -6
package/dist/benchmarks/index.d.ts +1 -0
package/dist/benchmarks/index.js +12 -0
package/dist/benchmarks/index.js.map +1 -0
package/dist/chunk-XDGJUIV2.js +219 -0
package/dist/chunk-XDGJUIV2.js.map +1 -0
package/dist/index-CEWY1rmu.d.ts +290 -0
package/dist/index.d.ts +61 -298
package/dist/index.js +139 -248
package/dist/index.js.map +1 -1
package/dist/openapi.json +477 -0
package/docs/concepts.md +4 -4
package/docs/knowledge-readiness.md +2 -2
package/docs/wire-protocol.md +3 -3
package/package.json +14 -7
package/examples/benchmarks/README.md +0 -44
package/examples/benchmarks/gsm8k/index.ts +0 -126
package/examples/benchmarks/swebench-lite/index.ts +0 -178
package/examples/multi-shot-optimization/index.ts +0 -114
package/examples/same-sandbox-harness/index.ts +0 -63

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Tangle Network
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md CHANGED Viewed

@@ -55,9 +55,9 @@ Package responsibilities:
   optimization, reporting.
 - Product app: domain state, tools, credentials, UI, storage, deployment, model
   gateway.
-- `agent-runtime`: production agent-loop/session runtime.
-- `agent-knowledge`: evidence stores, claim/page synthesis, retrieval, knowledge
-  readiness implementation.
+- `@tangle-network/agent-runtime`: production agent-loop/session runtime.
+- `@tangle-network/agent-knowledge`: evidence stores, claim/page synthesis,
+  retrieval, knowledge readiness implementation.
 ## Install
@@ -72,10 +72,12 @@ npm i -g @tangle-network/agent-eval
 agent-eval serve --port 5005
 ```
-Python client:
+Python client source lives in `clients/python`. Until the PyPI package is
+published, install it from the repo:
 ```sh
-pip install tangle-agent-eval
+cd clients/python
+pip install -e .
 ```
 ## Core Primitives
@@ -98,7 +100,8 @@ pip install tangle-agent-eval
 ## Examples
-Runnable examples live in [`examples/`](./examples):
+Runnable examples live in the repository's [`examples/`](./examples)
+directory. They are not part of the published npm package.
 - [`examples/same-sandbox-harness`](./examples/same-sandbox-harness) - run
   multiple eval passes against the same workspace.

package/dist/benchmarks/index.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, i as deterministicSplit, l as routing } from '../index-CEWY1rmu.js';

package/dist/benchmarks/index.js ADDED Viewed

@@ -0,0 +1,12 @@
+import {
+  BENCHMARK_SPLIT_SEED,
+  deterministicSplit,
+  routing_exports
+} from "../chunk-XDGJUIV2.js";
+import "../chunk-PZ5AY32C.js";
+export {
+  BENCHMARK_SPLIT_SEED,
+  deterministicSplit,
+  routing_exports as routing
+};
+//# sourceMappingURL=index.js.map

package/dist/benchmarks/index.js.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}

package/dist/chunk-XDGJUIV2.js ADDED Viewed

@@ -0,0 +1,219 @@
+import {
+  __export
+} from "./chunk-PZ5AY32C.js";
+// src/benchmarks/index.ts
+var benchmarks_exports = {};
+__export(benchmarks_exports, {
+  BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
+  deterministicSplit: () => deterministicSplit,
+  routing: () => routing_exports
+});
+// src/benchmarks/types.ts
+function fnv1a32(input) {
+  let h = 2166136261;
+  for (let i = 0; i < input.length; i++) {
+    h ^= input.charCodeAt(i) & 255;
+    h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
+  }
+  return h >>> 0;
+}
+var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
+function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
+  const h = fnv1a32(`${seed}::${itemId}`);
+  const pos = h / 4294967296;
+  if (pos < 0.6) return "search";
+  if (pos < 0.8) return "dev";
+  return "holdout";
+}
+// src/benchmarks/routing/index.ts
+var routing_exports = {};
+__export(routing_exports, {
+  ROUTING_DATASET: () => ROUTING_DATASET,
+  RoutingAdapter: () => RoutingAdapter,
+  assignSplit: () => assignSplit,
+  evaluate: () => evaluate,
+  extractRouteTokens: () => extractRouteTokens,
+  loadDataset: () => loadDataset
+});
+// src/benchmarks/routing/dataset.ts
+var ROUTING_DATASET = [
+  {
+    id: "file_001",
+    category: "file",
+    prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
+    route: "fs.write",
+    synonyms: ["filesystem.write", "write_file"],
+    hardNegatives: ["fs.read", "chat.reply"]
+  },
+  {
+    id: "file_002",
+    category: "file",
+    prompt: "Read the contents of /etc/hosts and summarize the entries.",
+    route: "fs.read",
+    synonyms: ["filesystem.read", "read_file"],
+    hardNegatives: ["fs.write", "search.web"]
+  },
+  {
+    id: "file_003",
+    category: "file",
+    prompt: "List every Python file under src/ recursively.",
+    route: "fs.list",
+    synonyms: ["filesystem.list", "list_files"],
+    hardNegatives: ["fs.read", "search.code"]
+  },
+  {
+    id: "file_004",
+    category: "file",
+    prompt: "Delete the cached build at .turbo/cache.",
+    route: "fs.delete",
+    synonyms: ["filesystem.delete", "remove_file"],
+    hardNegatives: ["fs.write", "fs.list"]
+  },
+  {
+    id: "math_001",
+    category: "math",
+    prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
+    route: "math.integral",
+    synonyms: ["calculator.integral", "math.solve"],
+    hardNegatives: ["math.derivative", "chat.reply"]
+  },
+  {
+    id: "math_002",
+    category: "math",
+    prompt: "Compute the derivative of sin(x) * cos(x).",
+    route: "math.derivative",
+    synonyms: ["calculator.derivative", "math.solve"],
+    hardNegatives: ["math.integral", "math.algebra"]
+  },
+  {
+    id: "math_003",
+    category: "math",
+    prompt: "Solve 2x + 7 = 19 for x.",
+    route: "math.algebra",
+    synonyms: ["calculator.algebra", "math.solve"],
+    hardNegatives: ["math.derivative", "math.integral"]
+  },
+  {
+    id: "math_004",
+    category: "math",
+    prompt: "What is the prime factorization of 360?",
+    route: "math.numbertheory",
+    synonyms: ["calculator.factor", "math.solve"],
+    hardNegatives: ["math.algebra", "search.web"]
+  },
+  {
+    id: "search_001",
+    category: "search",
+    prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
+    route: "search.web",
+    synonyms: ["web.search", "search.papers"],
+    hardNegatives: ["search.code", "chat.reply"]
+  },
+  {
+    id: "search_002",
+    category: "search",
+    prompt: "Search the codebase for every call site of `runProposeReview`.",
+    route: "search.code",
+    synonyms: ["code.search", "grep"],
+    hardNegatives: ["search.web", "fs.read"]
+  },
+  {
+    id: "search_003",
+    category: "search",
+    prompt: "What is the latest release of the Tangle network on GitHub?",
+    route: "search.web",
+    synonyms: ["web.search", "github.releases"],
+    hardNegatives: ["search.code", "chat.reply"]
+  },
+  {
+    id: "search_004",
+    category: "search",
+    prompt: "Find all TODO comments in the agent-eval src tree.",
+    route: "search.code",
+    synonyms: ["code.search", "grep"],
+    hardNegatives: ["search.web", "fs.list"]
+  },
+  {
+    id: "chat_001",
+    category: "chat",
+    prompt: "Hi there, how are you doing today?",
+    route: "chat.reply",
+    synonyms: ["conversation.reply"],
+    hardNegatives: ["search.web", "fs.read"]
+  },
+  {
+    id: "chat_002",
+    category: "chat",
+    prompt: "Please explain the difference between an LLM and a foundation model.",
+    route: "chat.reply",
+    synonyms: ["conversation.reply", "qa.answer"],
+    hardNegatives: ["search.web", "math.algebra"]
+  },
+  {
+    id: "chat_003",
+    category: "chat",
+    prompt: "Tell me a short joke about distributed systems.",
+    route: "chat.reply",
+    synonyms: ["conversation.reply"],
+    hardNegatives: ["search.web", "fs.read"]
+  },
+  {
+    id: "chat_004",
+    category: "chat",
+    prompt: "Acknowledge my last message with a thumbs up.",
+    route: "chat.reply",
+    synonyms: ["conversation.reply", "react"],
+    hardNegatives: ["fs.write", "search.web"]
+  }
+];
+// src/benchmarks/routing/index.ts
+var RoutingAdapter = class {
+  async loadDataset(split) {
+    return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl(it.id) === split);
+  }
+  async evaluate(item, response) {
+    const tokens = extractRouteTokens(response);
+    const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
+    const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
+    const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null;
+    const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
+    const score = firstMatch ? 1 : 0;
+    return {
+      score,
+      raw: {
+        firstToken: tokens[0] ?? null,
+        matchedRoute: firstMatch,
+        hitHardNegative: Boolean(firstHardNeg),
+        hardNegativeRoute: firstHardNeg,
+        category: item.payload.category
+      }
+    };
+  }
+  assignSplit(itemId) {
+    return assignSplitImpl(itemId);
+  }
+};
+function assignSplitImpl(itemId) {
+  return deterministicSplit(`routing::${itemId}`);
+}
+function extractRouteTokens(response) {
+  const matches = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
+  return matches ?? [];
+}
+var adapter = new RoutingAdapter();
+var loadDataset = adapter.loadDataset.bind(adapter);
+var evaluate = adapter.evaluate.bind(adapter);
+var assignSplit = adapter.assignSplit.bind(adapter);
+export {
+  BENCHMARK_SPLIT_SEED,
+  deterministicSplit,
+  routing_exports,
+  benchmarks_exports
+};
+//# sourceMappingURL=chunk-XDGJUIV2.js.map

package/dist/chunk-XDGJUIV2.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/benchmarks/index.ts","../src/benchmarks/types.ts","../src/benchmarks/routing/index.ts","../src/benchmarks/routing/dataset.ts"],"sourcesContent":["/**\n * Reference benchmark wrappers — entry point.\n *\n * Core surface (exported here):\n * - The `BenchmarkAdapter` contract.\n * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.\n * - `routing` — synthetic 16-task router benchmark. The only novel\n * benchmark we built; ships in the package.\n *\n * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):\n * - `gsm8k` — exact-match math reasoning (HF mirror, dataset\n * not bundled).\n * - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an\n * external grader).\n *\n * The example wrappers are reference implementations of `BenchmarkAdapter`.\n * Read them, copy them, adapt them. They're intentionally not in the main\n * entry — every team will configure them differently.\n */\n\nexport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from './types'\nexport { deterministicSplit, BENCHMARK_SPLIT_SEED } from './types'\n\nexport * as routing from './routing/index'\n","/**\n * Shared types for the reference benchmark wrappers under\n * `src/benchmarks/`. Each wrapper exports the three functions in\n * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.\n */\n\nimport type { RunSplitTag } from '../run-record'\n\nexport interface BenchmarkDatasetItem<TPayload = unknown> {\n /** Stable dataset-local item id (used for split assignment + paper\n * references). Unique within a benchmark. */\n id: string\n /** Free-form payload. Each benchmark defines its own shape. */\n payload: TPayload\n}\n\nexport interface BenchmarkEvaluation {\n /** [0, 1] score for the response on this item. Exact-match\n * benchmarks use 0/1; partial-credit benchmarks may return\n * fractional values. */\n score: number\n /** Optional bag of raw scoring signals — e.g. parsed numeric\n * answer, regex match, judge sub-scores. */\n raw: Record<string, unknown>\n}\n\n/** Common signature implemented by every adapter under `src/benchmarks/*`. */\n// `TPayload` is the per-item payload type; `_TItem` is preserved for\n// downstream type-narrowing extensions (a richer `BenchmarkDatasetItem`\n// subclass that adds e.g. provenance metadata) but is intentionally\n// unused here. `noUnusedLocals` requires the leading underscore.\nexport interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {\n /** Load the dataset for the given split. May hit the network on\n * first call but should be cache-friendly. Adapters that don't\n * ship the dataset itself MUST throw a clearly-marked error\n * pointing the caller at the loader script. */\n loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>\n /** Score a single response. Pure with respect to the inputs. */\n evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>\n /** Deterministic split assignment via item id hashing. The\n * fraction of items in each split is implementation-defined but\n * MUST be stable across processes and platforms. */\n assignSplit(itemId: string): RunSplitTag\n}\n\n// ── Deterministic split assignment ───────────────────────────────────\n\n/**\n * 32-bit FNV-1a hash. Stable, allocation-free, deterministic across\n * runtimes. We use it to assign items to splits rather than depending\n * on a polyfilled crypto.subtle path.\n */\nfunction fnv1a32(input: string): number {\n let h = 0x811c9dc5\n for (let i = 0; i < input.length; i++) {\n h ^= input.charCodeAt(i) & 0xff\n h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0\n }\n return h >>> 0\n}\n\n/** Split-assignment seed shared across all benchmarks. Bumping this\n * value reshuffles every split — do NOT do that lightly. */\nexport const BENCHMARK_SPLIT_SEED = 'agent-eval-v1'\n\n/**\n * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a\n * stable 32-bit hash of `${seed}::${id}`. Default proportions:\n *\n * search: 60% (optimization-readable)\n * dev: 20% (held-out for tuning, leak-on-purpose during dev)\n * holdout:20% (paper-grade held-out, gated reads)\n */\nexport function deterministicSplit(\n itemId: string,\n seed: string = BENCHMARK_SPLIT_SEED,\n): RunSplitTag {\n const h = fnv1a32(`${seed}::${itemId}`)\n const pos = h / 0x100000000\n if (pos < 0.6) return 'search'\n if (pos < 0.8) return 'dev'\n return 'holdout'\n}\n","/**\n * Routing benchmark — synthetic, dependency-free, ships in the\n * package. 16 cross-category items in `dataset.ts`. See\n * `routing/README.md` for the format.\n *\n * `evaluate` does case-insensitive exact match against the canonical\n * route plus declared synonyms. The first valid route token in the\n * response wins; everything else is ignored. Wrong answers also\n * report whether they hit a hard negative — useful when triaging\n * \"always picks the popular route\" failure modes.\n */\n\nimport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from '../types'\nimport { deterministicSplit } from '../types'\nimport type { RunSplitTag } from '../../run-record'\nimport { ROUTING_DATASET, type RoutingItem } from './dataset'\n\nexport type { RoutingItem }\nexport type RoutingPayload = RoutingItem\nexport type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>\n\nclass RoutingAdapter\n implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload>\n{\n async loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]> {\n return ROUTING_DATASET\n .map((item) => ({ id: item.id, payload: item }))\n .filter((it) => assignSplitImpl(it.id) === split)\n }\n\n async evaluate(\n item: RoutingDatasetItem,\n response: string,\n ): Promise<BenchmarkEvaluation> {\n const tokens = extractRouteTokens(response)\n const correct = new Set<string>([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()))\n const hardNeg = new Set<string>(item.payload.hardNegatives.map((s) => s.toLowerCase()))\n const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null\n const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null\n const score = firstMatch ? 1 : 0\n return {\n score,\n raw: {\n firstToken: tokens[0] ?? null,\n matchedRoute: firstMatch,\n hitHardNegative: Boolean(firstHardNeg),\n hardNegativeRoute: firstHardNeg,\n category: item.payload.category,\n },\n }\n }\n\n assignSplit(itemId: string): RunSplitTag {\n return assignSplitImpl(itemId)\n }\n}\n\nfunction assignSplitImpl(itemId: string): RunSplitTag {\n return deterministicSplit(`routing::${itemId}`)\n}\n\n/**\n * Pull route-shaped tokens out of a model response. Routes look like\n * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics\n * are not routes, but `category.action` patterns are robust to most\n * model wrappers (JSON output, prose explanations, code fences).\n */\nexport function extractRouteTokens(response: string): string[] {\n const matches = response.match(/[a-z][a-z0-9_]*\\.[a-z][a-z0-9_]*/gi)\n return matches ?? []\n}\n\nconst adapter = new RoutingAdapter()\n\nexport const loadDataset = adapter.loadDataset.bind(adapter)\nexport const evaluate = adapter.evaluate.bind(adapter)\nexport const assignSplit = adapter.assignSplit.bind(adapter)\nexport { RoutingAdapter, ROUTING_DATASET }\n","/**\n * Synthetic routing dataset. 16 tasks across 4 categories. Used as a\n * deterministic, dependency-free benchmark for any router that maps a\n * natural-language request to one of a fixed set of route labels.\n *\n * Format (see `routing/README.md` for prose):\n *\n * {\n * id: stable per-task ID (matches across processes).\n * category: one of the four route labels.\n * prompt: the user-facing request the router must classify.\n * route: the ground-truth route the router should pick.\n * synonyms: other strings that count as a correct answer.\n * hardNegatives:close-but-wrong route labels — used to detect the\n * \"always picks the popular route\" failure mode.\n * }\n *\n * The four categories are intentionally cross-domain (file ops,\n * math, search, conversation) so a router that collapses to one\n * category is easy to spot.\n */\n\nexport interface RoutingItem {\n id: string\n category: 'file' | 'math' | 'search' | 'chat'\n prompt: string\n /** Canonical correct route label. */\n route: string\n /** Alternate route labels that also count as correct. */\n synonyms: string[]\n /** Wrong-but-tempting route labels (for analysis, not grading). */\n hardNegatives: string[]\n}\n\nexport const ROUTING_DATASET: RoutingItem[] = [\n {\n id: 'file_001',\n category: 'file',\n prompt: 'Save the meeting notes to /tmp/notes-2025-04.md as markdown.',\n route: 'fs.write',\n synonyms: ['filesystem.write', 'write_file'],\n hardNegatives: ['fs.read', 'chat.reply'],\n },\n {\n id: 'file_002',\n category: 'file',\n prompt: 'Read the contents of /etc/hosts and summarize the entries.',\n route: 'fs.read',\n synonyms: ['filesystem.read', 'read_file'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n {\n id: 'file_003',\n category: 'file',\n prompt: 'List every Python file under src/ recursively.',\n route: 'fs.list',\n synonyms: ['filesystem.list', 'list_files'],\n hardNegatives: ['fs.read', 'search.code'],\n },\n {\n id: 'file_004',\n category: 'file',\n prompt: 'Delete the cached build at .turbo/cache.',\n route: 'fs.delete',\n synonyms: ['filesystem.delete', 'remove_file'],\n hardNegatives: ['fs.write', 'fs.list'],\n },\n {\n id: 'math_001',\n category: 'math',\n prompt: 'What is the integral of 3x^2 + 2x from 0 to 5?',\n route: 'math.integral',\n synonyms: ['calculator.integral', 'math.solve'],\n hardNegatives: ['math.derivative', 'chat.reply'],\n },\n {\n id: 'math_002',\n category: 'math',\n prompt: 'Compute the derivative of sin(x) * cos(x).',\n route: 'math.derivative',\n synonyms: ['calculator.derivative', 'math.solve'],\n hardNegatives: ['math.integral', 'math.algebra'],\n },\n {\n id: 'math_003',\n category: 'math',\n prompt: 'Solve 2x + 7 = 19 for x.',\n route: 'math.algebra',\n synonyms: ['calculator.algebra', 'math.solve'],\n hardNegatives: ['math.derivative', 'math.integral'],\n },\n {\n id: 'math_004',\n category: 'math',\n prompt: 'What is the prime factorization of 360?',\n route: 'math.numbertheory',\n synonyms: ['calculator.factor', 'math.solve'],\n hardNegatives: ['math.algebra', 'search.web'],\n },\n {\n id: 'search_001',\n category: 'search',\n prompt: 'Find recent papers on agent prompt optimization with held-out promotion gates.',\n route: 'search.web',\n synonyms: ['web.search', 'search.papers'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_002',\n category: 'search',\n prompt: 'Search the codebase for every call site of `runProposeReview`.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'search_003',\n category: 'search',\n prompt: 'What is the latest release of the Tangle network on GitHub?',\n route: 'search.web',\n synonyms: ['web.search', 'github.releases'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_004',\n category: 'search',\n prompt: 'Find all TODO comments in the agent-eval src tree.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.list'],\n },\n {\n id: 'chat_001',\n category: 'chat',\n prompt: 'Hi there, how are you doing today?',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_002',\n category: 'chat',\n prompt: 'Please explain the difference between an LLM and a foundation model.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'qa.answer'],\n hardNegatives: ['search.web', 'math.algebra'],\n },\n {\n id: 'chat_003',\n category: 'chat',\n prompt: 'Tell me a short joke about distributed systems.',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_004',\n category: 'chat',\n prompt: 'Acknowledge my last message with a thumbs up.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'react'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n]\n"],"mappings":";;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACoDA,SAAS,QAAQ,OAAuB;AACtC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,SAAK,MAAM,WAAW,CAAC,IAAI;AAC3B,QAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,SAAU;AAAA,EACxE;AACA,SAAO,MAAM;AACf;AAIO,IAAM,uBAAuB;AAU7B,SAAS,mBACd,QACA,OAAe,sBACF;AACb,QAAM,IAAI,QAAQ,GAAG,IAAI,KAAK,MAAM,EAAE;AACtC,QAAM,MAAM,IAAI;AAChB,MAAI,MAAM,IAAK,QAAO;AACtB,MAAI,MAAM,IAAK,QAAO;AACtB,SAAO;AACT;;;AClFA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACkCO,IAAM,kBAAiC;AAAA,EAC5C;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB,YAAY;AAAA,IAC3C,eAAe,CAAC,WAAW,YAAY;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,WAAW;AAAA,IACzC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,YAAY;AAAA,IAC1C,eAAe,CAAC,WAAW,aAAa;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,aAAa;AAAA,IAC7C,eAAe,CAAC,YAAY,SAAS;AAAA,EACvC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,uBAAuB,YAAY;AAAA,IAC9C,eAAe,CAAC,mBAAmB,YAAY;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,yBAAyB,YAAY;AAAA,IAChD,eAAe,CAAC,iBAAiB,cAAc;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,YAAY;AAAA,IAC7C,eAAe,CAAC,mBAAmB,eAAe;AAAA,EACpD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,YAAY;AAAA,IAC5C,eAAe,CAAC,gBAAgB,YAAY;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,eAAe;AAAA,IACxC,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,iBAAiB;AAAA,IAC1C,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,WAAW;AAAA,IAC5C,eAAe,CAAC,cAAc,cAAc;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,OAAO;AAAA,IACxC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AACF;;;AD1IA,IAAM,iBAAN,MAEA;AAAA,EACE,MAAM,YAAY,OAAmD;AACnE,WAAO,gBACJ,IAAI,CAAC,UAAU,EAAE,IAAI,KAAK,IAAI,SAAS,KAAK,EAAE,EAC9C,OAAO,CAAC,OAAO,gBAAgB,GAAG,EAAE,MAAM,KAAK;AAAA,EACpD;AAAA,EAEA,MAAM,SACJ,MACA,UAC8B;AAC9B,UAAM,SAAS,mBAAmB,QAAQ;AAC1C,UAAM,UAAU,IAAI,IAAY,CAAC,KAAK,QAAQ,OAAO,GAAG,KAAK,QAAQ,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AAC1G,UAAM,UAAU,IAAI,IAAY,KAAK,QAAQ,cAAc,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AACtF,UAAM,aAAa,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACvE,UAAM,eAAe,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACzE,UAAM,QAAQ,aAAa,IAAI;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,KAAK;AAAA,QACH,YAAY,OAAO,CAAC,KAAK;AAAA,QACzB,cAAc;AAAA,QACd,iBAAiB,QAAQ,YAAY;AAAA,QACrC,mBAAmB;AAAA,QACnB,UAAU,KAAK,QAAQ;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AAAA,EAEA,YAAY,QAA6B;AACvC,WAAO,gBAAgB,MAAM;AAAA,EAC/B;AACF;AAEA,SAAS,gBAAgB,QAA6B;AACpD,SAAO,mBAAmB,YAAY,MAAM,EAAE;AAChD;AAQO,SAAS,mBAAmB,UAA4B;AAC7D,QAAM,UAAU,SAAS,MAAM,oCAAoC;AACnE,SAAO,WAAW,CAAC;AACrB;AAEA,IAAM,UAAU,IAAI,eAAe;AAE5B,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;AACpD,IAAM,WAAW,QAAQ,SAAS,KAAK,OAAO;AAC9C,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;","names":[]}

package/dist/index-CEWY1rmu.d.ts ADDED Viewed

@@ -0,0 +1,290 @@
+/**
+ * Paper-grade RunRecord schema + runtime validator.
+ *
+ * Every run that participates in a promotion gate, paper table, or
+ * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory
+ * fields are exactly those the paper "Two Loops, Three Roles" requires
+ * for reproducibility: who/what/when/cost/seed/hash, plus the search vs
+ * holdout split tag and either a `searchScore` or a `holdoutScore`.
+ *
+ * This is intentionally NOT a replacement for the rich `Run` /
+ * `ProposeReviewReport` / `ScenarioResult` types already in the
+ * package. Those are runtime structures with full provenance. A
+ * `RunRecord` is the analysis-time projection — the JSON-friendly
+ * row you'd put in a parquet file or paste into a notebook.
+ *
+ * Validate at the boundary:
+ *
+ *   const rec = validateRunRecord(rawJson)         // throws on missing
+ *   const ok  = isRunRecord(rawJson)               // boolean check
+ *   const rec = parseRunRecordSafe(rawJson)        // { ok, value | error }
+ *
+ * The validator runs in pure TS — zod is intentionally NOT a
+ * dependency. Round-trip tested in `tests/run-record.test.ts`.
+ */
+/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the
+ *  combined train+test pool that the optimizer is allowed to read. */
+type RunSplitTag = 'search' | 'dev' | 'holdout';
+interface RunTokenUsage {
+    input: number;
+    output: number;
+    cached?: number;
+}
+interface RunJudgeMetadata {
+    model: string;
+    promptVersion: string;
+    /** [0,1] confidence the judge declared. Constant judge confidence
+     *  across many runs is a fallback signal (see `canary.ts`). */
+    confidence: number;
+    /** True if the judge degraded to a fallback path (rules-only,
+     *  prior-call cache, etc.). The canary uses this to alert. */
+    fallback: boolean;
+}
+interface RunOutcome {
+    /** Score on the search/optimization split. Optional because a
+     *  holdout-only evaluation only fills `holdoutScore`. */
+    searchScore?: number;
+    /** Score on the held-out split. Optional because a search-only run
+     *  only fills `searchScore`. At least one must be present. */
+    holdoutScore?: number;
+    /** Bag of any other metric the run produced — judge dimensions,
+     *  pass/fail counters, latency stats, etc. Numeric only — keeps
+     *  reporters honest. */
+    raw: Record<string, number>;
+}
+/**
+ * Mandatory paper-grade fields for a single evaluation run. Optional
+ * fields are extension points; mandatory fields throw if missing.
+ *
+ * Hash discipline:
+ *   - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the
+ *     model (after any steering bundle merge).
+ *   - `configHash` is the sha256 of the effective run config (model,
+ *     temperature, tools, judges, splits). The pair (promptHash,
+ *     configHash) uniquely identifies an experimental cell.
+ *
+ * Model snapshot discipline:
+ *   - `model` MUST encode a snapshot version. Bare aliases like
+ *     `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.
+ *     Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.
+ */
+interface RunRecord {
+    /** UUID for the run. */
+    runId: string;
+    /** Logical experiment grouping (a treatment vs a baseline within
+     *  the same sweep should share `experimentId`). */
+    experimentId: string;
+    /** Stable identifier for the candidate (variant) being run. The
+     *  promotion gate compares two `candidateId`s on matched items. */
+    candidateId: string;
+    /** RNG seed for the run. Always recorded — silent re-seeding is
+     *  the most common cause of non-reproducible numbers. */
+    seed: number;
+    /** Model identifier WITH snapshot version. */
+    model: string;
+    /** sha256 of the effective prompt (post-steering). */
+    promptHash: string;
+    /** sha256 of the effective config. */
+    configHash: string;
+    /** Git SHA the harness was run from. */
+    commitSha: string;
+    /** End-to-end wall-clock duration in milliseconds. */
+    wallMs: number;
+    /** Time spent queued before execution started, if known. */
+    queueMs?: number;
+    /** Total USD cost. Mandatory — runs without a cost number are
+     *  unbounded by definition and must not be admitted into the gate. */
+    costUsd: number;
+    /** Token usage breakdown. */
+    tokenUsage: RunTokenUsage;
+    /** Judge-side metadata, if a judge was used. */
+    judgeMetadata?: RunJudgeMetadata;
+    /** Per-split scores + raw bag. */
+    outcome: RunOutcome;
+    /** Categorical failure tag, when the run failed and the harness
+     *  classified it. Free-form string; standard tags live in
+     *  `failure-taxonomy.ts`. */
+    failureMode?: string;
+    /** Which split this run was drawn from. */
+    splitTag: RunSplitTag;
+}
+declare class RunRecordValidationError extends Error {
+    readonly path: string;
+    constructor(message: string, path?: string);
+}
+/**
+ * Strict validator. Throws `RunRecordValidationError` on the first
+ * missing or wrongly-typed field. Returns the input cast to
+ * `RunRecord` on success — the validator does not coerce.
+ */
+declare function validateRunRecord(input: unknown): RunRecord;
+/** Boolean validator — convenience for filtering arrays. */
+declare function isRunRecord(input: unknown): input is RunRecord;
+/** Non-throwing validator — returns a discriminated union. */
+declare function parseRunRecordSafe(input: unknown): {
+    ok: true;
+    value: RunRecord;
+} | {
+    ok: false;
+    error: RunRecordValidationError;
+};
+/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
+declare function roundTripRunRecord(record: RunRecord): RunRecord;
+/**
+ * Shared types for the reference benchmark wrappers under
+ * `src/benchmarks/`. Each wrapper exports the three functions in
+ * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.
+ */
+interface BenchmarkDatasetItem<TPayload = unknown> {
+    /** Stable dataset-local item id (used for split assignment + paper
+     *  references). Unique within a benchmark. */
+    id: string;
+    /** Free-form payload. Each benchmark defines its own shape. */
+    payload: TPayload;
+}
+interface BenchmarkEvaluation {
+    /** [0, 1] score for the response on this item. Exact-match
+     *  benchmarks use 0/1; partial-credit benchmarks may return
+     *  fractional values. */
+    score: number;
+    /** Optional bag of raw scoring signals — e.g. parsed numeric
+     *  answer, regex match, judge sub-scores. */
+    raw: Record<string, unknown>;
+}
+/** Common signature implemented by every adapter under `src/benchmarks/*`. */
+interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {
+    /** Load the dataset for the given split. May hit the network on
+     *  first call but should be cache-friendly. Adapters that don't
+     *  ship the dataset itself MUST throw a clearly-marked error
+     *  pointing the caller at the loader script. */
+    loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>;
+    /** Score a single response. Pure with respect to the inputs. */
+    evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>;
+    /** Deterministic split assignment via item id hashing. The
+     *  fraction of items in each split is implementation-defined but
+     *  MUST be stable across processes and platforms. */
+    assignSplit(itemId: string): RunSplitTag;
+}
+/** Split-assignment seed shared across all benchmarks. Bumping this
+ *  value reshuffles every split — do NOT do that lightly. */
+declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
+/**
+ * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a
+ * stable 32-bit hash of `${seed}::${id}`. Default proportions:
+ *
+ *   search: 60%   (optimization-readable)
+ *   dev:    20%   (held-out for tuning, leak-on-purpose during dev)
+ *   holdout:20%   (paper-grade held-out, gated reads)
+ */
+declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
+/**
+ * Synthetic routing dataset. 16 tasks across 4 categories. Used as a
+ * deterministic, dependency-free benchmark for any router that maps a
+ * natural-language request to one of a fixed set of route labels.
+ *
+ * Format (see `routing/README.md` for prose):
+ *
+ *   {
+ *     id:           stable per-task ID (matches across processes).
+ *     category:     one of the four route labels.
+ *     prompt:       the user-facing request the router must classify.
+ *     route:        the ground-truth route the router should pick.
+ *     synonyms:     other strings that count as a correct answer.
+ *     hardNegatives:close-but-wrong route labels — used to detect the
+ *                   "always picks the popular route" failure mode.
+ *   }
+ *
+ * The four categories are intentionally cross-domain (file ops,
+ * math, search, conversation) so a router that collapses to one
+ * category is easy to spot.
+ */
+interface RoutingItem {
+    id: string;
+    category: 'file' | 'math' | 'search' | 'chat';
+    prompt: string;
+    /** Canonical correct route label. */
+    route: string;
+    /** Alternate route labels that also count as correct. */
+    synonyms: string[];
+    /** Wrong-but-tempting route labels (for analysis, not grading). */
+    hardNegatives: string[];
+}
+declare const ROUTING_DATASET: RoutingItem[];
+/**
+ * Routing benchmark — synthetic, dependency-free, ships in the
+ * package. 16 cross-category items in `dataset.ts`. See
+ * `routing/README.md` for the format.
+ *
+ * `evaluate` does case-insensitive exact match against the canonical
+ * route plus declared synonyms. The first valid route token in the
+ * response wins; everything else is ignored. Wrong answers also
+ * report whether they hit a hard negative — useful when triaging
+ * "always picks the popular route" failure modes.
+ */
+type RoutingPayload = RoutingItem;
+type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>;
+declare class RoutingAdapter implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload> {
+    loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]>;
+    evaluate(item: RoutingDatasetItem, response: string): Promise<BenchmarkEvaluation>;
+    assignSplit(itemId: string): RunSplitTag;
+}
+/**
+ * Pull route-shaped tokens out of a model response. Routes look like
+ * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics
+ * are not routes, but `category.action` patterns are robust to most
+ * model wrappers (JSON output, prose explanations, code fences).
+ */
+declare function extractRouteTokens(response: string): string[];
+declare const loadDataset: (split: RunSplitTag) => Promise<RoutingDatasetItem[]>;
+declare const evaluate: (item: RoutingDatasetItem, response: string) => Promise<BenchmarkEvaluation>;
+declare const assignSplit: (itemId: string) => RunSplitTag;
+declare const index$1_ROUTING_DATASET: typeof ROUTING_DATASET;
+type index$1_RoutingAdapter = RoutingAdapter;
+declare const index$1_RoutingAdapter: typeof RoutingAdapter;
+type index$1_RoutingDatasetItem = RoutingDatasetItem;
+type index$1_RoutingItem = RoutingItem;
+type index$1_RoutingPayload = RoutingPayload;
+declare const index$1_assignSplit: typeof assignSplit;
+declare const index$1_evaluate: typeof evaluate;
+declare const index$1_extractRouteTokens: typeof extractRouteTokens;
+declare const index$1_loadDataset: typeof loadDataset;
+declare namespace index$1 {
+  export { index$1_ROUTING_DATASET as ROUTING_DATASET, index$1_RoutingAdapter as RoutingAdapter, type index$1_RoutingDatasetItem as RoutingDatasetItem, type index$1_RoutingItem as RoutingItem, type index$1_RoutingPayload as RoutingPayload, index$1_assignSplit as assignSplit, index$1_evaluate as evaluate, index$1_extractRouteTokens as extractRouteTokens, index$1_loadDataset as loadDataset };
+}
+/**
+ * Reference benchmark wrappers — entry point.
+ *
+ * Core surface (exported here):
+ *   - The `BenchmarkAdapter` contract.
+ *   - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.
+ *   - `routing` — synthetic 16-task router benchmark. The only novel
+ *     benchmark we built; ships in the package.
+ *
+ * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):
+ *   - `gsm8k`         — exact-match math reasoning (HF mirror, dataset
+ *                       not bundled).
+ *   - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an
+ *                       external grader).
+ *
+ * The example wrappers are reference implementations of `BenchmarkAdapter`.
+ * Read them, copy them, adapt them. They're intentionally not in the main
+ * entry — every team will configure them differently.
+ */
+declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
+type index_BenchmarkAdapter<_TItem = unknown, TPayload = unknown> = BenchmarkAdapter<_TItem, TPayload>;
+type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayload>;
+type index_BenchmarkEvaluation = BenchmarkEvaluation;
+declare const index_deterministicSplit: typeof deterministicSplit;
+declare namespace index {
+  export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$1 as routing };
+}
+export { BENCHMARK_SPLIT_SEED as B, type RunRecord as R, type RunSplitTag as a, type BenchmarkAdapter as b, type BenchmarkDatasetItem as c, type BenchmarkEvaluation as d, type RunJudgeMetadata as e, type RunOutcome as f, RunRecordValidationError as g, type RunTokenUsage as h, deterministicSplit as i, index as j, isRunRecord as k, index$1 as l, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };