@tangle-network/agent-eval 0.20.9 → 0.20.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,302 @@
1
+ # Changelog
2
+
3
+ ## 0.20.10 — hardening audit follow-up
4
+
5
+ ### Fixed
6
+
7
+ - `hashRubric` now recursively sorts nested rubric fields before hashing, so
8
+ dimension, failure-mode, and win changes alter `rubricVersion`.
9
+ - Wire judge handling now validates LLM output before returning it: finite
10
+ dimension scores, rationale, and known failure/win ids are enforced.
11
+ - Control-runtime budgets reject invalid numeric config, and invalid action
12
+ costs are omitted from step telemetry instead of leaking `NaN`/`Infinity`.
13
+ - Knowledge readiness now treats invalid `validUntil` timestamps as stale.
14
+ - Trace-analyst regex search supports leading `(?i)` and stops scanning once
15
+ bounded match output is reached.
16
+ - SWE-Bench Lite example wording now reflects the implemented external-grader
17
+ adapter, with quoted command parsing and timeout coverage.
18
+
19
+ ### Changed
20
+
21
+ - Published package contents now include `CHANGELOG.md`.
22
+ - Public docs now use GitHub URLs for repository-only examples and Python
23
+ client source.
24
+ - Publish CI now checks npm, Python package, runtime fallback version, and tag
25
+ version agree before publishing.
26
+
27
+ ## 0.20.9 — release hygiene and runtime failure fixes
28
+
29
+ ### Fixed
30
+
31
+ - Initial `runAgentControlLoop` observe/validate failures now report the
32
+ actual observe/validate error even when trace start/end emission also fails.
33
+ - Knowledge readiness recommended actions now honor non-blocking gap
34
+ acquisition modes such as `ask_user`, `search_web`, `query_connector`, and
35
+ `inspect_repo`.
36
+ - Npm builds now generate `dist/openapi.json`, and the package exports
37
+ `@tangle-network/agent-eval/openapi.json`.
38
+ - Npm and Python client versions are locked at `0.20.9`.
39
+
40
+ ### Added
41
+
42
+ - `CallbackResearcher`, a concrete callback-backed implementation of the
43
+ stable `Researcher` interface for scripts, tests, and small integrations.
44
+ - Public `@tangle-network/agent-eval/benchmarks` subpath for the supported
45
+ routing benchmark surface.
46
+ - Root MIT `LICENSE`.
47
+
48
+ ### Changed
49
+
50
+ - Raw TypeScript examples are no longer included in the npm package; they remain
51
+ repository examples to read, copy, and adapt.
52
+
53
+ ## 0.20.2 — freshness-aware knowledge readiness
54
+
55
+ ### Added
56
+
57
+ - `KnowledgeRequirement.validUntil` and `lastVerifiedAt` for explicit freshness
58
+ contracts.
59
+ - `scoreKnowledgeReadiness({ now })` support for deterministic freshness gates.
60
+
61
+ ### Changed
62
+
63
+ - Expired knowledge requirements now score as missing even when confidence and
64
+ evidence are otherwise high.
65
+
66
+ ## 0.20.0 — knowledge readiness contracts
67
+
68
+ ### Added
69
+
70
+ - First-class knowledge-readiness contracts: `KnowledgeRequirement`,
71
+ `KnowledgeBundle`, `KnowledgeReadinessReport`, `UserQuestion`, and
72
+ `DataAcquisitionPlan`.
73
+ - `scoreKnowledgeReadiness`, `blockingKnowledgeEval`,
74
+ `userQuestionsForKnowledgeGaps`, and `acquisitionPlansForKnowledgeGaps`.
75
+ - Knowledge/data failure classes including `knowledge_readiness_blocked`,
76
+ `missing_credentials`, `bad_retrieval`, `insufficient_evidence`, and
77
+ `contradictory_evidence`.
78
+ - `docs/knowledge-readiness.md`, plus documented knowledge-related ASI
79
+ responsible surfaces for multi-shot optimization.
80
+
81
+ ## 0.19.1 — release confidence gate
82
+
83
+ ### Added
84
+
85
+ - `evaluateReleaseConfidence`, a conservative release scorecard over corpus
86
+ coverage, search/holdout run evidence, ASI diagnostics, overfit checks, and
87
+ cost/latency budgets.
88
+ - `assertReleaseConfidence`, a throwing variant for CI/release scripts.
89
+ - `releaseTraceEvidenceFromMultiShotTrials`, a helper that projects
90
+ `MultiShotTrialResult` rows into release trace evidence so single-shot and
91
+ variable multi-shot apps use the same release gate.
92
+
93
+ ## 0.19.0 — legacy optimizer removal
94
+
95
+ ### Removed
96
+
97
+ - Removed the legacy pairwise prompt optimizer surface:
98
+ `PromptOptimizer`, `OptimizationLoop`, and their associated root-exported
99
+ types are gone. The blessed optimization path is now
100
+ `runMultiShotOptimization` for task trajectories and the steering-specific
101
+ optimizers for explicit steering tables.
102
+ - Removed the old `PromptVariant` root export. Public callers should use
103
+ `MultiShotVariant` for multi-shot trajectory optimization or
104
+ `EvolvableVariant` for the lower-level prompt/code evolution core.
105
+
106
+ ### Changed
107
+
108
+ - Documentation now points optimization users at `runMultiShotOptimization`
109
+ instead of the removed pairwise prompt optimizer.
110
+
111
+ ## 0.18.0 — multi-shot optimization
112
+
113
+ ### Added
114
+
115
+ - `runMultiShotOptimization`, the canonical GEPA-style adapter for
116
+ variable-length agent trajectories. It wraps `runPromptEvolution` while
117
+ preserving full multi-shot traces, actionable side information, stable paired
118
+ seeds, score/cost objectives, and optional held-out promotion gating.
119
+ - `trialTraceFromMultiShotTrial`, a bridge from multi-shot trial results into
120
+ reflective mutation prompts.
121
+ - `ActionableSideInfo`, `MultiShotVariant`, `MultiShotTrace`, `MultiShotRun`,
122
+ `MultiShotScore`, `MultiShotTrialResult`, `MultiShotMutateAdapter`, and
123
+ related public types.
124
+ - `docs/multi-shot-optimization.md` and
125
+ `examples/multi-shot-optimization/index.ts`.
126
+
127
+ ### Changed
128
+
129
+ - The multi-shot result shape explicitly separates `searchBestVariant` from
130
+ `promotedVariant`. If a holdout gate rejects the search winner, the promoted
131
+ variant is the baseline.
132
+ - `runMultiShotOptimization` validates release-critical configuration up front:
133
+ unique variant/scenario ids, positive integer run counts, population size,
134
+ disjoint search/holdout ids, and a gate baseline key matching the first seed
135
+ variant.
136
+
137
+ ## 0.17.2 — agent control runtime
138
+
139
+ ### Added
140
+
141
+ - `runAgentControlLoop`, a generic `observe -> validate -> decide -> act`
142
+ runtime for agentic tasks with step, wall-clock, and recorded-cost budgets;
143
+ no-progress and repeated-action stop policies; structured runtime failures;
144
+ objective/subjective eval helpers; and `TraceStore` emission.
145
+ - `runProposeReviewAsControlLoop`, a bridge preset that expresses
146
+ propose/verify/review as a specialization of the generic control runtime.
147
+ - feedback trajectory helpers for turning control-loop runs and user/judge
148
+ labels into reusable dataset scenarios, optimizer rows, and preference
149
+ memory.
150
+ - `docs/control-runtime.md`, with integration patterns for tax, legal,
151
+ agent-builder, and film-agent products.
152
+
153
+ ### Changed
154
+
155
+ - control runtime trace sink and `onStep` callback failures are now recorded
156
+ as structured runtime errors without aborting an otherwise valid run.
157
+ - `runProposeReviewAsControlLoop` accepts a caller-provided verifier failure
158
+ mapper for domain-specific failure classes.
159
+
160
+ ## 0.17.0 — surface cleanup + usage-guidance pitfalls
161
+
162
+ This release tightens the public benchmark surface and lands internal usage guidance that the v0.15 dispatch couldn't write.
163
+
164
+ ### Moved
165
+
166
+ - `src/benchmarks/gsm8k/` → `examples/benchmarks/gsm8k/`
167
+ - `src/benchmarks/swebench-lite/` → `examples/benchmarks/swebench-lite/`
168
+
169
+ These are reference implementations of `BenchmarkAdapter`, not core surface. Consumers read them, copy them, adapt them. The novel `routing` benchmark stays in `src/benchmarks/` because it's our own and broadly useful.
170
+
171
+ `src/benchmarks/index.ts` now exports the shared types + the `routing` benchmark only. The previous `gsm8k` and `swebenchLite` namespace exports are gone — import directly from `examples/benchmarks/<name>/index.ts` (or copy the wrapper into your own project).
172
+
173
+ ### Added
174
+
175
+ - `examples/benchmarks/README.md` documents how to use, copy, and extend the example wrappers.
176
+ - Internal agent-eval usage guidance gains production-rigor and pitfalls sections covering the v0.16 primitives.
177
+
178
+ ### Migration
179
+
180
+ If you imported `gsm8k` or `swebenchLite` from `@tangle-network/agent-eval/benchmarks`:
181
+
182
+ ```ts
183
+ // before
184
+ import { gsm8k, swebenchLite } from '@tangle-network/agent-eval/benchmarks'
185
+
186
+ // after — copy the file from examples/benchmarks/<name>/index.ts into your project,
187
+ // or import via relative path from the cloned repo.
188
+ ```
189
+
190
+ The `routing` benchmark and the shared `BenchmarkAdapter` types are unchanged.
191
+
192
+ ## 0.16.0 — naming cleanup
193
+
194
+ The v0.15 primitives were framed as "paper-grade" but most are production-rigor utilities any team needs. This release renames the three reporting helpers and drops the "paper" framing from the public API. Behavior unchanged.
195
+
196
+ ### Renamed
197
+
198
+ - `paperTable` → `summaryTable`
199
+ - `paretoFigure` → `paretoChart`
200
+ - `gainDistributionFigure` → `gainHistogram`
201
+ - `PaperTable` / `PaperTableOptions` / `PaperTableRow` types → `SummaryTable` / `SummaryTableOptions` / `SummaryTableRow`
202
+ - File: `src/paper-report.ts` → `src/summary-report.ts`
203
+
204
+ ### Migration
205
+
206
+ Drop-in: search-and-replace the three function names and the file path. Type names follow the same pattern. No behavior change.
207
+
208
+ ```ts
209
+ // before
210
+ import { paperTable, paretoFigure, gainDistributionFigure } from '@tangle-network/agent-eval'
211
+ // after
212
+ import { summaryTable, paretoChart, gainHistogram } from '@tangle-network/agent-eval'
213
+ ```
214
+
215
+ ## 0.15.0 — paper-grade primitives
216
+
217
+ Substrate for the "Two Loops, Three Roles" paper on multi-level prompt
218
+ optimization with held-out promotion gates.
219
+
220
+ ### Added
221
+
222
+ - **`HeldOutGate`** (`src/promotion-gate.ts`) — first-class held-out
223
+ paired-delta promotion gate. Three checks: minimum productive runs,
224
+ positive lower bound on bootstrap CI of paired holdout median delta,
225
+ bounded overfit-gap relative to baseline. Decisions carry a
226
+ machine-readable `rejectionCode` (`few_runs` | `negative_delta` |
227
+ `overfit_gap`) plus an `evidence` block with every number the gate
228
+ read. Generalizes the inline pattern that lived in
229
+ `redteam/scripts/agent-eval-autoresearch.ts:138–171`.
230
+ - **`RunRecord`** (`src/run-record.ts`) — paper-grade JSON-friendly run
231
+ schema with mandatory fields: `runId`, `experimentId`, `candidateId`,
232
+ `seed`, snapshot-versioned `model`, `promptHash`, `configHash`,
233
+ `commitSha`, `wallMs`, `costUsd`, `tokenUsage`, `outcome`, `splitTag`.
234
+ Runtime validator (`validateRunRecord`, `isRunRecord`,
235
+ `parseRunRecordSafe`, `roundTripRunRecord`) throws on missing fields
236
+ and on bare model aliases without snapshot suffix.
237
+ - **`Researcher`** (`src/researcher.ts`) — stable hook for an
238
+ autonomous-research agent: `inspectFailures` → `proposeChange` →
239
+ `applyChange` → `evaluateChange`. `NoopResearcher` is the
240
+ fail-loud placeholder. Implementations live downstream.
241
+ - **Reference benchmarks** (`src/benchmarks/`) — three adapters that
242
+ share the `BenchmarkAdapter<TItem, TPayload>` shape:
243
+ - `gsm8k`: HF-mirror loader (JSONL via `AGENT_EVAL_GSM8K_PATH`),
244
+ exact-match grading via `parseGsm8kAnswer`.
245
+ - `swebench-lite`: 30-instance subset stub. Loader reads
246
+ `AGENT_EVAL_SWEBENCH_PATH`; grader shells out to
247
+ `AGENT_EVAL_SWEBENCH_GRADER_CMD`. Both fail loud when unset.
248
+ - `routing`: synthetic 16-task router benchmark, ships in the
249
+ package, dependency-free. Format documented in
250
+ `src/benchmarks/routing/README.md`.
251
+ - `deterministicSplit(itemId, seed?)`: stable 60/20/20 split via
252
+ FNV-1a hash. Default seed `agent-eval-v1`.
253
+ - **`summaryTable`, `paretoChart`, `gainHistogram`**
254
+ (`sr./summary-report.ts`) — Table 1 + Pareto + gain-distribution specs.
255
+ Returns data structures (markdown table, point lists, histogram bins);
256
+ caller picks the plotting library.
257
+ - **`runCanaries`** (`src/canary.ts`) — three liveness canaries:
258
+ silent judge fallback (consecutive constant-confidence streak),
259
+ judge calibration drift (KS test on confidence distribution), eval-set
260
+ distribution shift (chi-square on category bucket counts).
261
+ - **`pairedBootstrap`, `pairedWilcoxon`, `bhAdjust`**
262
+ (`src/paired-stats.ts`) — paper-style aliases + the missing paired
263
+ bootstrap CI primitive. Deterministic with optional seed.
264
+
265
+ ### Notes
266
+
267
+ - No breaking changes. Every existing module is untouched; new types
268
+ are additive.
269
+ - All new public symbols carry JSDoc.
270
+ - 87 new tests across 7 new test files. 571 total tests pass.
271
+ - See the package docs for usage directives and pitfalls.
272
+
273
+ ## 0.11.0
274
+
275
+ intent-match + flow-layer + deploy-gate + concept complexity
276
+ weighting.
277
+
278
+ ## 0.10.0
279
+
280
+ `LayerResult.diagnostics` + `buildReviewerPrompt` +
281
+ `createDefaultReviewer` + `mergeLayerResults` options.
282
+
283
+ ## 0.9.0
284
+
285
+ `CommandRunner` contract + `multiToolchainLayer` + `Finding.detail`.
286
+
287
+ ## 0.8.x
288
+
289
+ `probeLlm` + `keyword-coverage-judge`. Honestly-absent primitives
290
+ backfilled — `llm-client`, multi-layer verifier, semantic concept judge,
291
+ extractor utilities.
292
+
293
+ ## 0.7.x
294
+
295
+ Extracted muffled-gate scanner; `CostTracker.recordVerdict`. Footgun
296
+ fix: `cwd` belongs in `HarnessConfig`, not the driver constructor.
297
+
298
+ ## 0.6.x
299
+
300
+ Tier 1 (meta-eval correlation, PRM, bisector), Tier 2 (counterfactual,
301
+ cross-trace diff, pre-registration), Tier 3 (self-play, causal
302
+ attribution, active learning, RM export), governance templates.
package/README.md CHANGED
@@ -98,16 +98,20 @@ pip install -e .
98
98
  | `summaryTable`, `paretoChart`, `gainHistogram` | Report-ready structured outputs. |
99
99
  | `KnowledgeRequirement`, `KnowledgeBundle` | Shared contracts for knowledge readiness. |
100
100
 
101
+ `NoopResearcher` is a fail-loud sentinel for wiring tests. Production systems
102
+ should implement `Researcher` directly or use `CallbackResearcher`.
103
+
101
104
  ## Examples
102
105
 
103
- Runnable examples live in the repository's [`examples/`](./examples)
106
+ Runnable examples live in the repository's
107
+ [`examples/`](https://github.com/tangle-network/agent-eval/tree/main/examples)
104
108
  directory. They are not part of the published npm package.
105
109
 
106
- - [`examples/same-sandbox-harness`](./examples/same-sandbox-harness) - run
110
+ - [`examples/same-sandbox-harness`](https://github.com/tangle-network/agent-eval/tree/main/examples/same-sandbox-harness) - run
107
111
  multiple eval passes against the same workspace.
108
- - [`examples/multi-shot-optimization`](./examples/multi-shot-optimization) -
112
+ - [`examples/multi-shot-optimization`](https://github.com/tangle-network/agent-eval/tree/main/examples/multi-shot-optimization) -
109
113
  optimize full agent trajectories with held-out promotion.
110
- - [`examples/benchmarks`](./examples/benchmarks) - benchmark adapter shape and
114
+ - [`examples/benchmarks`](https://github.com/tangle-network/agent-eval/tree/main/examples/benchmarks) - benchmark adapter shape and
111
115
  reference benchmark wrappers.
112
116
 
113
117
  The examples are intentionally kept outside the README so they can be expanded,
@@ -1 +1 @@
1
- export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, i as deterministicSplit, l as routing } from '../index-CEWY1rmu.js';
1
+ export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, i as deterministicSplit, l as routing } from '../index-1PZOtZFr.js';
@@ -2,7 +2,7 @@ import {
2
2
  BENCHMARK_SPLIT_SEED,
3
3
  deterministicSplit,
4
4
  routing_exports
5
- } from "../chunk-XDGJUIV2.js";
5
+ } from "../chunk-42I2QC2L.js";
6
6
  import "../chunk-PZ5AY32C.js";
7
7
  export {
8
8
  BENCHMARK_SPLIT_SEED,
@@ -216,4 +216,4 @@ export {
216
216
  routing_exports,
217
217
  benchmarks_exports
218
218
  };
219
- //# sourceMappingURL=chunk-XDGJUIV2.js.map
219
+ //# sourceMappingURL=chunk-42I2QC2L.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/benchmarks/index.ts","../src/benchmarks/types.ts","../src/benchmarks/routing/index.ts","../src/benchmarks/routing/dataset.ts"],"sourcesContent":["/**\n * Reference benchmark wrappers — entry point.\n *\n * Core surface (exported here):\n * - The `BenchmarkAdapter` contract.\n * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.\n * - `routing` — synthetic 16-task router benchmark. The only novel\n * benchmark we built; ships in the package.\n *\n * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):\n * - `gsm8k` — exact-match math reasoning (HF mirror, dataset\n * not bundled).\n * - `swebench-lite` — 30-instance SWE-Bench subset via an external\n * grader command.\n *\n * The example wrappers are reference implementations of `BenchmarkAdapter`.\n * Read them, copy them, adapt them. They're intentionally not in the main\n * entry — every team will configure them differently.\n */\n\nexport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from './types'\nexport { deterministicSplit, BENCHMARK_SPLIT_SEED } from './types'\n\nexport * as routing from './routing/index'\n","/**\n * Shared types for the reference benchmark wrappers under\n * `src/benchmarks/`. Each wrapper exports the three functions in\n * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.\n */\n\nimport type { RunSplitTag } from '../run-record'\n\nexport interface BenchmarkDatasetItem<TPayload = unknown> {\n /** Stable dataset-local item id (used for split assignment + paper\n * references). Unique within a benchmark. */\n id: string\n /** Free-form payload. Each benchmark defines its own shape. */\n payload: TPayload\n}\n\nexport interface BenchmarkEvaluation {\n /** [0, 1] score for the response on this item. Exact-match\n * benchmarks use 0/1; partial-credit benchmarks may return\n * fractional values. */\n score: number\n /** Optional bag of raw scoring signals — e.g. parsed numeric\n * answer, regex match, judge sub-scores. */\n raw: Record<string, unknown>\n}\n\n/** Common signature implemented by every adapter under `src/benchmarks/*`. */\n// `TPayload` is the per-item payload type; `_TItem` is preserved for\n// downstream type-narrowing extensions (a richer `BenchmarkDatasetItem`\n// subclass that adds e.g. provenance metadata) but is intentionally\n// unused here. `noUnusedLocals` requires the leading underscore.\nexport interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {\n /** Load the dataset for the given split. May hit the network on\n * first call but should be cache-friendly. Adapters that don't\n * ship the dataset itself MUST throw a clearly-marked error\n * pointing the caller at the loader script. */\n loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>\n /** Score a single response. Pure with respect to the inputs. */\n evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>\n /** Deterministic split assignment via item id hashing. The\n * fraction of items in each split is implementation-defined but\n * MUST be stable across processes and platforms. */\n assignSplit(itemId: string): RunSplitTag\n}\n\n// ── Deterministic split assignment ───────────────────────────────────\n\n/**\n * 32-bit FNV-1a hash. Stable, allocation-free, deterministic across\n * runtimes. We use it to assign items to splits rather than depending\n * on a polyfilled crypto.subtle path.\n */\nfunction fnv1a32(input: string): number {\n let h = 0x811c9dc5\n for (let i = 0; i < input.length; i++) {\n h ^= input.charCodeAt(i) & 0xff\n h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0\n }\n return h >>> 0\n}\n\n/** Split-assignment seed shared across all benchmarks. Bumping this\n * value reshuffles every split — do NOT do that lightly. */\nexport const BENCHMARK_SPLIT_SEED = 'agent-eval-v1'\n\n/**\n * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a\n * stable 32-bit hash of `${seed}::${id}`. Default proportions:\n *\n * search: 60% (optimization-readable)\n * dev: 20% (held-out for tuning, leak-on-purpose during dev)\n * holdout:20% (paper-grade held-out, gated reads)\n */\nexport function deterministicSplit(\n itemId: string,\n seed: string = BENCHMARK_SPLIT_SEED,\n): RunSplitTag {\n const h = fnv1a32(`${seed}::${itemId}`)\n const pos = h / 0x100000000\n if (pos < 0.6) return 'search'\n if (pos < 0.8) return 'dev'\n return 'holdout'\n}\n","/**\n * Routing benchmark — synthetic, dependency-free, ships in the\n * package. 16 cross-category items in `dataset.ts`. See\n * `routing/README.md` for the format.\n *\n * `evaluate` does case-insensitive exact match against the canonical\n * route plus declared synonyms. The first valid route token in the\n * response wins; everything else is ignored. Wrong answers also\n * report whether they hit a hard negative — useful when triaging\n * \"always picks the popular route\" failure modes.\n */\n\nimport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from '../types'\nimport { deterministicSplit } from '../types'\nimport type { RunSplitTag } from '../../run-record'\nimport { ROUTING_DATASET, type RoutingItem } from './dataset'\n\nexport type { RoutingItem }\nexport type RoutingPayload = RoutingItem\nexport type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>\n\nclass RoutingAdapter\n implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload>\n{\n async loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]> {\n return ROUTING_DATASET\n .map((item) => ({ id: item.id, payload: item }))\n .filter((it) => assignSplitImpl(it.id) === split)\n }\n\n async evaluate(\n item: RoutingDatasetItem,\n response: string,\n ): Promise<BenchmarkEvaluation> {\n const tokens = extractRouteTokens(response)\n const correct = new Set<string>([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()))\n const hardNeg = new Set<string>(item.payload.hardNegatives.map((s) => s.toLowerCase()))\n const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null\n const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null\n const score = firstMatch ? 1 : 0\n return {\n score,\n raw: {\n firstToken: tokens[0] ?? null,\n matchedRoute: firstMatch,\n hitHardNegative: Boolean(firstHardNeg),\n hardNegativeRoute: firstHardNeg,\n category: item.payload.category,\n },\n }\n }\n\n assignSplit(itemId: string): RunSplitTag {\n return assignSplitImpl(itemId)\n }\n}\n\nfunction assignSplitImpl(itemId: string): RunSplitTag {\n return deterministicSplit(`routing::${itemId}`)\n}\n\n/**\n * Pull route-shaped tokens out of a model response. Routes look like\n * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics\n * are not routes, but `category.action` patterns are robust to most\n * model wrappers (JSON output, prose explanations, code fences).\n */\nexport function extractRouteTokens(response: string): string[] {\n const matches = response.match(/[a-z][a-z0-9_]*\\.[a-z][a-z0-9_]*/gi)\n return matches ?? []\n}\n\nconst adapter = new RoutingAdapter()\n\nexport const loadDataset = adapter.loadDataset.bind(adapter)\nexport const evaluate = adapter.evaluate.bind(adapter)\nexport const assignSplit = adapter.assignSplit.bind(adapter)\nexport { RoutingAdapter, ROUTING_DATASET }\n","/**\n * Synthetic routing dataset. 16 tasks across 4 categories. Used as a\n * deterministic, dependency-free benchmark for any router that maps a\n * natural-language request to one of a fixed set of route labels.\n *\n * Format (see `routing/README.md` for prose):\n *\n * {\n * id: stable per-task ID (matches across processes).\n * category: one of the four route labels.\n * prompt: the user-facing request the router must classify.\n * route: the ground-truth route the router should pick.\n * synonyms: other strings that count as a correct answer.\n * hardNegatives:close-but-wrong route labels — used to detect the\n * \"always picks the popular route\" failure mode.\n * }\n *\n * The four categories are intentionally cross-domain (file ops,\n * math, search, conversation) so a router that collapses to one\n * category is easy to spot.\n */\n\nexport interface RoutingItem {\n id: string\n category: 'file' | 'math' | 'search' | 'chat'\n prompt: string\n /** Canonical correct route label. */\n route: string\n /** Alternate route labels that also count as correct. */\n synonyms: string[]\n /** Wrong-but-tempting route labels (for analysis, not grading). */\n hardNegatives: string[]\n}\n\nexport const ROUTING_DATASET: RoutingItem[] = [\n {\n id: 'file_001',\n category: 'file',\n prompt: 'Save the meeting notes to /tmp/notes-2025-04.md as markdown.',\n route: 'fs.write',\n synonyms: ['filesystem.write', 'write_file'],\n hardNegatives: ['fs.read', 'chat.reply'],\n },\n {\n id: 'file_002',\n category: 'file',\n prompt: 'Read the contents of /etc/hosts and summarize the entries.',\n route: 'fs.read',\n synonyms: ['filesystem.read', 'read_file'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n {\n id: 'file_003',\n category: 'file',\n prompt: 'List every Python file under src/ recursively.',\n route: 'fs.list',\n synonyms: ['filesystem.list', 'list_files'],\n hardNegatives: ['fs.read', 'search.code'],\n },\n {\n id: 'file_004',\n category: 'file',\n prompt: 'Delete the cached build at .turbo/cache.',\n route: 'fs.delete',\n synonyms: ['filesystem.delete', 'remove_file'],\n hardNegatives: ['fs.write', 'fs.list'],\n },\n {\n id: 'math_001',\n category: 'math',\n prompt: 'What is the integral of 3x^2 + 2x from 0 to 5?',\n route: 'math.integral',\n synonyms: ['calculator.integral', 'math.solve'],\n hardNegatives: ['math.derivative', 'chat.reply'],\n },\n {\n id: 'math_002',\n category: 'math',\n prompt: 'Compute the derivative of sin(x) * cos(x).',\n route: 'math.derivative',\n synonyms: ['calculator.derivative', 'math.solve'],\n hardNegatives: ['math.integral', 'math.algebra'],\n },\n {\n id: 'math_003',\n category: 'math',\n prompt: 'Solve 2x + 7 = 19 for x.',\n route: 'math.algebra',\n synonyms: ['calculator.algebra', 'math.solve'],\n hardNegatives: ['math.derivative', 'math.integral'],\n },\n {\n id: 'math_004',\n category: 'math',\n prompt: 'What is the prime factorization of 360?',\n route: 'math.numbertheory',\n synonyms: ['calculator.factor', 'math.solve'],\n hardNegatives: ['math.algebra', 'search.web'],\n },\n {\n id: 'search_001',\n category: 'search',\n prompt: 'Find recent papers on agent prompt optimization with held-out promotion gates.',\n route: 'search.web',\n synonyms: ['web.search', 'search.papers'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_002',\n category: 'search',\n prompt: 'Search the codebase for every call site of `runProposeReview`.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'search_003',\n category: 'search',\n prompt: 'What is the latest release of the Tangle network on GitHub?',\n route: 'search.web',\n synonyms: ['web.search', 'github.releases'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_004',\n category: 'search',\n prompt: 'Find all TODO comments in the agent-eval src tree.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.list'],\n },\n {\n id: 'chat_001',\n category: 'chat',\n prompt: 'Hi there, how are you doing today?',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_002',\n category: 'chat',\n prompt: 'Please explain the difference between an LLM and a foundation model.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'qa.answer'],\n hardNegatives: ['search.web', 'math.algebra'],\n },\n {\n id: 'chat_003',\n category: 'chat',\n prompt: 'Tell me a short joke about distributed systems.',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_004',\n category: 'chat',\n prompt: 'Acknowledge my last message with a thumbs up.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'react'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n]\n"],"mappings":";;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACoDA,SAAS,QAAQ,OAAuB;AACtC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,SAAK,MAAM,WAAW,CAAC,IAAI;AAC3B,QAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,SAAU;AAAA,EACxE;AACA,SAAO,MAAM;AACf;AAIO,IAAM,uBAAuB;AAU7B,SAAS,mBACd,QACA,OAAe,sBACF;AACb,QAAM,IAAI,QAAQ,GAAG,IAAI,KAAK,MAAM,EAAE;AACtC,QAAM,MAAM,IAAI;AAChB,MAAI,MAAM,IAAK,QAAO;AACtB,MAAI,MAAM,IAAK,QAAO;AACtB,SAAO;AACT;;;AClFA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACkCO,IAAM,kBAAiC;AAAA,EAC5C;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB,YAAY;AAAA,IAC3C,eAAe,CAAC,WAAW,YAAY;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,WAAW;AAAA,IACzC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,YAAY;AAAA,IAC1C,eAAe,CAAC,WAAW,aAAa;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,aAAa;AAAA,IAC7C,eAAe,CAAC,YAAY,SAAS;AAAA,EACvC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,uBAAuB,YAAY;AAAA,IAC9C,eAAe,CAAC,mBAAmB,YAAY;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,yBAAyB,YAAY;AAAA,IAChD,eAAe,CAAC,iBAAiB,cAAc;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,YAAY;AAAA,IAC7C,eAAe,CAAC,mBAAmB,eAAe;AAAA,EACpD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,YAAY;AAAA,IAC5C,eAAe,CAAC,gBAAgB,YAAY;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,eAAe;AAAA,IACxC,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,iBAAiB;AAAA,IAC1C,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,WAAW;AAAA,IAC5C,eAAe,CAAC,cAAc,cAAc;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,OAAO;AAAA,IACxC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AACF;;;AD1IA,IAAM,iBAAN,MAEA;AAAA,EACE,MAAM,YAAY,OAAmD;AACnE,WAAO,gBACJ,IAAI,CAAC,UAAU,EAAE,IAAI,KAAK,IAAI,SAAS,KAAK,EAAE,EAC9C,OAAO,CAAC,OAAO,gBAAgB,GAAG,EAAE,MAAM,KAAK;AAAA,EACpD;AAAA,EAEA,MAAM,SACJ,MACA,UAC8B;AAC9B,UAAM,SAAS,mBAAmB,QAAQ;AAC1C,UAAM,UAAU,IAAI,IAAY,CAAC,KAAK,QAAQ,OAAO,GAAG,KAAK,QAAQ,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AAC1G,UAAM,UAAU,IAAI,IAAY,KAAK,QAAQ,cAAc,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AACtF,UAAM,aAAa,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACvE,UAAM,eAAe,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACzE,UAAM,QAAQ,aAAa,IAAI;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,KAAK;AAAA,QACH,YAAY,OAAO,CAAC,KAAK;AAAA,QACzB,cAAc;AAAA,QACd,iBAAiB,QAAQ,YAAY;AAAA,QACrC,mBAAmB;AAAA,QACnB,UAAU,KAAK,QAAQ;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AAAA,EAEA,YAAY,QAA6B;AACvC,WAAO,gBAAgB,MAAM;AAAA,EAC/B;AACF;AAEA,SAAS,gBAAgB,QAA6B;AACpD,SAAO,mBAAmB,YAAY,MAAM,EAAE;AAChD;AAQO,SAAS,mBAAmB,UAA4B;AAC7D,QAAM,UAAU,SAAS,MAAM,oCAAoC;AACnE,SAAO,WAAW,CAAC;AACrB;AAEA,IAAM,UAAU,IAAI,eAAe;AAE5B,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;AACpD,IAAM,WAAW,QAAQ,SAAS,KAAK,OAAO;AAC9C,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;","names":[]}
@@ -83,13 +83,21 @@ var ErrorResponseSchema = z.object({
83
83
  }).openapi("ErrorResponse");
84
84
  var WIRE_VERSION = "1.0.0";
85
85
  function hashRubric(rubric) {
86
- const stable = JSON.stringify(rubric, Object.keys(rubric).sort());
86
+ const stable = stableStringify(rubric);
87
87
  let h = 5381;
88
88
  for (let i = 0; i < stable.length; i++) {
89
89
  h = h * 33 ^ stable.charCodeAt(i);
90
90
  }
91
91
  return `${rubric.name}@${(h >>> 0).toString(16).padStart(8, "0")}`;
92
92
  }
93
+ function stableStringify(value) {
94
+ if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
95
+ if (value && typeof value === "object") {
96
+ const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b)).map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`);
97
+ return `{${entries.join(",")}}`;
98
+ }
99
+ return JSON.stringify(value);
100
+ }
93
101
 
94
102
  // src/wire/rubrics.ts
95
103
  var ANTI_SLOP = {
@@ -225,6 +233,47 @@ function judgeOutputSchema(rubric) {
225
233
  }
226
234
  };
227
235
  }
236
+ function validateJudgeOutput(value, rubric) {
237
+ if (!value || typeof value !== "object") {
238
+ throw new WireError("judge_error", "Judge returned malformed output.", 500, value);
239
+ }
240
+ const raw = value;
241
+ const rawDimensions = raw.dimensions;
242
+ if (!rawDimensions || typeof rawDimensions !== "object" || Array.isArray(rawDimensions)) {
243
+ throw new WireError("judge_error", "Judge returned malformed dimensions.", 500, value);
244
+ }
245
+ const dimensions = {};
246
+ const dimensionRecord = rawDimensions;
247
+ for (const dim of rubric.dimensions) {
248
+ const score = dimensionRecord[dim.id];
249
+ if (typeof score !== "number" || !Number.isFinite(score) || score < dim.min || score > dim.max) {
250
+ throw new WireError("judge_error", `Judge returned invalid score for dimension "${dim.id}".`, 500, value);
251
+ }
252
+ dimensions[dim.id] = score;
253
+ }
254
+ const allowedFailures = new Set(rubric.failureModes.map((mode) => mode.id));
255
+ const allowedWins = new Set(rubric.wins.map((win) => win.id));
256
+ const failureModes = validateIdArray(raw.failureModes, allowedFailures, "failureModes", value);
257
+ const wins = validateIdArray(raw.wins, allowedWins, "wins", value);
258
+ if (typeof raw.rationale !== "string" || raw.rationale.trim().length === 0) {
259
+ throw new WireError("judge_error", "Judge returned missing rationale.", 500, value);
260
+ }
261
+ return { dimensions, failureModes, wins, rationale: raw.rationale };
262
+ }
263
+ function validateIdArray(raw, allowed, field, original) {
264
+ if (raw === void 0) return [];
265
+ if (!Array.isArray(raw)) {
266
+ throw new WireError("judge_error", `Judge returned non-array ${field}.`, 500, original);
267
+ }
268
+ const out = [];
269
+ for (const item of raw) {
270
+ if (typeof item !== "string" || !allowed.has(item)) {
271
+ throw new WireError("judge_error", `Judge returned unknown ${field} id "${String(item)}".`, 500, original);
272
+ }
273
+ out.push(item);
274
+ }
275
+ return out;
276
+ }
228
277
  function compositeScore(dimensions, rubric) {
229
278
  let weighted = 0;
230
279
  let totalWeight = 0;
@@ -273,17 +322,15 @@ async function handleJudge(req) {
273
322
  temperature: 0,
274
323
  timeoutMs: 6e4
275
324
  });
276
- if (!value || typeof value !== "object" || !value.dimensions) {
277
- throw new WireError("judge_error", "Judge returned malformed output.", 500, value);
278
- }
279
- const composite = compositeScore(value.dimensions, rubric);
325
+ const output = validateJudgeOutput(value, rubric);
326
+ const composite = compositeScore(output.dimensions, rubric);
280
327
  const durationMs = Date.now() - startedAt;
281
328
  return {
282
329
  composite,
283
- dimensions: value.dimensions,
284
- failureModes: value.failureModes ?? [],
285
- wins: value.wins ?? [],
286
- rationale: value.rationale,
330
+ dimensions: output.dimensions,
331
+ failureModes: output.failureModes ?? [],
332
+ wins: output.wins ?? [],
333
+ rationale: output.rationale,
287
334
  rubricVersion: hashRubric(rubric),
288
335
  model: result.model,
289
336
  durationMs
@@ -400,7 +447,7 @@ function buildOpenApi(packageVersion) {
400
447
  }
401
448
  });
402
449
  const generator = new OpenApiGeneratorV31(registry.definitions);
403
- return generator.generateDocument({
450
+ const doc = generator.generateDocument({
404
451
  openapi: "3.1.0",
405
452
  info: {
406
453
  title: "@tangle-network/agent-eval \u2014 wire protocol",
@@ -413,6 +460,38 @@ Wire-protocol version: ${WIRE_VERSION}. Bumps on breaking changes to request/res
413
460
  },
414
461
  servers: [{ url: "http://localhost:5005", description: "Local agent-eval serve" }]
415
462
  });
463
+ const rubricRef = { $ref: "#/components/schemas/Rubric" };
464
+ const commonJudgeFields = {
465
+ content: { type: "string", minLength: 1 },
466
+ context: { type: "object", additionalProperties: true },
467
+ model: { type: "string" }
468
+ };
469
+ doc.components ??= {};
470
+ doc.components.schemas ??= {};
471
+ doc.components.schemas.JudgeRequest = {
472
+ oneOf: [
473
+ {
474
+ type: "object",
475
+ additionalProperties: false,
476
+ required: ["rubricName", "content"],
477
+ properties: {
478
+ rubricName: { type: "string", minLength: 1 },
479
+ ...commonJudgeFields
480
+ }
481
+ },
482
+ {
483
+ type: "object",
484
+ additionalProperties: false,
485
+ required: ["rubric", "content"],
486
+ properties: {
487
+ rubric: rubricRef,
488
+ ...commonJudgeFields
489
+ }
490
+ }
491
+ ],
492
+ description: "Judge request. Provide exactly one of rubricName or rubric."
493
+ };
494
+ return doc;
416
495
  }
417
496
 
418
497
  // src/wire/server.ts
@@ -591,4 +670,4 @@ export {
591
670
  runRpcOnce,
592
671
  runRpcBatch
593
672
  };
594
- //# sourceMappingURL=chunk-CJJSB6ZQ.js.map
673
+ //# sourceMappingURL=chunk-LSR4IAYN.js.map