npm - @tangle-network/agent-eval - Versions diffs - 0.20.9 → 0.20.10 - Mend

@tangle-network/agent-eval 0.20.9 → 0.20.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/CHANGELOG.md +302 -0
package/README.md +8 -4
package/dist/benchmarks/index.d.ts +1 -1
package/dist/benchmarks/index.js +1 -1
package/dist/{chunk-XDGJUIV2.js → chunk-42I2QC2L.js} +1 -1
package/dist/chunk-42I2QC2L.js.map +1 -0
package/dist/{chunk-CJJSB6ZQ.js → chunk-LSR4IAYN.js} +90 -11
package/dist/chunk-LSR4IAYN.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/{index-CEWY1rmu.d.ts → index-1PZOtZFr.d.ts} +2 -2
package/dist/index.d.ts +2 -2
package/dist/index.js +63 -14
package/dist/index.js.map +1 -1
package/dist/openapi.json +50 -25
package/dist/{sink-fetch-C0B8ximv.d.ts → sink-fetch-B1Yg4Til.d.ts} +1 -1
package/dist/telemetry/file.d.ts +1 -1
package/dist/telemetry/index.d.ts +2 -2
package/dist/telemetry/index.js.map +1 -1
package/dist/wire/index.js +1 -1
package/docs/wire-protocol.md +2 -2
package/package.json +5 -4
package/dist/chunk-CJJSB6ZQ.js.map +0 -1
package/dist/chunk-XDGJUIV2.js.map +0 -1

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,302 @@
+# Changelog
+## 0.20.10 — hardening audit follow-up
+### Fixed
+- `hashRubric` now recursively sorts nested rubric fields before hashing, so
+  dimension, failure-mode, and win changes alter `rubricVersion`.
+- Wire judge handling now validates LLM output before returning it: finite
+  dimension scores, rationale, and known failure/win ids are enforced.
+- Control-runtime budgets reject invalid numeric config, and invalid action
+  costs are omitted from step telemetry instead of leaking `NaN`/`Infinity`.
+- Knowledge readiness now treats invalid `validUntil` timestamps as stale.
+- Trace-analyst regex search supports leading `(?i)` and stops scanning once
+  bounded match output is reached.
+- SWE-Bench Lite example wording now reflects the implemented external-grader
+  adapter, with quoted command parsing and timeout coverage.
+### Changed
+- Published package contents now include `CHANGELOG.md`.
+- Public docs now use GitHub URLs for repository-only examples and Python
+  client source.
+- Publish CI now checks npm, Python package, runtime fallback version, and tag
+  version agree before publishing.
+## 0.20.9 — release hygiene and runtime failure fixes
+### Fixed
+- Initial `runAgentControlLoop` observe/validate failures now report the
+  actual observe/validate error even when trace start/end emission also fails.
+- Knowledge readiness recommended actions now honor non-blocking gap
+  acquisition modes such as `ask_user`, `search_web`, `query_connector`, and
+  `inspect_repo`.
+- Npm builds now generate `dist/openapi.json`, and the package exports
+  `@tangle-network/agent-eval/openapi.json`.
+- Npm and Python client versions are locked at `0.20.9`.
+### Added
+- `CallbackResearcher`, a concrete callback-backed implementation of the
+  stable `Researcher` interface for scripts, tests, and small integrations.
+- Public `@tangle-network/agent-eval/benchmarks` subpath for the supported
+  routing benchmark surface.
+- Root MIT `LICENSE`.
+### Changed
+- Raw TypeScript examples are no longer included in the npm package; they remain
+  repository examples to read, copy, and adapt.
+## 0.20.2 — freshness-aware knowledge readiness
+### Added
+- `KnowledgeRequirement.validUntil` and `lastVerifiedAt` for explicit freshness
+  contracts.
+- `scoreKnowledgeReadiness({ now })` support for deterministic freshness gates.
+### Changed
+- Expired knowledge requirements now score as missing even when confidence and
+  evidence are otherwise high.
+## 0.20.0 — knowledge readiness contracts
+### Added
+- First-class knowledge-readiness contracts: `KnowledgeRequirement`,
+  `KnowledgeBundle`, `KnowledgeReadinessReport`, `UserQuestion`, and
+  `DataAcquisitionPlan`.
+- `scoreKnowledgeReadiness`, `blockingKnowledgeEval`,
+  `userQuestionsForKnowledgeGaps`, and `acquisitionPlansForKnowledgeGaps`.
+- Knowledge/data failure classes including `knowledge_readiness_blocked`,
+  `missing_credentials`, `bad_retrieval`, `insufficient_evidence`, and
+  `contradictory_evidence`.
+- `docs/knowledge-readiness.md`, plus documented knowledge-related ASI
+  responsible surfaces for multi-shot optimization.
+## 0.19.1 — release confidence gate
+### Added
+- `evaluateReleaseConfidence`, a conservative release scorecard over corpus
+  coverage, search/holdout run evidence, ASI diagnostics, overfit checks, and
+  cost/latency budgets.
+- `assertReleaseConfidence`, a throwing variant for CI/release scripts.
+- `releaseTraceEvidenceFromMultiShotTrials`, a helper that projects
+  `MultiShotTrialResult` rows into release trace evidence so single-shot and
+  variable multi-shot apps use the same release gate.
+## 0.19.0 — legacy optimizer removal
+### Removed
+- Removed the legacy pairwise prompt optimizer surface:
+  `PromptOptimizer`, `OptimizationLoop`, and their associated root-exported
+  types are gone. The blessed optimization path is now
+  `runMultiShotOptimization` for task trajectories and the steering-specific
+  optimizers for explicit steering tables.
+- Removed the old `PromptVariant` root export. Public callers should use
+  `MultiShotVariant` for multi-shot trajectory optimization or
+  `EvolvableVariant` for the lower-level prompt/code evolution core.
+### Changed
+- Documentation now points optimization users at `runMultiShotOptimization`
+  instead of the removed pairwise prompt optimizer.
+## 0.18.0 — multi-shot optimization
+### Added
+- `runMultiShotOptimization`, the canonical GEPA-style adapter for
+  variable-length agent trajectories. It wraps `runPromptEvolution` while
+  preserving full multi-shot traces, actionable side information, stable paired
+  seeds, score/cost objectives, and optional held-out promotion gating.
+- `trialTraceFromMultiShotTrial`, a bridge from multi-shot trial results into
+  reflective mutation prompts.
+- `ActionableSideInfo`, `MultiShotVariant`, `MultiShotTrace`, `MultiShotRun`,
+  `MultiShotScore`, `MultiShotTrialResult`, `MultiShotMutateAdapter`, and
+  related public types.
+- `docs/multi-shot-optimization.md` and
+  `examples/multi-shot-optimization/index.ts`.
+### Changed
+- The multi-shot result shape explicitly separates `searchBestVariant` from
+  `promotedVariant`. If a holdout gate rejects the search winner, the promoted
+  variant is the baseline.
+- `runMultiShotOptimization` validates release-critical configuration up front:
+  unique variant/scenario ids, positive integer run counts, population size,
+  disjoint search/holdout ids, and a gate baseline key matching the first seed
+  variant.
+## 0.17.2 — agent control runtime
+### Added
+- `runAgentControlLoop`, a generic `observe -> validate -> decide -> act`
+  runtime for agentic tasks with step, wall-clock, and recorded-cost budgets;
+  no-progress and repeated-action stop policies; structured runtime failures;
+  objective/subjective eval helpers; and `TraceStore` emission.
+- `runProposeReviewAsControlLoop`, a bridge preset that expresses
+  propose/verify/review as a specialization of the generic control runtime.
+- feedback trajectory helpers for turning control-loop runs and user/judge
+  labels into reusable dataset scenarios, optimizer rows, and preference
+  memory.
+- `docs/control-runtime.md`, with integration patterns for tax, legal,
+  agent-builder, and film-agent products.
+### Changed
+- control runtime trace sink and `onStep` callback failures are now recorded
+  as structured runtime errors without aborting an otherwise valid run.
+- `runProposeReviewAsControlLoop` accepts a caller-provided verifier failure
+  mapper for domain-specific failure classes.
+## 0.17.0 — surface cleanup + usage-guidance pitfalls
+This release tightens the public benchmark surface and lands internal usage guidance that the v0.15 dispatch couldn't write.
+### Moved
+- `src/benchmarks/gsm8k/` → `examples/benchmarks/gsm8k/`
+- `src/benchmarks/swebench-lite/` → `examples/benchmarks/swebench-lite/`
+These are reference implementations of `BenchmarkAdapter`, not core surface. Consumers read them, copy them, adapt them. The novel `routing` benchmark stays in `src/benchmarks/` because it's our own and broadly useful.
+`src/benchmarks/index.ts` now exports the shared types + the `routing` benchmark only. The previous `gsm8k` and `swebenchLite` namespace exports are gone — import directly from `examples/benchmarks/<name>/index.ts` (or copy the wrapper into your own project).
+### Added
+- `examples/benchmarks/README.md` documents how to use, copy, and extend the example wrappers.
+- Internal agent-eval usage guidance gains production-rigor and pitfalls sections covering the v0.16 primitives.
+### Migration
+If you imported `gsm8k` or `swebenchLite` from `@tangle-network/agent-eval/benchmarks`:
+```ts
+// before
+import { gsm8k, swebenchLite } from '@tangle-network/agent-eval/benchmarks'
+// after — copy the file from examples/benchmarks/<name>/index.ts into your project,
+// or import via relative path from the cloned repo.
+```
+The `routing` benchmark and the shared `BenchmarkAdapter` types are unchanged.
+## 0.16.0 — naming cleanup
+The v0.15 primitives were framed as "paper-grade" but most are production-rigor utilities any team needs. This release renames the three reporting helpers and drops the "paper" framing from the public API. Behavior unchanged.
+### Renamed
+- `paperTable` → `summaryTable`
+- `paretoFigure` → `paretoChart`
+- `gainDistributionFigure` → `gainHistogram`
+- `PaperTable` / `PaperTableOptions` / `PaperTableRow` types → `SummaryTable` / `SummaryTableOptions` / `SummaryTableRow`
+- File: `src/paper-report.ts` → `src/summary-report.ts`
+### Migration
+Drop-in: search-and-replace the three function names and the file path. Type names follow the same pattern. No behavior change.
+```ts
+// before
+import { paperTable, paretoFigure, gainDistributionFigure } from '@tangle-network/agent-eval'
+// after
+import { summaryTable, paretoChart, gainHistogram } from '@tangle-network/agent-eval'
+```
+## 0.15.0 — paper-grade primitives
+Substrate for the "Two Loops, Three Roles" paper on multi-level prompt
+optimization with held-out promotion gates.
+### Added
+- **`HeldOutGate`** (`src/promotion-gate.ts`) — first-class held-out
+  paired-delta promotion gate. Three checks: minimum productive runs,
+  positive lower bound on bootstrap CI of paired holdout median delta,
+  bounded overfit-gap relative to baseline. Decisions carry a
+  machine-readable `rejectionCode` (`few_runs` | `negative_delta` |
+  `overfit_gap`) plus an `evidence` block with every number the gate
+  read. Generalizes the inline pattern that lived in
+  `redteam/scripts/agent-eval-autoresearch.ts:138–171`.
+- **`RunRecord`** (`src/run-record.ts`) — paper-grade JSON-friendly run
+  schema with mandatory fields: `runId`, `experimentId`, `candidateId`,
+  `seed`, snapshot-versioned `model`, `promptHash`, `configHash`,
+  `commitSha`, `wallMs`, `costUsd`, `tokenUsage`, `outcome`, `splitTag`.
+  Runtime validator (`validateRunRecord`, `isRunRecord`,
+  `parseRunRecordSafe`, `roundTripRunRecord`) throws on missing fields
+  and on bare model aliases without snapshot suffix.
+- **`Researcher`** (`src/researcher.ts`) — stable hook for an
+  autonomous-research agent: `inspectFailures` → `proposeChange` →
+  `applyChange` → `evaluateChange`. `NoopResearcher` is the
+  fail-loud placeholder. Implementations live downstream.
+- **Reference benchmarks** (`src/benchmarks/`) — three adapters that
+  share the `BenchmarkAdapter<TItem, TPayload>` shape:
+  - `gsm8k`: HF-mirror loader (JSONL via `AGENT_EVAL_GSM8K_PATH`),
+    exact-match grading via `parseGsm8kAnswer`.
+  - `swebench-lite`: 30-instance subset stub. Loader reads
+    `AGENT_EVAL_SWEBENCH_PATH`; grader shells out to
+    `AGENT_EVAL_SWEBENCH_GRADER_CMD`. Both fail loud when unset.
+  - `routing`: synthetic 16-task router benchmark, ships in the
+    package, dependency-free. Format documented in
+    `src/benchmarks/routing/README.md`.
+  - `deterministicSplit(itemId, seed?)`: stable 60/20/20 split via
+    FNV-1a hash. Default seed `agent-eval-v1`.
+- **`summaryTable`, `paretoChart`, `gainHistogram`**
+  (`sr./summary-report.ts`) — Table 1 + Pareto + gain-distribution specs.
+  Returns data structures (markdown table, point lists, histogram bins);
+  caller picks the plotting library.
+- **`runCanaries`** (`src/canary.ts`) — three liveness canaries:
+  silent judge fallback (consecutive constant-confidence streak),
+  judge calibration drift (KS test on confidence distribution), eval-set
+  distribution shift (chi-square on category bucket counts).
+- **`pairedBootstrap`, `pairedWilcoxon`, `bhAdjust`**
+  (`src/paired-stats.ts`) — paper-style aliases + the missing paired
+  bootstrap CI primitive. Deterministic with optional seed.
+### Notes
+- No breaking changes. Every existing module is untouched; new types
+  are additive.
+- All new public symbols carry JSDoc.
+- 87 new tests across 7 new test files. 571 total tests pass.
+- See the package docs for usage directives and pitfalls.
+## 0.11.0
+intent-match + flow-layer + deploy-gate + concept complexity
+weighting.
+## 0.10.0
+`LayerResult.diagnostics` + `buildReviewerPrompt` +
+`createDefaultReviewer` + `mergeLayerResults` options.
+## 0.9.0
+`CommandRunner` contract + `multiToolchainLayer` + `Finding.detail`.
+## 0.8.x
+`probeLlm` + `keyword-coverage-judge`. Honestly-absent primitives
+backfilled — `llm-client`, multi-layer verifier, semantic concept judge,
+extractor utilities.
+## 0.7.x
+Extracted muffled-gate scanner; `CostTracker.recordVerdict`. Footgun
+fix: `cwd` belongs in `HarnessConfig`, not the driver constructor.
+## 0.6.x
+Tier 1 (meta-eval correlation, PRM, bisector), Tier 2 (counterfactual,
+cross-trace diff, pre-registration), Tier 3 (self-play, causal
+attribution, active learning, RM export), governance templates.

package/README.md CHANGED Viewed

@@ -98,16 +98,20 @@ pip install -e .
 | `summaryTable`, `paretoChart`, `gainHistogram` | Report-ready structured outputs. |
 | `KnowledgeRequirement`, `KnowledgeBundle` | Shared contracts for knowledge readiness. |
+`NoopResearcher` is a fail-loud sentinel for wiring tests. Production systems
+should implement `Researcher` directly or use `CallbackResearcher`.
 ## Examples
-Runnable examples live in the repository's [`examples/`](./examples)
+Runnable examples live in the repository's
+[`examples/`](https://github.com/tangle-network/agent-eval/tree/main/examples)
 directory. They are not part of the published npm package.
-- [`examples/same-sandbox-harness`](./examples/same-sandbox-harness) - run
+- [`examples/same-sandbox-harness`](https://github.com/tangle-network/agent-eval/tree/main/examples/same-sandbox-harness) - run
   multiple eval passes against the same workspace.
-- [`examples/multi-shot-optimization`](./examples/multi-shot-optimization) -
+- [`examples/multi-shot-optimization`](https://github.com/tangle-network/agent-eval/tree/main/examples/multi-shot-optimization) -
   optimize full agent trajectories with held-out promotion.
-- [`examples/benchmarks`](./examples/benchmarks) - benchmark adapter shape and
+- [`examples/benchmarks`](https://github.com/tangle-network/agent-eval/tree/main/examples/benchmarks) - benchmark adapter shape and
   reference benchmark wrappers.
 The examples are intentionally kept outside the README so they can be expanded,

package/dist/benchmarks/index.d.ts CHANGED Viewed

	@@ -1 +1 @@
1	- export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, i as deterministicSplit, l as routing } from '../index-~~CEWY1rmu~~.js';
1	+ export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, i as deterministicSplit, l as routing } from '../index-1PZOtZFr.js';

package/dist/benchmarks/index.js CHANGED Viewed

@@ -2,7 +2,7 @@ import {
   BENCHMARK_SPLIT_SEED,
   deterministicSplit,
   routing_exports
-} from "../chunk-XDGJUIV2.js";
+} from "../chunk-42I2QC2L.js";
 import "../chunk-PZ5AY32C.js";
 export {
   BENCHMARK_SPLIT_SEED,

package/dist/{chunk-XDGJUIV2.js → chunk-42I2QC2L.js} RENAMED Viewed

@@ -216,4 +216,4 @@ export {
   routing_exports,
   benchmarks_exports
 };
-//# sourceMappingURL=chunk-XDGJUIV2.js.map
+//# sourceMappingURL=chunk-42I2QC2L.js.map

package/dist/chunk-42I2QC2L.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/benchmarks/index.ts","../src/benchmarks/types.ts","../src/benchmarks/routing/index.ts","../src/benchmarks/routing/dataset.ts"],"sourcesContent":["/**\n * Reference benchmark wrappers — entry point.\n *\n * Core surface (exported here):\n * - The `BenchmarkAdapter` contract.\n * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.\n * - `routing` — synthetic 16-task router benchmark. The only novel\n * benchmark we built; ships in the package.\n *\n * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):\n * - `gsm8k` — exact-match math reasoning (HF mirror, dataset\n * not bundled).\n * - `swebench-lite` — 30-instance SWE-Bench subset via an external\n * grader command.\n *\n * The example wrappers are reference implementations of `BenchmarkAdapter`.\n * Read them, copy them, adapt them. They're intentionally not in the main\n * entry — every team will configure them differently.\n */\n\nexport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from './types'\nexport { deterministicSplit, BENCHMARK_SPLIT_SEED } from './types'\n\nexport * as routing from './routing/index'\n","/**\n * Shared types for the reference benchmark wrappers under\n * `src/benchmarks/`. Each wrapper exports the three functions in\n * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.\n */\n\nimport type { RunSplitTag } from '../run-record'\n\nexport interface BenchmarkDatasetItem<TPayload = unknown> {\n /** Stable dataset-local item id (used for split assignment + paper\n * references). Unique within a benchmark. */\n id: string\n /** Free-form payload. Each benchmark defines its own shape. */\n payload: TPayload\n}\n\nexport interface BenchmarkEvaluation {\n /** [0, 1] score for the response on this item. Exact-match\n * benchmarks use 0/1; partial-credit benchmarks may return\n * fractional values. */\n score: number\n /** Optional bag of raw scoring signals — e.g. parsed numeric\n * answer, regex match, judge sub-scores. */\n raw: Record<string, unknown>\n}\n\n/** Common signature implemented by every adapter under `src/benchmarks/*`. */\n// `TPayload` is the per-item payload type; `_TItem` is preserved for\n// downstream type-narrowing extensions (a richer `BenchmarkDatasetItem`\n// subclass that adds e.g. provenance metadata) but is intentionally\n// unused here. `noUnusedLocals` requires the leading underscore.\nexport interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {\n /** Load the dataset for the given split. May hit the network on\n * first call but should be cache-friendly. Adapters that don't\n * ship the dataset itself MUST throw a clearly-marked error\n * pointing the caller at the loader script. */\n loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>\n /** Score a single response. Pure with respect to the inputs. */\n evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>\n /** Deterministic split assignment via item id hashing. The\n * fraction of items in each split is implementation-defined but\n * MUST be stable across processes and platforms. */\n assignSplit(itemId: string): RunSplitTag\n}\n\n// ── Deterministic split assignment ───────────────────────────────────\n\n/**\n * 32-bit FNV-1a hash. Stable, allocation-free, deterministic across\n * runtimes. We use it to assign items to splits rather than depending\n * on a polyfilled crypto.subtle path.\n */\nfunction fnv1a32(input: string): number {\n let h = 0x811c9dc5\n for (let i = 0; i < input.length; i++) {\n h ^= input.charCodeAt(i) & 0xff\n h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0\n }\n return h >>> 0\n}\n\n/** Split-assignment seed shared across all benchmarks. Bumping this\n * value reshuffles every split — do NOT do that lightly. */\nexport const BENCHMARK_SPLIT_SEED = 'agent-eval-v1'\n\n/**\n * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a\n * stable 32-bit hash of `${seed}::${id}`. Default proportions:\n *\n * search: 60% (optimization-readable)\n * dev: 20% (held-out for tuning, leak-on-purpose during dev)\n * holdout:20% (paper-grade held-out, gated reads)\n */\nexport function deterministicSplit(\n itemId: string,\n seed: string = BENCHMARK_SPLIT_SEED,\n): RunSplitTag {\n const h = fnv1a32(`${seed}::${itemId}`)\n const pos = h / 0x100000000\n if (pos < 0.6) return 'search'\n if (pos < 0.8) return 'dev'\n return 'holdout'\n}\n","/**\n * Routing benchmark — synthetic, dependency-free, ships in the\n * package. 16 cross-category items in `dataset.ts`. See\n * `routing/README.md` for the format.\n *\n * `evaluate` does case-insensitive exact match against the canonical\n * route plus declared synonyms. The first valid route token in the\n * response wins; everything else is ignored. Wrong answers also\n * report whether they hit a hard negative — useful when triaging\n * \"always picks the popular route\" failure modes.\n */\n\nimport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from '../types'\nimport { deterministicSplit } from '../types'\nimport type { RunSplitTag } from '../../run-record'\nimport { ROUTING_DATASET, type RoutingItem } from './dataset'\n\nexport type { RoutingItem }\nexport type RoutingPayload = RoutingItem\nexport type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>\n\nclass RoutingAdapter\n implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload>\n{\n async loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]> {\n return ROUTING_DATASET\n .map((item) => ({ id: item.id, payload: item }))\n .filter((it) => assignSplitImpl(it.id) === split)\n }\n\n async evaluate(\n item: RoutingDatasetItem,\n response: string,\n ): Promise<BenchmarkEvaluation> {\n const tokens = extractRouteTokens(response)\n const correct = new Set<string>([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()))\n const hardNeg = new Set<string>(item.payload.hardNegatives.map((s) => s.toLowerCase()))\n const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null\n const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null\n const score = firstMatch ? 1 : 0\n return {\n score,\n raw: {\n firstToken: tokens[0] ?? null,\n matchedRoute: firstMatch,\n hitHardNegative: Boolean(firstHardNeg),\n hardNegativeRoute: firstHardNeg,\n category: item.payload.category,\n },\n }\n }\n\n assignSplit(itemId: string): RunSplitTag {\n return assignSplitImpl(itemId)\n }\n}\n\nfunction assignSplitImpl(itemId: string): RunSplitTag {\n return deterministicSplit(`routing::${itemId}`)\n}\n\n/**\n * Pull route-shaped tokens out of a model response. Routes look like\n * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics\n * are not routes, but `category.action` patterns are robust to most\n * model wrappers (JSON output, prose explanations, code fences).\n */\nexport function extractRouteTokens(response: string): string[] {\n const matches = response.match(/[a-z][a-z0-9_]*\\.[a-z][a-z0-9_]*/gi)\n return matches ?? []\n}\n\nconst adapter = new RoutingAdapter()\n\nexport const loadDataset = adapter.loadDataset.bind(adapter)\nexport const evaluate = adapter.evaluate.bind(adapter)\nexport const assignSplit = adapter.assignSplit.bind(adapter)\nexport { RoutingAdapter, ROUTING_DATASET }\n","/**\n * Synthetic routing dataset. 16 tasks across 4 categories. Used as a\n * deterministic, dependency-free benchmark for any router that maps a\n * natural-language request to one of a fixed set of route labels.\n *\n * Format (see `routing/README.md` for prose):\n *\n * {\n * id: stable per-task ID (matches across processes).\n * category: one of the four route labels.\n * prompt: the user-facing request the router must classify.\n * route: the ground-truth route the router should pick.\n * synonyms: other strings that count as a correct answer.\n * hardNegatives:close-but-wrong route labels — used to detect the\n * \"always picks the popular route\" failure mode.\n * }\n *\n * The four categories are intentionally cross-domain (file ops,\n * math, search, conversation) so a router that collapses to one\n * category is easy to spot.\n */\n\nexport interface RoutingItem {\n id: string\n category: 'file' | 'math' | 'search' | 'chat'\n prompt: string\n /** Canonical correct route label. */\n route: string\n /** Alternate route labels that also count as correct. */\n synonyms: string[]\n /** Wrong-but-tempting route labels (for analysis, not grading). */\n hardNegatives: string[]\n}\n\nexport const ROUTING_DATASET: RoutingItem[] = [\n {\n id: 'file_001',\n category: 'file',\n prompt: 'Save the meeting notes to /tmp/notes-2025-04.md as markdown.',\n route: 'fs.write',\n synonyms: ['filesystem.write', 'write_file'],\n hardNegatives: ['fs.read', 'chat.reply'],\n },\n {\n id: 'file_002',\n category: 'file',\n prompt: 'Read the contents of /etc/hosts and summarize the entries.',\n route: 'fs.read',\n synonyms: ['filesystem.read', 'read_file'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n {\n id: 'file_003',\n category: 'file',\n prompt: 'List every Python file under src/ recursively.',\n route: 'fs.list',\n synonyms: ['filesystem.list', 'list_files'],\n hardNegatives: ['fs.read', 'search.code'],\n },\n {\n id: 'file_004',\n category: 'file',\n prompt: 'Delete the cached build at .turbo/cache.',\n route: 'fs.delete',\n synonyms: ['filesystem.delete', 'remove_file'],\n hardNegatives: ['fs.write', 'fs.list'],\n },\n {\n id: 'math_001',\n category: 'math',\n prompt: 'What is the integral of 3x^2 + 2x from 0 to 5?',\n route: 'math.integral',\n synonyms: ['calculator.integral', 'math.solve'],\n hardNegatives: ['math.derivative', 'chat.reply'],\n },\n {\n id: 'math_002',\n category: 'math',\n prompt: 'Compute the derivative of sin(x) * cos(x).',\n route: 'math.derivative',\n synonyms: ['calculator.derivative', 'math.solve'],\n hardNegatives: ['math.integral', 'math.algebra'],\n },\n {\n id: 'math_003',\n category: 'math',\n prompt: 'Solve 2x + 7 = 19 for x.',\n route: 'math.algebra',\n synonyms: ['calculator.algebra', 'math.solve'],\n hardNegatives: ['math.derivative', 'math.integral'],\n },\n {\n id: 'math_004',\n category: 'math',\n prompt: 'What is the prime factorization of 360?',\n route: 'math.numbertheory',\n synonyms: ['calculator.factor', 'math.solve'],\n hardNegatives: ['math.algebra', 'search.web'],\n },\n {\n id: 'search_001',\n category: 'search',\n prompt: 'Find recent papers on agent prompt optimization with held-out promotion gates.',\n route: 'search.web',\n synonyms: ['web.search', 'search.papers'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_002',\n category: 'search',\n prompt: 'Search the codebase for every call site of `runProposeReview`.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'search_003',\n category: 'search',\n prompt: 'What is the latest release of the Tangle network on GitHub?',\n route: 'search.web',\n synonyms: ['web.search', 'github.releases'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_004',\n category: 'search',\n prompt: 'Find all TODO comments in the agent-eval src tree.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.list'],\n },\n {\n id: 'chat_001',\n category: 'chat',\n prompt: 'Hi there, how are you doing today?',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_002',\n category: 'chat',\n prompt: 'Please explain the difference between an LLM and a foundation model.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'qa.answer'],\n hardNegatives: ['search.web', 'math.algebra'],\n },\n {\n id: 'chat_003',\n category: 'chat',\n prompt: 'Tell me a short joke about distributed systems.',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_004',\n category: 'chat',\n prompt: 'Acknowledge my last message with a thumbs up.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'react'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n]\n"],"mappings":";;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACoDA,SAAS,QAAQ,OAAuB;AACtC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,SAAK,MAAM,WAAW,CAAC,IAAI;AAC3B,QAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,SAAU;AAAA,EACxE;AACA,SAAO,MAAM;AACf;AAIO,IAAM,uBAAuB;AAU7B,SAAS,mBACd,QACA,OAAe,sBACF;AACb,QAAM,IAAI,QAAQ,GAAG,IAAI,KAAK,MAAM,EAAE;AACtC,QAAM,MAAM,IAAI;AAChB,MAAI,MAAM,IAAK,QAAO;AACtB,MAAI,MAAM,IAAK,QAAO;AACtB,SAAO;AACT;;;AClFA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACkCO,IAAM,kBAAiC;AAAA,EAC5C;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB,YAAY;AAAA,IAC3C,eAAe,CAAC,WAAW,YAAY;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,WAAW;AAAA,IACzC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,YAAY;AAAA,IAC1C,eAAe,CAAC,WAAW,aAAa;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,aAAa;AAAA,IAC7C,eAAe,CAAC,YAAY,SAAS;AAAA,EACvC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,uBAAuB,YAAY;AAAA,IAC9C,eAAe,CAAC,mBAAmB,YAAY;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,yBAAyB,YAAY;AAAA,IAChD,eAAe,CAAC,iBAAiB,cAAc;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,YAAY;AAAA,IAC7C,eAAe,CAAC,mBAAmB,eAAe;AAAA,EACpD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,YAAY;AAAA,IAC5C,eAAe,CAAC,gBAAgB,YAAY;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,eAAe;AAAA,IACxC,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,iBAAiB;AAAA,IAC1C,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,WAAW;AAAA,IAC5C,eAAe,CAAC,cAAc,cAAc;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,OAAO;AAAA,IACxC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AACF;;;AD1IA,IAAM,iBAAN,MAEA;AAAA,EACE,MAAM,YAAY,OAAmD;AACnE,WAAO,gBACJ,IAAI,CAAC,UAAU,EAAE,IAAI,KAAK,IAAI,SAAS,KAAK,EAAE,EAC9C,OAAO,CAAC,OAAO,gBAAgB,GAAG,EAAE,MAAM,KAAK;AAAA,EACpD;AAAA,EAEA,MAAM,SACJ,MACA,UAC8B;AAC9B,UAAM,SAAS,mBAAmB,QAAQ;AAC1C,UAAM,UAAU,IAAI,IAAY,CAAC,KAAK,QAAQ,OAAO,GAAG,KAAK,QAAQ,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AAC1G,UAAM,UAAU,IAAI,IAAY,KAAK,QAAQ,cAAc,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AACtF,UAAM,aAAa,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACvE,UAAM,eAAe,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACzE,UAAM,QAAQ,aAAa,IAAI;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,KAAK;AAAA,QACH,YAAY,OAAO,CAAC,KAAK;AAAA,QACzB,cAAc;AAAA,QACd,iBAAiB,QAAQ,YAAY;AAAA,QACrC,mBAAmB;AAAA,QACnB,UAAU,KAAK,QAAQ;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AAAA,EAEA,YAAY,QAA6B;AACvC,WAAO,gBAAgB,MAAM;AAAA,EAC/B;AACF;AAEA,SAAS,gBAAgB,QAA6B;AACpD,SAAO,mBAAmB,YAAY,MAAM,EAAE;AAChD;AAQO,SAAS,mBAAmB,UAA4B;AAC7D,QAAM,UAAU,SAAS,MAAM,oCAAoC;AACnE,SAAO,WAAW,CAAC;AACrB;AAEA,IAAM,UAAU,IAAI,eAAe;AAE5B,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;AACpD,IAAM,WAAW,QAAQ,SAAS,KAAK,OAAO;AAC9C,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;","names":[]}

package/dist/{chunk-CJJSB6ZQ.js → chunk-LSR4IAYN.js} RENAMED Viewed

@@ -83,13 +83,21 @@ var ErrorResponseSchema = z.object({
 }).openapi("ErrorResponse");
 var WIRE_VERSION = "1.0.0";
 function hashRubric(rubric) {
-  const stable = JSON.stringify(rubric, Object.keys(rubric).sort());
+  const stable = stableStringify(rubric);
   let h = 5381;
   for (let i = 0; i < stable.length; i++) {
     h = h * 33 ^ stable.charCodeAt(i);
   }
   return `${rubric.name}@${(h >>> 0).toString(16).padStart(8, "0")}`;
 }
+function stableStringify(value) {
+  if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
+  if (value && typeof value === "object") {
+    const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b)).map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`);
+    return `{${entries.join(",")}}`;
+  }
+  return JSON.stringify(value);
+}
 // src/wire/rubrics.ts
 var ANTI_SLOP = {
@@ -225,6 +233,47 @@ function judgeOutputSchema(rubric) {
     }
   };
 }
+function validateJudgeOutput(value, rubric) {
+  if (!value || typeof value !== "object") {
+    throw new WireError("judge_error", "Judge returned malformed output.", 500, value);
+  }
+  const raw = value;
+  const rawDimensions = raw.dimensions;
+  if (!rawDimensions || typeof rawDimensions !== "object" || Array.isArray(rawDimensions)) {
+    throw new WireError("judge_error", "Judge returned malformed dimensions.", 500, value);
+  }
+  const dimensions = {};
+  const dimensionRecord = rawDimensions;
+  for (const dim of rubric.dimensions) {
+    const score = dimensionRecord[dim.id];
+    if (typeof score !== "number" || !Number.isFinite(score) || score < dim.min || score > dim.max) {
+      throw new WireError("judge_error", `Judge returned invalid score for dimension "${dim.id}".`, 500, value);
+    }
+    dimensions[dim.id] = score;
+  }
+  const allowedFailures = new Set(rubric.failureModes.map((mode) => mode.id));
+  const allowedWins = new Set(rubric.wins.map((win) => win.id));
+  const failureModes = validateIdArray(raw.failureModes, allowedFailures, "failureModes", value);
+  const wins = validateIdArray(raw.wins, allowedWins, "wins", value);
+  if (typeof raw.rationale !== "string" || raw.rationale.trim().length === 0) {
+    throw new WireError("judge_error", "Judge returned missing rationale.", 500, value);
+  }
+  return { dimensions, failureModes, wins, rationale: raw.rationale };
+}
+function validateIdArray(raw, allowed, field, original) {
+  if (raw === void 0) return [];
+  if (!Array.isArray(raw)) {
+    throw new WireError("judge_error", `Judge returned non-array ${field}.`, 500, original);
+  }
+  const out = [];
+  for (const item of raw) {
+    if (typeof item !== "string" || !allowed.has(item)) {
+      throw new WireError("judge_error", `Judge returned unknown ${field} id "${String(item)}".`, 500, original);
+    }
+    out.push(item);
+  }
+  return out;
+}
 function compositeScore(dimensions, rubric) {
   let weighted = 0;
   let totalWeight = 0;
@@ -273,17 +322,15 @@ async function handleJudge(req) {
     temperature: 0,
     timeoutMs: 6e4
   });
-  if (!value || typeof value !== "object" || !value.dimensions) {
-    throw new WireError("judge_error", "Judge returned malformed output.", 500, value);
-  }
-  const composite = compositeScore(value.dimensions, rubric);
+  const output = validateJudgeOutput(value, rubric);
+  const composite = compositeScore(output.dimensions, rubric);
   const durationMs = Date.now() - startedAt;
   return {
     composite,
-    dimensions: value.dimensions,
-    failureModes: value.failureModes ?? [],
-    wins: value.wins ?? [],
-    rationale: value.rationale,
+    dimensions: output.dimensions,
+    failureModes: output.failureModes ?? [],
+    wins: output.wins ?? [],
+    rationale: output.rationale,
     rubricVersion: hashRubric(rubric),
     model: result.model,
     durationMs
@@ -400,7 +447,7 @@ function buildOpenApi(packageVersion) {
     }
   });
   const generator = new OpenApiGeneratorV31(registry.definitions);
-  return generator.generateDocument({
+  const doc = generator.generateDocument({
     openapi: "3.1.0",
     info: {
       title: "@tangle-network/agent-eval \u2014 wire protocol",
@@ -413,6 +460,38 @@ Wire-protocol version: ${WIRE_VERSION}. Bumps on breaking changes to request/res
     },
     servers: [{ url: "http://localhost:5005", description: "Local agent-eval serve" }]
   });
+  const rubricRef = { $ref: "#/components/schemas/Rubric" };
+  const commonJudgeFields = {
+    content: { type: "string", minLength: 1 },
+    context: { type: "object", additionalProperties: true },
+    model: { type: "string" }
+  };
+  doc.components ??= {};
+  doc.components.schemas ??= {};
+  doc.components.schemas.JudgeRequest = {
+    oneOf: [
+      {
+        type: "object",
+        additionalProperties: false,
+        required: ["rubricName", "content"],
+        properties: {
+          rubricName: { type: "string", minLength: 1 },
+          ...commonJudgeFields
+        }
+      },
+      {
+        type: "object",
+        additionalProperties: false,
+        required: ["rubric", "content"],
+        properties: {
+          rubric: rubricRef,
+          ...commonJudgeFields
+        }
+      }
+    ],
+    description: "Judge request. Provide exactly one of rubricName or rubric."
+  };
+  return doc;
 }
 // src/wire/server.ts
@@ -591,4 +670,4 @@ export {
   runRpcOnce,
   runRpcBatch
 };
-//# sourceMappingURL=chunk-CJJSB6ZQ.js.map
+//# sourceMappingURL=chunk-LSR4IAYN.js.map