@tangle-network/agent-eval 0.20.8 → 0.20.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.20.8",
3
+ "version": "0.20.10",
4
4
  "description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -33,6 +33,14 @@
33
33
  "types": "./dist/wire/index.d.ts",
34
34
  "import": "./dist/wire/index.js",
35
35
  "default": "./dist/wire/index.js"
36
+ },
37
+ "./benchmarks": {
38
+ "types": "./dist/benchmarks/index.d.ts",
39
+ "import": "./dist/benchmarks/index.js",
40
+ "default": "./dist/benchmarks/index.js"
41
+ },
42
+ "./openapi.json": {
43
+ "default": "./dist/openapi.json"
36
44
  }
37
45
  },
38
46
  "bin": {
@@ -41,15 +49,15 @@
41
49
  "files": [
42
50
  "dist",
43
51
  "docs",
44
- "examples"
52
+ "CHANGELOG.md"
45
53
  ],
46
54
  "publishConfig": {
47
55
  "access": "public"
48
56
  },
49
57
  "scripts": {
50
- "build": "tsup",
58
+ "build": "tsup && pnpm openapi",
51
59
  "dev": "tsup --watch",
52
- "prepare": "tsup",
60
+ "prepare": "pnpm build",
53
61
  "test": "vitest run",
54
62
  "test:watch": "vitest",
55
63
  "typecheck": "tsc --noEmit",
@@ -59,7 +67,7 @@
59
67
  "@asteasolutions/zod-to-openapi": "^8.5.0",
60
68
  "@ax-llm/ax": "^19.0.25",
61
69
  "@hono/node-server": "^2.0.0",
62
- "@tangle-network/tcloud": "^0.2.0",
70
+ "@tangle-network/tcloud": "^0.4.6",
63
71
  "hono": "^4.12.15",
64
72
  "zod": "^4.3.6"
65
73
  },
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/wire/schemas.ts","../src/wire/rubrics.ts","../src/wire/handlers.ts","../src/wire/openapi.ts","../src/wire/server.ts","../src/wire/rpc.ts"],"sourcesContent":["/**\n * Wire-protocol schemas.\n *\n * These Zod schemas are the contract between the agent-eval runtime and\n * any non-TypeScript client (Python, Rust, Go, …). They get rendered to\n * OpenAPI by `wire/openapi.ts` and code-generators consume that spec to\n * produce typed clients in other languages.\n *\n * Rule: if it's not in this file, it isn't on the wire. Keep names and\n * shapes self-explanatory — every field has a `.describe()` so the\n * generated docs are useful without reading the source.\n */\nimport { extendZodWithOpenApi } from '@asteasolutions/zod-to-openapi'\nimport { z } from 'zod'\n\nextendZodWithOpenApi(z)\n\n// ── Building blocks ─────────────────────────────────────────────────\n\nexport const RubricDimensionSchema = z\n .object({\n id: z\n .string()\n .min(1)\n .describe('Short stable id like \"buyer_quality\" — used as the key in scoring output.'),\n description: z\n .string()\n .min(1)\n .describe('One-line plain-English meaning. Read by humans reviewing low scores.'),\n weight: z\n .number()\n .min(0)\n .default(1)\n .describe('Relative weight in the composite score. Default 1; 0 disables.'),\n min: z.number().default(0).describe('Lower bound of valid score for this dimension.'),\n max: z.number().default(1).describe('Upper bound of valid score for this dimension.'),\n })\n .openapi('RubricDimension')\n\nexport const FailureModeSchema = z\n .object({\n id: z.string().min(1).describe('Short stable id like \"ai-cadence\" — used in detection lists.'),\n description: z.string().min(1).describe('Plain-English description of the failure pattern.'),\n })\n .openapi('FailureMode')\n\n// ── Rubric ──────────────────────────────────────────────────────────\n\nexport const RubricSchema = z\n .object({\n name: z\n .string()\n .min(1)\n .describe('Stable name like \"anti-slop\" — used by clients to invoke this rubric.'),\n description: z\n .string()\n .min(1)\n .describe('What this rubric measures. Shown in /v1/rubrics listing.'),\n systemPrompt: z\n .string()\n .min(1)\n .describe(\n 'Instructs the judging LLM. Should explain the persona (e.g. \"senior engineer reviewing voice\"), what to score on, and what to return.',\n ),\n dimensions: z\n .array(RubricDimensionSchema)\n .min(1)\n .describe('Scoring axes. The composite score is a weighted sum of these.'),\n failureModes: z\n .array(FailureModeSchema)\n .default([])\n .describe('Patterns to detect; each detected mode appears in the result.failureModes list.'),\n wins: z\n .array(FailureModeSchema)\n .default([])\n .describe('Positive patterns; each detected one appears in the result.wins list.'),\n })\n .openapi('Rubric')\n\n// ── Judge call ──────────────────────────────────────────────────────\n\nexport const JudgeRequestSchema = z\n .object({\n rubricName: z\n .string()\n .optional()\n .describe('Use a built-in rubric by name. Mutually exclusive with `rubric`.'),\n rubric: RubricSchema.optional().describe(\n 'Inline rubric definition. Mutually exclusive with `rubricName`.',\n ),\n content: z\n .string()\n .min(1)\n .describe('The text being judged — a tweet, a blog post, a code snippet, anything stringly.'),\n context: z\n .record(z.string(), z.unknown())\n .optional()\n .describe(\n 'Free-form metadata for the rubric to use — analytics, source URL, author, etc. Surfaced to the LLM.',\n ),\n model: z\n .string()\n .optional()\n .describe('Override the judge model (default routes via tcloud). e.g. \"claude-opus-4-7\".'),\n })\n .refine((v) => Boolean(v.rubricName) !== Boolean(v.rubric), {\n message: 'Provide exactly one of `rubricName` or `rubric`.',\n })\n .openapi('JudgeRequest')\n\nexport const JudgeResultSchema = z\n .object({\n composite: z\n .number()\n .min(0)\n .max(1)\n .describe('Weighted combination of dimension scores in 0..1. The single number to gate on.'),\n dimensions: z\n .record(z.string(), z.number())\n .describe('Per-dimension score, keyed by RubricDimension.id.'),\n failureModes: z\n .array(z.string())\n .default([])\n .describe('Failure-mode ids detected in the content (subset of rubric.failureModes ids).'),\n wins: z\n .array(z.string())\n .default([])\n .describe('Win ids detected in the content (subset of rubric.wins ids).'),\n rationale: z\n .string()\n .describe('Plain-English explanation of the score. Surfaced to the human reviewer.'),\n rubricVersion: z\n .string()\n .describe(\n 'Stable hash of the rubric used. Scores are only comparable across runs when this matches.',\n ),\n model: z.string().describe('Model that produced the judgement, for reproducibility.'),\n durationMs: z.number().int().nonnegative().describe('End-to-end wall time for this call.'),\n })\n .openapi('JudgeResult')\n\n// ── Rubric listing ──────────────────────────────────────────────────\n\nexport const RubricInfoSchema = z\n .object({\n name: z.string().describe('Pass this to /v1/judge as `rubricName`.'),\n description: z.string().describe('What this rubric measures.'),\n dimensions: z\n .array(z.object({ id: z.string(), description: z.string(), weight: z.number() }))\n .describe('The scoring axes this rubric uses, with weights.'),\n failureModes: z.array(z.string()).default([]).describe('Failure-mode ids this rubric detects.'),\n rubricVersion: z.string().describe('Stable hash — match this to compare scores across runs.'),\n })\n .openapi('RubricInfo')\n\nexport const ListRubricsResponseSchema = z\n .object({\n rubrics: z.array(RubricInfoSchema),\n })\n .openapi('ListRubricsResponse')\n\n// ── Version / health ────────────────────────────────────────────────\n\nexport const VersionResponseSchema = z\n .object({\n package: z.string().describe('Package name (always \"@tangle-network/agent-eval\").'),\n version: z.string().describe('Semver of the running server. Match your client to this.'),\n wireVersion: z\n .string()\n .describe(\n 'Wire-protocol semver. Bumps separately from package version when the schema changes.',\n ),\n apiSurface: z.array(z.string()).describe('List of supported method names.'),\n })\n .openapi('VersionResponse')\n\nexport const HealthResponseSchema = z\n .object({\n status: z.literal('ok'),\n uptimeSec: z.number(),\n })\n .openapi('HealthResponse')\n\n// ── Errors ──────────────────────────────────────────────────────────\n\nexport const ErrorResponseSchema = z\n .object({\n error: z\n .object({\n code: z\n .string()\n .describe('Machine-readable code: \"validation_error\", \"rubric_not_found\", \"judge_error\".'),\n message: z.string().describe('Human-readable message.'),\n details: z.unknown().optional().describe('Optional structured detail.'),\n })\n .describe('Errors are always wrapped in this shape across all endpoints.'),\n })\n .openapi('ErrorResponse')\n\n// ── Type exports for callers in the same package ────────────────────\n\nexport type RubricDimension = z.infer<typeof RubricDimensionSchema>\nexport type FailureMode = z.infer<typeof FailureModeSchema>\nexport type Rubric = z.infer<typeof RubricSchema>\nexport type JudgeRequest = z.infer<typeof JudgeRequestSchema>\nexport type JudgeResult = z.infer<typeof JudgeResultSchema>\nexport type RubricInfo = z.infer<typeof RubricInfoSchema>\nexport type ListRubricsResponse = z.infer<typeof ListRubricsResponseSchema>\nexport type VersionResponse = z.infer<typeof VersionResponseSchema>\nexport type ErrorResponse = z.infer<typeof ErrorResponseSchema>\n\n// ── Wire-protocol version ───────────────────────────────────────────\n\n/**\n * Bump on any breaking change to a request/response schema.\n * Non-breaking (additive) changes don't require a bump.\n */\nexport const WIRE_VERSION = '1.0.0'\n\n/**\n * Stable hash of a rubric. Used to make scores comparable across runs:\n * if the rubricVersion matches, the rubric was identical.\n */\nexport function hashRubric(rubric: Rubric): string {\n // deterministic stringify (keys sorted) for stable hashing\n const stable = JSON.stringify(rubric, Object.keys(rubric).sort())\n let h = 5381\n for (let i = 0; i < stable.length; i++) {\n h = (h * 33) ^ stable.charCodeAt(i)\n }\n // Unsigned 32-bit hex, prefixed with rubric name + version slot\n return `${rubric.name}@${(h >>> 0).toString(16).padStart(8, '0')}`\n}\n","/**\n * Built-in rubrics shipped with agent-eval.\n *\n * A rubric is a set of scoring axes plus a system prompt that tells the\n * judging LLM how to grade against those axes. Built-in rubrics are\n * curated for use cases that recur across Tangle projects — call them\n * by name from any client.\n *\n * Adding a rubric:\n * 1. Define the Rubric object below with a clear `description` and\n * named `dimensions`.\n * 2. Register it in `BUILTIN_RUBRICS` at the bottom.\n * 3. Add a test in `tests/wire/rubrics.test.ts`.\n *\n * Custom rubrics: callers pass `rubric` inline to /v1/judge instead of\n * `rubricName` — see schemas.ts.\n */\nimport type { Rubric } from './schemas'\nimport { hashRubric } from './schemas'\n\n// ── anti-slop ───────────────────────────────────────────────────────\n// Voice/style judge tuned for technical-buyer audiences. Used by the\n// Postiz autoresearch loop and any content-quality gate.\n\nconst ANTI_SLOP: Rubric = {\n name: 'anti-slop',\n description:\n 'Voice and signal quality for content aimed at senior engineers. Catches AI cadence, marketing tone, and engagement-bait shapes.',\n systemPrompt: `You are evaluating a piece of content written for senior engineers and technical founders.\n\nYou score three things:\n- buyer_quality (0..1): would a senior engineer in the target ICP find this worth their attention? High = specific, earned, technically interesting. Low = generic, hyped, off-target.\n- voice (0..1): does it read like a person who built the thing, or like AI/marketing copy?\n- signal (0..1): does it contain a non-obvious detail, constraint, or claim a reader couldn't get from the public docs?\n\nDetect failure modes (return ids matching):\n- ai-cadence: rule-of-three openings, em-dash flourish, \"Let me explain\", \"Here's the thing\", AI rhythm\n- marketing-tone: \"We're excited to announce\", \"thrilled\", \"delighted\", \"game-changer\", buzzword stack\n- vague-claim: technical claim without a specific component, file, or measurement\n- no-hook: opening doesn't earn attention from the target reader\n- engagement-bait: \"agree?\", \"thoughts?\", listicles, controversy-fishing, hook-detail-pitch\n- off-icp: content shape would attract motivational/grift/hype audiences instead of buyers\n- stale-claim: repeats a positioning line we've used many times this month\n\nDetect wins (return ids matching):\n- specific-component: names a real file, component, or measurement\n- earned-detail: shares a non-obvious detail not derivable from public docs\n- constraint-articulated: names a real tradeoff and the side chosen\n- honest-failure: describes a real failure mode and what was done about it\n\nReturn ONLY JSON matching the response schema. Be conservative — most content has 0-1 wins and 1-2 failure modes, not many of each.`,\n dimensions: [\n {\n id: 'buyer_quality',\n description: 'Would the target buyer find this worth attention?',\n weight: 0.5,\n min: 0,\n max: 1,\n },\n {\n id: 'voice',\n description: 'Does it sound like a builder, not AI or marketing?',\n weight: 0.3,\n min: 0,\n max: 1,\n },\n {\n id: 'signal',\n description: 'Non-obvious detail, constraint, or claim?',\n weight: 0.2,\n min: 0,\n max: 1,\n },\n ],\n failureModes: [\n { id: 'ai-cadence', description: 'AI-rhythm openings and transitions' },\n { id: 'marketing-tone', description: 'Buzzwords, hype, corporate-PR voice' },\n { id: 'vague-claim', description: 'Technical claim without specifics' },\n { id: 'no-hook', description: 'Opening fails to earn attention' },\n { id: 'engagement-bait', description: 'Listicle/controversy/agree-pattern' },\n { id: 'off-icp', description: 'Voice attracts the wrong audience' },\n { id: 'stale-claim', description: 'Reuses an over-used positioning line' },\n ],\n wins: [\n { id: 'specific-component', description: 'Names a real file/component/number' },\n { id: 'earned-detail', description: 'Detail not in public docs' },\n { id: 'constraint-articulated', description: 'Names a real tradeoff' },\n { id: 'honest-failure', description: 'Describes a real failure honestly' },\n ],\n}\n\n// ── Registry ────────────────────────────────────────────────────────\n\nexport const BUILTIN_RUBRICS: Record<string, Rubric> = {\n 'anti-slop': ANTI_SLOP,\n}\n\n/** Get a built-in rubric by name, or undefined. */\nexport function getBuiltinRubric(name: string): Rubric | undefined {\n return BUILTIN_RUBRICS[name]\n}\n\n/** List built-in rubrics with their stable versions. */\nexport function listBuiltinRubrics() {\n return Object.values(BUILTIN_RUBRICS).map((r) => ({\n name: r.name,\n description: r.description,\n dimensions: r.dimensions.map((d) => ({\n id: d.id,\n description: d.description,\n weight: d.weight,\n })),\n failureModes: r.failureModes.map((f) => f.id),\n rubricVersion: hashRubric(r),\n }))\n}\n","/**\n * Pure handler functions — the \"business logic\" behind every wire-protocol\n * method. The HTTP server (`server.ts`) and the stdio RPC (`rpc.ts`) both\n * call these. Tests call these directly without spinning a server.\n *\n * Each handler:\n * - Takes a parsed request (already Zod-validated by the transport).\n * - Returns a result that matches the response schema.\n * - Throws `WireError` for caller-fixable errors (404, 400, 422).\n * - Lets unexpected errors bubble — the transport maps them to 500.\n */\nimport { callLlmJson } from '../llm-client'\nimport { getBuiltinRubric, listBuiltinRubrics } from './rubrics'\nimport {\n hashRubric,\n WIRE_VERSION,\n type JudgeRequest,\n type JudgeResult,\n type ListRubricsResponse,\n type Rubric,\n type VersionResponse,\n} from './schemas'\n\n/** Caller-fixable error. The transport renders this to 4xx + ErrorResponse. */\nexport class WireError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n public readonly status: number = 400,\n public readonly details?: unknown,\n ) {\n super(message)\n this.name = 'WireError'\n }\n}\n\n// ── judge ───────────────────────────────────────────────────────────\n\n/** The JSON schema we ask the judging LLM to fill in. */\nfunction judgeOutputSchema(rubric: Rubric) {\n return {\n name: 'JudgeOutput',\n schema: {\n type: 'object',\n additionalProperties: false,\n properties: {\n dimensions: {\n type: 'object',\n additionalProperties: false,\n properties: Object.fromEntries(\n rubric.dimensions.map((d) => [\n d.id,\n { type: 'number', minimum: d.min, maximum: d.max },\n ]),\n ),\n required: rubric.dimensions.map((d) => d.id),\n },\n failureModes: {\n type: 'array',\n items: { type: 'string', enum: rubric.failureModes.map((f) => f.id) },\n },\n wins: {\n type: 'array',\n items: { type: 'string', enum: rubric.wins.map((w) => w.id) },\n },\n rationale: { type: 'string' },\n },\n required: ['dimensions', 'rationale'],\n } as Record<string, unknown>,\n }\n}\n\ninterface JudgeOutput {\n dimensions: Record<string, number>\n failureModes?: string[]\n wins?: string[]\n rationale: string\n}\n\nfunction compositeScore(dimensions: Record<string, number>, rubric: Rubric): number {\n let weighted = 0\n let totalWeight = 0\n for (const dim of rubric.dimensions) {\n const raw = dimensions[dim.id] ?? 0\n const range = dim.max - dim.min || 1\n const normalized = Math.max(0, Math.min(1, (raw - dim.min) / range))\n weighted += normalized * dim.weight\n totalWeight += dim.weight\n }\n return totalWeight > 0 ? weighted / totalWeight : 0\n}\n\nfunction buildJudgePrompt(content: string, context: unknown): string {\n const ctx = context && Object.keys(context as object).length ? JSON.stringify(context) : ''\n return [\n `CONTENT TO JUDGE:`,\n content,\n '',\n ctx ? `CONTEXT (metadata, analytics, etc.):` : '',\n ctx ? ctx : '',\n ]\n .filter(Boolean)\n .join('\\n')\n}\n\nconst DEFAULT_JUDGE_MODEL = 'claude-sonnet-4-6'\n\nexport async function handleJudge(req: JudgeRequest): Promise<JudgeResult> {\n // Resolve rubric\n let rubric: Rubric\n if (req.rubricName) {\n const found = getBuiltinRubric(req.rubricName)\n if (!found) {\n throw new WireError('rubric_not_found', `No built-in rubric named \"${req.rubricName}\".`, 404)\n }\n rubric = found\n } else if (req.rubric) {\n rubric = req.rubric\n } else {\n // refine() in the schema should already have caught this — defense in depth\n throw new WireError('validation_error', 'Provide either `rubricName` or `rubric`.', 422)\n }\n\n const startedAt = Date.now()\n const model = req.model ?? DEFAULT_JUDGE_MODEL\n\n const { value, result } = await callLlmJson<JudgeOutput>({\n model,\n messages: [\n { role: 'system', content: rubric.systemPrompt },\n { role: 'user', content: buildJudgePrompt(req.content, req.context) },\n ],\n jsonSchema: judgeOutputSchema(rubric),\n temperature: 0.0,\n timeoutMs: 60_000,\n })\n\n // Defensive: ensure dimensions object isn't malformed\n if (!value || typeof value !== 'object' || !value.dimensions) {\n throw new WireError('judge_error', 'Judge returned malformed output.', 500, value)\n }\n\n const composite = compositeScore(value.dimensions, rubric)\n const durationMs = Date.now() - startedAt\n\n return {\n composite,\n dimensions: value.dimensions,\n failureModes: value.failureModes ?? [],\n wins: value.wins ?? [],\n rationale: value.rationale,\n rubricVersion: hashRubric(rubric),\n model: result.model,\n durationMs,\n }\n}\n\n// ── listRubrics ─────────────────────────────────────────────────────\n\nexport function handleListRubrics(): ListRubricsResponse {\n return { rubrics: listBuiltinRubrics() }\n}\n\n// ── version ─────────────────────────────────────────────────────────\n\nimport { readFileSync } from 'node:fs'\nimport { dirname, resolve } from 'node:path'\nimport { fileURLToPath } from 'node:url'\n\nlet CACHED_VERSION: string | undefined\n\nfunction readPackageVersion(): string {\n if (CACHED_VERSION) return CACHED_VERSION\n // Walk up from this file looking for the nearest package.json.\n // In dist/ this is dist/.., in src/wire/ this is ../../package.json.\n const here = dirname(fileURLToPath(import.meta.url))\n const candidates = [\n resolve(here, '..', '..', 'package.json'), // src/wire → repo root\n resolve(here, '..', 'package.json'), // dist → repo root\n ]\n for (const path of candidates) {\n try {\n const pkg = JSON.parse(readFileSync(path, 'utf-8')) as { version?: string }\n if (pkg.version) {\n CACHED_VERSION = pkg.version\n return pkg.version\n }\n } catch {\n // try next\n }\n }\n return '0.0.0-unknown'\n}\n\nexport function handleVersion(): VersionResponse {\n return {\n package: '@tangle-network/agent-eval',\n version: readPackageVersion(),\n wireVersion: WIRE_VERSION,\n apiSurface: ['judge', 'listRubrics', 'version'],\n }\n}\n","/**\n * Build an OpenAPI spec from the wire schemas.\n *\n * The spec is the contract that other-language clients (Python, Rust,\n * Go, …) generate from. There is no hand-written client — clients are\n * derived artifacts of this file plus `schemas.ts`.\n *\n * Run `pnpm openapi` (defined in package.json) to write the spec to\n * `dist/openapi.json`. CI uses that file to regenerate the Python\n * client and gate the dual-publish workflow.\n */\nimport { OpenApiGeneratorV31, OpenAPIRegistry } from '@asteasolutions/zod-to-openapi'\nimport type { OpenAPIObject } from 'openapi3-ts/oas31'\n\nimport {\n ErrorResponseSchema,\n HealthResponseSchema,\n JudgeRequestSchema,\n JudgeResultSchema,\n ListRubricsResponseSchema,\n VersionResponseSchema,\n WIRE_VERSION,\n} from './schemas'\n\nexport function buildOpenApi(packageVersion: string): OpenAPIObject {\n const registry = new OpenAPIRegistry()\n\n // Components — each schema becomes a $ref-able component\n registry.register('JudgeRequest', JudgeRequestSchema)\n registry.register('JudgeResult', JudgeResultSchema)\n registry.register('ListRubricsResponse', ListRubricsResponseSchema)\n registry.register('VersionResponse', VersionResponseSchema)\n registry.register('HealthResponse', HealthResponseSchema)\n registry.register('ErrorResponse', ErrorResponseSchema)\n\n // Routes\n registry.registerPath({\n method: 'post',\n path: '/v1/judge',\n summary: 'Score a piece of content against a rubric',\n description:\n 'Runs the judging LLM with the named (or inline) rubric and returns dimension scores, detected failure modes, wins, and a composite score in 0..1.',\n request: {\n body: {\n content: {\n 'application/json': { schema: JudgeRequestSchema },\n },\n },\n },\n responses: {\n 200: {\n description: 'Successful judgement',\n content: { 'application/json': { schema: JudgeResultSchema } },\n },\n 400: {\n description: 'Validation error',\n content: { 'application/json': { schema: ErrorResponseSchema } },\n },\n 404: {\n description: 'Rubric not found',\n content: { 'application/json': { schema: ErrorResponseSchema } },\n },\n 500: {\n description: 'Judge error',\n content: { 'application/json': { schema: ErrorResponseSchema } },\n },\n },\n })\n\n registry.registerPath({\n method: 'get',\n path: '/v1/rubrics',\n summary: 'List built-in rubrics',\n description:\n 'Returns every rubric registered server-side, with their dimensions and stable rubricVersion hash.',\n responses: {\n 200: {\n description: 'Listing',\n content: { 'application/json': { schema: ListRubricsResponseSchema } },\n },\n },\n })\n\n registry.registerPath({\n method: 'get',\n path: '/v1/version',\n summary: 'Server and wire-protocol version',\n description: 'Match your client version to `version`; check `wireVersion` for compatibility.',\n responses: {\n 200: {\n description: 'Version info',\n content: { 'application/json': { schema: VersionResponseSchema } },\n },\n },\n })\n\n registry.registerPath({\n method: 'get',\n path: '/healthz',\n summary: 'Liveness check',\n responses: {\n 200: {\n description: 'OK',\n content: { 'application/json': { schema: HealthResponseSchema } },\n },\n },\n })\n\n const generator = new OpenApiGeneratorV31(registry.definitions)\n return generator.generateDocument({\n openapi: '3.1.0',\n info: {\n title: '@tangle-network/agent-eval — wire protocol',\n version: packageVersion,\n description: `HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: ${WIRE_VERSION}. Bumps on breaking changes to request/response schemas.`,\n contact: { name: 'Tangle Network', url: 'https://github.com/tangle-network/agent-eval' },\n license: { name: 'MIT' },\n },\n servers: [{ url: 'http://localhost:5005', description: 'Local agent-eval serve' }],\n })\n}\n","/**\n * HTTP transport for the wire protocol.\n *\n * Hono + @hono/node-server. Every endpoint:\n * 1. Validates the request against its Zod schema.\n * 2. Calls the matching handler in `handlers.ts`.\n * 3. Renders 4xx for `WireError` with structured body, 500 for unexpected.\n *\n * The server has no internal state besides the handler imports — restart\n * costs nothing. Run via `agent-eval serve --port 5005`.\n */\nimport { serve, type ServerType } from '@hono/node-server'\nimport { Hono } from 'hono'\nimport { cors } from 'hono/cors'\n\nimport {\n handleJudge,\n handleListRubrics,\n handleVersion,\n WireError,\n} from './handlers'\nimport { buildOpenApi } from './openapi'\nimport { JudgeRequestSchema } from './schemas'\n\nconst STARTED_AT = Date.now()\n\nexport function createApp() {\n const app = new Hono()\n\n app.use('*', cors())\n\n app.onError((err, c) => {\n if (err instanceof WireError) {\n return c.json(\n { error: { code: err.code, message: err.message, details: err.details } },\n err.status as 400 | 404 | 422 | 500,\n )\n }\n // Unexpected — log and return generic 500 without leaking internals.\n console.error('[agent-eval] unhandled error:', err)\n return c.json(\n { error: { code: 'internal_error', message: 'Internal server error.' } },\n 500,\n )\n })\n\n // ── Health ──\n app.get('/healthz', (c) =>\n c.json({ status: 'ok' as const, uptimeSec: (Date.now() - STARTED_AT) / 1000 }),\n )\n\n // ── Version ──\n app.get('/v1/version', (c) => c.json(handleVersion()))\n\n // ── Rubrics ──\n app.get('/v1/rubrics', (c) => c.json(handleListRubrics()))\n\n // ── Judge ──\n app.post('/v1/judge', async (c) => {\n const raw = await c.req.json().catch(() => null)\n if (raw == null) {\n throw new WireError('validation_error', 'Request body must be JSON.', 400)\n }\n const parsed = JudgeRequestSchema.safeParse(raw)\n if (!parsed.success) {\n throw new WireError(\n 'validation_error',\n 'Request did not match JudgeRequest schema.',\n 400,\n parsed.error.issues,\n )\n }\n const result = await handleJudge(parsed.data)\n return c.json(result)\n })\n\n // ── OpenAPI spec ──\n app.get('/openapi.json', (c) => c.json(buildOpenApi(handleVersion().version)))\n\n return app\n}\n\nexport interface ServeOptions {\n /** Default 5005. */\n port?: number\n /** Default '127.0.0.1'. Set to '0.0.0.0' to listen on all interfaces. */\n host?: string\n}\n\nexport function startServer(opts: ServeOptions = {}): ServerType {\n const app = createApp()\n const port = opts.port ?? 5005\n const host = opts.host ?? '127.0.0.1'\n return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] serving on http://${address}:${actualPort}`)\n })\n}\n","/**\n * stdio RPC transport.\n *\n * For batch / cron use without a running server. The Python client falls\n * back to this when no server is reachable.\n *\n * Protocol (line-delimited JSON over stdin/stdout):\n * IN: one JSON object on stdin: {\"method\":\"judge\",\"params\":{...}}\n * OUT: one JSON object on stdout: {\"result\":{...}} or {\"error\":{...}}\n *\n * One request per process invocation. To pipeline many calls, the client\n * writes JSONL to stdin and reads JSONL from stdout — see batch mode below.\n */\nimport { handleJudge, handleListRubrics, handleVersion, WireError } from './handlers'\nimport { JudgeRequestSchema } from './schemas'\n\ninterface RpcRequest {\n method: 'judge' | 'listRubrics' | 'version'\n params?: unknown\n}\n\ninterface RpcSuccess {\n result: unknown\n}\n\ninterface RpcError {\n error: { code: string; message: string; details?: unknown }\n}\n\nexport async function dispatchRpc(req: RpcRequest): Promise<RpcSuccess | RpcError> {\n try {\n switch (req.method) {\n case 'judge': {\n const parsed = JudgeRequestSchema.safeParse(req.params)\n if (!parsed.success) {\n return {\n error: {\n code: 'validation_error',\n message: 'params did not match JudgeRequest schema.',\n details: parsed.error.issues,\n },\n }\n }\n return { result: await handleJudge(parsed.data) }\n }\n case 'listRubrics':\n return { result: handleListRubrics() }\n case 'version':\n return { result: handleVersion() }\n default:\n return {\n error: {\n code: 'unknown_method',\n message: `No such method: ${(req as { method: string }).method}`,\n },\n }\n }\n } catch (err) {\n if (err instanceof WireError) {\n return { error: { code: err.code, message: err.message, details: err.details } }\n }\n const message = err instanceof Error ? err.message : String(err)\n return { error: { code: 'internal_error', message } }\n }\n}\n\n// ── stdin/stdout driver ─────────────────────────────────────────────\n\nasync function readAll(stream: NodeJS.ReadableStream): Promise<string> {\n const chunks: Buffer[] = []\n for await (const chunk of stream) {\n chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk as string))\n }\n return Buffer.concat(chunks).toString('utf-8')\n}\n\n/** Read one JSON request from stdin, write one JSON response to stdout. */\nexport async function runRpcOnce(method?: string): Promise<number> {\n const raw = await readAll(process.stdin)\n let req: RpcRequest\n try {\n const body = JSON.parse(raw)\n req = method ? { method: method as RpcRequest['method'], params: body } : (body as RpcRequest)\n } catch (err) {\n process.stdout.write(\n JSON.stringify({\n error: {\n code: 'parse_error',\n message: `stdin was not valid JSON: ${err instanceof Error ? err.message : String(err)}`,\n },\n }) + '\\n',\n )\n return 1\n }\n const out = await dispatchRpc(req)\n process.stdout.write(JSON.stringify(out) + '\\n')\n return 'error' in out ? 1 : 0\n}\n\n/** Read JSONL requests from stdin, write JSONL responses to stdout. */\nexport async function runRpcBatch(method?: string): Promise<number> {\n const raw = await readAll(process.stdin)\n const lines = raw.split('\\n').filter((l) => l.trim().length > 0)\n let exitCode = 0\n for (const line of lines) {\n let req: RpcRequest\n try {\n const body = JSON.parse(line)\n req = method ? { method: method as RpcRequest['method'], params: body } : (body as RpcRequest)\n } catch (err) {\n process.stdout.write(\n JSON.stringify({\n error: {\n code: 'parse_error',\n message: `line was not valid JSON: ${err instanceof Error ? err.message : String(err)}`,\n },\n }) + '\\n',\n )\n exitCode = 1\n continue\n }\n const out = await dispatchRpc(req)\n process.stdout.write(JSON.stringify(out) + '\\n')\n if ('error' in out) exitCode = 1\n }\n return exitCode\n}\n"],"mappings":";;;;;AAYA,SAAS,4BAA4B;AACrC,SAAS,SAAS;AAElB,qBAAqB,CAAC;AAIf,IAAM,wBAAwB,EAClC,OAAO;AAAA,EACN,IAAI,EACD,OAAO,EACP,IAAI,CAAC,EACL,SAAS,gFAA2E;AAAA,EACvF,aAAa,EACV,OAAO,EACP,IAAI,CAAC,EACL,SAAS,sEAAsE;AAAA,EAClF,QAAQ,EACL,OAAO,EACP,IAAI,CAAC,EACL,QAAQ,CAAC,EACT,SAAS,gEAAgE;AAAA,EAC5E,KAAK,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS,gDAAgD;AAAA,EACpF,KAAK,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS,gDAAgD;AACtF,CAAC,EACA,QAAQ,iBAAiB;AAErB,IAAM,oBAAoB,EAC9B,OAAO;AAAA,EACN,IAAI,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,SAAS,mEAA8D;AAAA,EAC7F,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,SAAS,mDAAmD;AAC7F,CAAC,EACA,QAAQ,aAAa;AAIjB,IAAM,eAAe,EACzB,OAAO;AAAA,EACN,MAAM,EACH,OAAO,EACP,IAAI,CAAC,EACL,SAAS,4EAAuE;AAAA,EACnF,aAAa,EACV,OAAO,EACP,IAAI,CAAC,EACL,SAAS,0DAA0D;AAAA,EACtE,cAAc,EACX,OAAO,EACP,IAAI,CAAC,EACL;AAAA,IACC;AAAA,EACF;AAAA,EACF,YAAY,EACT,MAAM,qBAAqB,EAC3B,IAAI,CAAC,EACL,SAAS,+DAA+D;AAAA,EAC3E,cAAc,EACX,MAAM,iBAAiB,EACvB,QAAQ,CAAC,CAAC,EACV,SAAS,iFAAiF;AAAA,EAC7F,MAAM,EACH,MAAM,iBAAiB,EACvB,QAAQ,CAAC,CAAC,EACV,SAAS,uEAAuE;AACrF,CAAC,EACA,QAAQ,QAAQ;AAIZ,IAAM,qBAAqB,EAC/B,OAAO;AAAA,EACN,YAAY,EACT,OAAO,EACP,SAAS,EACT,SAAS,kEAAkE;AAAA,EAC9E,QAAQ,aAAa,SAAS,EAAE;AAAA,IAC9B;AAAA,EACF;AAAA,EACA,SAAS,EACN,OAAO,EACP,IAAI,CAAC,EACL,SAAS,uFAAkF;AAAA,EAC9F,SAAS,EACN,OAAO,EAAE,OAAO,GAAG,EAAE,QAAQ,CAAC,EAC9B,SAAS,EACT;AAAA,IACC;AAAA,EACF;AAAA,EACF,OAAO,EACJ,OAAO,EACP,SAAS,EACT,SAAS,+EAA+E;AAC7F,CAAC,EACA,OAAO,CAAC,MAAM,QAAQ,EAAE,UAAU,MAAM,QAAQ,EAAE,MAAM,GAAG;AAAA,EAC1D,SAAS;AACX,CAAC,EACA,QAAQ,cAAc;AAElB,IAAM,oBAAoB,EAC9B,OAAO;AAAA,EACN,WAAW,EACR,OAAO,EACP,IAAI,CAAC,EACL,IAAI,CAAC,EACL,SAAS,iFAAiF;AAAA,EAC7F,YAAY,EACT,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,CAAC,EAC7B,SAAS,mDAAmD;AAAA,EAC/D,cAAc,EACX,MAAM,EAAE,OAAO,CAAC,EAChB,QAAQ,CAAC,CAAC,EACV,SAAS,+EAA+E;AAAA,EAC3F,MAAM,EACH,MAAM,EAAE,OAAO,CAAC,EAChB,QAAQ,CAAC,CAAC,EACV,SAAS,8DAA8D;AAAA,EAC1E,WAAW,EACR,OAAO,EACP,SAAS,yEAAyE;AAAA,EACrF,eAAe,EACZ,OAAO,EACP;AAAA,IACC;AAAA,EACF;AAAA,EACF,OAAO,EAAE,OAAO,EAAE,SAAS,yDAAyD;AAAA,EACpF,YAAY,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,SAAS,qCAAqC;AAC3F,CAAC,EACA,QAAQ,aAAa;AAIjB,IAAM,mBAAmB,EAC7B,OAAO;AAAA,EACN,MAAM,EAAE,OAAO,EAAE,SAAS,yCAAyC;AAAA,EACnE,aAAa,EAAE,OAAO,EAAE,SAAS,4BAA4B;AAAA,EAC7D,YAAY,EACT,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,GAAG,aAAa,EAAE,OAAO,GAAG,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC,EAC/E,SAAS,kDAAkD;AAAA,EAC9D,cAAc,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,QAAQ,CAAC,CAAC,EAAE,SAAS,uCAAuC;AAAA,EAC9F,eAAe,EAAE,OAAO,EAAE,SAAS,8DAAyD;AAC9F,CAAC,EACA,QAAQ,YAAY;AAEhB,IAAM,4BAA4B,EACtC,OAAO;AAAA,EACN,SAAS,EAAE,MAAM,gBAAgB;AACnC,CAAC,EACA,QAAQ,qBAAqB;AAIzB,IAAM,wBAAwB,EAClC,OAAO;AAAA,EACN,SAAS,EAAE,OAAO,EAAE,SAAS,qDAAqD;AAAA,EAClF,SAAS,EAAE,OAAO,EAAE,SAAS,0DAA0D;AAAA,EACvF,aAAa,EACV,OAAO,EACP;AAAA,IACC;AAAA,EACF;AAAA,EACF,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,iCAAiC;AAC5E,CAAC,EACA,QAAQ,iBAAiB;AAErB,IAAM,uBAAuB,EACjC,OAAO;AAAA,EACN,QAAQ,EAAE,QAAQ,IAAI;AAAA,EACtB,WAAW,EAAE,OAAO;AACtB,CAAC,EACA,QAAQ,gBAAgB;AAIpB,IAAM,sBAAsB,EAChC,OAAO;AAAA,EACN,OAAO,EACJ,OAAO;AAAA,IACN,MAAM,EACH,OAAO,EACP,SAAS,+EAA+E;AAAA,IAC3F,SAAS,EAAE,OAAO,EAAE,SAAS,yBAAyB;AAAA,IACtD,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,6BAA6B;AAAA,EACxE,CAAC,EACA,SAAS,+DAA+D;AAC7E,CAAC,EACA,QAAQ,eAAe;AAoBnB,IAAM,eAAe;AAMrB,SAAS,WAAW,QAAwB;AAEjD,QAAM,SAAS,KAAK,UAAU,QAAQ,OAAO,KAAK,MAAM,EAAE,KAAK,CAAC;AAChE,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,QAAK,IAAI,KAAM,OAAO,WAAW,CAAC;AAAA,EACpC;AAEA,SAAO,GAAG,OAAO,IAAI,KAAK,MAAM,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC;AAClE;;;AChNA,IAAM,YAAoB;AAAA,EACxB,MAAM;AAAA,EACN,aACE;AAAA,EACF,cAAc;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAuBd,YAAY;AAAA,IACV;AAAA,MACE,IAAI;AAAA,MACJ,aAAa;AAAA,MACb,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,aAAa;AAAA,MACb,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,aAAa;AAAA,MACb,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,KAAK;AAAA,IACP;AAAA,EACF;AAAA,EACA,cAAc;AAAA,IACZ,EAAE,IAAI,cAAc,aAAa,qCAAqC;AAAA,IACtE,EAAE,IAAI,kBAAkB,aAAa,sCAAsC;AAAA,IAC3E,EAAE,IAAI,eAAe,aAAa,oCAAoC;AAAA,IACtE,EAAE,IAAI,WAAW,aAAa,kCAAkC;AAAA,IAChE,EAAE,IAAI,mBAAmB,aAAa,qCAAqC;AAAA,IAC3E,EAAE,IAAI,WAAW,aAAa,oCAAoC;AAAA,IAClE,EAAE,IAAI,eAAe,aAAa,uCAAuC;AAAA,EAC3E;AAAA,EACA,MAAM;AAAA,IACJ,EAAE,IAAI,sBAAsB,aAAa,qCAAqC;AAAA,IAC9E,EAAE,IAAI,iBAAiB,aAAa,4BAA4B;AAAA,IAChE,EAAE,IAAI,0BAA0B,aAAa,wBAAwB;AAAA,IACrE,EAAE,IAAI,kBAAkB,aAAa,oCAAoC;AAAA,EAC3E;AACF;AAIO,IAAM,kBAA0C;AAAA,EACrD,aAAa;AACf;AAGO,SAAS,iBAAiB,MAAkC;AACjE,SAAO,gBAAgB,IAAI;AAC7B;AAGO,SAAS,qBAAqB;AACnC,SAAO,OAAO,OAAO,eAAe,EAAE,IAAI,CAAC,OAAO;AAAA,IAChD,MAAM,EAAE;AAAA,IACR,aAAa,EAAE;AAAA,IACf,YAAY,EAAE,WAAW,IAAI,CAAC,OAAO;AAAA,MACnC,IAAI,EAAE;AAAA,MACN,aAAa,EAAE;AAAA,MACf,QAAQ,EAAE;AAAA,IACZ,EAAE;AAAA,IACF,cAAc,EAAE,aAAa,IAAI,CAAC,MAAM,EAAE,EAAE;AAAA,IAC5C,eAAe,WAAW,CAAC;AAAA,EAC7B,EAAE;AACJ;;;ACkDA,SAAS,oBAAoB;AAC7B,SAAS,SAAS,eAAe;AACjC,SAAS,qBAAqB;AA/IvB,IAAM,YAAN,cAAwB,MAAM;AAAA,EACnC,YACkB,MAChB,SACgB,SAAiB,KACjB,SAChB;AACA,UAAM,OAAO;AALG;AAEA;AACA;AAGhB,SAAK,OAAO;AAAA,EACd;AAAA,EAPkB;AAAA,EAEA;AAAA,EACA;AAKpB;AAKA,SAAS,kBAAkB,QAAgB;AACzC,SAAO;AAAA,IACL,MAAM;AAAA,IACN,QAAQ;AAAA,MACN,MAAM;AAAA,MACN,sBAAsB;AAAA,MACtB,YAAY;AAAA,QACV,YAAY;AAAA,UACV,MAAM;AAAA,UACN,sBAAsB;AAAA,UACtB,YAAY,OAAO;AAAA,YACjB,OAAO,WAAW,IAAI,CAAC,MAAM;AAAA,cAC3B,EAAE;AAAA,cACF,EAAE,MAAM,UAAU,SAAS,EAAE,KAAK,SAAS,EAAE,IAAI;AAAA,YACnD,CAAC;AAAA,UACH;AAAA,UACA,UAAU,OAAO,WAAW,IAAI,CAAC,MAAM,EAAE,EAAE;AAAA,QAC7C;AAAA,QACA,cAAc;AAAA,UACZ,MAAM;AAAA,UACN,OAAO,EAAE,MAAM,UAAU,MAAM,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE;AAAA,QACtE;AAAA,QACA,MAAM;AAAA,UACJ,MAAM;AAAA,UACN,OAAO,EAAE,MAAM,UAAU,MAAM,OAAO,KAAK,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE;AAAA,QAC9D;AAAA,QACA,WAAW,EAAE,MAAM,SAAS;AAAA,MAC9B;AAAA,MACA,UAAU,CAAC,cAAc,WAAW;AAAA,IACtC;AAAA,EACF;AACF;AASA,SAAS,eAAe,YAAoC,QAAwB;AAClF,MAAI,WAAW;AACf,MAAI,cAAc;AAClB,aAAW,OAAO,OAAO,YAAY;AACnC,UAAM,MAAM,WAAW,IAAI,EAAE,KAAK;AAClC,UAAM,QAAQ,IAAI,MAAM,IAAI,OAAO;AACnC,UAAM,aAAa,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,MAAM,IAAI,OAAO,KAAK,CAAC;AACnE,gBAAY,aAAa,IAAI;AAC7B,mBAAe,IAAI;AAAA,EACrB;AACA,SAAO,cAAc,IAAI,WAAW,cAAc;AACpD;AAEA,SAAS,iBAAiB,SAAiB,SAA0B;AACnE,QAAM,MAAM,WAAW,OAAO,KAAK,OAAiB,EAAE,SAAS,KAAK,UAAU,OAAO,IAAI;AACzF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA,MAAM,yCAAyC;AAAA,IAC/C,MAAM,MAAM;AAAA,EACd,EACG,OAAO,OAAO,EACd,KAAK,IAAI;AACd;AAEA,IAAM,sBAAsB;AAE5B,eAAsB,YAAY,KAAyC;AAEzE,MAAI;AACJ,MAAI,IAAI,YAAY;AAClB,UAAM,QAAQ,iBAAiB,IAAI,UAAU;AAC7C,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,UAAU,oBAAoB,6BAA6B,IAAI,UAAU,MAAM,GAAG;AAAA,IAC9F;AACA,aAAS;AAAA,EACX,WAAW,IAAI,QAAQ;AACrB,aAAS,IAAI;AAAA,EACf,OAAO;AAEL,UAAM,IAAI,UAAU,oBAAoB,4CAA4C,GAAG;AAAA,EACzF;AAEA,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,QAAQ,IAAI,SAAS;AAE3B,QAAM,EAAE,OAAO,OAAO,IAAI,MAAM,YAAyB;AAAA,IACvD;AAAA,IACA,UAAU;AAAA,MACR,EAAE,MAAM,UAAU,SAAS,OAAO,aAAa;AAAA,MAC/C,EAAE,MAAM,QAAQ,SAAS,iBAAiB,IAAI,SAAS,IAAI,OAAO,EAAE;AAAA,IACtE;AAAA,IACA,YAAY,kBAAkB,MAAM;AAAA,IACpC,aAAa;AAAA,IACb,WAAW;AAAA,EACb,CAAC;AAGD,MAAI,CAAC,SAAS,OAAO,UAAU,YAAY,CAAC,MAAM,YAAY;AAC5D,UAAM,IAAI,UAAU,eAAe,oCAAoC,KAAK,KAAK;AAAA,EACnF;AAEA,QAAM,YAAY,eAAe,MAAM,YAAY,MAAM;AACzD,QAAM,aAAa,KAAK,IAAI,IAAI;AAEhC,SAAO;AAAA,IACL;AAAA,IACA,YAAY,MAAM;AAAA,IAClB,cAAc,MAAM,gBAAgB,CAAC;AAAA,IACrC,MAAM,MAAM,QAAQ,CAAC;AAAA,IACrB,WAAW,MAAM;AAAA,IACjB,eAAe,WAAW,MAAM;AAAA,IAChC,OAAO,OAAO;AAAA,IACd;AAAA,EACF;AACF;AAIO,SAAS,oBAAyC;AACvD,SAAO,EAAE,SAAS,mBAAmB,EAAE;AACzC;AAQA,IAAI;AAEJ,SAAS,qBAA6B;AACpC,MAAI,eAAgB,QAAO;AAG3B,QAAM,OAAO,QAAQ,cAAc,YAAY,GAAG,CAAC;AACnD,QAAM,aAAa;AAAA,IACjB,QAAQ,MAAM,MAAM,MAAM,cAAc;AAAA;AAAA,IACxC,QAAQ,MAAM,MAAM,cAAc;AAAA;AAAA,EACpC;AACA,aAAW,QAAQ,YAAY;AAC7B,QAAI;AACF,YAAM,MAAM,KAAK,MAAM,aAAa,MAAM,OAAO,CAAC;AAClD,UAAI,IAAI,SAAS;AACf,yBAAiB,IAAI;AACrB,eAAO,IAAI;AAAA,MACb;AAAA,IACF,QAAQ;AAAA,IAER;AAAA,EACF;AACA,SAAO;AACT;AAEO,SAAS,gBAAiC;AAC/C,SAAO;AAAA,IACL,SAAS;AAAA,IACT,SAAS,mBAAmB;AAAA,IAC5B,aAAa;AAAA,IACb,YAAY,CAAC,SAAS,eAAe,SAAS;AAAA,EAChD;AACF;;;AC9LA,SAAS,qBAAqB,uBAAuB;AAa9C,SAAS,aAAa,gBAAuC;AAClE,QAAM,WAAW,IAAI,gBAAgB;AAGrC,WAAS,SAAS,gBAAgB,kBAAkB;AACpD,WAAS,SAAS,eAAe,iBAAiB;AAClD,WAAS,SAAS,uBAAuB,yBAAyB;AAClE,WAAS,SAAS,mBAAmB,qBAAqB;AAC1D,WAAS,SAAS,kBAAkB,oBAAoB;AACxD,WAAS,SAAS,iBAAiB,mBAAmB;AAGtD,WAAS,aAAa;AAAA,IACpB,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,SAAS;AAAA,IACT,aACE;AAAA,IACF,SAAS;AAAA,MACP,MAAM;AAAA,QACJ,SAAS;AAAA,UACP,oBAAoB,EAAE,QAAQ,mBAAmB;AAAA,QACnD;AAAA,MACF;AAAA,IACF;AAAA,IACA,WAAW;AAAA,MACT,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,kBAAkB,EAAE;AAAA,MAC/D;AAAA,MACA,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,oBAAoB,EAAE;AAAA,MACjE;AAAA,MACA,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,oBAAoB,EAAE;AAAA,MACjE;AAAA,MACA,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,oBAAoB,EAAE;AAAA,MACjE;AAAA,IACF;AAAA,EACF,CAAC;AAED,WAAS,aAAa;AAAA,IACpB,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,SAAS;AAAA,IACT,aACE;AAAA,IACF,WAAW;AAAA,MACT,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,0BAA0B,EAAE;AAAA,MACvE;AAAA,IACF;AAAA,EACF,CAAC;AAED,WAAS,aAAa;AAAA,IACpB,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,SAAS;AAAA,IACT,aAAa;AAAA,IACb,WAAW;AAAA,MACT,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,sBAAsB,EAAE;AAAA,MACnE;AAAA,IACF;AAAA,EACF,CAAC;AAED,WAAS,aAAa;AAAA,IACpB,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,SAAS;AAAA,IACT,WAAW;AAAA,MACT,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,qBAAqB,EAAE;AAAA,MAClE;AAAA,IACF;AAAA,EACF,CAAC;AAED,QAAM,YAAY,IAAI,oBAAoB,SAAS,WAAW;AAC9D,SAAO,UAAU,iBAAiB;AAAA,IAChC,SAAS;AAAA,IACT,MAAM;AAAA,MACJ,OAAO;AAAA,MACP,SAAS;AAAA,MACT,aAAa;AAAA;AAAA,yBAEM,YAAY;AAAA,MAC/B,SAAS,EAAE,MAAM,kBAAkB,KAAK,+CAA+C;AAAA,MACvF,SAAS,EAAE,MAAM,MAAM;AAAA,IACzB;AAAA,IACA,SAAS,CAAC,EAAE,KAAK,yBAAyB,aAAa,yBAAyB,CAAC;AAAA,EACnF,CAAC;AACH;;;AC/GA,SAAS,aAA8B;AACvC,SAAS,YAAY;AACrB,SAAS,YAAY;AAWrB,IAAM,aAAa,KAAK,IAAI;AAErB,SAAS,YAAY;AAC1B,QAAM,MAAM,IAAI,KAAK;AAErB,MAAI,IAAI,KAAK,KAAK,CAAC;AAEnB,MAAI,QAAQ,CAAC,KAAK,MAAM;AACtB,QAAI,eAAe,WAAW;AAC5B,aAAO,EAAE;AAAA,QACP,EAAE,OAAO,EAAE,MAAM,IAAI,MAAM,SAAS,IAAI,SAAS,SAAS,IAAI,QAAQ,EAAE;AAAA,QACxE,IAAI;AAAA,MACN;AAAA,IACF;AAEA,YAAQ,MAAM,iCAAiC,GAAG;AAClD,WAAO,EAAE;AAAA,MACP,EAAE,OAAO,EAAE,MAAM,kBAAkB,SAAS,yBAAyB,EAAE;AAAA,MACvE;AAAA,IACF;AAAA,EACF,CAAC;AAGD,MAAI;AAAA,IAAI;AAAA,IAAY,CAAC,MACnB,EAAE,KAAK,EAAE,QAAQ,MAAe,YAAY,KAAK,IAAI,IAAI,cAAc,IAAK,CAAC;AAAA,EAC/E;AAGA,MAAI,IAAI,eAAe,CAAC,MAAM,EAAE,KAAK,cAAc,CAAC,CAAC;AAGrD,MAAI,IAAI,eAAe,CAAC,MAAM,EAAE,KAAK,kBAAkB,CAAC,CAAC;AAGzD,MAAI,KAAK,aAAa,OAAO,MAAM;AACjC,UAAM,MAAM,MAAM,EAAE,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI;AAC/C,QAAI,OAAO,MAAM;AACf,YAAM,IAAI,UAAU,oBAAoB,8BAA8B,GAAG;AAAA,IAC3E;AACA,UAAM,SAAS,mBAAmB,UAAU,GAAG;AAC/C,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,QACA;AAAA,QACA,OAAO,MAAM;AAAA,MACf;AAAA,IACF;AACA,UAAM,SAAS,MAAM,YAAY,OAAO,IAAI;AAC5C,WAAO,EAAE,KAAK,MAAM;AAAA,EACtB,CAAC;AAGD,MAAI,IAAI,iBAAiB,CAAC,MAAM,EAAE,KAAK,aAAa,cAAc,EAAE,OAAO,CAAC,CAAC;AAE7E,SAAO;AACT;AASO,SAAS,YAAY,OAAqB,CAAC,GAAe;AAC/D,QAAM,MAAM,UAAU;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO,MAAM,EAAE,OAAO,IAAI,OAAO,MAAM,UAAU,KAAK,GAAG,CAAC,EAAE,SAAS,MAAM,WAAW,MAAM;AAE1F,YAAQ,IAAI,kCAAkC,OAAO,IAAI,UAAU,EAAE;AAAA,EACvE,CAAC;AACH;;;ACpEA,eAAsB,YAAY,KAAiD;AACjF,MAAI;AACF,YAAQ,IAAI,QAAQ;AAAA,MAClB,KAAK,SAAS;AACZ,cAAM,SAAS,mBAAmB,UAAU,IAAI,MAAM;AACtD,YAAI,CAAC,OAAO,SAAS;AACnB,iBAAO;AAAA,YACL,OAAO;AAAA,cACL,MAAM;AAAA,cACN,SAAS;AAAA,cACT,SAAS,OAAO,MAAM;AAAA,YACxB;AAAA,UACF;AAAA,QACF;AACA,eAAO,EAAE,QAAQ,MAAM,YAAY,OAAO,IAAI,EAAE;AAAA,MAClD;AAAA,MACA,KAAK;AACH,eAAO,EAAE,QAAQ,kBAAkB,EAAE;AAAA,MACvC,KAAK;AACH,eAAO,EAAE,QAAQ,cAAc,EAAE;AAAA,MACnC;AACE,eAAO;AAAA,UACL,OAAO;AAAA,YACL,MAAM;AAAA,YACN,SAAS,mBAAoB,IAA2B,MAAM;AAAA,UAChE;AAAA,QACF;AAAA,IACJ;AAAA,EACF,SAAS,KAAK;AACZ,QAAI,eAAe,WAAW;AAC5B,aAAO,EAAE,OAAO,EAAE,MAAM,IAAI,MAAM,SAAS,IAAI,SAAS,SAAS,IAAI,QAAQ,EAAE;AAAA,IACjF;AACA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,WAAO,EAAE,OAAO,EAAE,MAAM,kBAAkB,QAAQ,EAAE;AAAA,EACtD;AACF;AAIA,eAAe,QAAQ,QAAgD;AACrE,QAAM,SAAmB,CAAC;AAC1B,mBAAiB,SAAS,QAAQ;AAChC,WAAO,KAAK,OAAO,SAAS,KAAK,IAAI,QAAQ,OAAO,KAAK,KAAe,CAAC;AAAA,EAC3E;AACA,SAAO,OAAO,OAAO,MAAM,EAAE,SAAS,OAAO;AAC/C;AAGA,eAAsB,WAAW,QAAkC;AACjE,QAAM,MAAM,MAAM,QAAQ,QAAQ,KAAK;AACvC,MAAI;AACJ,MAAI;AACF,UAAM,OAAO,KAAK,MAAM,GAAG;AAC3B,UAAM,SAAS,EAAE,QAAwC,QAAQ,KAAK,IAAK;AAAA,EAC7E,SAAS,KAAK;AACZ,YAAQ,OAAO;AAAA,MACb,KAAK,UAAU;AAAA,QACb,OAAO;AAAA,UACL,MAAM;AAAA,UACN,SAAS,6BAA6B,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,QACxF;AAAA,MACF,CAAC,IAAI;AAAA,IACP;AACA,WAAO;AAAA,EACT;AACA,QAAM,MAAM,MAAM,YAAY,GAAG;AACjC,UAAQ,OAAO,MAAM,KAAK,UAAU,GAAG,IAAI,IAAI;AAC/C,SAAO,WAAW,MAAM,IAAI;AAC9B;AAGA,eAAsB,YAAY,QAAkC;AAClE,QAAM,MAAM,MAAM,QAAQ,QAAQ,KAAK;AACvC,QAAM,QAAQ,IAAI,MAAM,IAAI,EAAE,OAAO,CAAC,MAAM,EAAE,KAAK,EAAE,SAAS,CAAC;AAC/D,MAAI,WAAW;AACf,aAAW,QAAQ,OAAO;AACxB,QAAI;AACJ,QAAI;AACF,YAAM,OAAO,KAAK,MAAM,IAAI;AAC5B,YAAM,SAAS,EAAE,QAAwC,QAAQ,KAAK,IAAK;AAAA,IAC7E,SAAS,KAAK;AACZ,cAAQ,OAAO;AAAA,QACb,KAAK,UAAU;AAAA,UACb,OAAO;AAAA,YACL,MAAM;AAAA,YACN,SAAS,4BAA4B,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,UACvF;AAAA,QACF,CAAC,IAAI;AAAA,MACP;AACA,iBAAW;AACX;AAAA,IACF;AACA,UAAM,MAAM,MAAM,YAAY,GAAG;AACjC,YAAQ,OAAO,MAAM,KAAK,UAAU,GAAG,IAAI,IAAI;AAC/C,QAAI,WAAW,IAAK,YAAW;AAAA,EACjC;AACA,SAAO;AACT;","names":[]}
@@ -1,44 +0,0 @@
1
- # Example benchmark wrappers
2
-
3
- Reference implementations of `BenchmarkAdapter` for two public benchmarks. They are NOT bundled — they're intentionally shipped as source you read, copy, and adapt.
4
-
5
- | Wrapper | What it does | Why it's an example, not core |
6
- |---|---|---|
7
- | [`gsm8k/`](./gsm8k) | Exact-match grading on the final numeric answer of GSM8K (Cobbe et al.) | The dataset isn't ours and isn't bundled. The wrapper points to a local JSONL via `AGENT_EVAL_GSM8K_PATH`. |
8
- | [`swebench-lite/`](./swebench-lite) | Pass/fail grading via an external SWE-Bench grader command | The grader is a separate binary; the wrapper stubs the integration via `AGENT_EVAL_SWEBENCH_GRADER_CMD`. |
9
-
10
- The novel benchmark we ship and own — the synthetic routing task — lives in `src/benchmarks/routing/` and IS in the bundle.
11
-
12
- ## Using these wrappers
13
-
14
- Two paths.
15
-
16
- **Option A — read and inline.** Copy the wrapper file into your project. Replace the import paths from `../../../src/benchmarks/types` and `../../../src/run-record` with `@tangle-network/agent-eval`. Done.
17
-
18
- **Option B — import from agent-eval source.** If your project sits in this monorepo (or you've cloned the repo), import directly:
19
-
20
- ```ts
21
- import * as gsm8k from '@tangle-network/agent-eval/examples/benchmarks/gsm8k'
22
- ```
23
-
24
- This requires adding `examples/**/*.ts` to your TypeScript paths. Easier to just copy.
25
-
26
- ## What every BenchmarkAdapter exports
27
-
28
- ```ts
29
- loadDataset(split: 'search' | 'dev' | 'holdout'): Promise<DatasetItem[]>
30
- evaluate(item, response): Promise<{ score: number, raw: Record<string, unknown> }>
31
- assignSplit(itemId: string): 'search' | 'dev' | 'holdout'
32
- ```
33
-
34
- `assignSplit` uses `deterministicSplit(itemId, BENCHMARK_SPLIT_SEED)` — same item gets the same split everywhere. Don't change the seed; it's load-bearing for reproducibility.
35
-
36
- ## Adding a new benchmark
37
-
38
- 1. Create `examples/benchmarks/<your-benchmark>/index.ts`.
39
- 2. Export `loadDataset`, `evaluate`, `assignSplit`. Optionally a typed `Adapter` class.
40
- 3. Use `deterministicSplit` from `@tangle-network/agent-eval` for split assignment.
41
- 4. Fail loud on missing config (env vars, paths). Never default to silent-pass.
42
- 5. Document config requirements in a per-benchmark README.
43
-
44
- If your benchmark is novel and broadly useful, propose moving it into `src/benchmarks/` as core surface (PR welcome). The bar is: novel rubric, reusable across projects, low maintenance burden.
@@ -1,126 +0,0 @@
1
- /**
2
- * GSM8K wrapper — exact-match grading on the final numeric answer.
3
- *
4
- * The dataset itself is NOT bundled. `loadDataset` will:
5
- * 1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
6
- * file with `{ id, question, answer }` records — the standard
7
- * HF mirror layout converted to JSONL);
8
- * 2. otherwise throw a clearly-marked error pointing to the loader.
9
- *
10
- * `evaluate` parses the final number out of the response (last
11
- * occurrence of a signed-decimal-or-integer literal, optionally after
12
- * `####`, the GSM8K answer convention) and compares to the ground-
13
- * truth integer. Floating-point comparisons use a 1e-6 tolerance.
14
- */
15
-
16
- import { existsSync, readFileSync } from 'node:fs'
17
-
18
- import type {
19
- BenchmarkAdapter,
20
- BenchmarkDatasetItem,
21
- BenchmarkEvaluation,
22
- } from '../../../src/benchmarks/types'
23
- import { deterministicSplit } from '../../../src/benchmarks/types'
24
- import type { RunSplitTag } from '../../../src/run-record'
25
-
26
- export interface Gsm8kPayload {
27
- question: string
28
- /** Reference answer, post-#### normalization. May be a number or
29
- * a numeric string ("72", "1.5"). */
30
- answer: string
31
- }
32
-
33
- export type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>
34
-
35
- class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
36
- async loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]> {
37
- const path = process.env.AGENT_EVAL_GSM8K_PATH
38
- if (!path) {
39
- throw new Error(
40
- 'GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file ' +
41
- 'with {id, question, answer} records (the HF GSM8K mirror converted to JSONL).',
42
- )
43
- }
44
- if (!existsSync(path)) {
45
- throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`)
46
- }
47
- const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split)
48
- return items
49
- }
50
-
51
- async evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation> {
52
- const expected = parseGsm8kAnswer(item.payload.answer)
53
- const observed = parseGsm8kAnswer(response)
54
- if (expected === null) {
55
- // Defensive: the dataset should never ship a non-numeric ref.
56
- return { score: 0, raw: { reason: 'reference_not_numeric', expected: item.payload.answer } }
57
- }
58
- if (observed === null) {
59
- return { score: 0, raw: { reason: 'no_numeric_in_response', expected, observed: null } }
60
- }
61
- const ok = Math.abs(expected - observed) < 1e-6
62
- return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } }
63
- }
64
-
65
- assignSplit(itemId: string): RunSplitTag {
66
- return assignSplitImpl(itemId)
67
- }
68
- }
69
-
70
- function assignSplitImpl(itemId: string): RunSplitTag {
71
- return deterministicSplit(`gsm8k::${itemId}`)
72
- }
73
-
74
- function parseJsonl(path: string): Gsm8kItem[] {
75
- const raw = readFileSync(path, 'utf8')
76
- const out: Gsm8kItem[] = []
77
- let lineNo = 0
78
- for (const line of raw.split('\n')) {
79
- lineNo++
80
- const trimmed = line.trim()
81
- if (!trimmed) continue
82
- let row: Record<string, unknown>
83
- try {
84
- row = JSON.parse(trimmed) as Record<string, unknown>
85
- } catch (e) {
86
- throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${(e as Error).message}`)
87
- }
88
- const id = String(row.id ?? `gsm8k_${lineNo}`)
89
- const question = String(row.question ?? '')
90
- const answer = String(row.answer ?? '')
91
- if (!question || !answer) {
92
- throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`)
93
- }
94
- out.push({ id, payload: { question, answer } })
95
- }
96
- return out
97
- }
98
-
99
- /**
100
- * Parse a GSM8K-style answer. Honors the dataset's `#### N`
101
- * convention (the canonical answer comes after `####`); otherwise
102
- * returns the LAST signed numeric literal in the string.
103
- */
104
- export function parseGsm8kAnswer(text: string): number | null {
105
- if (!text) return null
106
- const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/)
107
- if (afterMarker) {
108
- const cleaned = afterMarker[1]!.replace(/,/g, '')
109
- const v = Number(cleaned)
110
- if (Number.isFinite(v)) return v
111
- }
112
- // Last numeric literal anywhere in the string.
113
- const matches = text.match(/-?\d[\d,]*\.?\d*/g)
114
- if (!matches || matches.length === 0) return null
115
- const last = matches[matches.length - 1]!
116
- const cleaned = last.replace(/,/g, '')
117
- const v = Number(cleaned)
118
- return Number.isFinite(v) ? v : null
119
- }
120
-
121
- const adapter = new Gsm8kAdapter()
122
-
123
- export const loadDataset = adapter.loadDataset.bind(adapter)
124
- export const evaluate = adapter.evaluate.bind(adapter)
125
- export const assignSplit = adapter.assignSplit.bind(adapter)
126
- export { Gsm8kAdapter }
@@ -1,178 +0,0 @@
1
- /**
2
- * SWE-Bench Lite wrapper — 30-instance subset.
3
- *
4
- * Status: STUB. The actual SWE-Bench harness needs a Docker host and
5
- * is too heavy to ship inside this package. We expose the contract
6
- * (loadDataset, evaluate, assignSplit) so consumers can plug in their
7
- * own grader without touching call sites.
8
- *
9
- * Wire-up paths in priority order:
10
- *
11
- * 1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
12
- * lite instances + per-instance metadata (instance_id,
13
- * problem_statement, base_commit, repo, FAIL_TO_PASS,
14
- * PASS_TO_PASS).
15
- * 2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
16
- * that reads `{instance_id, patch}` JSON on stdin and writes
17
- * `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
18
- * JSON on stdout. Implementations can shell out to the
19
- * official `swebench` runner here.
20
- *
21
- * If neither is set, every public method throws a clearly-marked
22
- * "not implemented" error. The stub fails LOUD; it never silently
23
- * scores zero.
24
- */
25
-
26
- import { existsSync, readFileSync } from 'node:fs'
27
- import { spawn } from 'node:child_process'
28
-
29
- import type {
30
- BenchmarkAdapter,
31
- BenchmarkDatasetItem,
32
- BenchmarkEvaluation,
33
- } from '../../../src/benchmarks/types'
34
- import { deterministicSplit } from '../../../src/benchmarks/types'
35
- import type { RunSplitTag } from '../../../src/run-record'
36
-
37
- export interface SweBenchLitePayload {
38
- instanceId: string
39
- problemStatement: string
40
- baseCommit: string
41
- repo: string
42
- failToPass: string[]
43
- passToPass: string[]
44
- }
45
-
46
- export type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>
47
-
48
- class SweBenchLiteAdapter
49
- implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload>
50
- {
51
- async loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]> {
52
- const path = process.env.AGENT_EVAL_SWEBENCH_PATH
53
- if (!path) {
54
- throw new Error(
55
- 'SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file ' +
56
- 'with the 30 lite instances. STUB: this wrapper does not bundle the dataset; ' +
57
- 'see https://www.swebench.com/lite.html for the canonical source.',
58
- )
59
- }
60
- if (!existsSync(path)) {
61
- throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`)
62
- }
63
- const all = parseJsonl(path)
64
- return all.filter((it) => assignSplitImpl(it.id) === split)
65
- }
66
-
67
- async evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation> {
68
- const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD
69
- if (!cmd) {
70
- throw new Error(
71
- 'SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an ' +
72
- 'executable that reads {instance_id, patch} JSON on stdin and writes ' +
73
- '{passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. ' +
74
- 'TODO(swebench-lite): bundle a default Docker-based runner once the SDK ' +
75
- 'stabilises (https://github.com/swe-bench/SWE-bench).',
76
- )
77
- }
78
- const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response })
79
- const result = await runGrader(cmd, stdinPayload)
80
- let parsed: Record<string, unknown>
81
- try {
82
- parsed = JSON.parse(result.stdout) as Record<string, unknown>
83
- } catch (e) {
84
- throw new Error(
85
- `SWE-Bench grader emitted non-JSON stdout: ${(e as Error).message}\n` +
86
- `stdout=${result.stdout.slice(0, 400)}\nstderr=${result.stderr.slice(0, 400)}`,
87
- )
88
- }
89
- const passed = Boolean(parsed.passed)
90
- return {
91
- score: passed ? 1 : 0,
92
- raw: {
93
- passed,
94
- failToPassPassed: Boolean(parsed.fail_to_pass_passed),
95
- passToPassPassed: Boolean(parsed.pass_to_pass_passed),
96
- graderLog: typeof parsed.log === 'string' ? parsed.log.slice(0, 4000) : '',
97
- },
98
- }
99
- }
100
-
101
- assignSplit(itemId: string): RunSplitTag {
102
- return assignSplitImpl(itemId)
103
- }
104
- }
105
-
106
- function assignSplitImpl(itemId: string): RunSplitTag {
107
- return deterministicSplit(`swebench-lite::${itemId}`)
108
- }
109
-
110
- function parseJsonl(path: string): SweBenchLiteItem[] {
111
- const raw = readFileSync(path, 'utf8')
112
- const out: SweBenchLiteItem[] = []
113
- let lineNo = 0
114
- for (const line of raw.split('\n')) {
115
- lineNo++
116
- const trimmed = line.trim()
117
- if (!trimmed) continue
118
- const row = JSON.parse(trimmed) as Record<string, unknown>
119
- const instanceId = String(row.instance_id ?? row.instanceId ?? '')
120
- if (!instanceId) {
121
- throw new Error(`swebench-lite line ${lineNo} missing instance_id`)
122
- }
123
- out.push({
124
- id: instanceId,
125
- payload: {
126
- instanceId,
127
- problemStatement: String(row.problem_statement ?? row.problemStatement ?? ''),
128
- baseCommit: String(row.base_commit ?? row.baseCommit ?? ''),
129
- repo: String(row.repo ?? ''),
130
- failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
131
- passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass),
132
- },
133
- })
134
- }
135
- return out
136
- }
137
-
138
- function asStringArray(v: unknown): string[] {
139
- if (Array.isArray(v)) return v.filter((x): x is string => typeof x === 'string')
140
- if (typeof v === 'string') {
141
- try {
142
- const parsed = JSON.parse(v)
143
- if (Array.isArray(parsed)) return parsed.filter((x): x is string => typeof x === 'string')
144
- } catch {
145
- // Plain string; treat as a single-element list.
146
- return [v]
147
- }
148
- }
149
- return []
150
- }
151
-
152
- function runGrader(cmd: string, stdin: string): Promise<{ stdout: string; stderr: string }> {
153
- return new Promise((resolve, reject) => {
154
- const parts = cmd.split(/\s+/)
155
- const child = spawn(parts[0]!, parts.slice(1), { stdio: ['pipe', 'pipe', 'pipe'] })
156
- let stdout = ''
157
- let stderr = ''
158
- child.stdout.on('data', (b: Buffer) => (stdout += b.toString('utf8')))
159
- child.stderr.on('data', (b: Buffer) => (stderr += b.toString('utf8')))
160
- child.on('error', reject)
161
- child.on('close', (code) => {
162
- if (code !== 0) {
163
- reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`))
164
- return
165
- }
166
- resolve({ stdout, stderr })
167
- })
168
- child.stdin.write(stdin)
169
- child.stdin.end()
170
- })
171
- }
172
-
173
- const adapter = new SweBenchLiteAdapter()
174
-
175
- export const loadDataset = adapter.loadDataset.bind(adapter)
176
- export const evaluate = adapter.evaluate.bind(adapter)
177
- export const assignSplit = adapter.assignSplit.bind(adapter)
178
- export { SweBenchLiteAdapter }
@@ -1,114 +0,0 @@
1
- import {
2
- runMultiShotOptimization,
3
- trialTraceFromMultiShotTrial,
4
- type MultiShotVariant,
5
- type RunRecord,
6
- } from '@tangle-network/agent-eval'
7
-
8
- type Payload = {
9
- instruction: string
10
- quality: number
11
- }
12
-
13
- const baseline: MultiShotVariant<Payload> = {
14
- id: 'baseline',
15
- label: 'baseline',
16
- generation: 0,
17
- payload: {
18
- instruction: 'Complete the user task.',
19
- quality: 0.45,
20
- },
21
- }
22
-
23
- const result = await runMultiShotOptimization<Payload>({
24
- runId: 'demo-multi-shot',
25
- target: 'demo-agent-system-prompt',
26
- seedVariants: [baseline],
27
- searchScenarioIds: ['search-brief', 'search-code-review', 'search-research'],
28
- reps: 1,
29
- generations: 2,
30
- populationSize: 2,
31
- scoreConcurrency: 2,
32
- runner: {
33
- async run({ variant, scenarioId }) {
34
- return {
35
- trace: {
36
- scenarioId,
37
- turns: [
38
- { role: 'user', content: `Run ${scenarioId}` },
39
- { role: 'assistant', content: `${variant.payload.instruction} quality=${variant.payload.quality}` },
40
- ],
41
- output: `quality=${variant.payload.quality}`,
42
- },
43
- costUsd: 0.01,
44
- durationMs: 50,
45
- }
46
- },
47
- },
48
- scorer: {
49
- async score({ variant }) {
50
- return {
51
- score: variant.payload.quality,
52
- ok: true,
53
- asi: variant.payload.quality >= 0.8
54
- ? []
55
- : [{
56
- expectationId: 'complete-task',
57
- message: 'The agent did not fully complete the task.',
58
- severity: 'error',
59
- responsibleSurface: 'system-prompt',
60
- suggestion: 'Make completion criteria explicit before final response.',
61
- }],
62
- }
63
- },
64
- },
65
- mutateAdapter: {
66
- async mutate({ parent, bottomTrials, childCount, generation }) {
67
- const traces = bottomTrials.map((trial) => trialTraceFromMultiShotTrial(trial))
68
- const rationale = traces.flatMap((trace) => (trace.expectations ?? []).map((e) => e.phrase)).join('\n')
69
- return Array.from({ length: childCount }, (_, i) => ({
70
- id: `${parent.id}.g${generation}.${i}`,
71
- label: 'completion-focused',
72
- generation,
73
- payload: {
74
- instruction: `${parent.payload.instruction} Verify every requested step before final answer.`,
75
- quality: 0.9,
76
- },
77
- rationale,
78
- }))
79
- },
80
- },
81
- gate: {
82
- holdoutScenarioIds: ['holdout-brief', 'holdout-code-review', 'holdout-research'],
83
- gate: {
84
- baselineKey: 'baseline',
85
- minProductiveRuns: 3,
86
- pairedDeltaThreshold: 0,
87
- seed: 7,
88
- },
89
- toRunRecord: ({ variant, scenarioId, rep, split, seed, trial }): RunRecord => ({
90
- runId: `demo-${variant.id}-${scenarioId}-${rep}-${split}`,
91
- experimentId: scenarioId,
92
- candidateId: variant.id,
93
- seed,
94
- model: 'demo-model@2026-01-01',
95
- promptHash: 'p'.repeat(64),
96
- configHash: 'c'.repeat(64),
97
- commitSha: 'deadbeef',
98
- wallMs: trial.durationMs ?? 0,
99
- costUsd: trial.cost ?? 0,
100
- tokenUsage: { input: 1, output: 1 },
101
- outcome: {
102
- [split === 'holdout' ? 'holdoutScore' : 'searchScore']: trial.score,
103
- raw: { score: trial.score },
104
- },
105
- splitTag: split,
106
- }),
107
- },
108
- })
109
-
110
- console.log({
111
- searchBest: result.searchBestVariant.id,
112
- promoted: result.promotedVariant.id,
113
- gate: result.gate?.decision ?? null,
114
- })