@tangle-network/agent-eval 0.20.9 → 0.20.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +302 -0
- package/README.md +86 -8
- package/dist/benchmarks/index.d.ts +1 -1
- package/dist/benchmarks/index.js +1 -1
- package/dist/{chunk-XDGJUIV2.js → chunk-42I2QC2L.js} +1 -1
- package/dist/chunk-42I2QC2L.js.map +1 -0
- package/dist/{chunk-CJJSB6ZQ.js → chunk-LSR4IAYN.js} +90 -11
- package/dist/chunk-LSR4IAYN.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{index-CEWY1rmu.d.ts → index-1PZOtZFr.d.ts} +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +63 -14
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +50 -25
- package/dist/{sink-fetch-C0B8ximv.d.ts → sink-fetch-B1Yg4Til.d.ts} +1 -1
- package/dist/telemetry/file.d.ts +1 -1
- package/dist/telemetry/index.d.ts +2 -2
- package/dist/telemetry/index.js.map +1 -1
- package/dist/wire/index.js +1 -1
- package/docs/product-eval-adoption.md +194 -0
- package/docs/wire-protocol.md +2 -2
- package/package.json +5 -4
- package/dist/chunk-CJJSB6ZQ.js.map +0 -1
- package/dist/chunk-XDGJUIV2.js.map +0 -1
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.20.
|
|
5
|
+
"version": "0.20.11",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
|
@@ -21,33 +21,58 @@
|
|
|
21
21
|
"components": {
|
|
22
22
|
"schemas": {
|
|
23
23
|
"JudgeRequest": {
|
|
24
|
-
"
|
|
25
|
-
|
|
26
|
-
"rubricName": {
|
|
27
|
-
"type": "string",
|
|
28
|
-
"description": "Use a built-in rubric by name. Mutually exclusive with `rubric`."
|
|
29
|
-
},
|
|
30
|
-
"rubric": {
|
|
31
|
-
"$ref": "#/components/schemas/Rubric"
|
|
32
|
-
},
|
|
33
|
-
"content": {
|
|
34
|
-
"type": "string",
|
|
35
|
-
"minLength": 1,
|
|
36
|
-
"description": "The text being judged — a tweet, a blog post, a code snippet, anything stringly."
|
|
37
|
-
},
|
|
38
|
-
"context": {
|
|
24
|
+
"oneOf": [
|
|
25
|
+
{
|
|
39
26
|
"type": "object",
|
|
40
|
-
"additionalProperties":
|
|
41
|
-
"
|
|
27
|
+
"additionalProperties": false,
|
|
28
|
+
"required": [
|
|
29
|
+
"rubricName",
|
|
30
|
+
"content"
|
|
31
|
+
],
|
|
32
|
+
"properties": {
|
|
33
|
+
"rubricName": {
|
|
34
|
+
"type": "string",
|
|
35
|
+
"minLength": 1
|
|
36
|
+
},
|
|
37
|
+
"content": {
|
|
38
|
+
"type": "string",
|
|
39
|
+
"minLength": 1
|
|
40
|
+
},
|
|
41
|
+
"context": {
|
|
42
|
+
"type": "object",
|
|
43
|
+
"additionalProperties": true
|
|
44
|
+
},
|
|
45
|
+
"model": {
|
|
46
|
+
"type": "string"
|
|
47
|
+
}
|
|
48
|
+
}
|
|
42
49
|
},
|
|
43
|
-
|
|
44
|
-
"type": "
|
|
45
|
-
"
|
|
50
|
+
{
|
|
51
|
+
"type": "object",
|
|
52
|
+
"additionalProperties": false,
|
|
53
|
+
"required": [
|
|
54
|
+
"rubric",
|
|
55
|
+
"content"
|
|
56
|
+
],
|
|
57
|
+
"properties": {
|
|
58
|
+
"rubric": {
|
|
59
|
+
"$ref": "#/components/schemas/Rubric"
|
|
60
|
+
},
|
|
61
|
+
"content": {
|
|
62
|
+
"type": "string",
|
|
63
|
+
"minLength": 1
|
|
64
|
+
},
|
|
65
|
+
"context": {
|
|
66
|
+
"type": "object",
|
|
67
|
+
"additionalProperties": true
|
|
68
|
+
},
|
|
69
|
+
"model": {
|
|
70
|
+
"type": "string"
|
|
71
|
+
}
|
|
72
|
+
}
|
|
46
73
|
}
|
|
47
|
-
|
|
48
|
-
"
|
|
49
|
-
"content"
|
|
50
|
-
]
|
|
74
|
+
],
|
|
75
|
+
"description": "Judge request. Provide exactly one of rubricName or rubric."
|
|
51
76
|
},
|
|
52
77
|
"Rubric": {
|
|
53
78
|
"type": "object",
|
|
@@ -64,7 +64,7 @@ interface TelemetryModel {
|
|
|
64
64
|
* `child_process`. Safe to import from a Cloudflare Worker, Lambda, edge
|
|
65
65
|
* function, or browser extension.
|
|
66
66
|
*
|
|
67
|
-
* For Node-only file persistence, import from '
|
|
67
|
+
* For Node-only file persistence, import from '@tangle-network/agent-eval/telemetry/file'.
|
|
68
68
|
*/
|
|
69
69
|
|
|
70
70
|
interface TelemetrySink {
|
package/dist/telemetry/file.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { T as TelemetryKind, a as TelemetryModel, b as TelemetrySource, c as TelemetrySink } from '../sink-fetch-
|
|
2
|
-
export { F as FanoutTelemetrySink, H as HttpTelemetrySink, I as InMemoryTelemetrySink, N as NullTelemetrySink, d as TELEMETRY_SCHEMA_VERSION, e as TelemetryEnvelope } from '../sink-fetch-
|
|
1
|
+
import { T as TelemetryKind, a as TelemetryModel, b as TelemetrySource, c as TelemetrySink } from '../sink-fetch-B1Yg4Til.js';
|
|
2
|
+
export { F as FanoutTelemetrySink, H as HttpTelemetrySink, I as InMemoryTelemetrySink, N as NullTelemetrySink, d as TELEMETRY_SCHEMA_VERSION, e as TelemetryEnvelope } from '../sink-fetch-B1Yg4Til.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Telemetry client — thin wrapper that builds envelopes from `EmitArgs` and
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/telemetry/schema.ts","../../src/telemetry/sink-fetch.ts","../../src/telemetry/client.ts"],"sourcesContent":["/**\n * Fleet telemetry envelope — agent-eval's portable observability shape.\n *\n * Designed so any consumer (Node CLI, Cloudflare Worker, Lambda, browser\n * extension) can emit structured rows describing one unit of work — a page\n * audit, a tool call, an evolve round, a full agent run — to a central sink.\n *\n * The schema is intentionally a strict superset of agent-eval's `Run` shape\n * so a future TraceStore adapter can promote envelopes into traces without\n * translation.\n */\n\nexport const TELEMETRY_SCHEMA_VERSION = 1\n\n/** Discriminator for the unit of work this envelope describes. */\nexport type TelemetryKind =\n | 'agent-run'\n | 'design-audit-page'\n | 'design-audit-run'\n | 'design-evolve-round'\n | 'design-evolve-run'\n | 'gepa-trial'\n | 'gepa-generation'\n | 'tool-call'\n | 'judge-verdict'\n | 'custom'\n\nexport interface TelemetryEnvelope {\n schemaVersion: typeof TELEMETRY_SCHEMA_VERSION\n envelopeId: string\n runId: string\n timestamp: string\n parentRunId?: string\n\n source: TelemetrySource\n model?: TelemetryModel\n kind: TelemetryKind\n ok: boolean\n durationMs: number\n\n data: Record<string, unknown>\n metrics: Record<string, number>\n tags?: Record<string, string>\n\n error?: string\n}\n\nexport interface TelemetrySource {\n /** Repo identity — basename of cwd plus git remote if discoverable. */\n repo: string\n cwd: string\n gitSha?: string\n gitBranch?: string\n cliVersion: string\n /** What was invoked, e.g. `design-audit`, `bad run`, `gepa --target`. */\n invocation: string\n /** Sanitised argv minus secrets. */\n argv?: string[]\n /**\n * Multi-tenant identity. Set when the consumer runs inside a hosted\n * product so a fleet rollup can group by tenant without leaking customer\n * URLs or PII.\n */\n tenantId?: string\n /** Optional sub-tenant identity (project, suite, walkthrough, customer). */\n customerId?: string\n /** SHA-256 (12 hex) of the API key used to authenticate this run, when applicable. */\n apiKeyHash?: string\n}\n\nexport interface TelemetryModel {\n provider: string\n name: string\n /** SHA-256 (12 hex chars) of the prompt(s) used. */\n promptHash?: string\n /** SHA-256 (12 hex chars) of the composed rubric body, if applicable. */\n rubricHash?: string\n}\n","/**\n * Workers-safe telemetry sinks — only `fetch` and pure JS. No `fs`, no\n * `child_process`. Safe to import from a Cloudflare Worker, Lambda, edge\n * function, or browser extension.\n *\n * For Node-only file persistence, import from './sink-file' instead.\n */\n\nimport type { TelemetryEnvelope } from './schema'\n\nexport interface TelemetrySink {\n emit(envelope: TelemetryEnvelope): Promise<void> | void\n close?(): Promise<void> | void\n}\n\n/** Best-effort POST to a remote collector. Fire-and-forget; never throws. */\nexport class HttpTelemetrySink implements TelemetrySink {\n private inflight = new Set<Promise<void>>()\n\n constructor(\n private readonly endpoint: string,\n private readonly bearer?: string,\n ) {}\n\n emit(envelope: TelemetryEnvelope): void {\n const body = JSON.stringify(envelope)\n const headers: Record<string, string> = { 'content-type': 'application/json' }\n if (this.bearer) headers.authorization = `Bearer ${this.bearer}`\n const promise = fetch(this.endpoint, { method: 'POST', headers, body })\n .then(() => undefined)\n .catch(() => undefined)\n this.inflight.add(promise)\n promise.finally(() => this.inflight.delete(promise))\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(Array.from(this.inflight))\n }\n}\n\n/** Fanout to multiple sinks — failures in one do not affect others. */\nexport class FanoutTelemetrySink implements TelemetrySink {\n constructor(private readonly sinks: TelemetrySink[]) {}\n\n emit(envelope: TelemetryEnvelope): void {\n for (const sink of this.sinks) {\n try {\n const result = sink.emit(envelope)\n if (result && typeof (result as Promise<unknown>).catch === 'function') {\n ;(result as Promise<unknown>).catch(() => undefined)\n }\n } catch {\n // swallow — telemetry must never break a run\n }\n }\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(this.sinks.map((s) => Promise.resolve(s.close?.())))\n }\n}\n\n/** No-op sink — used when telemetry is explicitly disabled. */\nexport class NullTelemetrySink implements TelemetrySink {\n emit(): void {}\n}\n\n/** In-memory sink — useful for tests + downstream adapters. */\nexport class InMemoryTelemetrySink implements TelemetrySink {\n readonly envelopes: TelemetryEnvelope[] = []\n emit(envelope: TelemetryEnvelope): void {\n this.envelopes.push(envelope)\n }\n clear(): void { this.envelopes.length = 0 }\n}\n","/**\n * Telemetry client — thin wrapper that builds envelopes from `EmitArgs` and\n * delegates to a `TelemetrySink`. Pure logic; no I/O. Use this from any\n * runtime — Workers, Node, browser — and choose the sink accordingly.\n *\n * For an opinionated singleton with env-var-driven sink wiring (the bad CLI\n * pattern), see `./node-client.ts`.\n */\n\nimport type { TelemetryEnvelope, TelemetryKind, TelemetryModel, TelemetrySource } from './schema'\nimport { TELEMETRY_SCHEMA_VERSION } from './schema'\nimport type { TelemetrySink } from './sink-fetch'\n\nexport interface EmitArgs {\n kind: TelemetryKind\n runId: string\n parentRunId?: string\n ok: boolean\n durationMs: number\n data?: Record<string, unknown>\n metrics?: Record<string, number>\n tags?: Record<string, string>\n model?: TelemetryModel\n error?: string\n /** Override the source for this envelope. Falls back to `defaultSource`. */\n source?: TelemetrySource\n}\n\nexport class TelemetryClient {\n constructor(\n private readonly sink: TelemetrySink,\n private readonly defaultSource: TelemetrySource,\n ) {}\n\n emit(args: EmitArgs): void {\n const envelope: TelemetryEnvelope = {\n schemaVersion: TELEMETRY_SCHEMA_VERSION,\n envelopeId: makeEnvelopeId(),\n runId: args.runId,\n timestamp: new Date().toISOString(),\n source: args.source ?? this.defaultSource,\n kind: args.kind,\n ok: args.ok,\n durationMs: args.durationMs,\n data: args.data ?? {},\n metrics: args.metrics ?? {},\n ...(args.parentRunId ? { parentRunId: args.parentRunId } : {}),\n ...(args.model ? { model: args.model } : {}),\n ...(args.tags ? { tags: args.tags } : {}),\n ...(args.error ? { error: args.error } : {}),\n }\n try {\n this.sink.emit(envelope)\n } catch {\n // swallow — telemetry never breaks the calling code path\n }\n }\n\n async close(): Promise<void> {\n await this.sink.close?.()\n }\n}\n\n/** Generate a UUIDv4 with whatever crypto is available (Node, Workers, browsers). */\nfunction makeEnvelopeId(): string {\n if (typeof crypto !== 'undefined' && typeof crypto.randomUUID === 'function') {\n return crypto.randomUUID()\n }\n // Last-resort fallback. Lower entropy but never throws.\n return 'env-' + Date.now().toString(36) + '-' + Math.random().toString(36).slice(2, 10)\n}\n\nexport const SECRET_FLAGS = new Set(['--api-key', '--bearer', '--token', '--password'])\n\n/** Strip likely-secret values from argv, preserving structure. */\nexport function sanitiseArgv(argv: string[]): string[] {\n const out: string[] = []\n for (let i = 0; i < argv.length; i++) {\n const a = argv[i]!\n if (SECRET_FLAGS.has(a)) {\n out.push(a, '<redacted>')\n i++\n continue\n }\n if (/^(?:--api-key|--bearer|--token|--password)=/.test(a)) {\n out.push(a.replace(/=.*$/, '=<redacted>'))\n continue\n }\n out.push(a)\n }\n return out\n}\n"],"mappings":";;;AAYO,IAAM,2BAA2B;;;ACIjC,IAAM,oBAAN,MAAiD;AAAA,EAGtD,YACmB,UACA,QACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAJX,WAAW,oBAAI,IAAmB;AAAA,EAO1C,KAAK,UAAmC;AACtC,UAAM,OAAO,KAAK,UAAU,QAAQ;AACpC,UAAM,UAAkC,EAAE,gBAAgB,mBAAmB;AAC7E,QAAI,KAAK,OAAQ,SAAQ,gBAAgB,UAAU,KAAK,MAAM;AAC9D,UAAM,UAAU,MAAM,KAAK,UAAU,EAAE,QAAQ,QAAQ,SAAS,KAAK,CAAC,EACnE,KAAK,MAAM,MAAS,EACpB,MAAM,MAAM,MAAS;AACxB,SAAK,SAAS,IAAI,OAAO;AACzB,YAAQ,QAAQ,MAAM,KAAK,SAAS,OAAO,OAAO,CAAC;AAAA,EACrD;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,MAAM,KAAK,KAAK,QAAQ,CAAC;AAAA,EACpD;AACF;AAGO,IAAM,sBAAN,MAAmD;AAAA,EACxD,YAA6B,OAAwB;AAAxB;AAAA,EAAyB;AAAA,EAAzB;AAAA,EAE7B,KAAK,UAAmC;AACtC,eAAW,QAAQ,KAAK,OAAO;AAC7B,UAAI;AACF,cAAM,SAAS,KAAK,KAAK,QAAQ;AACjC,YAAI,UAAU,OAAQ,OAA4B,UAAU,YAAY;AACtE;AAAC,UAAC,OAA4B,MAAM,MAAM,MAAS;AAAA,QACrD;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,KAAK,MAAM,IAAI,CAAC,MAAM,QAAQ,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAAA,EAC9E;AACF;AAGO,IAAM,oBAAN,MAAiD;AAAA,EACtD,OAAa;AAAA,EAAC;AAChB;AAGO,IAAM,wBAAN,MAAqD;AAAA,EACjD,YAAiC,CAAC;AAAA,EAC3C,KAAK,UAAmC;AACtC,SAAK,UAAU,KAAK,QAAQ;AAAA,EAC9B;AAAA,EACA,QAAc;AAAE,SAAK,UAAU,SAAS;AAAA,EAAE;AAC5C;;;AC9CO,IAAM,kBAAN,MAAsB;AAAA,EAC3B,YACmB,MACA,eACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAGnB,KAAK,MAAsB;AACzB,UAAM,WAA8B;AAAA,MAClC,eAAe;AAAA,MACf,YAAY,eAAe;AAAA,MAC3B,OAAO,KAAK;AAAA,MACZ,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,MAClC,QAAQ,KAAK,UAAU,KAAK;AAAA,MAC5B,MAAM,KAAK;AAAA,MACX,IAAI,KAAK;AAAA,MACT,YAAY,KAAK;AAAA,MACjB,MAAM,KAAK,QAAQ,CAAC;AAAA,MACpB,SAAS,KAAK,WAAW,CAAC;AAAA,MAC1B,GAAI,KAAK,cAAc,EAAE,aAAa,KAAK,YAAY,IAAI,CAAC;AAAA,MAC5D,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,MAC1C,GAAI,KAAK,OAAO,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC;AAAA,MACvC,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,IAC5C;AACA,QAAI;AACF,WAAK,KAAK,KAAK,QAAQ;AAAA,IACzB,QAAQ;AAAA,IAER;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,KAAK,KAAK,QAAQ;AAAA,EAC1B;AACF;AAGA,SAAS,iBAAyB;AAChC,MAAI,OAAO,WAAW,eAAe,OAAO,OAAO,eAAe,YAAY;AAC5E,WAAO,OAAO,WAAW;AAAA,EAC3B;AAEA,SAAO,SAAS,KAAK,IAAI,EAAE,SAAS,EAAE,IAAI,MAAM,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,EAAE;AACxF;AAEO,IAAM,eAAe,oBAAI,IAAI,CAAC,aAAa,YAAY,WAAW,YAAY,CAAC;AAG/E,SAAS,aAAa,MAA0B;AACrD,QAAM,MAAgB,CAAC;AACvB,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,CAAC;AAChB,QAAI,aAAa,IAAI,CAAC,GAAG;AACvB,UAAI,KAAK,GAAG,YAAY;AACxB;AACA;AAAA,IACF;AACA,QAAI,8CAA8C,KAAK,CAAC,GAAG;AACzD,UAAI,KAAK,EAAE,QAAQ,QAAQ,aAAa,CAAC;AACzC;AAAA,IACF;AACA,QAAI,KAAK,CAAC;AAAA,EACZ;AACA,SAAO;AACT;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/telemetry/schema.ts","../../src/telemetry/sink-fetch.ts","../../src/telemetry/client.ts"],"sourcesContent":["/**\n * Fleet telemetry envelope — agent-eval's portable observability shape.\n *\n * Designed so any consumer (Node CLI, Cloudflare Worker, Lambda, browser\n * extension) can emit structured rows describing one unit of work — a page\n * audit, a tool call, an evolve round, a full agent run — to a central sink.\n *\n * The schema is intentionally a strict superset of agent-eval's `Run` shape\n * so a future TraceStore adapter can promote envelopes into traces without\n * translation.\n */\n\nexport const TELEMETRY_SCHEMA_VERSION = 1\n\n/** Discriminator for the unit of work this envelope describes. */\nexport type TelemetryKind =\n | 'agent-run'\n | 'design-audit-page'\n | 'design-audit-run'\n | 'design-evolve-round'\n | 'design-evolve-run'\n | 'gepa-trial'\n | 'gepa-generation'\n | 'tool-call'\n | 'judge-verdict'\n | 'custom'\n\nexport interface TelemetryEnvelope {\n schemaVersion: typeof TELEMETRY_SCHEMA_VERSION\n envelopeId: string\n runId: string\n timestamp: string\n parentRunId?: string\n\n source: TelemetrySource\n model?: TelemetryModel\n kind: TelemetryKind\n ok: boolean\n durationMs: number\n\n data: Record<string, unknown>\n metrics: Record<string, number>\n tags?: Record<string, string>\n\n error?: string\n}\n\nexport interface TelemetrySource {\n /** Repo identity — basename of cwd plus git remote if discoverable. */\n repo: string\n cwd: string\n gitSha?: string\n gitBranch?: string\n cliVersion: string\n /** What was invoked, e.g. `design-audit`, `bad run`, `gepa --target`. */\n invocation: string\n /** Sanitised argv minus secrets. */\n argv?: string[]\n /**\n * Multi-tenant identity. Set when the consumer runs inside a hosted\n * product so a fleet rollup can group by tenant without leaking customer\n * URLs or PII.\n */\n tenantId?: string\n /** Optional sub-tenant identity (project, suite, walkthrough, customer). */\n customerId?: string\n /** SHA-256 (12 hex) of the API key used to authenticate this run, when applicable. */\n apiKeyHash?: string\n}\n\nexport interface TelemetryModel {\n provider: string\n name: string\n /** SHA-256 (12 hex chars) of the prompt(s) used. */\n promptHash?: string\n /** SHA-256 (12 hex chars) of the composed rubric body, if applicable. */\n rubricHash?: string\n}\n","/**\n * Workers-safe telemetry sinks — only `fetch` and pure JS. No `fs`, no\n * `child_process`. Safe to import from a Cloudflare Worker, Lambda, edge\n * function, or browser extension.\n *\n * For Node-only file persistence, import from '@tangle-network/agent-eval/telemetry/file'.\n */\n\nimport type { TelemetryEnvelope } from './schema'\n\nexport interface TelemetrySink {\n emit(envelope: TelemetryEnvelope): Promise<void> | void\n close?(): Promise<void> | void\n}\n\n/** Best-effort POST to a remote collector. Fire-and-forget; never throws. */\nexport class HttpTelemetrySink implements TelemetrySink {\n private inflight = new Set<Promise<void>>()\n\n constructor(\n private readonly endpoint: string,\n private readonly bearer?: string,\n ) {}\n\n emit(envelope: TelemetryEnvelope): void {\n const body = JSON.stringify(envelope)\n const headers: Record<string, string> = { 'content-type': 'application/json' }\n if (this.bearer) headers.authorization = `Bearer ${this.bearer}`\n const promise = fetch(this.endpoint, { method: 'POST', headers, body })\n .then(() => undefined)\n .catch(() => undefined)\n this.inflight.add(promise)\n promise.finally(() => this.inflight.delete(promise))\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(Array.from(this.inflight))\n }\n}\n\n/** Fanout to multiple sinks — failures in one do not affect others. */\nexport class FanoutTelemetrySink implements TelemetrySink {\n constructor(private readonly sinks: TelemetrySink[]) {}\n\n emit(envelope: TelemetryEnvelope): void {\n for (const sink of this.sinks) {\n try {\n const result = sink.emit(envelope)\n if (result && typeof (result as Promise<unknown>).catch === 'function') {\n ;(result as Promise<unknown>).catch(() => undefined)\n }\n } catch {\n // swallow — telemetry must never break a run\n }\n }\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(this.sinks.map((s) => Promise.resolve(s.close?.())))\n }\n}\n\n/** No-op sink — used when telemetry is explicitly disabled. */\nexport class NullTelemetrySink implements TelemetrySink {\n emit(): void {}\n}\n\n/** In-memory sink — useful for tests + downstream adapters. */\nexport class InMemoryTelemetrySink implements TelemetrySink {\n readonly envelopes: TelemetryEnvelope[] = []\n emit(envelope: TelemetryEnvelope): void {\n this.envelopes.push(envelope)\n }\n clear(): void { this.envelopes.length = 0 }\n}\n","/**\n * Telemetry client — thin wrapper that builds envelopes from `EmitArgs` and\n * delegates to a `TelemetrySink`. Pure logic; no I/O. Use this from any\n * runtime — Workers, Node, browser — and choose the sink accordingly.\n *\n * For an opinionated singleton with env-var-driven sink wiring (the bad CLI\n * pattern), see `./node-client.ts`.\n */\n\nimport type { TelemetryEnvelope, TelemetryKind, TelemetryModel, TelemetrySource } from './schema'\nimport { TELEMETRY_SCHEMA_VERSION } from './schema'\nimport type { TelemetrySink } from './sink-fetch'\n\nexport interface EmitArgs {\n kind: TelemetryKind\n runId: string\n parentRunId?: string\n ok: boolean\n durationMs: number\n data?: Record<string, unknown>\n metrics?: Record<string, number>\n tags?: Record<string, string>\n model?: TelemetryModel\n error?: string\n /** Override the source for this envelope. Falls back to `defaultSource`. */\n source?: TelemetrySource\n}\n\nexport class TelemetryClient {\n constructor(\n private readonly sink: TelemetrySink,\n private readonly defaultSource: TelemetrySource,\n ) {}\n\n emit(args: EmitArgs): void {\n const envelope: TelemetryEnvelope = {\n schemaVersion: TELEMETRY_SCHEMA_VERSION,\n envelopeId: makeEnvelopeId(),\n runId: args.runId,\n timestamp: new Date().toISOString(),\n source: args.source ?? this.defaultSource,\n kind: args.kind,\n ok: args.ok,\n durationMs: args.durationMs,\n data: args.data ?? {},\n metrics: args.metrics ?? {},\n ...(args.parentRunId ? { parentRunId: args.parentRunId } : {}),\n ...(args.model ? { model: args.model } : {}),\n ...(args.tags ? { tags: args.tags } : {}),\n ...(args.error ? { error: args.error } : {}),\n }\n try {\n this.sink.emit(envelope)\n } catch {\n // swallow — telemetry never breaks the calling code path\n }\n }\n\n async close(): Promise<void> {\n await this.sink.close?.()\n }\n}\n\n/** Generate a UUIDv4 with whatever crypto is available (Node, Workers, browsers). */\nfunction makeEnvelopeId(): string {\n if (typeof crypto !== 'undefined' && typeof crypto.randomUUID === 'function') {\n return crypto.randomUUID()\n }\n // Last-resort fallback. Lower entropy but never throws.\n return 'env-' + Date.now().toString(36) + '-' + Math.random().toString(36).slice(2, 10)\n}\n\nexport const SECRET_FLAGS = new Set(['--api-key', '--bearer', '--token', '--password'])\n\n/** Strip likely-secret values from argv, preserving structure. */\nexport function sanitiseArgv(argv: string[]): string[] {\n const out: string[] = []\n for (let i = 0; i < argv.length; i++) {\n const a = argv[i]!\n if (SECRET_FLAGS.has(a)) {\n out.push(a, '<redacted>')\n i++\n continue\n }\n if (/^(?:--api-key|--bearer|--token|--password)=/.test(a)) {\n out.push(a.replace(/=.*$/, '=<redacted>'))\n continue\n }\n out.push(a)\n }\n return out\n}\n"],"mappings":";;;AAYO,IAAM,2BAA2B;;;ACIjC,IAAM,oBAAN,MAAiD;AAAA,EAGtD,YACmB,UACA,QACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAJX,WAAW,oBAAI,IAAmB;AAAA,EAO1C,KAAK,UAAmC;AACtC,UAAM,OAAO,KAAK,UAAU,QAAQ;AACpC,UAAM,UAAkC,EAAE,gBAAgB,mBAAmB;AAC7E,QAAI,KAAK,OAAQ,SAAQ,gBAAgB,UAAU,KAAK,MAAM;AAC9D,UAAM,UAAU,MAAM,KAAK,UAAU,EAAE,QAAQ,QAAQ,SAAS,KAAK,CAAC,EACnE,KAAK,MAAM,MAAS,EACpB,MAAM,MAAM,MAAS;AACxB,SAAK,SAAS,IAAI,OAAO;AACzB,YAAQ,QAAQ,MAAM,KAAK,SAAS,OAAO,OAAO,CAAC;AAAA,EACrD;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,MAAM,KAAK,KAAK,QAAQ,CAAC;AAAA,EACpD;AACF;AAGO,IAAM,sBAAN,MAAmD;AAAA,EACxD,YAA6B,OAAwB;AAAxB;AAAA,EAAyB;AAAA,EAAzB;AAAA,EAE7B,KAAK,UAAmC;AACtC,eAAW,QAAQ,KAAK,OAAO;AAC7B,UAAI;AACF,cAAM,SAAS,KAAK,KAAK,QAAQ;AACjC,YAAI,UAAU,OAAQ,OAA4B,UAAU,YAAY;AACtE;AAAC,UAAC,OAA4B,MAAM,MAAM,MAAS;AAAA,QACrD;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,KAAK,MAAM,IAAI,CAAC,MAAM,QAAQ,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAAA,EAC9E;AACF;AAGO,IAAM,oBAAN,MAAiD;AAAA,EACtD,OAAa;AAAA,EAAC;AAChB;AAGO,IAAM,wBAAN,MAAqD;AAAA,EACjD,YAAiC,CAAC;AAAA,EAC3C,KAAK,UAAmC;AACtC,SAAK,UAAU,KAAK,QAAQ;AAAA,EAC9B;AAAA,EACA,QAAc;AAAE,SAAK,UAAU,SAAS;AAAA,EAAE;AAC5C;;;AC9CO,IAAM,kBAAN,MAAsB;AAAA,EAC3B,YACmB,MACA,eACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAGnB,KAAK,MAAsB;AACzB,UAAM,WAA8B;AAAA,MAClC,eAAe;AAAA,MACf,YAAY,eAAe;AAAA,MAC3B,OAAO,KAAK;AAAA,MACZ,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,MAClC,QAAQ,KAAK,UAAU,KAAK;AAAA,MAC5B,MAAM,KAAK;AAAA,MACX,IAAI,KAAK;AAAA,MACT,YAAY,KAAK;AAAA,MACjB,MAAM,KAAK,QAAQ,CAAC;AAAA,MACpB,SAAS,KAAK,WAAW,CAAC;AAAA,MAC1B,GAAI,KAAK,cAAc,EAAE,aAAa,KAAK,YAAY,IAAI,CAAC;AAAA,MAC5D,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,MAC1C,GAAI,KAAK,OAAO,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC;AAAA,MACvC,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,IAC5C;AACA,QAAI;AACF,WAAK,KAAK,KAAK,QAAQ;AAAA,IACzB,QAAQ;AAAA,IAER;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,KAAK,KAAK,QAAQ;AAAA,EAC1B;AACF;AAGA,SAAS,iBAAyB;AAChC,MAAI,OAAO,WAAW,eAAe,OAAO,OAAO,eAAe,YAAY;AAC5E,WAAO,OAAO,WAAW;AAAA,EAC3B;AAEA,SAAO,SAAS,KAAK,IAAI,EAAE,SAAS,EAAE,IAAI,MAAM,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,EAAE;AACxF;AAEO,IAAM,eAAe,oBAAI,IAAI,CAAC,aAAa,YAAY,WAAW,YAAY,CAAC;AAG/E,SAAS,aAAa,MAA0B;AACrD,QAAM,MAAgB,CAAC;AACvB,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,CAAC;AAChB,QAAI,aAAa,IAAI,CAAC,GAAG;AACvB,UAAI,KAAK,GAAG,YAAY;AACxB;AACA;AAAA,IACF;AACA,QAAI,8CAA8C,KAAK,CAAC,GAAG;AACzD,UAAI,KAAK,EAAE,QAAQ,QAAQ,aAAa,CAAC;AACzC;AAAA,IACF;AACA,QAAI,KAAK,CAAC;AAAA,EACZ;AACA,SAAO;AACT;","names":[]}
|
package/dist/wire/index.js
CHANGED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# Product Eval Adoption
|
|
2
|
+
|
|
3
|
+
This guide is for teams adding `@tangle-network/agent-eval` to a real agent
|
|
4
|
+
product. The package supplies evaluation contracts and runtime primitives. Your
|
|
5
|
+
product supplies the actual workflow adapter, state, credentials, tools, UI, and
|
|
6
|
+
storage.
|
|
7
|
+
|
|
8
|
+
## Goal
|
|
9
|
+
|
|
10
|
+
Use the same loop for production, replay, and optimization:
|
|
11
|
+
|
|
12
|
+
```txt
|
|
13
|
+
real user task
|
|
14
|
+
-> product adapter observes state
|
|
15
|
+
-> validators and judges grade state
|
|
16
|
+
-> control loop decides next action
|
|
17
|
+
-> product agent acts in the real environment
|
|
18
|
+
-> trace + feedback trajectory are stored
|
|
19
|
+
-> datasets and optimizers replay the same adapter
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
If production and eval use different loops, benchmark gains will not transfer.
|
|
23
|
+
|
|
24
|
+
## What The Product Owns
|
|
25
|
+
|
|
26
|
+
The product owns:
|
|
27
|
+
|
|
28
|
+
- task state and domain models
|
|
29
|
+
- credentials, tenant policy, approval, and side-effect rules
|
|
30
|
+
- browser, sandbox, CLI, connector, or voice drivers
|
|
31
|
+
- database and trace persistence
|
|
32
|
+
- user/reviewer feedback collection
|
|
33
|
+
- deployment and live canary routing
|
|
34
|
+
- model gateway configuration
|
|
35
|
+
|
|
36
|
+
`agent-eval` owns:
|
|
37
|
+
|
|
38
|
+
- trace, run, dataset, feedback, and score contracts
|
|
39
|
+
- control-loop mechanics
|
|
40
|
+
- verifier and judge orchestration
|
|
41
|
+
- failure taxonomy
|
|
42
|
+
- paired statistics and holdout gates
|
|
43
|
+
- optimizer inputs and promotion reports
|
|
44
|
+
|
|
45
|
+
## Minimal Production Adapter
|
|
46
|
+
|
|
47
|
+
Start with a small adapter that mirrors one real workflow.
|
|
48
|
+
|
|
49
|
+
```ts
|
|
50
|
+
interface ProductEvalAdapter<TState, TAction> {
|
|
51
|
+
observe(taskId: string): Promise<TState>
|
|
52
|
+
validate(state: TState): Promise<ControlEvalResult[]>
|
|
53
|
+
decide(input: {
|
|
54
|
+
state: TState
|
|
55
|
+
evals: ControlEvalResult[]
|
|
56
|
+
history: unknown[]
|
|
57
|
+
}): Promise<TAction | 'stop'>
|
|
58
|
+
act(taskId: string, action: TAction): Promise<void>
|
|
59
|
+
}
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Keep the adapter product-owned until at least two products need the same shape.
|
|
63
|
+
|
|
64
|
+
## Validator Order
|
|
65
|
+
|
|
66
|
+
Use deterministic checks before judges.
|
|
67
|
+
|
|
68
|
+
1. **State validity**: schema, required files, required DB rows, required
|
|
69
|
+
connections.
|
|
70
|
+
2. **Runtime gates**: install, build, typecheck, tests, serve, deploy smoke.
|
|
71
|
+
3. **Policy gates**: approvals, side effects, budget, credentials, data
|
|
72
|
+
freshness.
|
|
73
|
+
4. **Behavior gates**: browser flows, API calls, generated app preview, voice
|
|
74
|
+
transcript checks.
|
|
75
|
+
5. **Semantic judges**: intent fit, quality, completeness, safety,
|
|
76
|
+
professional correctness.
|
|
77
|
+
|
|
78
|
+
Semantic judges should never turn a failed build into a pass.
|
|
79
|
+
|
|
80
|
+
## Traces And Feedback
|
|
81
|
+
|
|
82
|
+
Every serious run should record:
|
|
83
|
+
|
|
84
|
+
- task id and scenario id
|
|
85
|
+
- git commit
|
|
86
|
+
- model and provider
|
|
87
|
+
- prompt/config hashes
|
|
88
|
+
- tool calls and retrieval spans
|
|
89
|
+
- build/test/deploy output
|
|
90
|
+
- cost, latency, and token use
|
|
91
|
+
- user/reviewer feedback
|
|
92
|
+
- final outcome and failure class
|
|
93
|
+
|
|
94
|
+
Convert runs into `FeedbackTrajectory` records so normal product usage becomes
|
|
95
|
+
replayable eval data.
|
|
96
|
+
|
|
97
|
+
```txt
|
|
98
|
+
production run -> feedback trajectory -> dataset scenario -> optimizer row
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Datasets And Holdouts
|
|
102
|
+
|
|
103
|
+
Use four splits:
|
|
104
|
+
|
|
105
|
+
- `train`: optimizer search.
|
|
106
|
+
- `dev`: tuning and threshold selection.
|
|
107
|
+
- `test`: normal reporting.
|
|
108
|
+
- `holdout`: promotion-only gate.
|
|
109
|
+
|
|
110
|
+
Do not inspect or tune against holdout failures during optimization. If a
|
|
111
|
+
holdout failure reveals a real product bug, fix the bug and rotate the holdout
|
|
112
|
+
set with a signed note.
|
|
113
|
+
|
|
114
|
+
## Optimization
|
|
115
|
+
|
|
116
|
+
Use `runMultiShotOptimization()` when the system is a multi-step agent, not a
|
|
117
|
+
single prompt.
|
|
118
|
+
|
|
119
|
+
Good optimization targets:
|
|
120
|
+
|
|
121
|
+
- system prompt
|
|
122
|
+
- tool descriptions
|
|
123
|
+
- retrieval policy
|
|
124
|
+
- data acquisition policy
|
|
125
|
+
- user-question policy
|
|
126
|
+
- evaluator threshold
|
|
127
|
+
- agent topology
|
|
128
|
+
- scaffold/template choice
|
|
129
|
+
|
|
130
|
+
Bad optimization targets:
|
|
131
|
+
|
|
132
|
+
- hidden holdout examples
|
|
133
|
+
- production credentials
|
|
134
|
+
- brittle string checks that do not match user value
|
|
135
|
+
- fake workflows that do not call the product adapter
|
|
136
|
+
|
|
137
|
+
Use actionable side information so the optimizer knows whether a failure belongs
|
|
138
|
+
to prompt, tools, retrieval, data acquisition, sandbox, evaluator, or product
|
|
139
|
+
runtime.
|
|
140
|
+
|
|
141
|
+
## Release Gate
|
|
142
|
+
|
|
143
|
+
A launch or promotion should require:
|
|
144
|
+
|
|
145
|
+
- enough runs for the target risk level
|
|
146
|
+
- paired improvement over the current baseline
|
|
147
|
+
- no critical regression on test
|
|
148
|
+
- holdout pass or explicit rejection
|
|
149
|
+
- cost and latency within budget
|
|
150
|
+
- no unresolved canary or contamination failures
|
|
151
|
+
- trace evidence for representative successes and failures
|
|
152
|
+
- human-readable report with failure clusters and next actions
|
|
153
|
+
|
|
154
|
+
`evaluateReleaseConfidence()` and the paired statistics helpers provide the
|
|
155
|
+
decision data. The product decides the business threshold.
|
|
156
|
+
|
|
157
|
+
## Product Patterns
|
|
158
|
+
|
|
159
|
+
### Coding Or Builder Agent
|
|
160
|
+
|
|
161
|
+
Use sandbox/build/test/serve/browser validators. Add intent and semantic
|
|
162
|
+
concept judges only after the generated app runs.
|
|
163
|
+
|
|
164
|
+
### Browser Agent
|
|
165
|
+
|
|
166
|
+
Record browser steps, screenshots, network errors, console errors, and final
|
|
167
|
+
state. Use deterministic DOM/API assertions before visual or semantic judges.
|
|
168
|
+
|
|
169
|
+
### Domain Agent
|
|
170
|
+
|
|
171
|
+
Use domain fixtures, jurisdiction/date metadata, retrieval spans, and
|
|
172
|
+
professional judges. Fail missing/stale evidence separately from bad reasoning.
|
|
173
|
+
|
|
174
|
+
### Workflow Or Integration Agent
|
|
175
|
+
|
|
176
|
+
Use `@tangle-network/agent-integrations` manifests as readiness inputs. Gate
|
|
177
|
+
missing connections, missing scopes, approval-required writes, and stale tokens
|
|
178
|
+
before blaming the agent prompt.
|
|
179
|
+
|
|
180
|
+
### Voice Agent
|
|
181
|
+
|
|
182
|
+
Record transcript, timing, interruptions, tool calls, and task outcome. Judge
|
|
183
|
+
conversation quality separately from tool success and policy compliance.
|
|
184
|
+
|
|
185
|
+
## Anti-Patterns
|
|
186
|
+
|
|
187
|
+
- Evaluating only final prose for an agent that actually builds, browses, or
|
|
188
|
+
calls tools.
|
|
189
|
+
- Letting an LLM judge override failed tests.
|
|
190
|
+
- Optimizing on examples that users will never hit.
|
|
191
|
+
- Recording traces as logs but never converting them to datasets.
|
|
192
|
+
- Calling every failure a prompt failure when context, data, auth, or runtime
|
|
193
|
+
readiness was missing.
|
|
194
|
+
- Shipping reports without run ids, commits, model ids, or evidence links.
|
package/docs/wire-protocol.md
CHANGED
|
@@ -96,7 +96,7 @@ GET /v1/version
|
|
|
96
96
|
```json
|
|
97
97
|
{
|
|
98
98
|
"package": "@tangle-network/agent-eval",
|
|
99
|
-
"version": "0.20.
|
|
99
|
+
"version": "0.20.10",
|
|
100
100
|
"wireVersion": "1.0.0",
|
|
101
101
|
"apiSurface": ["judge", "listRubrics", "version"]
|
|
102
102
|
}
|
|
@@ -176,7 +176,7 @@ Each invocation is one process — Node startup adds ~500 ms. For more than a fe
|
|
|
176
176
|
|
|
177
177
|
## Clients
|
|
178
178
|
|
|
179
|
-
- **Python**: source lives in [`clients/python`](
|
|
179
|
+
- **Python**: source lives in [`clients/python`](https://github.com/tangle-network/agent-eval/tree/main/clients/python). Auto-detects HTTP, falls back to subprocess. Version-locked to npm.
|
|
180
180
|
- **TypeScript**: import directly from `@tangle-network/agent-eval` (no wire round-trip needed in-process).
|
|
181
181
|
- **Rust / Go / Other**: generate from `dist/openapi.json`. PRs welcome to add an officially-maintained client.
|
|
182
182
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.11",
|
|
4
4
|
"description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -48,19 +48,20 @@
|
|
|
48
48
|
},
|
|
49
49
|
"files": [
|
|
50
50
|
"dist",
|
|
51
|
-
"docs"
|
|
51
|
+
"docs",
|
|
52
|
+
"CHANGELOG.md"
|
|
52
53
|
],
|
|
53
54
|
"publishConfig": {
|
|
54
55
|
"access": "public"
|
|
55
56
|
},
|
|
56
57
|
"scripts": {
|
|
57
|
-
"build": "tsup &&
|
|
58
|
+
"build": "tsup && pnpm openapi",
|
|
58
59
|
"dev": "tsup --watch",
|
|
59
60
|
"prepare": "pnpm build",
|
|
60
61
|
"test": "vitest run",
|
|
61
62
|
"test:watch": "vitest",
|
|
62
63
|
"typecheck": "tsc --noEmit",
|
|
63
|
-
"openapi": "
|
|
64
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
64
65
|
},
|
|
65
66
|
"dependencies": {
|
|
66
67
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/wire/schemas.ts","../src/wire/rubrics.ts","../src/wire/handlers.ts","../src/wire/openapi.ts","../src/wire/server.ts","../src/wire/rpc.ts"],"sourcesContent":["/**\n * Wire-protocol schemas.\n *\n * These Zod schemas are the contract between the agent-eval runtime and\n * any non-TypeScript client (Python, Rust, Go, …). They get rendered to\n * OpenAPI by `wire/openapi.ts` and code-generators consume that spec to\n * produce typed clients in other languages.\n *\n * Rule: if it's not in this file, it isn't on the wire. Keep names and\n * shapes self-explanatory — every field has a `.describe()` so the\n * generated docs are useful without reading the source.\n */\nimport { extendZodWithOpenApi } from '@asteasolutions/zod-to-openapi'\nimport { z } from 'zod'\n\nextendZodWithOpenApi(z)\n\n// ── Building blocks ─────────────────────────────────────────────────\n\nexport const RubricDimensionSchema = z\n .object({\n id: z\n .string()\n .min(1)\n .describe('Short stable id like \"buyer_quality\" — used as the key in scoring output.'),\n description: z\n .string()\n .min(1)\n .describe('One-line plain-English meaning. Read by humans reviewing low scores.'),\n weight: z\n .number()\n .min(0)\n .default(1)\n .describe('Relative weight in the composite score. Default 1; 0 disables.'),\n min: z.number().default(0).describe('Lower bound of valid score for this dimension.'),\n max: z.number().default(1).describe('Upper bound of valid score for this dimension.'),\n })\n .openapi('RubricDimension')\n\nexport const FailureModeSchema = z\n .object({\n id: z.string().min(1).describe('Short stable id like \"ai-cadence\" — used in detection lists.'),\n description: z.string().min(1).describe('Plain-English description of the failure pattern.'),\n })\n .openapi('FailureMode')\n\n// ── Rubric ──────────────────────────────────────────────────────────\n\nexport const RubricSchema = z\n .object({\n name: z\n .string()\n .min(1)\n .describe('Stable name like \"anti-slop\" — used by clients to invoke this rubric.'),\n description: z\n .string()\n .min(1)\n .describe('What this rubric measures. Shown in /v1/rubrics listing.'),\n systemPrompt: z\n .string()\n .min(1)\n .describe(\n 'Instructs the judging LLM. Should explain the persona (e.g. \"senior engineer reviewing voice\"), what to score on, and what to return.',\n ),\n dimensions: z\n .array(RubricDimensionSchema)\n .min(1)\n .describe('Scoring axes. The composite score is a weighted sum of these.'),\n failureModes: z\n .array(FailureModeSchema)\n .default([])\n .describe('Patterns to detect; each detected mode appears in the result.failureModes list.'),\n wins: z\n .array(FailureModeSchema)\n .default([])\n .describe('Positive patterns; each detected one appears in the result.wins list.'),\n })\n .openapi('Rubric')\n\n// ── Judge call ──────────────────────────────────────────────────────\n\nexport const JudgeRequestSchema = z\n .object({\n rubricName: z\n .string()\n .optional()\n .describe('Use a built-in rubric by name. Mutually exclusive with `rubric`.'),\n rubric: RubricSchema.optional().describe(\n 'Inline rubric definition. Mutually exclusive with `rubricName`.',\n ),\n content: z\n .string()\n .min(1)\n .describe('The text being judged — a tweet, a blog post, a code snippet, anything stringly.'),\n context: z\n .record(z.string(), z.unknown())\n .optional()\n .describe(\n 'Free-form metadata for the rubric to use — analytics, source URL, author, etc. Surfaced to the LLM.',\n ),\n model: z\n .string()\n .optional()\n .describe('Override the judge model (default routes via tcloud). e.g. \"claude-opus-4-7\".'),\n })\n .refine((v) => Boolean(v.rubricName) !== Boolean(v.rubric), {\n message: 'Provide exactly one of `rubricName` or `rubric`.',\n })\n .openapi('JudgeRequest')\n\nexport const JudgeResultSchema = z\n .object({\n composite: z\n .number()\n .min(0)\n .max(1)\n .describe('Weighted combination of dimension scores in 0..1. The single number to gate on.'),\n dimensions: z\n .record(z.string(), z.number())\n .describe('Per-dimension score, keyed by RubricDimension.id.'),\n failureModes: z\n .array(z.string())\n .default([])\n .describe('Failure-mode ids detected in the content (subset of rubric.failureModes ids).'),\n wins: z\n .array(z.string())\n .default([])\n .describe('Win ids detected in the content (subset of rubric.wins ids).'),\n rationale: z\n .string()\n .describe('Plain-English explanation of the score. Surfaced to the human reviewer.'),\n rubricVersion: z\n .string()\n .describe(\n 'Stable hash of the rubric used. Scores are only comparable across runs when this matches.',\n ),\n model: z.string().describe('Model that produced the judgement, for reproducibility.'),\n durationMs: z.number().int().nonnegative().describe('End-to-end wall time for this call.'),\n })\n .openapi('JudgeResult')\n\n// ── Rubric listing ──────────────────────────────────────────────────\n\nexport const RubricInfoSchema = z\n .object({\n name: z.string().describe('Pass this to /v1/judge as `rubricName`.'),\n description: z.string().describe('What this rubric measures.'),\n dimensions: z\n .array(z.object({ id: z.string(), description: z.string(), weight: z.number() }))\n .describe('The scoring axes this rubric uses, with weights.'),\n failureModes: z.array(z.string()).default([]).describe('Failure-mode ids this rubric detects.'),\n rubricVersion: z.string().describe('Stable hash — match this to compare scores across runs.'),\n })\n .openapi('RubricInfo')\n\nexport const ListRubricsResponseSchema = z\n .object({\n rubrics: z.array(RubricInfoSchema),\n })\n .openapi('ListRubricsResponse')\n\n// ── Version / health ────────────────────────────────────────────────\n\nexport const VersionResponseSchema = z\n .object({\n package: z.string().describe('Package name (always \"@tangle-network/agent-eval\").'),\n version: z.string().describe('Semver of the running server. Match your client to this.'),\n wireVersion: z\n .string()\n .describe(\n 'Wire-protocol semver. Bumps separately from package version when the schema changes.',\n ),\n apiSurface: z.array(z.string()).describe('List of supported method names.'),\n })\n .openapi('VersionResponse')\n\nexport const HealthResponseSchema = z\n .object({\n status: z.literal('ok'),\n uptimeSec: z.number(),\n })\n .openapi('HealthResponse')\n\n// ── Errors ──────────────────────────────────────────────────────────\n\nexport const ErrorResponseSchema = z\n .object({\n error: z\n .object({\n code: z\n .string()\n .describe('Machine-readable code: \"validation_error\", \"rubric_not_found\", \"judge_error\".'),\n message: z.string().describe('Human-readable message.'),\n details: z.unknown().optional().describe('Optional structured detail.'),\n })\n .describe('Errors are always wrapped in this shape across all endpoints.'),\n })\n .openapi('ErrorResponse')\n\n// ── Type exports for callers in the same package ────────────────────\n\nexport type RubricDimension = z.infer<typeof RubricDimensionSchema>\nexport type FailureMode = z.infer<typeof FailureModeSchema>\nexport type Rubric = z.infer<typeof RubricSchema>\nexport type JudgeRequest = z.infer<typeof JudgeRequestSchema>\nexport type JudgeResult = z.infer<typeof JudgeResultSchema>\nexport type RubricInfo = z.infer<typeof RubricInfoSchema>\nexport type ListRubricsResponse = z.infer<typeof ListRubricsResponseSchema>\nexport type VersionResponse = z.infer<typeof VersionResponseSchema>\nexport type ErrorResponse = z.infer<typeof ErrorResponseSchema>\n\n// ── Wire-protocol version ───────────────────────────────────────────\n\n/**\n * Bump on any breaking change to a request/response schema.\n * Non-breaking (additive) changes don't require a bump.\n */\nexport const WIRE_VERSION = '1.0.0'\n\n/**\n * Stable hash of a rubric. Used to make scores comparable across runs:\n * if the rubricVersion matches, the rubric was identical.\n */\nexport function hashRubric(rubric: Rubric): string {\n // deterministic stringify (keys sorted) for stable hashing\n const stable = JSON.stringify(rubric, Object.keys(rubric).sort())\n let h = 5381\n for (let i = 0; i < stable.length; i++) {\n h = (h * 33) ^ stable.charCodeAt(i)\n }\n // Unsigned 32-bit hex, prefixed with rubric name + version slot\n return `${rubric.name}@${(h >>> 0).toString(16).padStart(8, '0')}`\n}\n","/**\n * Built-in rubrics shipped with agent-eval.\n *\n * A rubric is a set of scoring axes plus a system prompt that tells the\n * judging LLM how to grade against those axes. Built-in rubrics are\n * curated for use cases that recur across Tangle projects — call them\n * by name from any client.\n *\n * Adding a rubric:\n * 1. Define the Rubric object below with a clear `description` and\n * named `dimensions`.\n * 2. Register it in `BUILTIN_RUBRICS` at the bottom.\n * 3. Add a test in `tests/wire/rubrics.test.ts`.\n *\n * Custom rubrics: callers pass `rubric` inline to /v1/judge instead of\n * `rubricName` — see schemas.ts.\n */\nimport type { Rubric } from './schemas'\nimport { hashRubric } from './schemas'\n\n// ── anti-slop ───────────────────────────────────────────────────────\n// Voice/style judge tuned for technical-buyer audiences. Used by the\n// Postiz autoresearch loop and any content-quality gate.\n\nconst ANTI_SLOP: Rubric = {\n name: 'anti-slop',\n description:\n 'Voice and signal quality for content aimed at senior engineers. Catches AI cadence, marketing tone, and engagement-bait shapes.',\n systemPrompt: `You are evaluating a piece of content written for senior engineers and technical founders.\n\nYou score three things:\n- buyer_quality (0..1): would a senior engineer in the target ICP find this worth their attention? High = specific, earned, technically interesting. Low = generic, hyped, off-target.\n- voice (0..1): does it read like a person who built the thing, or like AI/marketing copy?\n- signal (0..1): does it contain a non-obvious detail, constraint, or claim a reader couldn't get from the public docs?\n\nDetect failure modes (return ids matching):\n- ai-cadence: rule-of-three openings, em-dash flourish, \"Let me explain\", \"Here's the thing\", AI rhythm\n- marketing-tone: \"We're excited to announce\", \"thrilled\", \"delighted\", \"game-changer\", buzzword stack\n- vague-claim: technical claim without a specific component, file, or measurement\n- no-hook: opening doesn't earn attention from the target reader\n- engagement-bait: \"agree?\", \"thoughts?\", listicles, controversy-fishing, hook-detail-pitch\n- off-icp: content shape would attract motivational/grift/hype audiences instead of buyers\n- stale-claim: repeats a positioning line we've used many times this month\n\nDetect wins (return ids matching):\n- specific-component: names a real file, component, or measurement\n- earned-detail: shares a non-obvious detail not derivable from public docs\n- constraint-articulated: names a real tradeoff and the side chosen\n- honest-failure: describes a real failure mode and what was done about it\n\nReturn ONLY JSON matching the response schema. Be conservative — most content has 0-1 wins and 1-2 failure modes, not many of each.`,\n dimensions: [\n {\n id: 'buyer_quality',\n description: 'Would the target buyer find this worth attention?',\n weight: 0.5,\n min: 0,\n max: 1,\n },\n {\n id: 'voice',\n description: 'Does it sound like a builder, not AI or marketing?',\n weight: 0.3,\n min: 0,\n max: 1,\n },\n {\n id: 'signal',\n description: 'Non-obvious detail, constraint, or claim?',\n weight: 0.2,\n min: 0,\n max: 1,\n },\n ],\n failureModes: [\n { id: 'ai-cadence', description: 'AI-rhythm openings and transitions' },\n { id: 'marketing-tone', description: 'Buzzwords, hype, corporate-PR voice' },\n { id: 'vague-claim', description: 'Technical claim without specifics' },\n { id: 'no-hook', description: 'Opening fails to earn attention' },\n { id: 'engagement-bait', description: 'Listicle/controversy/agree-pattern' },\n { id: 'off-icp', description: 'Voice attracts the wrong audience' },\n { id: 'stale-claim', description: 'Reuses an over-used positioning line' },\n ],\n wins: [\n { id: 'specific-component', description: 'Names a real file/component/number' },\n { id: 'earned-detail', description: 'Detail not in public docs' },\n { id: 'constraint-articulated', description: 'Names a real tradeoff' },\n { id: 'honest-failure', description: 'Describes a real failure honestly' },\n ],\n}\n\n// ── Registry ────────────────────────────────────────────────────────\n\nexport const BUILTIN_RUBRICS: Record<string, Rubric> = {\n 'anti-slop': ANTI_SLOP,\n}\n\n/** Get a built-in rubric by name, or undefined. */\nexport function getBuiltinRubric(name: string): Rubric | undefined {\n return BUILTIN_RUBRICS[name]\n}\n\n/** List built-in rubrics with their stable versions. */\nexport function listBuiltinRubrics() {\n return Object.values(BUILTIN_RUBRICS).map((r) => ({\n name: r.name,\n description: r.description,\n dimensions: r.dimensions.map((d) => ({\n id: d.id,\n description: d.description,\n weight: d.weight,\n })),\n failureModes: r.failureModes.map((f) => f.id),\n rubricVersion: hashRubric(r),\n }))\n}\n","/**\n * Pure handler functions — the \"business logic\" behind every wire-protocol\n * method. The HTTP server (`server.ts`) and the stdio RPC (`rpc.ts`) both\n * call these. Tests call these directly without spinning a server.\n *\n * Each handler:\n * - Takes a parsed request (already Zod-validated by the transport).\n * - Returns a result that matches the response schema.\n * - Throws `WireError` for caller-fixable errors (404, 400, 422).\n * - Lets unexpected errors bubble — the transport maps them to 500.\n */\nimport { callLlmJson } from '../llm-client'\nimport { getBuiltinRubric, listBuiltinRubrics } from './rubrics'\nimport {\n hashRubric,\n WIRE_VERSION,\n type JudgeRequest,\n type JudgeResult,\n type ListRubricsResponse,\n type Rubric,\n type VersionResponse,\n} from './schemas'\n\n/** Caller-fixable error. The transport renders this to 4xx + ErrorResponse. */\nexport class WireError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n public readonly status: number = 400,\n public readonly details?: unknown,\n ) {\n super(message)\n this.name = 'WireError'\n }\n}\n\n// ── judge ───────────────────────────────────────────────────────────\n\n/** The JSON schema we ask the judging LLM to fill in. */\nfunction judgeOutputSchema(rubric: Rubric) {\n return {\n name: 'JudgeOutput',\n schema: {\n type: 'object',\n additionalProperties: false,\n properties: {\n dimensions: {\n type: 'object',\n additionalProperties: false,\n properties: Object.fromEntries(\n rubric.dimensions.map((d) => [\n d.id,\n { type: 'number', minimum: d.min, maximum: d.max },\n ]),\n ),\n required: rubric.dimensions.map((d) => d.id),\n },\n failureModes: {\n type: 'array',\n items: { type: 'string', enum: rubric.failureModes.map((f) => f.id) },\n },\n wins: {\n type: 'array',\n items: { type: 'string', enum: rubric.wins.map((w) => w.id) },\n },\n rationale: { type: 'string' },\n },\n required: ['dimensions', 'rationale'],\n } as Record<string, unknown>,\n }\n}\n\ninterface JudgeOutput {\n dimensions: Record<string, number>\n failureModes?: string[]\n wins?: string[]\n rationale: string\n}\n\nfunction compositeScore(dimensions: Record<string, number>, rubric: Rubric): number {\n let weighted = 0\n let totalWeight = 0\n for (const dim of rubric.dimensions) {\n const raw = dimensions[dim.id] ?? 0\n const range = dim.max - dim.min || 1\n const normalized = Math.max(0, Math.min(1, (raw - dim.min) / range))\n weighted += normalized * dim.weight\n totalWeight += dim.weight\n }\n return totalWeight > 0 ? weighted / totalWeight : 0\n}\n\nfunction buildJudgePrompt(content: string, context: unknown): string {\n const ctx = context && Object.keys(context as object).length ? JSON.stringify(context) : ''\n return [\n `CONTENT TO JUDGE:`,\n content,\n '',\n ctx ? `CONTEXT (metadata, analytics, etc.):` : '',\n ctx ? ctx : '',\n ]\n .filter(Boolean)\n .join('\\n')\n}\n\nconst DEFAULT_JUDGE_MODEL = 'claude-sonnet-4-6'\n\nexport async function handleJudge(req: JudgeRequest): Promise<JudgeResult> {\n // Resolve rubric\n let rubric: Rubric\n if (req.rubricName) {\n const found = getBuiltinRubric(req.rubricName)\n if (!found) {\n throw new WireError('rubric_not_found', `No built-in rubric named \"${req.rubricName}\".`, 404)\n }\n rubric = found\n } else if (req.rubric) {\n rubric = req.rubric\n } else {\n // refine() in the schema should already have caught this — defense in depth\n throw new WireError('validation_error', 'Provide either `rubricName` or `rubric`.', 422)\n }\n\n const startedAt = Date.now()\n const model = req.model ?? DEFAULT_JUDGE_MODEL\n\n const { value, result } = await callLlmJson<JudgeOutput>({\n model,\n messages: [\n { role: 'system', content: rubric.systemPrompt },\n { role: 'user', content: buildJudgePrompt(req.content, req.context) },\n ],\n jsonSchema: judgeOutputSchema(rubric),\n temperature: 0.0,\n timeoutMs: 60_000,\n })\n\n // Defensive: ensure dimensions object isn't malformed\n if (!value || typeof value !== 'object' || !value.dimensions) {\n throw new WireError('judge_error', 'Judge returned malformed output.', 500, value)\n }\n\n const composite = compositeScore(value.dimensions, rubric)\n const durationMs = Date.now() - startedAt\n\n return {\n composite,\n dimensions: value.dimensions,\n failureModes: value.failureModes ?? [],\n wins: value.wins ?? [],\n rationale: value.rationale,\n rubricVersion: hashRubric(rubric),\n model: result.model,\n durationMs,\n }\n}\n\n// ── listRubrics ─────────────────────────────────────────────────────\n\nexport function handleListRubrics(): ListRubricsResponse {\n return { rubrics: listBuiltinRubrics() }\n}\n\n// ── version ─────────────────────────────────────────────────────────\n\nimport { readFileSync } from 'node:fs'\nimport { dirname, resolve } from 'node:path'\nimport { fileURLToPath } from 'node:url'\n\nlet CACHED_VERSION: string | undefined\n\nfunction readPackageVersion(): string {\n if (CACHED_VERSION) return CACHED_VERSION\n // Walk up from this file looking for the nearest package.json.\n // In dist/ this is dist/.., in src/wire/ this is ../../package.json.\n const here = dirname(fileURLToPath(import.meta.url))\n const candidates = [\n resolve(here, '..', '..', 'package.json'), // src/wire → repo root\n resolve(here, '..', 'package.json'), // dist → repo root\n ]\n for (const path of candidates) {\n try {\n const pkg = JSON.parse(readFileSync(path, 'utf-8')) as { version?: string }\n if (pkg.version) {\n CACHED_VERSION = pkg.version\n return pkg.version\n }\n } catch {\n // try next\n }\n }\n return '0.0.0-unknown'\n}\n\nexport function handleVersion(): VersionResponse {\n return {\n package: '@tangle-network/agent-eval',\n version: readPackageVersion(),\n wireVersion: WIRE_VERSION,\n apiSurface: ['judge', 'listRubrics', 'version'],\n }\n}\n","/**\n * Build an OpenAPI spec from the wire schemas.\n *\n * The spec is the contract that other-language clients (Python, Rust,\n * Go, …) generate from. There is no hand-written client — clients are\n * derived artifacts of this file plus `schemas.ts`.\n *\n * Run `pnpm openapi` (defined in package.json) to write the spec to\n * `dist/openapi.json`. CI uses that file to regenerate the Python\n * client and gate the dual-publish workflow.\n */\nimport { OpenApiGeneratorV31, OpenAPIRegistry } from '@asteasolutions/zod-to-openapi'\nimport type { OpenAPIObject } from 'openapi3-ts/oas31'\n\nimport {\n ErrorResponseSchema,\n HealthResponseSchema,\n JudgeRequestSchema,\n JudgeResultSchema,\n ListRubricsResponseSchema,\n VersionResponseSchema,\n WIRE_VERSION,\n} from './schemas'\n\nexport function buildOpenApi(packageVersion: string): OpenAPIObject {\n const registry = new OpenAPIRegistry()\n\n // Components — each schema becomes a $ref-able component\n registry.register('JudgeRequest', JudgeRequestSchema)\n registry.register('JudgeResult', JudgeResultSchema)\n registry.register('ListRubricsResponse', ListRubricsResponseSchema)\n registry.register('VersionResponse', VersionResponseSchema)\n registry.register('HealthResponse', HealthResponseSchema)\n registry.register('ErrorResponse', ErrorResponseSchema)\n\n // Routes\n registry.registerPath({\n method: 'post',\n path: '/v1/judge',\n summary: 'Score a piece of content against a rubric',\n description:\n 'Runs the judging LLM with the named (or inline) rubric and returns dimension scores, detected failure modes, wins, and a composite score in 0..1.',\n request: {\n body: {\n content: {\n 'application/json': { schema: JudgeRequestSchema },\n },\n },\n },\n responses: {\n 200: {\n description: 'Successful judgement',\n content: { 'application/json': { schema: JudgeResultSchema } },\n },\n 400: {\n description: 'Validation error',\n content: { 'application/json': { schema: ErrorResponseSchema } },\n },\n 404: {\n description: 'Rubric not found',\n content: { 'application/json': { schema: ErrorResponseSchema } },\n },\n 500: {\n description: 'Judge error',\n content: { 'application/json': { schema: ErrorResponseSchema } },\n },\n },\n })\n\n registry.registerPath({\n method: 'get',\n path: '/v1/rubrics',\n summary: 'List built-in rubrics',\n description:\n 'Returns every rubric registered server-side, with their dimensions and stable rubricVersion hash.',\n responses: {\n 200: {\n description: 'Listing',\n content: { 'application/json': { schema: ListRubricsResponseSchema } },\n },\n },\n })\n\n registry.registerPath({\n method: 'get',\n path: '/v1/version',\n summary: 'Server and wire-protocol version',\n description: 'Match your client version to `version`; check `wireVersion` for compatibility.',\n responses: {\n 200: {\n description: 'Version info',\n content: { 'application/json': { schema: VersionResponseSchema } },\n },\n },\n })\n\n registry.registerPath({\n method: 'get',\n path: '/healthz',\n summary: 'Liveness check',\n responses: {\n 200: {\n description: 'OK',\n content: { 'application/json': { schema: HealthResponseSchema } },\n },\n },\n })\n\n const generator = new OpenApiGeneratorV31(registry.definitions)\n return generator.generateDocument({\n openapi: '3.1.0',\n info: {\n title: '@tangle-network/agent-eval — wire protocol',\n version: packageVersion,\n description: `HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: ${WIRE_VERSION}. Bumps on breaking changes to request/response schemas.`,\n contact: { name: 'Tangle Network', url: 'https://github.com/tangle-network/agent-eval' },\n license: { name: 'MIT' },\n },\n servers: [{ url: 'http://localhost:5005', description: 'Local agent-eval serve' }],\n })\n}\n","/**\n * HTTP transport for the wire protocol.\n *\n * Hono + @hono/node-server. Every endpoint:\n * 1. Validates the request against its Zod schema.\n * 2. Calls the matching handler in `handlers.ts`.\n * 3. Renders 4xx for `WireError` with structured body, 500 for unexpected.\n *\n * The server has no internal state besides the handler imports — restart\n * costs nothing. Run via `agent-eval serve --port 5005`.\n */\nimport { serve, type ServerType } from '@hono/node-server'\nimport { Hono } from 'hono'\nimport { cors } from 'hono/cors'\n\nimport {\n handleJudge,\n handleListRubrics,\n handleVersion,\n WireError,\n} from './handlers'\nimport { buildOpenApi } from './openapi'\nimport { JudgeRequestSchema } from './schemas'\n\nconst STARTED_AT = Date.now()\n\nexport function createApp() {\n const app = new Hono()\n\n app.use('*', cors())\n\n app.onError((err, c) => {\n if (err instanceof WireError) {\n return c.json(\n { error: { code: err.code, message: err.message, details: err.details } },\n err.status as 400 | 404 | 422 | 500,\n )\n }\n // Unexpected — log and return generic 500 without leaking internals.\n console.error('[agent-eval] unhandled error:', err)\n return c.json(\n { error: { code: 'internal_error', message: 'Internal server error.' } },\n 500,\n )\n })\n\n // ── Health ──\n app.get('/healthz', (c) =>\n c.json({ status: 'ok' as const, uptimeSec: (Date.now() - STARTED_AT) / 1000 }),\n )\n\n // ── Version ──\n app.get('/v1/version', (c) => c.json(handleVersion()))\n\n // ── Rubrics ──\n app.get('/v1/rubrics', (c) => c.json(handleListRubrics()))\n\n // ── Judge ──\n app.post('/v1/judge', async (c) => {\n const raw = await c.req.json().catch(() => null)\n if (raw == null) {\n throw new WireError('validation_error', 'Request body must be JSON.', 400)\n }\n const parsed = JudgeRequestSchema.safeParse(raw)\n if (!parsed.success) {\n throw new WireError(\n 'validation_error',\n 'Request did not match JudgeRequest schema.',\n 400,\n parsed.error.issues,\n )\n }\n const result = await handleJudge(parsed.data)\n return c.json(result)\n })\n\n // ── OpenAPI spec ──\n app.get('/openapi.json', (c) => c.json(buildOpenApi(handleVersion().version)))\n\n return app\n}\n\nexport interface ServeOptions {\n /** Default 5005. */\n port?: number\n /** Default '127.0.0.1'. Set to '0.0.0.0' to listen on all interfaces. */\n host?: string\n}\n\nexport function startServer(opts: ServeOptions = {}): ServerType {\n const app = createApp()\n const port = opts.port ?? 5005\n const host = opts.host ?? '127.0.0.1'\n return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] serving on http://${address}:${actualPort}`)\n })\n}\n","/**\n * stdio RPC transport.\n *\n * For batch / cron use without a running server. The Python client falls\n * back to this when no server is reachable.\n *\n * Protocol (line-delimited JSON over stdin/stdout):\n * IN: one JSON object on stdin: {\"method\":\"judge\",\"params\":{...}}\n * OUT: one JSON object on stdout: {\"result\":{...}} or {\"error\":{...}}\n *\n * One request per process invocation. To pipeline many calls, the client\n * writes JSONL to stdin and reads JSONL from stdout — see batch mode below.\n */\nimport { handleJudge, handleListRubrics, handleVersion, WireError } from './handlers'\nimport { JudgeRequestSchema } from './schemas'\n\ninterface RpcRequest {\n method: 'judge' | 'listRubrics' | 'version'\n params?: unknown\n}\n\ninterface RpcSuccess {\n result: unknown\n}\n\ninterface RpcError {\n error: { code: string; message: string; details?: unknown }\n}\n\nexport async function dispatchRpc(req: RpcRequest): Promise<RpcSuccess | RpcError> {\n try {\n switch (req.method) {\n case 'judge': {\n const parsed = JudgeRequestSchema.safeParse(req.params)\n if (!parsed.success) {\n return {\n error: {\n code: 'validation_error',\n message: 'params did not match JudgeRequest schema.',\n details: parsed.error.issues,\n },\n }\n }\n return { result: await handleJudge(parsed.data) }\n }\n case 'listRubrics':\n return { result: handleListRubrics() }\n case 'version':\n return { result: handleVersion() }\n default:\n return {\n error: {\n code: 'unknown_method',\n message: `No such method: ${(req as { method: string }).method}`,\n },\n }\n }\n } catch (err) {\n if (err instanceof WireError) {\n return { error: { code: err.code, message: err.message, details: err.details } }\n }\n const message = err instanceof Error ? err.message : String(err)\n return { error: { code: 'internal_error', message } }\n }\n}\n\n// ── stdin/stdout driver ─────────────────────────────────────────────\n\nasync function readAll(stream: NodeJS.ReadableStream): Promise<string> {\n const chunks: Buffer[] = []\n for await (const chunk of stream) {\n chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk as string))\n }\n return Buffer.concat(chunks).toString('utf-8')\n}\n\n/** Read one JSON request from stdin, write one JSON response to stdout. */\nexport async function runRpcOnce(method?: string): Promise<number> {\n const raw = await readAll(process.stdin)\n let req: RpcRequest\n try {\n const body = JSON.parse(raw)\n req = method ? { method: method as RpcRequest['method'], params: body } : (body as RpcRequest)\n } catch (err) {\n process.stdout.write(\n JSON.stringify({\n error: {\n code: 'parse_error',\n message: `stdin was not valid JSON: ${err instanceof Error ? err.message : String(err)}`,\n },\n }) + '\\n',\n )\n return 1\n }\n const out = await dispatchRpc(req)\n process.stdout.write(JSON.stringify(out) + '\\n')\n return 'error' in out ? 1 : 0\n}\n\n/** Read JSONL requests from stdin, write JSONL responses to stdout. */\nexport async function runRpcBatch(method?: string): Promise<number> {\n const raw = await readAll(process.stdin)\n const lines = raw.split('\\n').filter((l) => l.trim().length > 0)\n let exitCode = 0\n for (const line of lines) {\n let req: RpcRequest\n try {\n const body = JSON.parse(line)\n req = method ? { method: method as RpcRequest['method'], params: body } : (body as RpcRequest)\n } catch (err) {\n process.stdout.write(\n JSON.stringify({\n error: {\n code: 'parse_error',\n message: `line was not valid JSON: ${err instanceof Error ? err.message : String(err)}`,\n },\n }) + '\\n',\n )\n exitCode = 1\n continue\n }\n const out = await dispatchRpc(req)\n process.stdout.write(JSON.stringify(out) + '\\n')\n if ('error' in out) exitCode = 1\n }\n return exitCode\n}\n"],"mappings":";;;;;AAYA,SAAS,4BAA4B;AACrC,SAAS,SAAS;AAElB,qBAAqB,CAAC;AAIf,IAAM,wBAAwB,EAClC,OAAO;AAAA,EACN,IAAI,EACD,OAAO,EACP,IAAI,CAAC,EACL,SAAS,gFAA2E;AAAA,EACvF,aAAa,EACV,OAAO,EACP,IAAI,CAAC,EACL,SAAS,sEAAsE;AAAA,EAClF,QAAQ,EACL,OAAO,EACP,IAAI,CAAC,EACL,QAAQ,CAAC,EACT,SAAS,gEAAgE;AAAA,EAC5E,KAAK,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS,gDAAgD;AAAA,EACpF,KAAK,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS,gDAAgD;AACtF,CAAC,EACA,QAAQ,iBAAiB;AAErB,IAAM,oBAAoB,EAC9B,OAAO;AAAA,EACN,IAAI,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,SAAS,mEAA8D;AAAA,EAC7F,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,SAAS,mDAAmD;AAC7F,CAAC,EACA,QAAQ,aAAa;AAIjB,IAAM,eAAe,EACzB,OAAO;AAAA,EACN,MAAM,EACH,OAAO,EACP,IAAI,CAAC,EACL,SAAS,4EAAuE;AAAA,EACnF,aAAa,EACV,OAAO,EACP,IAAI,CAAC,EACL,SAAS,0DAA0D;AAAA,EACtE,cAAc,EACX,OAAO,EACP,IAAI,CAAC,EACL;AAAA,IACC;AAAA,EACF;AAAA,EACF,YAAY,EACT,MAAM,qBAAqB,EAC3B,IAAI,CAAC,EACL,SAAS,+DAA+D;AAAA,EAC3E,cAAc,EACX,MAAM,iBAAiB,EACvB,QAAQ,CAAC,CAAC,EACV,SAAS,iFAAiF;AAAA,EAC7F,MAAM,EACH,MAAM,iBAAiB,EACvB,QAAQ,CAAC,CAAC,EACV,SAAS,uEAAuE;AACrF,CAAC,EACA,QAAQ,QAAQ;AAIZ,IAAM,qBAAqB,EAC/B,OAAO;AAAA,EACN,YAAY,EACT,OAAO,EACP,SAAS,EACT,SAAS,kEAAkE;AAAA,EAC9E,QAAQ,aAAa,SAAS,EAAE;AAAA,IAC9B;AAAA,EACF;AAAA,EACA,SAAS,EACN,OAAO,EACP,IAAI,CAAC,EACL,SAAS,uFAAkF;AAAA,EAC9F,SAAS,EACN,OAAO,EAAE,OAAO,GAAG,EAAE,QAAQ,CAAC,EAC9B,SAAS,EACT;AAAA,IACC;AAAA,EACF;AAAA,EACF,OAAO,EACJ,OAAO,EACP,SAAS,EACT,SAAS,+EAA+E;AAC7F,CAAC,EACA,OAAO,CAAC,MAAM,QAAQ,EAAE,UAAU,MAAM,QAAQ,EAAE,MAAM,GAAG;AAAA,EAC1D,SAAS;AACX,CAAC,EACA,QAAQ,cAAc;AAElB,IAAM,oBAAoB,EAC9B,OAAO;AAAA,EACN,WAAW,EACR,OAAO,EACP,IAAI,CAAC,EACL,IAAI,CAAC,EACL,SAAS,iFAAiF;AAAA,EAC7F,YAAY,EACT,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,CAAC,EAC7B,SAAS,mDAAmD;AAAA,EAC/D,cAAc,EACX,MAAM,EAAE,OAAO,CAAC,EAChB,QAAQ,CAAC,CAAC,EACV,SAAS,+EAA+E;AAAA,EAC3F,MAAM,EACH,MAAM,EAAE,OAAO,CAAC,EAChB,QAAQ,CAAC,CAAC,EACV,SAAS,8DAA8D;AAAA,EAC1E,WAAW,EACR,OAAO,EACP,SAAS,yEAAyE;AAAA,EACrF,eAAe,EACZ,OAAO,EACP;AAAA,IACC;AAAA,EACF;AAAA,EACF,OAAO,EAAE,OAAO,EAAE,SAAS,yDAAyD;AAAA,EACpF,YAAY,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,SAAS,qCAAqC;AAC3F,CAAC,EACA,QAAQ,aAAa;AAIjB,IAAM,mBAAmB,EAC7B,OAAO;AAAA,EACN,MAAM,EAAE,OAAO,EAAE,SAAS,yCAAyC;AAAA,EACnE,aAAa,EAAE,OAAO,EAAE,SAAS,4BAA4B;AAAA,EAC7D,YAAY,EACT,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,GAAG,aAAa,EAAE,OAAO,GAAG,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC,EAC/E,SAAS,kDAAkD;AAAA,EAC9D,cAAc,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,QAAQ,CAAC,CAAC,EAAE,SAAS,uCAAuC;AAAA,EAC9F,eAAe,EAAE,OAAO,EAAE,SAAS,8DAAyD;AAC9F,CAAC,EACA,QAAQ,YAAY;AAEhB,IAAM,4BAA4B,EACtC,OAAO;AAAA,EACN,SAAS,EAAE,MAAM,gBAAgB;AACnC,CAAC,EACA,QAAQ,qBAAqB;AAIzB,IAAM,wBAAwB,EAClC,OAAO;AAAA,EACN,SAAS,EAAE,OAAO,EAAE,SAAS,qDAAqD;AAAA,EAClF,SAAS,EAAE,OAAO,EAAE,SAAS,0DAA0D;AAAA,EACvF,aAAa,EACV,OAAO,EACP;AAAA,IACC;AAAA,EACF;AAAA,EACF,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,iCAAiC;AAC5E,CAAC,EACA,QAAQ,iBAAiB;AAErB,IAAM,uBAAuB,EACjC,OAAO;AAAA,EACN,QAAQ,EAAE,QAAQ,IAAI;AAAA,EACtB,WAAW,EAAE,OAAO;AACtB,CAAC,EACA,QAAQ,gBAAgB;AAIpB,IAAM,sBAAsB,EAChC,OAAO;AAAA,EACN,OAAO,EACJ,OAAO;AAAA,IACN,MAAM,EACH,OAAO,EACP,SAAS,+EAA+E;AAAA,IAC3F,SAAS,EAAE,OAAO,EAAE,SAAS,yBAAyB;AAAA,IACtD,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,6BAA6B;AAAA,EACxE,CAAC,EACA,SAAS,+DAA+D;AAC7E,CAAC,EACA,QAAQ,eAAe;AAoBnB,IAAM,eAAe;AAMrB,SAAS,WAAW,QAAwB;AAEjD,QAAM,SAAS,KAAK,UAAU,QAAQ,OAAO,KAAK,MAAM,EAAE,KAAK,CAAC;AAChE,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,QAAK,IAAI,KAAM,OAAO,WAAW,CAAC;AAAA,EACpC;AAEA,SAAO,GAAG,OAAO,IAAI,KAAK,MAAM,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC;AAClE;;;AChNA,IAAM,YAAoB;AAAA,EACxB,MAAM;AAAA,EACN,aACE;AAAA,EACF,cAAc;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAuBd,YAAY;AAAA,IACV;AAAA,MACE,IAAI;AAAA,MACJ,aAAa;AAAA,MACb,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,aAAa;AAAA,MACb,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,aAAa;AAAA,MACb,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,KAAK;AAAA,IACP;AAAA,EACF;AAAA,EACA,cAAc;AAAA,IACZ,EAAE,IAAI,cAAc,aAAa,qCAAqC;AAAA,IACtE,EAAE,IAAI,kBAAkB,aAAa,sCAAsC;AAAA,IAC3E,EAAE,IAAI,eAAe,aAAa,oCAAoC;AAAA,IACtE,EAAE,IAAI,WAAW,aAAa,kCAAkC;AAAA,IAChE,EAAE,IAAI,mBAAmB,aAAa,qCAAqC;AAAA,IAC3E,EAAE,IAAI,WAAW,aAAa,oCAAoC;AAAA,IAClE,EAAE,IAAI,eAAe,aAAa,uCAAuC;AAAA,EAC3E;AAAA,EACA,MAAM;AAAA,IACJ,EAAE,IAAI,sBAAsB,aAAa,qCAAqC;AAAA,IAC9E,EAAE,IAAI,iBAAiB,aAAa,4BAA4B;AAAA,IAChE,EAAE,IAAI,0BAA0B,aAAa,wBAAwB;AAAA,IACrE,EAAE,IAAI,kBAAkB,aAAa,oCAAoC;AAAA,EAC3E;AACF;AAIO,IAAM,kBAA0C;AAAA,EACrD,aAAa;AACf;AAGO,SAAS,iBAAiB,MAAkC;AACjE,SAAO,gBAAgB,IAAI;AAC7B;AAGO,SAAS,qBAAqB;AACnC,SAAO,OAAO,OAAO,eAAe,EAAE,IAAI,CAAC,OAAO;AAAA,IAChD,MAAM,EAAE;AAAA,IACR,aAAa,EAAE;AAAA,IACf,YAAY,EAAE,WAAW,IAAI,CAAC,OAAO;AAAA,MACnC,IAAI,EAAE;AAAA,MACN,aAAa,EAAE;AAAA,MACf,QAAQ,EAAE;AAAA,IACZ,EAAE;AAAA,IACF,cAAc,EAAE,aAAa,IAAI,CAAC,MAAM,EAAE,EAAE;AAAA,IAC5C,eAAe,WAAW,CAAC;AAAA,EAC7B,EAAE;AACJ;;;ACkDA,SAAS,oBAAoB;AAC7B,SAAS,SAAS,eAAe;AACjC,SAAS,qBAAqB;AA/IvB,IAAM,YAAN,cAAwB,MAAM;AAAA,EACnC,YACkB,MAChB,SACgB,SAAiB,KACjB,SAChB;AACA,UAAM,OAAO;AALG;AAEA;AACA;AAGhB,SAAK,OAAO;AAAA,EACd;AAAA,EAPkB;AAAA,EAEA;AAAA,EACA;AAKpB;AAKA,SAAS,kBAAkB,QAAgB;AACzC,SAAO;AAAA,IACL,MAAM;AAAA,IACN,QAAQ;AAAA,MACN,MAAM;AAAA,MACN,sBAAsB;AAAA,MACtB,YAAY;AAAA,QACV,YAAY;AAAA,UACV,MAAM;AAAA,UACN,sBAAsB;AAAA,UACtB,YAAY,OAAO;AAAA,YACjB,OAAO,WAAW,IAAI,CAAC,MAAM;AAAA,cAC3B,EAAE;AAAA,cACF,EAAE,MAAM,UAAU,SAAS,EAAE,KAAK,SAAS,EAAE,IAAI;AAAA,YACnD,CAAC;AAAA,UACH;AAAA,UACA,UAAU,OAAO,WAAW,IAAI,CAAC,MAAM,EAAE,EAAE;AAAA,QAC7C;AAAA,QACA,cAAc;AAAA,UACZ,MAAM;AAAA,UACN,OAAO,EAAE,MAAM,UAAU,MAAM,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE;AAAA,QACtE;AAAA,QACA,MAAM;AAAA,UACJ,MAAM;AAAA,UACN,OAAO,EAAE,MAAM,UAAU,MAAM,OAAO,KAAK,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE;AAAA,QAC9D;AAAA,QACA,WAAW,EAAE,MAAM,SAAS;AAAA,MAC9B;AAAA,MACA,UAAU,CAAC,cAAc,WAAW;AAAA,IACtC;AAAA,EACF;AACF;AASA,SAAS,eAAe,YAAoC,QAAwB;AAClF,MAAI,WAAW;AACf,MAAI,cAAc;AAClB,aAAW,OAAO,OAAO,YAAY;AACnC,UAAM,MAAM,WAAW,IAAI,EAAE,KAAK;AAClC,UAAM,QAAQ,IAAI,MAAM,IAAI,OAAO;AACnC,UAAM,aAAa,KAAK,IAAI,GAAG,KAAK,IAAI,IAAI,MAAM,IAAI,OAAO,KAAK,CAAC;AACnE,gBAAY,aAAa,IAAI;AAC7B,mBAAe,IAAI;AAAA,EACrB;AACA,SAAO,cAAc,IAAI,WAAW,cAAc;AACpD;AAEA,SAAS,iBAAiB,SAAiB,SAA0B;AACnE,QAAM,MAAM,WAAW,OAAO,KAAK,OAAiB,EAAE,SAAS,KAAK,UAAU,OAAO,IAAI;AACzF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA,MAAM,yCAAyC;AAAA,IAC/C,MAAM,MAAM;AAAA,EACd,EACG,OAAO,OAAO,EACd,KAAK,IAAI;AACd;AAEA,IAAM,sBAAsB;AAE5B,eAAsB,YAAY,KAAyC;AAEzE,MAAI;AACJ,MAAI,IAAI,YAAY;AAClB,UAAM,QAAQ,iBAAiB,IAAI,UAAU;AAC7C,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,UAAU,oBAAoB,6BAA6B,IAAI,UAAU,MAAM,GAAG;AAAA,IAC9F;AACA,aAAS;AAAA,EACX,WAAW,IAAI,QAAQ;AACrB,aAAS,IAAI;AAAA,EACf,OAAO;AAEL,UAAM,IAAI,UAAU,oBAAoB,4CAA4C,GAAG;AAAA,EACzF;AAEA,QAAM,YAAY,KAAK,IAAI;AAC3B,QAAM,QAAQ,IAAI,SAAS;AAE3B,QAAM,EAAE,OAAO,OAAO,IAAI,MAAM,YAAyB;AAAA,IACvD;AAAA,IACA,UAAU;AAAA,MACR,EAAE,MAAM,UAAU,SAAS,OAAO,aAAa;AAAA,MAC/C,EAAE,MAAM,QAAQ,SAAS,iBAAiB,IAAI,SAAS,IAAI,OAAO,EAAE;AAAA,IACtE;AAAA,IACA,YAAY,kBAAkB,MAAM;AAAA,IACpC,aAAa;AAAA,IACb,WAAW;AAAA,EACb,CAAC;AAGD,MAAI,CAAC,SAAS,OAAO,UAAU,YAAY,CAAC,MAAM,YAAY;AAC5D,UAAM,IAAI,UAAU,eAAe,oCAAoC,KAAK,KAAK;AAAA,EACnF;AAEA,QAAM,YAAY,eAAe,MAAM,YAAY,MAAM;AACzD,QAAM,aAAa,KAAK,IAAI,IAAI;AAEhC,SAAO;AAAA,IACL;AAAA,IACA,YAAY,MAAM;AAAA,IAClB,cAAc,MAAM,gBAAgB,CAAC;AAAA,IACrC,MAAM,MAAM,QAAQ,CAAC;AAAA,IACrB,WAAW,MAAM;AAAA,IACjB,eAAe,WAAW,MAAM;AAAA,IAChC,OAAO,OAAO;AAAA,IACd;AAAA,EACF;AACF;AAIO,SAAS,oBAAyC;AACvD,SAAO,EAAE,SAAS,mBAAmB,EAAE;AACzC;AAQA,IAAI;AAEJ,SAAS,qBAA6B;AACpC,MAAI,eAAgB,QAAO;AAG3B,QAAM,OAAO,QAAQ,cAAc,YAAY,GAAG,CAAC;AACnD,QAAM,aAAa;AAAA,IACjB,QAAQ,MAAM,MAAM,MAAM,cAAc;AAAA;AAAA,IACxC,QAAQ,MAAM,MAAM,cAAc;AAAA;AAAA,EACpC;AACA,aAAW,QAAQ,YAAY;AAC7B,QAAI;AACF,YAAM,MAAM,KAAK,MAAM,aAAa,MAAM,OAAO,CAAC;AAClD,UAAI,IAAI,SAAS;AACf,yBAAiB,IAAI;AACrB,eAAO,IAAI;AAAA,MACb;AAAA,IACF,QAAQ;AAAA,IAER;AAAA,EACF;AACA,SAAO;AACT;AAEO,SAAS,gBAAiC;AAC/C,SAAO;AAAA,IACL,SAAS;AAAA,IACT,SAAS,mBAAmB;AAAA,IAC5B,aAAa;AAAA,IACb,YAAY,CAAC,SAAS,eAAe,SAAS;AAAA,EAChD;AACF;;;AC9LA,SAAS,qBAAqB,uBAAuB;AAa9C,SAAS,aAAa,gBAAuC;AAClE,QAAM,WAAW,IAAI,gBAAgB;AAGrC,WAAS,SAAS,gBAAgB,kBAAkB;AACpD,WAAS,SAAS,eAAe,iBAAiB;AAClD,WAAS,SAAS,uBAAuB,yBAAyB;AAClE,WAAS,SAAS,mBAAmB,qBAAqB;AAC1D,WAAS,SAAS,kBAAkB,oBAAoB;AACxD,WAAS,SAAS,iBAAiB,mBAAmB;AAGtD,WAAS,aAAa;AAAA,IACpB,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,SAAS;AAAA,IACT,aACE;AAAA,IACF,SAAS;AAAA,MACP,MAAM;AAAA,QACJ,SAAS;AAAA,UACP,oBAAoB,EAAE,QAAQ,mBAAmB;AAAA,QACnD;AAAA,MACF;AAAA,IACF;AAAA,IACA,WAAW;AAAA,MACT,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,kBAAkB,EAAE;AAAA,MAC/D;AAAA,MACA,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,oBAAoB,EAAE;AAAA,MACjE;AAAA,MACA,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,oBAAoB,EAAE;AAAA,MACjE;AAAA,MACA,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,oBAAoB,EAAE;AAAA,MACjE;AAAA,IACF;AAAA,EACF,CAAC;AAED,WAAS,aAAa;AAAA,IACpB,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,SAAS;AAAA,IACT,aACE;AAAA,IACF,WAAW;AAAA,MACT,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,0BAA0B,EAAE;AAAA,MACvE;AAAA,IACF;AAAA,EACF,CAAC;AAED,WAAS,aAAa;AAAA,IACpB,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,SAAS;AAAA,IACT,aAAa;AAAA,IACb,WAAW;AAAA,MACT,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,sBAAsB,EAAE;AAAA,MACnE;AAAA,IACF;AAAA,EACF,CAAC;AAED,WAAS,aAAa;AAAA,IACpB,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,SAAS;AAAA,IACT,WAAW;AAAA,MACT,KAAK;AAAA,QACH,aAAa;AAAA,QACb,SAAS,EAAE,oBAAoB,EAAE,QAAQ,qBAAqB,EAAE;AAAA,MAClE;AAAA,IACF;AAAA,EACF,CAAC;AAED,QAAM,YAAY,IAAI,oBAAoB,SAAS,WAAW;AAC9D,SAAO,UAAU,iBAAiB;AAAA,IAChC,SAAS;AAAA,IACT,MAAM;AAAA,MACJ,OAAO;AAAA,MACP,SAAS;AAAA,MACT,aAAa;AAAA;AAAA,yBAEM,YAAY;AAAA,MAC/B,SAAS,EAAE,MAAM,kBAAkB,KAAK,+CAA+C;AAAA,MACvF,SAAS,EAAE,MAAM,MAAM;AAAA,IACzB;AAAA,IACA,SAAS,CAAC,EAAE,KAAK,yBAAyB,aAAa,yBAAyB,CAAC;AAAA,EACnF,CAAC;AACH;;;AC/GA,SAAS,aAA8B;AACvC,SAAS,YAAY;AACrB,SAAS,YAAY;AAWrB,IAAM,aAAa,KAAK,IAAI;AAErB,SAAS,YAAY;AAC1B,QAAM,MAAM,IAAI,KAAK;AAErB,MAAI,IAAI,KAAK,KAAK,CAAC;AAEnB,MAAI,QAAQ,CAAC,KAAK,MAAM;AACtB,QAAI,eAAe,WAAW;AAC5B,aAAO,EAAE;AAAA,QACP,EAAE,OAAO,EAAE,MAAM,IAAI,MAAM,SAAS,IAAI,SAAS,SAAS,IAAI,QAAQ,EAAE;AAAA,QACxE,IAAI;AAAA,MACN;AAAA,IACF;AAEA,YAAQ,MAAM,iCAAiC,GAAG;AAClD,WAAO,EAAE;AAAA,MACP,EAAE,OAAO,EAAE,MAAM,kBAAkB,SAAS,yBAAyB,EAAE;AAAA,MACvE;AAAA,IACF;AAAA,EACF,CAAC;AAGD,MAAI;AAAA,IAAI;AAAA,IAAY,CAAC,MACnB,EAAE,KAAK,EAAE,QAAQ,MAAe,YAAY,KAAK,IAAI,IAAI,cAAc,IAAK,CAAC;AAAA,EAC/E;AAGA,MAAI,IAAI,eAAe,CAAC,MAAM,EAAE,KAAK,cAAc,CAAC,CAAC;AAGrD,MAAI,IAAI,eAAe,CAAC,MAAM,EAAE,KAAK,kBAAkB,CAAC,CAAC;AAGzD,MAAI,KAAK,aAAa,OAAO,MAAM;AACjC,UAAM,MAAM,MAAM,EAAE,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI;AAC/C,QAAI,OAAO,MAAM;AACf,YAAM,IAAI,UAAU,oBAAoB,8BAA8B,GAAG;AAAA,IAC3E;AACA,UAAM,SAAS,mBAAmB,UAAU,GAAG;AAC/C,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,QACA;AAAA,QACA,OAAO,MAAM;AAAA,MACf;AAAA,IACF;AACA,UAAM,SAAS,MAAM,YAAY,OAAO,IAAI;AAC5C,WAAO,EAAE,KAAK,MAAM;AAAA,EACtB,CAAC;AAGD,MAAI,IAAI,iBAAiB,CAAC,MAAM,EAAE,KAAK,aAAa,cAAc,EAAE,OAAO,CAAC,CAAC;AAE7E,SAAO;AACT;AASO,SAAS,YAAY,OAAqB,CAAC,GAAe;AAC/D,QAAM,MAAM,UAAU;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO,MAAM,EAAE,OAAO,IAAI,OAAO,MAAM,UAAU,KAAK,GAAG,CAAC,EAAE,SAAS,MAAM,WAAW,MAAM;AAE1F,YAAQ,IAAI,kCAAkC,OAAO,IAAI,UAAU,EAAE;AAAA,EACvE,CAAC;AACH;;;ACpEA,eAAsB,YAAY,KAAiD;AACjF,MAAI;AACF,YAAQ,IAAI,QAAQ;AAAA,MAClB,KAAK,SAAS;AACZ,cAAM,SAAS,mBAAmB,UAAU,IAAI,MAAM;AACtD,YAAI,CAAC,OAAO,SAAS;AACnB,iBAAO;AAAA,YACL,OAAO;AAAA,cACL,MAAM;AAAA,cACN,SAAS;AAAA,cACT,SAAS,OAAO,MAAM;AAAA,YACxB;AAAA,UACF;AAAA,QACF;AACA,eAAO,EAAE,QAAQ,MAAM,YAAY,OAAO,IAAI,EAAE;AAAA,MAClD;AAAA,MACA,KAAK;AACH,eAAO,EAAE,QAAQ,kBAAkB,EAAE;AAAA,MACvC,KAAK;AACH,eAAO,EAAE,QAAQ,cAAc,EAAE;AAAA,MACnC;AACE,eAAO;AAAA,UACL,OAAO;AAAA,YACL,MAAM;AAAA,YACN,SAAS,mBAAoB,IAA2B,MAAM;AAAA,UAChE;AAAA,QACF;AAAA,IACJ;AAAA,EACF,SAAS,KAAK;AACZ,QAAI,eAAe,WAAW;AAC5B,aAAO,EAAE,OAAO,EAAE,MAAM,IAAI,MAAM,SAAS,IAAI,SAAS,SAAS,IAAI,QAAQ,EAAE;AAAA,IACjF;AACA,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,WAAO,EAAE,OAAO,EAAE,MAAM,kBAAkB,QAAQ,EAAE;AAAA,EACtD;AACF;AAIA,eAAe,QAAQ,QAAgD;AACrE,QAAM,SAAmB,CAAC;AAC1B,mBAAiB,SAAS,QAAQ;AAChC,WAAO,KAAK,OAAO,SAAS,KAAK,IAAI,QAAQ,OAAO,KAAK,KAAe,CAAC;AAAA,EAC3E;AACA,SAAO,OAAO,OAAO,MAAM,EAAE,SAAS,OAAO;AAC/C;AAGA,eAAsB,WAAW,QAAkC;AACjE,QAAM,MAAM,MAAM,QAAQ,QAAQ,KAAK;AACvC,MAAI;AACJ,MAAI;AACF,UAAM,OAAO,KAAK,MAAM,GAAG;AAC3B,UAAM,SAAS,EAAE,QAAwC,QAAQ,KAAK,IAAK;AAAA,EAC7E,SAAS,KAAK;AACZ,YAAQ,OAAO;AAAA,MACb,KAAK,UAAU;AAAA,QACb,OAAO;AAAA,UACL,MAAM;AAAA,UACN,SAAS,6BAA6B,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,QACxF;AAAA,MACF,CAAC,IAAI;AAAA,IACP;AACA,WAAO;AAAA,EACT;AACA,QAAM,MAAM,MAAM,YAAY,GAAG;AACjC,UAAQ,OAAO,MAAM,KAAK,UAAU,GAAG,IAAI,IAAI;AAC/C,SAAO,WAAW,MAAM,IAAI;AAC9B;AAGA,eAAsB,YAAY,QAAkC;AAClE,QAAM,MAAM,MAAM,QAAQ,QAAQ,KAAK;AACvC,QAAM,QAAQ,IAAI,MAAM,IAAI,EAAE,OAAO,CAAC,MAAM,EAAE,KAAK,EAAE,SAAS,CAAC;AAC/D,MAAI,WAAW;AACf,aAAW,QAAQ,OAAO;AACxB,QAAI;AACJ,QAAI;AACF,YAAM,OAAO,KAAK,MAAM,IAAI;AAC5B,YAAM,SAAS,EAAE,QAAwC,QAAQ,KAAK,IAAK;AAAA,IAC7E,SAAS,KAAK;AACZ,cAAQ,OAAO;AAAA,QACb,KAAK,UAAU;AAAA,UACb,OAAO;AAAA,YACL,MAAM;AAAA,YACN,SAAS,4BAA4B,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,UACvF;AAAA,QACF,CAAC,IAAI;AAAA,MACP;AACA,iBAAW;AACX;AAAA,IACF;AACA,UAAM,MAAM,MAAM,YAAY,GAAG;AACjC,YAAQ,OAAO,MAAM,KAAK,UAAU,GAAG,IAAI,IAAI;AAC/C,QAAI,WAAW,IAAK,YAAW;AAAA,EACjC;AACA,SAAO;AACT;","names":[]}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/benchmarks/index.ts","../src/benchmarks/types.ts","../src/benchmarks/routing/index.ts","../src/benchmarks/routing/dataset.ts"],"sourcesContent":["/**\n * Reference benchmark wrappers — entry point.\n *\n * Core surface (exported here):\n * - The `BenchmarkAdapter` contract.\n * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.\n * - `routing` — synthetic 16-task router benchmark. The only novel\n * benchmark we built; ships in the package.\n *\n * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):\n * - `gsm8k` — exact-match math reasoning (HF mirror, dataset\n * not bundled).\n * - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an\n * external grader).\n *\n * The example wrappers are reference implementations of `BenchmarkAdapter`.\n * Read them, copy them, adapt them. They're intentionally not in the main\n * entry — every team will configure them differently.\n */\n\nexport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from './types'\nexport { deterministicSplit, BENCHMARK_SPLIT_SEED } from './types'\n\nexport * as routing from './routing/index'\n","/**\n * Shared types for the reference benchmark wrappers under\n * `src/benchmarks/`. Each wrapper exports the three functions in\n * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.\n */\n\nimport type { RunSplitTag } from '../run-record'\n\nexport interface BenchmarkDatasetItem<TPayload = unknown> {\n /** Stable dataset-local item id (used for split assignment + paper\n * references). Unique within a benchmark. */\n id: string\n /** Free-form payload. Each benchmark defines its own shape. */\n payload: TPayload\n}\n\nexport interface BenchmarkEvaluation {\n /** [0, 1] score for the response on this item. Exact-match\n * benchmarks use 0/1; partial-credit benchmarks may return\n * fractional values. */\n score: number\n /** Optional bag of raw scoring signals — e.g. parsed numeric\n * answer, regex match, judge sub-scores. */\n raw: Record<string, unknown>\n}\n\n/** Common signature implemented by every adapter under `src/benchmarks/*`. */\n// `TPayload` is the per-item payload type; `_TItem` is preserved for\n// downstream type-narrowing extensions (a richer `BenchmarkDatasetItem`\n// subclass that adds e.g. provenance metadata) but is intentionally\n// unused here. `noUnusedLocals` requires the leading underscore.\nexport interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {\n /** Load the dataset for the given split. May hit the network on\n * first call but should be cache-friendly. Adapters that don't\n * ship the dataset itself MUST throw a clearly-marked error\n * pointing the caller at the loader script. */\n loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>\n /** Score a single response. Pure with respect to the inputs. */\n evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>\n /** Deterministic split assignment via item id hashing. The\n * fraction of items in each split is implementation-defined but\n * MUST be stable across processes and platforms. */\n assignSplit(itemId: string): RunSplitTag\n}\n\n// ── Deterministic split assignment ───────────────────────────────────\n\n/**\n * 32-bit FNV-1a hash. Stable, allocation-free, deterministic across\n * runtimes. We use it to assign items to splits rather than depending\n * on a polyfilled crypto.subtle path.\n */\nfunction fnv1a32(input: string): number {\n let h = 0x811c9dc5\n for (let i = 0; i < input.length; i++) {\n h ^= input.charCodeAt(i) & 0xff\n h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0\n }\n return h >>> 0\n}\n\n/** Split-assignment seed shared across all benchmarks. Bumping this\n * value reshuffles every split — do NOT do that lightly. */\nexport const BENCHMARK_SPLIT_SEED = 'agent-eval-v1'\n\n/**\n * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a\n * stable 32-bit hash of `${seed}::${id}`. Default proportions:\n *\n * search: 60% (optimization-readable)\n * dev: 20% (held-out for tuning, leak-on-purpose during dev)\n * holdout:20% (paper-grade held-out, gated reads)\n */\nexport function deterministicSplit(\n itemId: string,\n seed: string = BENCHMARK_SPLIT_SEED,\n): RunSplitTag {\n const h = fnv1a32(`${seed}::${itemId}`)\n const pos = h / 0x100000000\n if (pos < 0.6) return 'search'\n if (pos < 0.8) return 'dev'\n return 'holdout'\n}\n","/**\n * Routing benchmark — synthetic, dependency-free, ships in the\n * package. 16 cross-category items in `dataset.ts`. See\n * `routing/README.md` for the format.\n *\n * `evaluate` does case-insensitive exact match against the canonical\n * route plus declared synonyms. The first valid route token in the\n * response wins; everything else is ignored. Wrong answers also\n * report whether they hit a hard negative — useful when triaging\n * \"always picks the popular route\" failure modes.\n */\n\nimport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from '../types'\nimport { deterministicSplit } from '../types'\nimport type { RunSplitTag } from '../../run-record'\nimport { ROUTING_DATASET, type RoutingItem } from './dataset'\n\nexport type { RoutingItem }\nexport type RoutingPayload = RoutingItem\nexport type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>\n\nclass RoutingAdapter\n implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload>\n{\n async loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]> {\n return ROUTING_DATASET\n .map((item) => ({ id: item.id, payload: item }))\n .filter((it) => assignSplitImpl(it.id) === split)\n }\n\n async evaluate(\n item: RoutingDatasetItem,\n response: string,\n ): Promise<BenchmarkEvaluation> {\n const tokens = extractRouteTokens(response)\n const correct = new Set<string>([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()))\n const hardNeg = new Set<string>(item.payload.hardNegatives.map((s) => s.toLowerCase()))\n const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null\n const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null\n const score = firstMatch ? 1 : 0\n return {\n score,\n raw: {\n firstToken: tokens[0] ?? null,\n matchedRoute: firstMatch,\n hitHardNegative: Boolean(firstHardNeg),\n hardNegativeRoute: firstHardNeg,\n category: item.payload.category,\n },\n }\n }\n\n assignSplit(itemId: string): RunSplitTag {\n return assignSplitImpl(itemId)\n }\n}\n\nfunction assignSplitImpl(itemId: string): RunSplitTag {\n return deterministicSplit(`routing::${itemId}`)\n}\n\n/**\n * Pull route-shaped tokens out of a model response. Routes look like\n * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics\n * are not routes, but `category.action` patterns are robust to most\n * model wrappers (JSON output, prose explanations, code fences).\n */\nexport function extractRouteTokens(response: string): string[] {\n const matches = response.match(/[a-z][a-z0-9_]*\\.[a-z][a-z0-9_]*/gi)\n return matches ?? []\n}\n\nconst adapter = new RoutingAdapter()\n\nexport const loadDataset = adapter.loadDataset.bind(adapter)\nexport const evaluate = adapter.evaluate.bind(adapter)\nexport const assignSplit = adapter.assignSplit.bind(adapter)\nexport { RoutingAdapter, ROUTING_DATASET }\n","/**\n * Synthetic routing dataset. 16 tasks across 4 categories. Used as a\n * deterministic, dependency-free benchmark for any router that maps a\n * natural-language request to one of a fixed set of route labels.\n *\n * Format (see `routing/README.md` for prose):\n *\n * {\n * id: stable per-task ID (matches across processes).\n * category: one of the four route labels.\n * prompt: the user-facing request the router must classify.\n * route: the ground-truth route the router should pick.\n * synonyms: other strings that count as a correct answer.\n * hardNegatives:close-but-wrong route labels — used to detect the\n * \"always picks the popular route\" failure mode.\n * }\n *\n * The four categories are intentionally cross-domain (file ops,\n * math, search, conversation) so a router that collapses to one\n * category is easy to spot.\n */\n\nexport interface RoutingItem {\n id: string\n category: 'file' | 'math' | 'search' | 'chat'\n prompt: string\n /** Canonical correct route label. */\n route: string\n /** Alternate route labels that also count as correct. */\n synonyms: string[]\n /** Wrong-but-tempting route labels (for analysis, not grading). */\n hardNegatives: string[]\n}\n\nexport const ROUTING_DATASET: RoutingItem[] = [\n {\n id: 'file_001',\n category: 'file',\n prompt: 'Save the meeting notes to /tmp/notes-2025-04.md as markdown.',\n route: 'fs.write',\n synonyms: ['filesystem.write', 'write_file'],\n hardNegatives: ['fs.read', 'chat.reply'],\n },\n {\n id: 'file_002',\n category: 'file',\n prompt: 'Read the contents of /etc/hosts and summarize the entries.',\n route: 'fs.read',\n synonyms: ['filesystem.read', 'read_file'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n {\n id: 'file_003',\n category: 'file',\n prompt: 'List every Python file under src/ recursively.',\n route: 'fs.list',\n synonyms: ['filesystem.list', 'list_files'],\n hardNegatives: ['fs.read', 'search.code'],\n },\n {\n id: 'file_004',\n category: 'file',\n prompt: 'Delete the cached build at .turbo/cache.',\n route: 'fs.delete',\n synonyms: ['filesystem.delete', 'remove_file'],\n hardNegatives: ['fs.write', 'fs.list'],\n },\n {\n id: 'math_001',\n category: 'math',\n prompt: 'What is the integral of 3x^2 + 2x from 0 to 5?',\n route: 'math.integral',\n synonyms: ['calculator.integral', 'math.solve'],\n hardNegatives: ['math.derivative', 'chat.reply'],\n },\n {\n id: 'math_002',\n category: 'math',\n prompt: 'Compute the derivative of sin(x) * cos(x).',\n route: 'math.derivative',\n synonyms: ['calculator.derivative', 'math.solve'],\n hardNegatives: ['math.integral', 'math.algebra'],\n },\n {\n id: 'math_003',\n category: 'math',\n prompt: 'Solve 2x + 7 = 19 for x.',\n route: 'math.algebra',\n synonyms: ['calculator.algebra', 'math.solve'],\n hardNegatives: ['math.derivative', 'math.integral'],\n },\n {\n id: 'math_004',\n category: 'math',\n prompt: 'What is the prime factorization of 360?',\n route: 'math.numbertheory',\n synonyms: ['calculator.factor', 'math.solve'],\n hardNegatives: ['math.algebra', 'search.web'],\n },\n {\n id: 'search_001',\n category: 'search',\n prompt: 'Find recent papers on agent prompt optimization with held-out promotion gates.',\n route: 'search.web',\n synonyms: ['web.search', 'search.papers'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_002',\n category: 'search',\n prompt: 'Search the codebase for every call site of `runProposeReview`.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'search_003',\n category: 'search',\n prompt: 'What is the latest release of the Tangle network on GitHub?',\n route: 'search.web',\n synonyms: ['web.search', 'github.releases'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_004',\n category: 'search',\n prompt: 'Find all TODO comments in the agent-eval src tree.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.list'],\n },\n {\n id: 'chat_001',\n category: 'chat',\n prompt: 'Hi there, how are you doing today?',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_002',\n category: 'chat',\n prompt: 'Please explain the difference between an LLM and a foundation model.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'qa.answer'],\n hardNegatives: ['search.web', 'math.algebra'],\n },\n {\n id: 'chat_003',\n category: 'chat',\n prompt: 'Tell me a short joke about distributed systems.',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_004',\n category: 'chat',\n prompt: 'Acknowledge my last message with a thumbs up.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'react'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n]\n"],"mappings":";;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACoDA,SAAS,QAAQ,OAAuB;AACtC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,SAAK,MAAM,WAAW,CAAC,IAAI;AAC3B,QAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,SAAU;AAAA,EACxE;AACA,SAAO,MAAM;AACf;AAIO,IAAM,uBAAuB;AAU7B,SAAS,mBACd,QACA,OAAe,sBACF;AACb,QAAM,IAAI,QAAQ,GAAG,IAAI,KAAK,MAAM,EAAE;AACtC,QAAM,MAAM,IAAI;AAChB,MAAI,MAAM,IAAK,QAAO;AACtB,MAAI,MAAM,IAAK,QAAO;AACtB,SAAO;AACT;;;AClFA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACkCO,IAAM,kBAAiC;AAAA,EAC5C;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB,YAAY;AAAA,IAC3C,eAAe,CAAC,WAAW,YAAY;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,WAAW;AAAA,IACzC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,YAAY;AAAA,IAC1C,eAAe,CAAC,WAAW,aAAa;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,aAAa;AAAA,IAC7C,eAAe,CAAC,YAAY,SAAS;AAAA,EACvC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,uBAAuB,YAAY;AAAA,IAC9C,eAAe,CAAC,mBAAmB,YAAY;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,yBAAyB,YAAY;AAAA,IAChD,eAAe,CAAC,iBAAiB,cAAc;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,YAAY;AAAA,IAC7C,eAAe,CAAC,mBAAmB,eAAe;AAAA,EACpD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,YAAY;AAAA,IAC5C,eAAe,CAAC,gBAAgB,YAAY;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,eAAe;AAAA,IACxC,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,iBAAiB;AAAA,IAC1C,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,WAAW;AAAA,IAC5C,eAAe,CAAC,cAAc,cAAc;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,OAAO;AAAA,IACxC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AACF;;;AD1IA,IAAM,iBAAN,MAEA;AAAA,EACE,MAAM,YAAY,OAAmD;AACnE,WAAO,gBACJ,IAAI,CAAC,UAAU,EAAE,IAAI,KAAK,IAAI,SAAS,KAAK,EAAE,EAC9C,OAAO,CAAC,OAAO,gBAAgB,GAAG,EAAE,MAAM,KAAK;AAAA,EACpD;AAAA,EAEA,MAAM,SACJ,MACA,UAC8B;AAC9B,UAAM,SAAS,mBAAmB,QAAQ;AAC1C,UAAM,UAAU,IAAI,IAAY,CAAC,KAAK,QAAQ,OAAO,GAAG,KAAK,QAAQ,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AAC1G,UAAM,UAAU,IAAI,IAAY,KAAK,QAAQ,cAAc,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AACtF,UAAM,aAAa,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACvE,UAAM,eAAe,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACzE,UAAM,QAAQ,aAAa,IAAI;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,KAAK;AAAA,QACH,YAAY,OAAO,CAAC,KAAK;AAAA,QACzB,cAAc;AAAA,QACd,iBAAiB,QAAQ,YAAY;AAAA,QACrC,mBAAmB;AAAA,QACnB,UAAU,KAAK,QAAQ;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AAAA,EAEA,YAAY,QAA6B;AACvC,WAAO,gBAAgB,MAAM;AAAA,EAC/B;AACF;AAEA,SAAS,gBAAgB,QAA6B;AACpD,SAAO,mBAAmB,YAAY,MAAM,EAAE;AAChD;AAQO,SAAS,mBAAmB,UAA4B;AAC7D,QAAM,UAAU,SAAS,MAAM,oCAAoC;AACnE,SAAO,WAAW,CAAC;AACrB;AAEA,IAAM,UAAU,IAAI,eAAe;AAE5B,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;AACpD,IAAM,WAAW,QAAQ,SAAS,KAAK,OAAO;AAC9C,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;","names":[]}
|