@tangle-network/agent-eval 0.14.2 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -168,6 +168,51 @@ The `MutationTelemetry`, `LineageRecorder`, and `CostLedger` pass into the `code
168
168
 
169
169
  For the full primitive surface and rationale, read each module's JSDoc — `prompt-evolution.ts`, `composite-mutator.ts`, `sandbox-pool.ts`, `code-mutator.ts`, `reflective-mutation.ts`, `evolution-telemetry.ts`.
170
170
 
171
+ ## v0.16 highlights — production-rigor primitives
172
+
173
+ These are the primitives any team running prompt-optimization in production needs, regardless of whether they're writing a paper. v0.15 shipped them under "paper-grade" naming; v0.16 corrects that — they're production-first, paper-grade as a side effect.
174
+
175
+ - `HeldOutGate` — held-out paired-delta gate with `few_runs` /
176
+ `negative_delta` / `overfit_gap` rejection codes and a full evidence
177
+ block on every decision. Sits alongside the existing bootstrap-CI
178
+ `promotion-gate.ts`: that one asks "is this real or noise?", this one
179
+ asks "is this a real win on held-out and not overfit?". Use both.
180
+ - `RunRecord` — typed run schema with mandatory snapshot-pinned `model`,
181
+ `promptHash`, `configHash`, `commitSha`, `costUsd`, `splitTag`.
182
+ Runtime validator throws on missing fields. Reproducibility falls
183
+ out for free.
184
+ - `pairedBootstrap`, `pairedWilcoxon`, `bhAdjust` — statistical
185
+ primitives every rigorous A/B test needs. Already-existing primitives
186
+ are re-exported for paper-style aliases.
187
+ - `runCanaries` — silent judge-fallback, calibration drift (KS test),
188
+ distribution shift (chi-square). Catches the failure mode where your
189
+ judge silently degrades to a constant-0.30 confidence and you ship
190
+ configs graded by a stub.
191
+ - `summaryTable`, `paretoChart`, `gainHistogram` — A/B reporting
192
+ helpers. `summaryTable` emits markdown with means + 95% bootstrap
193
+ CIs + paired Wilcoxon p (BH-adjusted) + Cohen's d. Useful for both
194
+ internal status reports and paper Table 1s.
195
+ - `Researcher` — stable interface for an external agent that drives the
196
+ meta-loop (`inspectFailures` → `proposeChange` → `applyChange` →
197
+ `evaluateChange`). Ship a `NoopResearcher` as a placeholder; real
198
+ implementations live downstream.
199
+ - `benchmarks/{gsm8k,swebench-lite,routing}` — reference benchmark
200
+ wrappers behind one `BenchmarkAdapter` shape, with deterministic
201
+ splits and fail-loud env-var configuration. Mostly for reproducible
202
+ comparisons; not core surface.
203
+
204
+ ### v0.16 changes from v0.15
205
+
206
+ - Renamed `paperTable` → `summaryTable`, `paretoFigure` → `paretoChart`,
207
+ `gainDistributionFigure` → `gainHistogram`. Underlying semantics
208
+ unchanged. Type names follow (`SummaryTable`, `SummaryTableOptions`,
209
+ `SummaryTableRow`).
210
+ - File: `src/paper-report.ts` → `src/summary-report.ts`.
211
+ - Drop the "paper-grade" framing — the primitives are production-first.
212
+
213
+ See `CHANGELOG.md` for the full list. `.claude/skills/agent-eval/SKILL.md`
214
+ covers usage directives and pitfalls.
215
+
171
216
  ## Tech stack
172
217
 
173
218
  - TypeScript strict, no semicolons, single quotes, 2-space indent
@@ -0,0 +1,10 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __export = (target, all) => {
3
+ for (var name in all)
4
+ __defProp(target, name, { get: all[name], enumerable: true });
5
+ };
6
+
7
+ export {
8
+ __export
9
+ };
10
+ //# sourceMappingURL=chunk-PZ5AY32C.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
package/dist/cli.js CHANGED
@@ -7,6 +7,7 @@ import {
7
7
  startServer
8
8
  } from "./chunk-OZPRSK4A.js";
9
9
  import "./chunk-ITN4YOZY.js";
10
+ import "./chunk-PZ5AY32C.js";
10
11
 
11
12
  // src/cli.ts
12
13
  import { writeFileSync } from "fs";
package/dist/cli.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n/**\n * agent-eval CLI.\n *\n * agent-eval serve [--port 5005] [--host 127.0.0.1]\n * agent-eval rpc <method> # one request from stdin → one response on stdout\n * agent-eval rpc-batch <method> # JSONL stdin → JSONL stdout\n * agent-eval openapi [--out path] # write OpenAPI spec\n * agent-eval version\n *\n * <method> is one of: judge, listRubrics, version. When omitted, the\n * stdin payload must be a full {method, params} envelope.\n */\nimport { writeFileSync } from 'node:fs'\n\nimport { buildOpenApi } from './wire/openapi'\nimport { handleVersion } from './wire/handlers'\nimport { runRpcBatch, runRpcOnce } from './wire/rpc'\nimport { startServer } from './wire/server'\n\ninterface Args {\n command: string\n positional: string[]\n flags: Record<string, string>\n}\n\nfunction parseArgs(argv: string[]): Args {\n const [command, ...rest] = argv\n const positional: string[] = []\n const flags: Record<string, string> = {}\n for (let i = 0; i < rest.length; i++) {\n const tok = rest[i]\n if (tok.startsWith('--')) {\n const key = tok.slice(2)\n const next = rest[i + 1]\n if (next != null && !next.startsWith('--')) {\n flags[key] = next\n i++\n } else {\n flags[key] = 'true'\n }\n } else {\n positional.push(tok)\n }\n }\n return { command: command ?? 'help', positional, flags }\n}\n\nconst HELP = `agent-eval — wire-protocol entry point.\n\nCommands:\n serve [--port 5005] [--host 127.0.0.1]\n Start the HTTP server. POST /v1/judge, GET /v1/rubrics, GET /v1/version, GET /openapi.json.\n rpc <method>\n Read one JSON object from stdin (the params for <method>), write one\n JSON object to stdout. Method ∈ {judge, listRubrics, version}.\n rpc-batch <method>\n Like 'rpc' but JSONL in / JSONL out.\n openapi [--out openapi.json]\n Write the OpenAPI 3.1 spec.\n version\n Print server + wire-protocol version JSON.\n\nWithout arguments, prints this help.`\n\nasync function main(): Promise<number> {\n const { command, positional, flags } = parseArgs(process.argv.slice(2))\n\n switch (command) {\n case 'serve': {\n const port = Number(flags.port ?? 5005)\n const host = flags.host ?? '127.0.0.1'\n const server = startServer({ port, host })\n // Keep process alive on SIGINT/SIGTERM\n const shutdown = (sig: string) => {\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] received ${sig}, shutting down`)\n server.close(() => process.exit(0))\n // Force exit after 5s if close hangs\n setTimeout(() => process.exit(1), 5000).unref()\n }\n process.on('SIGINT', () => shutdown('SIGINT'))\n process.on('SIGTERM', () => shutdown('SIGTERM'))\n // Block forever\n await new Promise(() => {})\n return 0\n }\n case 'rpc': {\n const [method] = positional\n return await runRpcOnce(method)\n }\n case 'rpc-batch': {\n const [method] = positional\n return await runRpcBatch(method)\n }\n case 'openapi': {\n const out = flags.out ?? 'openapi.json'\n const spec = buildOpenApi(handleVersion().version)\n writeFileSync(out, JSON.stringify(spec, null, 2) + '\\n', 'utf-8')\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] wrote OpenAPI 3.1 spec to ${out}`)\n return 0\n }\n case 'version': {\n process.stdout.write(JSON.stringify(handleVersion(), null, 2) + '\\n')\n return 0\n }\n case 'help':\n case '--help':\n case '-h':\n case '':\n process.stdout.write(HELP + '\\n')\n return 0\n default:\n process.stderr.write(`unknown command: ${command}\\n${HELP}\\n`)\n return 1\n }\n}\n\nmain()\n .then((code) => process.exit(code))\n .catch((err) => {\n // eslint-disable-next-line no-console\n console.error('[agent-eval] cli error:', err)\n process.exit(1)\n })\n"],"mappings":";;;;;;;;;;;AAaA,SAAS,qBAAqB;AAa9B,SAAS,UAAU,MAAsB;AACvC,QAAM,CAAC,SAAS,GAAG,IAAI,IAAI;AAC3B,QAAM,aAAuB,CAAC;AAC9B,QAAM,QAAgC,CAAC;AACvC,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAClB,QAAI,IAAI,WAAW,IAAI,GAAG;AACxB,YAAM,MAAM,IAAI,MAAM,CAAC;AACvB,YAAM,OAAO,KAAK,IAAI,CAAC;AACvB,UAAI,QAAQ,QAAQ,CAAC,KAAK,WAAW,IAAI,GAAG;AAC1C,cAAM,GAAG,IAAI;AACb;AAAA,MACF,OAAO;AACL,cAAM,GAAG,IAAI;AAAA,MACf;AAAA,IACF,OAAO;AACL,iBAAW,KAAK,GAAG;AAAA,IACrB;AAAA,EACF;AACA,SAAO,EAAE,SAAS,WAAW,QAAQ,YAAY,MAAM;AACzD;AAEA,IAAM,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAiBb,eAAe,OAAwB;AACrC,QAAM,EAAE,SAAS,YAAY,MAAM,IAAI,UAAU,QAAQ,KAAK,MAAM,CAAC,CAAC;AAEtE,UAAQ,SAAS;AAAA,IACf,KAAK,SAAS;AACZ,YAAM,OAAO,OAAO,MAAM,QAAQ,IAAI;AACtC,YAAM,OAAO,MAAM,QAAQ;AAC3B,YAAM,SAAS,YAAY,EAAE,MAAM,KAAK,CAAC;AAEzC,YAAM,WAAW,CAAC,QAAgB;AAEhC,gBAAQ,IAAI,yBAAyB,GAAG,iBAAiB;AACzD,eAAO,MAAM,MAAM,QAAQ,KAAK,CAAC,CAAC;AAElC,mBAAW,MAAM,QAAQ,KAAK,CAAC,GAAG,GAAI,EAAE,MAAM;AAAA,MAChD;AACA,cAAQ,GAAG,UAAU,MAAM,SAAS,QAAQ,CAAC;AAC7C,cAAQ,GAAG,WAAW,MAAM,SAAS,SAAS,CAAC;AAE/C,YAAM,IAAI,QAAQ,MAAM;AAAA,MAAC,CAAC;AAC1B,aAAO;AAAA,IACT;AAAA,IACA,KAAK,OAAO;AACV,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,WAAW,MAAM;AAAA,IAChC;AAAA,IACA,KAAK,aAAa;AAChB,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,YAAY,MAAM;AAAA,IACjC;AAAA,IACA,KAAK,WAAW;AACd,YAAM,MAAM,MAAM,OAAO;AACzB,YAAM,OAAO,aAAa,cAAc,EAAE,OAAO;AACjD,oBAAc,KAAK,KAAK,UAAU,MAAM,MAAM,CAAC,IAAI,MAAM,OAAO;AAEhE,cAAQ,IAAI,0CAA0C,GAAG,EAAE;AAC3D,aAAO;AAAA,IACT;AAAA,IACA,KAAK,WAAW;AACd,cAAQ,OAAO,MAAM,KAAK,UAAU,cAAc,GAAG,MAAM,CAAC,IAAI,IAAI;AACpE,aAAO;AAAA,IACT;AAAA,IACA,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AACH,cAAQ,OAAO,MAAM,OAAO,IAAI;AAChC,aAAO;AAAA,IACT;AACE,cAAQ,OAAO,MAAM,oBAAoB,OAAO;AAAA,EAAK,IAAI;AAAA,CAAI;AAC7D,aAAO;AAAA,EACX;AACF;AAEA,KAAK,EACF,KAAK,CAAC,SAAS,QAAQ,KAAK,IAAI,CAAC,EACjC,MAAM,CAAC,QAAQ;AAEd,UAAQ,MAAM,2BAA2B,GAAG;AAC5C,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
1
+ {"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n/**\n * agent-eval CLI.\n *\n * agent-eval serve [--port 5005] [--host 127.0.0.1]\n * agent-eval rpc <method> # one request from stdin → one response on stdout\n * agent-eval rpc-batch <method> # JSONL stdin → JSONL stdout\n * agent-eval openapi [--out path] # write OpenAPI spec\n * agent-eval version\n *\n * <method> is one of: judge, listRubrics, version. When omitted, the\n * stdin payload must be a full {method, params} envelope.\n */\nimport { writeFileSync } from 'node:fs'\n\nimport { buildOpenApi } from './wire/openapi'\nimport { handleVersion } from './wire/handlers'\nimport { runRpcBatch, runRpcOnce } from './wire/rpc'\nimport { startServer } from './wire/server'\n\ninterface Args {\n command: string\n positional: string[]\n flags: Record<string, string>\n}\n\nfunction parseArgs(argv: string[]): Args {\n const [command, ...rest] = argv\n const positional: string[] = []\n const flags: Record<string, string> = {}\n for (let i = 0; i < rest.length; i++) {\n const tok = rest[i]\n if (tok.startsWith('--')) {\n const key = tok.slice(2)\n const next = rest[i + 1]\n if (next != null && !next.startsWith('--')) {\n flags[key] = next\n i++\n } else {\n flags[key] = 'true'\n }\n } else {\n positional.push(tok)\n }\n }\n return { command: command ?? 'help', positional, flags }\n}\n\nconst HELP = `agent-eval — wire-protocol entry point.\n\nCommands:\n serve [--port 5005] [--host 127.0.0.1]\n Start the HTTP server. POST /v1/judge, GET /v1/rubrics, GET /v1/version, GET /openapi.json.\n rpc <method>\n Read one JSON object from stdin (the params for <method>), write one\n JSON object to stdout. Method ∈ {judge, listRubrics, version}.\n rpc-batch <method>\n Like 'rpc' but JSONL in / JSONL out.\n openapi [--out openapi.json]\n Write the OpenAPI 3.1 spec.\n version\n Print server + wire-protocol version JSON.\n\nWithout arguments, prints this help.`\n\nasync function main(): Promise<number> {\n const { command, positional, flags } = parseArgs(process.argv.slice(2))\n\n switch (command) {\n case 'serve': {\n const port = Number(flags.port ?? 5005)\n const host = flags.host ?? '127.0.0.1'\n const server = startServer({ port, host })\n // Keep process alive on SIGINT/SIGTERM\n const shutdown = (sig: string) => {\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] received ${sig}, shutting down`)\n server.close(() => process.exit(0))\n // Force exit after 5s if close hangs\n setTimeout(() => process.exit(1), 5000).unref()\n }\n process.on('SIGINT', () => shutdown('SIGINT'))\n process.on('SIGTERM', () => shutdown('SIGTERM'))\n // Block forever\n await new Promise(() => {})\n return 0\n }\n case 'rpc': {\n const [method] = positional\n return await runRpcOnce(method)\n }\n case 'rpc-batch': {\n const [method] = positional\n return await runRpcBatch(method)\n }\n case 'openapi': {\n const out = flags.out ?? 'openapi.json'\n const spec = buildOpenApi(handleVersion().version)\n writeFileSync(out, JSON.stringify(spec, null, 2) + '\\n', 'utf-8')\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] wrote OpenAPI 3.1 spec to ${out}`)\n return 0\n }\n case 'version': {\n process.stdout.write(JSON.stringify(handleVersion(), null, 2) + '\\n')\n return 0\n }\n case 'help':\n case '--help':\n case '-h':\n case '':\n process.stdout.write(HELP + '\\n')\n return 0\n default:\n process.stderr.write(`unknown command: ${command}\\n${HELP}\\n`)\n return 1\n }\n}\n\nmain()\n .then((code) => process.exit(code))\n .catch((err) => {\n // eslint-disable-next-line no-console\n console.error('[agent-eval] cli error:', err)\n process.exit(1)\n })\n"],"mappings":";;;;;;;;;;;;AAaA,SAAS,qBAAqB;AAa9B,SAAS,UAAU,MAAsB;AACvC,QAAM,CAAC,SAAS,GAAG,IAAI,IAAI;AAC3B,QAAM,aAAuB,CAAC;AAC9B,QAAM,QAAgC,CAAC;AACvC,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAClB,QAAI,IAAI,WAAW,IAAI,GAAG;AACxB,YAAM,MAAM,IAAI,MAAM,CAAC;AACvB,YAAM,OAAO,KAAK,IAAI,CAAC;AACvB,UAAI,QAAQ,QAAQ,CAAC,KAAK,WAAW,IAAI,GAAG;AAC1C,cAAM,GAAG,IAAI;AACb;AAAA,MACF,OAAO;AACL,cAAM,GAAG,IAAI;AAAA,MACf;AAAA,IACF,OAAO;AACL,iBAAW,KAAK,GAAG;AAAA,IACrB;AAAA,EACF;AACA,SAAO,EAAE,SAAS,WAAW,QAAQ,YAAY,MAAM;AACzD;AAEA,IAAM,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAiBb,eAAe,OAAwB;AACrC,QAAM,EAAE,SAAS,YAAY,MAAM,IAAI,UAAU,QAAQ,KAAK,MAAM,CAAC,CAAC;AAEtE,UAAQ,SAAS;AAAA,IACf,KAAK,SAAS;AACZ,YAAM,OAAO,OAAO,MAAM,QAAQ,IAAI;AACtC,YAAM,OAAO,MAAM,QAAQ;AAC3B,YAAM,SAAS,YAAY,EAAE,MAAM,KAAK,CAAC;AAEzC,YAAM,WAAW,CAAC,QAAgB;AAEhC,gBAAQ,IAAI,yBAAyB,GAAG,iBAAiB;AACzD,eAAO,MAAM,MAAM,QAAQ,KAAK,CAAC,CAAC;AAElC,mBAAW,MAAM,QAAQ,KAAK,CAAC,GAAG,GAAI,EAAE,MAAM;AAAA,MAChD;AACA,cAAQ,GAAG,UAAU,MAAM,SAAS,QAAQ,CAAC;AAC7C,cAAQ,GAAG,WAAW,MAAM,SAAS,SAAS,CAAC;AAE/C,YAAM,IAAI,QAAQ,MAAM;AAAA,MAAC,CAAC;AAC1B,aAAO;AAAA,IACT;AAAA,IACA,KAAK,OAAO;AACV,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,WAAW,MAAM;AAAA,IAChC;AAAA,IACA,KAAK,aAAa;AAChB,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,YAAY,MAAM;AAAA,IACjC;AAAA,IACA,KAAK,WAAW;AACd,YAAM,MAAM,MAAM,OAAO;AACzB,YAAM,OAAO,aAAa,cAAc,EAAE,OAAO;AACjD,oBAAc,KAAK,KAAK,UAAU,MAAM,MAAM,CAAC,IAAI,MAAM,OAAO;AAEhE,cAAQ,IAAI,0CAA0C,GAAG,EAAE;AAC3D,aAAO;AAAA,IACT;AAAA,IACA,KAAK,WAAW;AACd,cAAQ,OAAO,MAAM,KAAK,UAAU,cAAc,GAAG,MAAM,CAAC,IAAI,IAAI;AACpE,aAAO;AAAA,IACT;AAAA,IACA,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AACH,cAAQ,OAAO,MAAM,OAAO,IAAI;AAChC,aAAO;AAAA,IACT;AACE,cAAQ,OAAO,MAAM,oBAAoB,OAAO;AAAA,EAAK,IAAI;AAAA,CAAI;AAC7D,aAAO;AAAA,EACX;AACF;AAEA,KAAK,EACF,KAAK,CAAC,SAAS,QAAQ,KAAK,IAAI,CAAC,EACjC,MAAM,CAAC,QAAQ;AAEd,UAAQ,MAAM,2BAA2B,GAAG;AAC5C,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}