npm - @tangle-network/agent-eval - Versions diffs - 0.14.2 → 0.16.1 - Mend

@tangle-network/agent-eval 0.14.2 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +45 -0
package/dist/chunk-PZ5AY32C.js +10 -0
package/dist/chunk-PZ5AY32C.js.map +1 -0
package/dist/cli.js +1 -0
package/dist/cli.js.map +1 -1
package/dist/index.d.ts +963 -4
package/dist/index.js +1456 -132
package/dist/index.js.map +1 -1
package/dist/telemetry/file.js +2 -0
package/dist/telemetry/file.js.map +1 -1
package/dist/telemetry/index.js +2 -0
package/dist/telemetry/index.js.map +1 -1
package/dist/wire/index.js +1 -0
package/package.json +10 -12

package/README.md CHANGED Viewed

@@ -168,6 +168,51 @@ The `MutationTelemetry`, `LineageRecorder`, and `CostLedger` pass into the `code
 For the full primitive surface and rationale, read each module's JSDoc — `prompt-evolution.ts`, `composite-mutator.ts`, `sandbox-pool.ts`, `code-mutator.ts`, `reflective-mutation.ts`, `evolution-telemetry.ts`.
+## v0.16 highlights — production-rigor primitives
+These are the primitives any team running prompt-optimization in production needs, regardless of whether they're writing a paper. v0.15 shipped them under "paper-grade" naming; v0.16 corrects that — they're production-first, paper-grade as a side effect.
+- `HeldOutGate` — held-out paired-delta gate with `few_runs` /
+  `negative_delta` / `overfit_gap` rejection codes and a full evidence
+  block on every decision. Sits alongside the existing bootstrap-CI
+  `promotion-gate.ts`: that one asks "is this real or noise?", this one
+  asks "is this a real win on held-out and not overfit?". Use both.
+- `RunRecord` — typed run schema with mandatory snapshot-pinned `model`,
+  `promptHash`, `configHash`, `commitSha`, `costUsd`, `splitTag`.
+  Runtime validator throws on missing fields. Reproducibility falls
+  out for free.
+- `pairedBootstrap`, `pairedWilcoxon`, `bhAdjust` — statistical
+  primitives every rigorous A/B test needs. Already-existing primitives
+  are re-exported for paper-style aliases.
+- `runCanaries` — silent judge-fallback, calibration drift (KS test),
+  distribution shift (chi-square). Catches the failure mode where your
+  judge silently degrades to a constant-0.30 confidence and you ship
+  configs graded by a stub.
+- `summaryTable`, `paretoChart`, `gainHistogram` — A/B reporting
+  helpers. `summaryTable` emits markdown with means + 95% bootstrap
+  CIs + paired Wilcoxon p (BH-adjusted) + Cohen's d. Useful for both
+  internal status reports and paper Table 1s.
+- `Researcher` — stable interface for an external agent that drives the
+  meta-loop (`inspectFailures` → `proposeChange` → `applyChange` →
+  `evaluateChange`). Ship a `NoopResearcher` as a placeholder; real
+  implementations live downstream.
+- `benchmarks/{gsm8k,swebench-lite,routing}` — reference benchmark
+  wrappers behind one `BenchmarkAdapter` shape, with deterministic
+  splits and fail-loud env-var configuration. Mostly for reproducible
+  comparisons; not core surface.
+### v0.16 changes from v0.15
+- Renamed `paperTable` → `summaryTable`, `paretoFigure` → `paretoChart`,
+  `gainDistributionFigure` → `gainHistogram`. Underlying semantics
+  unchanged. Type names follow (`SummaryTable`, `SummaryTableOptions`,
+  `SummaryTableRow`).
+- File: `src/paper-report.ts` → `src/summary-report.ts`.
+- Drop the "paper-grade" framing — the primitives are production-first.
+See `CHANGELOG.md` for the full list. `.claude/skills/agent-eval/SKILL.md`
+covers usage directives and pitfalls.
 ## Tech stack
 - TypeScript strict, no semicolons, single quotes, 2-space indent

package/dist/chunk-PZ5AY32C.js ADDED Viewed

@@ -0,0 +1,10 @@
+var __defProp = Object.defineProperty;
+var __export = (target, all) => {
+  for (var name in all)
+    __defProp(target, name, { get: all[name], enumerable: true });
+};
+export {
+  __export
+};
+//# sourceMappingURL=chunk-PZ5AY32C.js.map

package/dist/chunk-PZ5AY32C.js.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}

package/dist/cli.js CHANGED Viewed

@@ -7,6 +7,7 @@ import {
   startServer
 } from "./chunk-OZPRSK4A.js";
 import "./chunk-ITN4YOZY.js";
+import "./chunk-PZ5AY32C.js";
 // src/cli.ts
 import { writeFileSync } from "fs";

package/dist/cli.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n/*\n agent-eval CLI.\n \n agent-eval serve [--port 5005] [--host 127.0.0.1]\n * agent-eval rpc <method> # one request from stdin → one response on stdout\n * agent-eval rpc-batch <method> # JSONL stdin → JSONL stdout\n * agent-eval openapi [--out path] # write OpenAPI spec\n * agent-eval version\n \n <method> is one of: judge, listRubrics, version. When omitted, the\n * stdin payload must be a full {method, params} envelope.\n */\nimport { writeFileSync } from 'node:fs'\n\nimport { buildOpenApi } from './wire/openapi'\nimport { handleVersion } from './wire/handlers'\nimport { runRpcBatch, runRpcOnce } from './wire/rpc'\nimport { startServer } from './wire/server'\n\ninterface Args {\n command: string\n positional: string[]\n flags: Record<string, string>\n}\n\nfunction parseArgs(argv: string[]): Args {\n const [command, ...rest] = argv\n const positional: string[] = []\n const flags: Record<string, string> = {}\n for (let i = 0; i < rest.length; i++) {\n const tok = rest[i]\n if (tok.startsWith('--')) {\n const key = tok.slice(2)\n const next = rest[i + 1]\n if (next != null && !next.startsWith('--')) {\n flags[key] = next\n i++\n } else {\n flags[key] = 'true'\n }\n } else {\n positional.push(tok)\n }\n }\n return { command: command ?? 'help', positional, flags }\n}\n\nconst HELP = `agent-eval — wire-protocol entry point.\n\nCommands:\n serve [--port 5005] [--host 127.0.0.1]\n Start the HTTP server. POST /v1/judge, GET /v1/rubrics, GET /v1/version, GET /openapi.json.\n rpc <method>\n Read one JSON object from stdin (the params for <method>), write one\n JSON object to stdout. Method ∈ {judge, listRubrics, version}.\n rpc-batch <method>\n Like 'rpc' but JSONL in / JSONL out.\n openapi [--out openapi.json]\n Write the OpenAPI 3.1 spec.\n version\n Print server + wire-protocol version JSON.\n\nWithout arguments, prints this help.`\n\nasync function main(): Promise<number> {\n const { command, positional, flags } = parseArgs(process.argv.slice(2))\n\n switch (command) {\n case 'serve': {\n const port = Number(flags.port ?? 5005)\n const host = flags.host ?? '127.0.0.1'\n const server = startServer({ port, host })\n // Keep process alive on SIGINT/SIGTERM\n const shutdown = (sig: string) => {\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] received ${sig}, shutting down`)\n server.close(() => process.exit(0))\n // Force exit after 5s if close hangs\n setTimeout(() => process.exit(1), 5000).unref()\n }\n process.on('SIGINT', () => shutdown('SIGINT'))\n process.on('SIGTERM', () => shutdown('SIGTERM'))\n // Block forever\n await new Promise(() => {})\n return 0\n }\n case 'rpc': {\n const [method] = positional\n return await runRpcOnce(method)\n }\n case 'rpc-batch': {\n const [method] = positional\n return await runRpcBatch(method)\n }\n case 'openapi': {\n const out = flags.out ?? 'openapi.json'\n const spec = buildOpenApi(handleVersion().version)\n writeFileSync(out, JSON.stringify(spec, null, 2) + '\\n', 'utf-8')\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] wrote OpenAPI 3.1 spec to ${out}`)\n return 0\n }\n case 'version': {\n process.stdout.write(JSON.stringify(handleVersion(), null, 2) + '\\n')\n return 0\n }\n case 'help':\n case '--help':\n case '-h':\n case '':\n process.stdout.write(HELP + '\\n')\n return 0\n default:\n process.stderr.write(`unknown command: ${command}\\n${HELP}\\n`)\n return 1\n }\n}\n\nmain()\n .then((code) => process.exit(code))\n .catch((err) => {\n // eslint-disable-next-line no-console\n console.error('[agent-eval] cli error:', err)\n process.exit(1)\n })\n"],"mappings":"~~;;;;;;;;;;;~~AAaA,SAAS,qBAAqB;AAa9B,SAAS,UAAU,MAAsB;AACvC,QAAM,CAAC,SAAS,GAAG,IAAI,IAAI;AAC3B,QAAM,aAAuB,CAAC;AAC9B,QAAM,QAAgC,CAAC;AACvC,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAClB,QAAI,IAAI,WAAW,IAAI,GAAG;AACxB,YAAM,MAAM,IAAI,MAAM,CAAC;AACvB,YAAM,OAAO,KAAK,IAAI,CAAC;AACvB,UAAI,QAAQ,QAAQ,CAAC,KAAK,WAAW,IAAI,GAAG;AAC1C,cAAM,GAAG,IAAI;AACb;AAAA,MACF,OAAO;AACL,cAAM,GAAG,IAAI;AAAA,MACf;AAAA,IACF,OAAO;AACL,iBAAW,KAAK,GAAG;AAAA,IACrB;AAAA,EACF;AACA,SAAO,EAAE,SAAS,WAAW,QAAQ,YAAY,MAAM;AACzD;AAEA,IAAM,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAiBb,eAAe,OAAwB;AACrC,QAAM,EAAE,SAAS,YAAY,MAAM,IAAI,UAAU,QAAQ,KAAK,MAAM,CAAC,CAAC;AAEtE,UAAQ,SAAS;AAAA,IACf,KAAK,SAAS;AACZ,YAAM,OAAO,OAAO,MAAM,QAAQ,IAAI;AACtC,YAAM,OAAO,MAAM,QAAQ;AAC3B,YAAM,SAAS,YAAY,EAAE,MAAM,KAAK,CAAC;AAEzC,YAAM,WAAW,CAAC,QAAgB;AAEhC,gBAAQ,IAAI,yBAAyB,GAAG,iBAAiB;AACzD,eAAO,MAAM,MAAM,QAAQ,KAAK,CAAC,CAAC;AAElC,mBAAW,MAAM,QAAQ,KAAK,CAAC,GAAG,GAAI,EAAE,MAAM;AAAA,MAChD;AACA,cAAQ,GAAG,UAAU,MAAM,SAAS,QAAQ,CAAC;AAC7C,cAAQ,GAAG,WAAW,MAAM,SAAS,SAAS,CAAC;AAE/C,YAAM,IAAI,QAAQ,MAAM;AAAA,MAAC,CAAC;AAC1B,aAAO;AAAA,IACT;AAAA,IACA,KAAK,OAAO;AACV,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,WAAW,MAAM;AAAA,IAChC;AAAA,IACA,KAAK,aAAa;AAChB,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,YAAY,MAAM;AAAA,IACjC;AAAA,IACA,KAAK,WAAW;AACd,YAAM,MAAM,MAAM,OAAO;AACzB,YAAM,OAAO,aAAa,cAAc,EAAE,OAAO;AACjD,oBAAc,KAAK,KAAK,UAAU,MAAM,MAAM,CAAC,IAAI,MAAM,OAAO;AAEhE,cAAQ,IAAI,0CAA0C,GAAG,EAAE;AAC3D,aAAO;AAAA,IACT;AAAA,IACA,KAAK,WAAW;AACd,cAAQ,OAAO,MAAM,KAAK,UAAU,cAAc,GAAG,MAAM,CAAC,IAAI,IAAI;AACpE,aAAO;AAAA,IACT;AAAA,IACA,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AACH,cAAQ,OAAO,MAAM,OAAO,IAAI;AAChC,aAAO;AAAA,IACT;AACE,cAAQ,OAAO,MAAM,oBAAoB,OAAO;AAAA,EAAK,IAAI;AAAA,CAAI;AAC7D,aAAO;AAAA,EACX;AACF;AAEA,KAAK,EACF,KAAK,CAAC,SAAS,QAAQ,KAAK,IAAI,CAAC,EACjC,MAAM,CAAC,QAAQ;AAEd,UAAQ,MAAM,2BAA2B,GAAG;AAC5C,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
1	+ {"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n/*\n agent-eval CLI.\n \n agent-eval serve [--port 5005] [--host 127.0.0.1]\n * agent-eval rpc <method> # one request from stdin → one response on stdout\n * agent-eval rpc-batch <method> # JSONL stdin → JSONL stdout\n * agent-eval openapi [--out path] # write OpenAPI spec\n * agent-eval version\n \n <method> is one of: judge, listRubrics, version. When omitted, the\n * stdin payload must be a full {method, params} envelope.\n */\nimport { writeFileSync } from 'node:fs'\n\nimport { buildOpenApi } from './wire/openapi'\nimport { handleVersion } from './wire/handlers'\nimport { runRpcBatch, runRpcOnce } from './wire/rpc'\nimport { startServer } from './wire/server'\n\ninterface Args {\n command: string\n positional: string[]\n flags: Record<string, string>\n}\n\nfunction parseArgs(argv: string[]): Args {\n const [command, ...rest] = argv\n const positional: string[] = []\n const flags: Record<string, string> = {}\n for (let i = 0; i < rest.length; i++) {\n const tok = rest[i]\n if (tok.startsWith('--')) {\n const key = tok.slice(2)\n const next = rest[i + 1]\n if (next != null && !next.startsWith('--')) {\n flags[key] = next\n i++\n } else {\n flags[key] = 'true'\n }\n } else {\n positional.push(tok)\n }\n }\n return { command: command ?? 'help', positional, flags }\n}\n\nconst HELP = `agent-eval — wire-protocol entry point.\n\nCommands:\n serve [--port 5005] [--host 127.0.0.1]\n Start the HTTP server. POST /v1/judge, GET /v1/rubrics, GET /v1/version, GET /openapi.json.\n rpc <method>\n Read one JSON object from stdin (the params for <method>), write one\n JSON object to stdout. Method ∈ {judge, listRubrics, version}.\n rpc-batch <method>\n Like 'rpc' but JSONL in / JSONL out.\n openapi [--out openapi.json]\n Write the OpenAPI 3.1 spec.\n version\n Print server + wire-protocol version JSON.\n\nWithout arguments, prints this help.`\n\nasync function main(): Promise<number> {\n const { command, positional, flags } = parseArgs(process.argv.slice(2))\n\n switch (command) {\n case 'serve': {\n const port = Number(flags.port ?? 5005)\n const host = flags.host ?? '127.0.0.1'\n const server = startServer({ port, host })\n // Keep process alive on SIGINT/SIGTERM\n const shutdown = (sig: string) => {\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] received ${sig}, shutting down`)\n server.close(() => process.exit(0))\n // Force exit after 5s if close hangs\n setTimeout(() => process.exit(1), 5000).unref()\n }\n process.on('SIGINT', () => shutdown('SIGINT'))\n process.on('SIGTERM', () => shutdown('SIGTERM'))\n // Block forever\n await new Promise(() => {})\n return 0\n }\n case 'rpc': {\n const [method] = positional\n return await runRpcOnce(method)\n }\n case 'rpc-batch': {\n const [method] = positional\n return await runRpcBatch(method)\n }\n case 'openapi': {\n const out = flags.out ?? 'openapi.json'\n const spec = buildOpenApi(handleVersion().version)\n writeFileSync(out, JSON.stringify(spec, null, 2) + '\\n', 'utf-8')\n // eslint-disable-next-line no-console\n console.log(`[agent-eval] wrote OpenAPI 3.1 spec to ${out}`)\n return 0\n }\n case 'version': {\n process.stdout.write(JSON.stringify(handleVersion(), null, 2) + '\\n')\n return 0\n }\n case 'help':\n case '--help':\n case '-h':\n case '':\n process.stdout.write(HELP + '\\n')\n return 0\n default:\n process.stderr.write(`unknown command: ${command}\\n${HELP}\\n`)\n return 1\n }\n}\n\nmain()\n .then((code) => process.exit(code))\n .catch((err) => {\n // eslint-disable-next-line no-console\n console.error('[agent-eval] cli error:', err)\n process.exit(1)\n })\n"],"mappings":";;;;;;;;;;;;AAaA,SAAS,qBAAqB;AAa9B,SAAS,UAAU,MAAsB;AACvC,QAAM,CAAC,SAAS,GAAG,IAAI,IAAI;AAC3B,QAAM,aAAuB,CAAC;AAC9B,QAAM,QAAgC,CAAC;AACvC,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAClB,QAAI,IAAI,WAAW,IAAI,GAAG;AACxB,YAAM,MAAM,IAAI,MAAM,CAAC;AACvB,YAAM,OAAO,KAAK,IAAI,CAAC;AACvB,UAAI,QAAQ,QAAQ,CAAC,KAAK,WAAW,IAAI,GAAG;AAC1C,cAAM,GAAG,IAAI;AACb;AAAA,MACF,OAAO;AACL,cAAM,GAAG,IAAI;AAAA,MACf;AAAA,IACF,OAAO;AACL,iBAAW,KAAK,GAAG;AAAA,IACrB;AAAA,EACF;AACA,SAAO,EAAE,SAAS,WAAW,QAAQ,YAAY,MAAM;AACzD;AAEA,IAAM,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAiBb,eAAe,OAAwB;AACrC,QAAM,EAAE,SAAS,YAAY,MAAM,IAAI,UAAU,QAAQ,KAAK,MAAM,CAAC,CAAC;AAEtE,UAAQ,SAAS;AAAA,IACf,KAAK,SAAS;AACZ,YAAM,OAAO,OAAO,MAAM,QAAQ,IAAI;AACtC,YAAM,OAAO,MAAM,QAAQ;AAC3B,YAAM,SAAS,YAAY,EAAE,MAAM,KAAK,CAAC;AAEzC,YAAM,WAAW,CAAC,QAAgB;AAEhC,gBAAQ,IAAI,yBAAyB,GAAG,iBAAiB;AACzD,eAAO,MAAM,MAAM,QAAQ,KAAK,CAAC,CAAC;AAElC,mBAAW,MAAM,QAAQ,KAAK,CAAC,GAAG,GAAI,EAAE,MAAM;AAAA,MAChD;AACA,cAAQ,GAAG,UAAU,MAAM,SAAS,QAAQ,CAAC;AAC7C,cAAQ,GAAG,WAAW,MAAM,SAAS,SAAS,CAAC;AAE/C,YAAM,IAAI,QAAQ,MAAM;AAAA,MAAC,CAAC;AAC1B,aAAO;AAAA,IACT;AAAA,IACA,KAAK,OAAO;AACV,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,WAAW,MAAM;AAAA,IAChC;AAAA,IACA,KAAK,aAAa;AAChB,YAAM,CAAC,MAAM,IAAI;AACjB,aAAO,MAAM,YAAY,MAAM;AAAA,IACjC;AAAA,IACA,KAAK,WAAW;AACd,YAAM,MAAM,MAAM,OAAO;AACzB,YAAM,OAAO,aAAa,cAAc,EAAE,OAAO;AACjD,oBAAc,KAAK,KAAK,UAAU,MAAM,MAAM,CAAC,IAAI,MAAM,OAAO;AAEhE,cAAQ,IAAI,0CAA0C,GAAG,EAAE;AAC3D,aAAO;AAAA,IACT;AAAA,IACA,KAAK,WAAW;AACd,cAAQ,OAAO,MAAM,KAAK,UAAU,cAAc,GAAG,MAAM,CAAC,IAAI,IAAI;AACpE,aAAO;AAAA,IACT;AAAA,IACA,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AAAA,IACL,KAAK;AACH,cAAQ,OAAO,MAAM,OAAO,IAAI;AAChC,aAAO;AAAA,IACT;AACE,cAAQ,OAAO,MAAM,oBAAoB,OAAO;AAAA,EAAK,IAAI;AAAA,CAAI;AAC7D,aAAO;AAAA,EACX;AACF;AAEA,KAAK,EACF,KAAK,CAAC,SAAS,QAAQ,KAAK,IAAI,CAAC,EACjC,MAAM,CAAC,QAAQ;AAEd,UAAQ,MAAM,2BAA2B,GAAG;AAC5C,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}