@tangle-network/agent-eval 0.60.0 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +5 -5
  5. package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
  6. package/dist/benchmarks/index.d.ts +3 -3
  7. package/dist/builder-eval/index.js +2 -2
  8. package/dist/campaign/index.d.ts +151 -11
  9. package/dist/campaign/index.js +211 -10
  10. package/dist/campaign/index.js.map +1 -1
  11. package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
  12. package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
  13. package/dist/chunk-3BFEG2F6.js.map +1 -0
  14. package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
  15. package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
  16. package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
  17. package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
  18. package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
  19. package/dist/{chunk-GBHRUAOF.js → chunk-GMXHLSLL.js} +2 -2
  20. package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
  21. package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
  22. package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
  23. package/dist/{chunk-NOPYCRNG.js → chunk-OLULBECP.js} +13 -2
  24. package/dist/chunk-OLULBECP.js.map +1 -0
  25. package/dist/chunk-PQV2TKC3.js +27 -0
  26. package/dist/chunk-PQV2TKC3.js.map +1 -0
  27. package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
  28. package/dist/{chunk-LBSXXH56.js → chunk-SUGME4OT.js} +5 -5
  29. package/dist/chunk-SUGME4OT.js.map +1 -0
  30. package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
  31. package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
  32. package/dist/cli.js +3 -3
  33. package/dist/contract/index.d.ts +13 -13
  34. package/dist/contract/index.js +7 -7
  35. package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
  36. package/dist/control.d.ts +5 -5
  37. package/dist/control.js +3 -3
  38. package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
  39. package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
  40. package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
  41. package/dist/governance/index.d.ts +3 -3
  42. package/dist/hosted/index.d.ts +5 -5
  43. package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
  44. package/dist/{index-BIkvdkSU.d.ts → index-D9dwa00f.d.ts} +2 -2
  45. package/dist/index.d.ts +24 -132
  46. package/dist/index.js +16 -29
  47. package/dist/index.js.map +1 -1
  48. package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
  49. package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
  50. package/dist/meta-eval/index.d.ts +3 -3
  51. package/dist/openapi.json +1 -1
  52. package/dist/pipelines/index.js +3 -3
  53. package/dist/{provenance-BM8vmMBa.d.ts → provenance-D0WeCXt1.d.ts} +5 -5
  54. package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
  55. package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
  56. package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
  57. package/dist/reporting.d.ts +6 -6
  58. package/dist/reporting.js +4 -4
  59. package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
  60. package/dist/rl.d.ts +9 -9
  61. package/dist/rl.js +7 -7
  62. package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
  63. package/dist/run-campaign-HXPJAUZ3.js +10 -0
  64. package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
  65. package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
  66. package/dist/traces.d.ts +2 -2
  67. package/dist/traces.js +3 -3
  68. package/dist/{types-VCIXx_yo.d.ts → types-Beb6KPqZ.d.ts} +21 -1
  69. package/dist/wire/index.d.ts +3 -3
  70. package/dist/wire/index.js +3 -3
  71. package/package.json +12 -25
  72. package/dist/chunk-LBSXXH56.js.map +0 -1
  73. package/dist/chunk-NOPYCRNG.js.map +0 -1
  74. package/dist/chunk-QYJT52YW.js.map +0 -1
  75. package/dist/run-campaign-5XENUKRF.js +0 -10
  76. /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
  77. /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
  78. /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
  79. /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
  80. /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
  81. /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
  82. /package/dist/{chunk-GBHRUAOF.js.map → chunk-GMXHLSLL.js.map} +0 -0
  83. /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
  84. /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
  85. /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
  86. /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
  87. /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
  88. /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
  89. /package/dist/{run-campaign-5XENUKRF.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAqBd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AClSA,SAAS,oBAAoB;AAC7B,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["existsSync","join"]}
1
+ {"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/presets/run-profile-matrix.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * `runProfileMatrix` — the missing keystone between `runAgentMatrix` and the\n * backend-integrity guard.\n *\n * The gap it closes: `runAgentMatrix` is a topology-opaque scheduler whose\n * cells return a bare `{ output, verdict, costUsd }` — no `tokenUsage`, not a\n * `RunRecord`. `assertRealBackend` / `summarizeBackendIntegrity` key on\n * `RunRecord.tokenUsage`, so they cannot run on a raw matrix result. Every\n * consumer therefore hand-writes the same bridge: fan a profile × scenario\n * cartesian, call dispatch, fabricate a `RunRecord` with token usage, thread it\n * back, run the integrity guard. That hand-rolled bridge is exactly the pile of\n * bespoke `eval:*` scripts the adoption skills keep trying (and failing) to\n * forbid.\n *\n * `runProfileMatrix` IS that bridge, once:\n *\n * - axis 3 (PROFILE) = `profiles: AgentProfile[]`\n * - axis 1 (PERSONA/SCENARIO) = `scenarios: Scenario[]` (each scenario carries\n * its persona; `personaOf` groups them for the `byPersona` pivot)\n * - the scoring axis = `judges`\n *\n * It runs `runCampaign` once per profile (reusing its seeds, reps, bootstrap\n * CIs, resumability, and the `LabeledScenarioStore` capture flywheel), maps\n * every cell to a validated `RunRecord` carrying the real `tokenUsage` the\n * dispatch reported via `ctx.cost.observeTokens`, and runs `assertRealBackend`\n * BY CONSTRUCTION before returning — so a stub-backend run fails loudly instead\n * of reporting a clean 0/N leaderboard.\n *\n * Dispatch contract: a dispatch that calls an LLM MUST report usage via\n * `ctx.cost.observeTokens({ input, output })` (and cost via `ctx.cost.observe`).\n * A dispatch that reports zero tokens is indistinguishable from a stub and the\n * integrity guard treats it as one.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport { type AgentProfile, agentProfileHash } from '../../agent-profile'\nimport { AgentEvalError } from '../../errors'\nimport {\n assertRealBackend,\n type BackendIntegrityReport,\n summarizeBackendIntegrity,\n} from '../../integrity/backend-integrity'\nimport {\n type RunOutcome,\n type RunRecord,\n type RunSplitTag,\n validateRunRecord,\n} from '../../run-record'\nimport { runCampaign } from '../run-campaign'\nimport type { CampaignStorage } from '../storage'\nimport type {\n CampaignCellResult,\n CampaignResult,\n DispatchContext,\n JudgeConfig,\n LabeledScenarioSource,\n LabeledScenarioStore,\n Scenario,\n} from '../types'\n\n/** Thrown when the matrix is misconfigured (no profiles, a profile whose model\n * lacks a snapshot version, etc.). Distinct from `BackendIntegrityError`,\n * which signals a stub backend at run time. */\nexport class ProfileMatrixError extends AgentEvalError {\n constructor(message: string) {\n super('profile_matrix', message)\n }\n}\n\n/** Dispatch for one cell: render `profile` against `scenario`, returning the\n * artifact the judges score. Report LLM usage via `ctx.cost.observeTokens`\n * and `ctx.cost.observe` — the integrity guard depends on it. */\nexport type ProfileDispatchFn<TScenario extends Scenario, TArtifact> = (\n profile: AgentProfile,\n scenario: TScenario,\n ctx: DispatchContext,\n) => Promise<TArtifact>\n\nexport interface RunProfileMatrixOptions<TScenario extends Scenario, TArtifact> {\n /** Axis 3 — the agent-under-test configurations. Each is one column. */\n profiles: AgentProfile[]\n /** Axis 1 — the persona/scenario corpus, run against every profile. */\n scenarios: TScenario[]\n /** Renders one (profile, scenario) cell. */\n dispatch: ProfileDispatchFn<TScenario, TArtifact>\n /** The scoring axis. */\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Where each profile's campaign writes artifacts/traces. One subdir per\n * profile. */\n runDir: string\n /** Git SHA the harness ran from — stamped onto every RunRecord (mandatory\n * for paper-grade records). */\n commitSha: string\n /** Logical experiment id shared across the whole matrix so the promotion\n * gate can pair profiles on matched scenarios. Default: a hash of the\n * profile + scenario ids. */\n experimentId?: string\n /** Which split these runs belong to. Default `'search'`. */\n splitTag?: RunSplitTag\n /** Replicates per (profile, scenario) cell for CI bands. Default 1. */\n reps?: number\n /** Campaign seed (per profile). Default 42. */\n seed?: number\n /**\n * Backend-integrity posture, enforced AFTER the matrix completes:\n * - `'assert'` (default) — throw `BackendIntegrityError` if the run was a\n * stub (and, with `allowMixed:false`, if it was mixed).\n * - `'warn'` — log the verdict but never throw.\n * - `'off'` — skip the guard entirely (only for offline/replay analysis).\n */\n integrity?: 'assert' | 'warn' | 'off'\n /** Forwarded to `assertRealBackend`. Default true (tolerate partial 429\n * cascades); set false for strict CI gates. */\n allowMixed?: boolean\n /** Max concurrent cells WITHIN each profile's campaign. Default 2.\n * Profiles run sequentially so the cost ceiling is honored deterministically. */\n maxConcurrency?: number\n /** Cumulative USD cap per profile campaign. */\n costCeiling?: number\n /** Capture flywheel — forwarded to each campaign. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: LabeledScenarioSource\n /** Storage backend. Default `fsCampaignStorage`. Pass\n * `inMemoryCampaignStorage()` for edge/CF-Worker/test runs. */\n storage?: CampaignStorage\n /** Test seam — override the wall clock. */\n now?: () => Date\n /** Optional persona key per scenario — drives the `byPersona` pivot. When\n * unset, `byPersona` is omitted. */\n personaOf?: (scenario: TScenario) => string\n /** Validate every produced RunRecord with `validateRunRecord` (fail-loud).\n * Default true — catches bad model snapshots and non-finite judge dims at\n * the boundary instead of letting them poison downstream analysis. */\n validate?: boolean\n}\n\nexport interface ProfileSummary {\n profileId: string\n profileHash: string\n model: string\n /** RunRecords produced for this profile (= scenarios × reps). */\n records: number\n /** Mean composite across this profile's records. */\n meanComposite: number\n totalCostUsd: number\n /** Per-profile integrity verdict — surfaces a single profile that ran stub\n * even when the matrix as a whole looks real. */\n integrity: BackendIntegrityReport\n}\n\nexport interface ScenarioRollup {\n meanComposite: number\n n: number\n}\n\nexport interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {\n matrixId: string\n experimentId: string\n /** One RunRecord per (profile, scenario, rep) cell — the integrity-checked,\n * paper-grade output. Feed straight into `analyzeRuns`, `HeldOutGate`,\n * scorecards, the hosted wire format. */\n records: RunRecord[]\n byProfile: Record<string, ProfileSummary>\n byScenario: Record<string, ScenarioRollup>\n /** Present only when `personaOf` was supplied. */\n byPersona?: Record<string, ScenarioRollup>\n /** Whole-matrix integrity report (the one `integrity:'assert'` enforces). */\n integrity: BackendIntegrityReport\n /** The raw per-profile campaign results, keyed by profile id. */\n campaigns: Record<string, CampaignResult<TArtifact, TScenario>>\n}\n\nfunction sanitize(id: string): string {\n return id.replace(/[^a-zA-Z0-9_-]/g, '_')\n}\n\nfunction sha(input: unknown): string {\n return createHash('sha256').update(JSON.stringify(input)).digest('hex')\n}\n\nfunction mean(xs: number[]): number {\n return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length\n}\n\nfunction cellComposite(cell: CampaignCellResult<unknown>): number {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n return composites.length === 0 ? 0 : mean(composites)\n}\n\ninterface BuildRecordArgs<TArtifact> {\n cell: CampaignCellResult<TArtifact>\n profile: AgentProfile\n profileHash: string\n configHash: string\n experimentId: string\n splitTag: RunSplitTag\n commitSha: string\n matrixId: string\n}\n\nfunction buildRunRecord<TArtifact>(args: BuildRecordArgs<TArtifact>): RunRecord {\n const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } =\n args\n const composite = cellComposite(cell)\n\n // Flatten judge dimensions (judge-prefixed to avoid collisions) into raw.\n const raw: Record<string, number> = { composite }\n const perJudge: Record<string, Record<string, number>> = {}\n const dimAccum: Record<string, number[]> = {}\n const notes: string[] = []\n for (const [judgeName, js] of Object.entries(cell.judgeScores)) {\n perJudge[judgeName] = { ...js.dimensions }\n for (const [dim, value] of Object.entries(js.dimensions)) {\n raw[`${judgeName}.${dim}`] = value\n ;(dimAccum[dim] ??= []).push(value)\n }\n if (js.notes) notes.push(`${judgeName}: ${js.notes}`)\n }\n const perDimMean: Record<string, number> = {}\n for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values)\n\n const outcome: RunOutcome =\n splitTag === 'holdout' ? { holdoutScore: composite, raw } : { searchScore: composite, raw }\n if (Object.keys(perJudge).length > 0) {\n outcome.judgeScores = {\n perJudge,\n perDimMean,\n composite,\n ...(notes.length > 0 ? { notes: notes.join(' | ') } : {}),\n }\n }\n\n return {\n runId: `${matrixId}:${profile.id}:${cell.cellId}`,\n experimentId,\n candidateId: profile.id,\n seed: cell.seed,\n model: profile.model,\n promptHash: profileHash,\n configHash,\n commitSha,\n wallMs: cell.durationMs,\n costUsd: cell.costUsd,\n tokenUsage: cell.tokenUsage,\n outcome,\n splitTag,\n scenarioId: cell.scenarioId,\n ...(cell.error ? { failureMode: cell.error } : {}),\n }\n}\n\nexport async function runProfileMatrix<TScenario extends Scenario, TArtifact>(\n opts: RunProfileMatrixOptions<TScenario, TArtifact>,\n): Promise<RunProfileMatrixResult<TArtifact, TScenario>> {\n if (opts.profiles.length === 0) throw new ProfileMatrixError('profiles must not be empty')\n if (opts.scenarios.length === 0) throw new ProfileMatrixError('scenarios must not be empty')\n\n const splitTag = opts.splitTag ?? 'search'\n const seed = opts.seed ?? 42\n const validate = opts.validate ?? true\n const integrityMode = opts.integrity ?? 'assert'\n const profileIds = opts.profiles.map((p) => p.id)\n const experimentId =\n opts.experimentId ??\n `pm_${sha({ profileIds, scenarios: opts.scenarios.map((s) => s.id) }).slice(0, 16)}`\n const matrixId = `mtx_${sha({ experimentId, profileIds, seed, splitTag }).slice(0, 16)}`\n\n // Preflight: every profile must hash (non-empty model) AND its model must\n // carry a snapshot version, BEFORE any LLM spend. A probe record run through\n // validateRunRecord catches both in the exact place they'd otherwise surface\n // far downstream.\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n try {\n validateRunRecord({\n runId: `${matrixId}:${profile.id}:probe`,\n experimentId,\n candidateId: profile.id,\n seed,\n model: profile.model,\n promptHash: profileHash,\n configHash: profileHash,\n commitSha: opts.commitSha,\n wallMs: 0,\n costUsd: 0,\n tokenUsage: { input: 0, output: 0 },\n outcome:\n splitTag === 'holdout' ? { holdoutScore: 0, raw: {} } : { searchScore: 0, raw: {} },\n splitTag,\n })\n } catch (err) {\n throw new ProfileMatrixError(\n `profile '${profile.id}' is not recordable: ${err instanceof Error ? err.message : String(err)}`,\n )\n }\n }\n\n const records: RunRecord[] = []\n const campaigns: Record<string, CampaignResult<TArtifact, TScenario>> = {}\n const byProfile: Record<string, ProfileSummary> = {}\n\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n const configHash = sha({\n profile: profileHash,\n judges: (opts.judges ?? []).map((j) => j.name),\n seed,\n splitTag,\n })\n\n // Bind the profile into a campaign dispatch. Name it so the campaign's\n // manifest hash is stable + distinct per profile.\n const dispatch = (scenario: TScenario, ctx: DispatchContext): Promise<TArtifact> =>\n opts.dispatch(profile, scenario, ctx)\n Object.defineProperty(dispatch, 'name', { value: `profile_${sanitize(profile.id)}` })\n\n const campaign = await runCampaign<TScenario, TArtifact>({\n scenarios: opts.scenarios,\n dispatch,\n judges: opts.judges,\n seed,\n reps: opts.reps,\n maxConcurrency: opts.maxConcurrency,\n costCeiling: opts.costCeiling,\n labeledStore: opts.labeledStore,\n captureSource: opts.captureSource,\n storage: opts.storage,\n now: opts.now,\n runDir: join(opts.runDir, sanitize(profile.id)),\n })\n campaigns[profile.id] = campaign\n\n const profileRecords: RunRecord[] = []\n for (const cell of campaign.cells) {\n const record = buildRunRecord({\n cell,\n profile,\n profileHash,\n configHash,\n experimentId,\n splitTag,\n commitSha: opts.commitSha,\n matrixId,\n })\n if (validate) validateRunRecord(record)\n profileRecords.push(record)\n records.push(record)\n }\n\n byProfile[profile.id] = {\n profileId: profile.id,\n profileHash,\n model: profile.model,\n records: profileRecords.length,\n meanComposite: mean(profileRecords.map(compositeOf)),\n totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),\n integrity: summarizeBackendIntegrity(profileRecords),\n }\n }\n\n // Integrity by construction — the whole point of the primitive.\n const integrity = summarizeBackendIntegrity(records)\n if (integrityMode === 'assert') {\n assertRealBackend(records, { allowMixed: opts.allowMixed ?? true })\n } else if (integrityMode === 'warn' && integrity.verdict !== 'real') {\n // eslint-disable-next-line no-console\n console.warn(\n `[runProfileMatrix] backend integrity: ${integrity.verdict} — ${integrity.diagnosis}`,\n )\n }\n\n // Pivots.\n const byScenario = rollup(records, (r) => r.scenarioId)\n const byPersona = opts.personaOf\n ? rollupByPersona(records, opts.scenarios, opts.personaOf)\n : undefined\n\n return { matrixId, experimentId, records, byProfile, byScenario, byPersona, integrity, campaigns }\n}\n\n/** Composite for a produced RunRecord (the split score it carries). */\nfunction compositeOf(r: RunRecord): number {\n return r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0\n}\n\nfunction rollup(\n records: RunRecord[],\n keyOf: (r: RunRecord) => string | undefined,\n): Record<string, ScenarioRollup> {\n const groups = new Map<string, number[]>()\n for (const r of records) {\n const key = keyOf(r)\n if (key === undefined) continue\n const arr = groups.get(key) ?? []\n arr.push(compositeOf(r))\n groups.set(key, arr)\n }\n const out: Record<string, ScenarioRollup> = {}\n for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length }\n return out\n}\n\nfunction rollupByPersona<TScenario extends Scenario>(\n records: RunRecord[],\n scenarios: TScenario[],\n personaOf: (s: TScenario) => string,\n): Record<string, ScenarioRollup> {\n const personaByScenarioId = new Map<string, string>()\n for (const s of scenarios) personaByScenarioId.set(s.id, personaOf(s))\n return rollup(records, (r) => (r.scenarioId ? personaByScenarioId.get(r.scenarioId) : undefined))\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAqBd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AC5QA,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,QAAAC,aAAY;AA6Bd,IAAM,qBAAN,cAAiC,eAAe;AAAA,EACrD,YAAY,SAAiB;AAC3B,UAAM,kBAAkB,OAAO;AAAA,EACjC;AACF;AAyGA,SAAS,SAAS,IAAoB;AACpC,SAAO,GAAG,QAAQ,mBAAmB,GAAG;AAC1C;AAEA,SAAS,IAAI,OAAwB;AACnC,SAAOC,YAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,KAAK,CAAC,EAAE,OAAO,KAAK;AACxE;AAEA,SAAS,KAAK,IAAsB;AAClC,SAAO,GAAG,WAAW,IAAI,IAAI,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAClE;AAEA,SAAS,cAAc,MAA2C;AAChE,QAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,SAAO,WAAW,WAAW,IAAI,IAAI,KAAK,UAAU;AACtD;AAaA,SAAS,eAA0B,MAA6C;AAC9E,QAAM,EAAE,MAAM,SAAS,aAAa,YAAY,cAAc,UAAU,WAAW,SAAS,IAC1F;AACF,QAAM,YAAY,cAAc,IAAI;AAGpC,QAAM,MAA8B,EAAE,UAAU;AAChD,QAAM,WAAmD,CAAC;AAC1D,QAAM,WAAqC,CAAC;AAC5C,QAAM,QAAkB,CAAC;AACzB,aAAW,CAAC,WAAW,EAAE,KAAK,OAAO,QAAQ,KAAK,WAAW,GAAG;AAC9D,aAAS,SAAS,IAAI,EAAE,GAAG,GAAG,WAAW;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,UAAU,GAAG;AACxD,UAAI,GAAG,SAAS,IAAI,GAAG,EAAE,IAAI;AAC5B,OAAC,SAAS,GAAG,MAAM,CAAC,GAAG,KAAK,KAAK;AAAA,IACpC;AACA,QAAI,GAAG,MAAO,OAAM,KAAK,GAAG,SAAS,KAAK,GAAG,KAAK,EAAE;AAAA,EACtD;AACA,QAAM,aAAqC,CAAC;AAC5C,aAAW,CAAC,KAAK,MAAM,KAAK,OAAO,QAAQ,QAAQ,EAAG,YAAW,GAAG,IAAI,KAAK,MAAM;AAEnF,QAAM,UACJ,aAAa,YAAY,EAAE,cAAc,WAAW,IAAI,IAAI,EAAE,aAAa,WAAW,IAAI;AAC5F,MAAI,OAAO,KAAK,QAAQ,EAAE,SAAS,GAAG;AACpC,YAAQ,cAAc;AAAA,MACpB;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAI,MAAM,SAAS,IAAI,EAAE,OAAO,MAAM,KAAK,KAAK,EAAE,IAAI,CAAC;AAAA,IACzD;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE,IAAI,KAAK,MAAM;AAAA,IAC/C;AAAA,IACA,aAAa,QAAQ;AAAA,IACrB,MAAM,KAAK;AAAA,IACX,OAAO,QAAQ;AAAA,IACf,YAAY;AAAA,IACZ;AAAA,IACA;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,SAAS,KAAK;AAAA,IACd,YAAY,KAAK;AAAA,IACjB;AAAA,IACA;AAAA,IACA,YAAY,KAAK;AAAA,IACjB,GAAI,KAAK,QAAQ,EAAE,aAAa,KAAK,MAAM,IAAI,CAAC;AAAA,EAClD;AACF;AAEA,eAAsB,iBACpB,MACuD;AACvD,MAAI,KAAK,SAAS,WAAW,EAAG,OAAM,IAAI,mBAAmB,4BAA4B;AACzF,MAAI,KAAK,UAAU,WAAW,EAAG,OAAM,IAAI,mBAAmB,6BAA6B;AAE3F,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,gBAAgB,KAAK,aAAa;AACxC,QAAM,aAAa,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE;AAChD,QAAM,eACJ,KAAK,gBACL,MAAM,IAAI,EAAE,YAAY,WAAW,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AACpF,QAAM,WAAW,OAAO,IAAI,EAAE,cAAc,YAAY,MAAM,SAAS,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AAMtF,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,QAAI;AACF,wBAAkB;AAAA,QAChB,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE;AAAA,QAChC;AAAA,QACA,aAAa,QAAQ;AAAA,QACrB;AAAA,QACA,OAAO,QAAQ;AAAA,QACf,YAAY;AAAA,QACZ,YAAY;AAAA,QACZ,WAAW,KAAK;AAAA,QAChB,QAAQ;AAAA,QACR,SAAS;AAAA,QACT,YAAY,EAAE,OAAO,GAAG,QAAQ,EAAE;AAAA,QAClC,SACE,aAAa,YAAY,EAAE,cAAc,GAAG,KAAK,CAAC,EAAE,IAAI,EAAE,aAAa,GAAG,KAAK,CAAC,EAAE;AAAA,QACpF;AAAA,MACF,CAAC;AAAA,IACH,SAAS,KAAK;AACZ,YAAM,IAAI;AAAA,QACR,YAAY,QAAQ,EAAE,wBAAwB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,MAChG;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAuB,CAAC;AAC9B,QAAM,YAAkE,CAAC;AACzE,QAAM,YAA4C,CAAC;AAEnD,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,UAAM,aAAa,IAAI;AAAA,MACrB,SAAS;AAAA,MACT,SAAS,KAAK,UAAU,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,IAAI;AAAA,MAC7C;AAAA,MACA;AAAA,IACF,CAAC;AAID,UAAM,WAAW,CAAC,UAAqB,QACrC,KAAK,SAAS,SAAS,UAAU,GAAG;AACtC,WAAO,eAAe,UAAU,QAAQ,EAAE,OAAO,WAAW,SAAS,QAAQ,EAAE,CAAC,GAAG,CAAC;AAEpF,UAAM,WAAW,MAAM,YAAkC;AAAA,MACvD,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,QAAQ,KAAK;AAAA,MACb;AAAA,MACA,MAAM,KAAK;AAAA,MACX,gBAAgB,KAAK;AAAA,MACrB,aAAa,KAAK;AAAA,MAClB,cAAc,KAAK;AAAA,MACnB,eAAe,KAAK;AAAA,MACpB,SAAS,KAAK;AAAA,MACd,KAAK,KAAK;AAAA,MACV,QAAQC,MAAK,KAAK,QAAQ,SAAS,QAAQ,EAAE,CAAC;AAAA,IAChD,CAAC;AACD,cAAU,QAAQ,EAAE,IAAI;AAExB,UAAM,iBAA8B,CAAC;AACrC,eAAW,QAAQ,SAAS,OAAO;AACjC,YAAM,SAAS,eAAe;AAAA,QAC5B;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,WAAW,KAAK;AAAA,QAChB;AAAA,MACF,CAAC;AACD,UAAI,SAAU,mBAAkB,MAAM;AACtC,qBAAe,KAAK,MAAM;AAC1B,cAAQ,KAAK,MAAM;AAAA,IACrB;AAEA,cAAU,QAAQ,EAAE,IAAI;AAAA,MACtB,WAAW,QAAQ;AAAA,MACnB;AAAA,MACA,OAAO,QAAQ;AAAA,MACf,SAAS,eAAe;AAAA,MACxB,eAAe,KAAK,eAAe,IAAI,WAAW,CAAC;AAAA,MACnD,cAAc,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,MAC9D,WAAW,0BAA0B,cAAc;AAAA,IACrD;AAAA,EACF;AAGA,QAAM,YAAY,0BAA0B,OAAO;AACnD,MAAI,kBAAkB,UAAU;AAC9B,sBAAkB,SAAS,EAAE,YAAY,KAAK,cAAc,KAAK,CAAC;AAAA,EACpE,WAAW,kBAAkB,UAAU,UAAU,YAAY,QAAQ;AAEnE,YAAQ;AAAA,MACN,yCAAyC,UAAU,OAAO,WAAM,UAAU,SAAS;AAAA,IACrF;AAAA,EACF;AAGA,QAAM,aAAa,OAAO,SAAS,CAAC,MAAM,EAAE,UAAU;AACtD,QAAM,YAAY,KAAK,YACnB,gBAAgB,SAAS,KAAK,WAAW,KAAK,SAAS,IACvD;AAEJ,SAAO,EAAE,UAAU,cAAc,SAAS,WAAW,YAAY,WAAW,WAAW,UAAU;AACnG;AAGA,SAAS,YAAY,GAAsB;AACzC,SAAO,EAAE,QAAQ,gBAAgB,EAAE,QAAQ,eAAe;AAC5D;AAEA,SAAS,OACP,SACA,OACgC;AAChC,QAAM,SAAS,oBAAI,IAAsB;AACzC,aAAW,KAAK,SAAS;AACvB,UAAM,MAAM,MAAM,CAAC;AACnB,QAAI,QAAQ,OAAW;AACvB,UAAM,MAAM,OAAO,IAAI,GAAG,KAAK,CAAC;AAChC,QAAI,KAAK,YAAY,CAAC,CAAC;AACvB,WAAO,IAAI,KAAK,GAAG;AAAA,EACrB;AACA,QAAM,MAAsC,CAAC;AAC7C,aAAW,CAAC,KAAK,EAAE,KAAK,OAAQ,KAAI,GAAG,IAAI,EAAE,eAAe,KAAK,EAAE,GAAG,GAAG,GAAG,OAAO;AACnF,SAAO;AACT;AAEA,SAAS,gBACP,SACA,WACA,WACgC;AAChC,QAAM,sBAAsB,oBAAI,IAAoB;AACpD,aAAW,KAAK,UAAW,qBAAoB,IAAI,EAAE,IAAI,UAAU,CAAC,CAAC;AACrE,SAAO,OAAO,SAAS,CAAC,MAAO,EAAE,aAAa,oBAAoB,IAAI,EAAE,UAAU,IAAI,MAAU;AAClG;;;AC/YA,SAAS,oBAAoB;AAC7B,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["createHash","join","createHash","join","existsSync","join"]}
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  cohensD
3
- } from "./chunk-S3SDD56V.js";
3
+ } from "./chunk-ITBRCT73.js";
4
4
  import {
5
5
  argHash,
6
6
  groupBy,
@@ -551,4 +551,4 @@ export {
551
551
  iqr,
552
552
  welchsTTest
553
553
  };
554
- //# sourceMappingURL=chunk-QDOSODID.js.map
554
+ //# sourceMappingURL=chunk-3B7Y5AUR.js.map
@@ -54,4 +54,4 @@ export {
54
54
  VerificationError,
55
55
  ReplayError
56
56
  };
57
- //# sourceMappingURL=chunk-QYJT52YW.js.map
57
+ //# sourceMappingURL=chunk-3BFEG2F6.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/errors.ts"],"sourcesContent":["/**\n * Error taxonomy for `@tangle-network/agent-eval`.\n *\n * Every error this package throws as part of its *public contract* extends\n * `AgentEvalError`. Consumers can pattern-match by `instanceof <Subclass>` or\n * by the stable string `code` carried on the base class.\n *\n * The codes are stable across minor versions; new codes can be added, but\n * existing codes never change meaning. New subclasses are non-breaking.\n *\n * Internal invariant guards (`throw new Error('this should never happen')`)\n * remain plain `Error`s on purpose — they're programmer-mistake assertions,\n * not consumer-catchable contract failures.\n */\n\nexport type AgentEvalErrorCode =\n | 'validation'\n | 'not_found'\n | 'config'\n | 'capture_integrity'\n | 'judge'\n | 'verification'\n | 'replay'\n | 'backend_integrity'\n | 'profile_matrix'\n\nexport class AgentEvalError extends Error {\n /** Stable string code. Survives minification; safe to switch on. */\n readonly code: AgentEvalErrorCode\n\n constructor(code: AgentEvalErrorCode, message: string, options?: { cause?: unknown }) {\n super(message, options)\n this.name = this.constructor.name\n this.code = code\n }\n}\n\n/** Caller passed invalid arguments (out of range, mutually-exclusive options, bad shape). */\nexport class ValidationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('validation', message, options)\n }\n}\n\n/** A named resource (run, span, rubric, scenario, dataset row, route) does not exist. */\nexport class NotFoundError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('not_found', message, options)\n }\n}\n\n/** Configuration missing or malformed (`HOME` unset, required image not supplied, env var absent). */\nexport class ConfigError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('config', message, options)\n }\n}\n\n/**\n * A run is missing the artifacts a launch-grade check requires:\n * raw HTTP capture absent, no LLM spans, route assertion failed, run-end\n * assertion tripped. Block ship on this; do not catch and move on.\n */\nexport class CaptureIntegrityError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('capture_integrity', message, options)\n }\n}\n\n/** A judge call failed in a way that's not retryable: schema parse failure, bad rubric, conflicting dimensions. */\nexport class JudgeError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('judge', message, options)\n }\n}\n\n/** A verifier signalled a hard failure (compile, test, schema) — distinct from a low judge score. */\nexport class VerificationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('verification', message, options)\n }\n}\n\n/** Replay cache cannot satisfy a request: miss with no fallback, sink lacks list(), unsupported URL. */\nexport class ReplayError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('replay', message, options)\n }\n}\n"],"mappings":";AA0BO,IAAM,iBAAN,cAA6B,MAAM;AAAA;AAAA,EAE/B;AAAA,EAET,YAAY,MAA0B,SAAiB,SAA+B;AACpF,UAAM,SAAS,OAAO;AACtB,SAAK,OAAO,KAAK,YAAY;AAC7B,SAAK,OAAO;AAAA,EACd;AACF;AAGO,IAAM,kBAAN,cAA8B,eAAe;AAAA,EAClD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,cAAc,SAAS,OAAO;AAAA,EACtC;AACF;AAGO,IAAM,gBAAN,cAA4B,eAAe;AAAA,EAChD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,aAAa,SAAS,OAAO;AAAA,EACrC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;AAOO,IAAM,wBAAN,cAAoC,eAAe;AAAA,EACxD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,qBAAqB,SAAS,OAAO;AAAA,EAC7C;AACF;AAGO,IAAM,aAAN,cAAyB,eAAe;AAAA,EAC7C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,SAAS,SAAS,OAAO;AAAA,EACjC;AACF;AAGO,IAAM,oBAAN,cAAgC,eAAe;AAAA,EACpD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,gBAAgB,SAAS,OAAO;AAAA,EACxC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;","names":[]}
@@ -4,7 +4,7 @@ import {
4
4
  } from "./chunk-NCRFYPS3.js";
5
5
  import {
6
6
  validateRunRecord
7
- } from "./chunk-NCK5QLGT.js";
7
+ } from "./chunk-F3SRAAZO.js";
8
8
  import {
9
9
  TraceEmitter
10
10
  } from "./chunk-TVVP3ZZQ.js";
@@ -610,4 +610,4 @@ export {
610
610
  runProposeReviewAsControlLoop,
611
611
  controlFailureClassFromVerification
612
612
  };
613
- //# sourceMappingURL=chunk-J4DIMSRK.js.map
613
+ //# sourceMappingURL=chunk-6EKXFFGQ.js.map
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  callLlmJson
3
- } from "./chunk-VXNVVBZO.js";
3
+ } from "./chunk-IHDHUN2X.js";
4
4
 
5
5
  // src/wire/schemas.ts
6
6
  import { extendZodWithOpenApi } from "@asteasolutions/zod-to-openapi";
@@ -1002,4 +1002,4 @@ export {
1002
1002
  startServer,
1003
1003
  startServerAsync
1004
1004
  };
1005
- //# sourceMappingURL=chunk-63EPZQUZ.js.map
1005
+ //# sourceMappingURL=chunk-6REHLN5J.js.map
@@ -2,14 +2,14 @@ import {
2
2
  buildAgentProfileCell,
3
3
  validateRunRecord,
4
4
  verifyAgentProfileCell
5
- } from "./chunk-NCK5QLGT.js";
5
+ } from "./chunk-F3SRAAZO.js";
6
6
  import {
7
7
  researchReport
8
- } from "./chunk-OLIBRKRD.js";
8
+ } from "./chunk-KX6F6NCG.js";
9
9
  import {
10
10
  RunIntegrityError,
11
11
  assertRunCaptured
12
- } from "./chunk-UBPIXOC4.js";
12
+ } from "./chunk-SBCB6VZY.js";
13
13
  import {
14
14
  TraceEmitter
15
15
  } from "./chunk-TVVP3ZZQ.js";
@@ -19,7 +19,7 @@ import {
19
19
  } from "./chunk-VSMTAMNK.js";
20
20
  import {
21
21
  assertLlmRoute
22
- } from "./chunk-VXNVVBZO.js";
22
+ } from "./chunk-IHDHUN2X.js";
23
23
  import {
24
24
  FileSystemRawProviderSink
25
25
  } from "./chunk-PC4UYEBM.js";
@@ -329,4 +329,4 @@ function defaultRunId(params) {
329
329
  export {
330
330
  runEvalCampaign
331
331
  };
332
- //# sourceMappingURL=chunk-GM476SZU.js.map
332
+ //# sourceMappingURL=chunk-AIWHLG7J.js.map
@@ -1,9 +1,9 @@
1
1
  import {
2
2
  summaryTable
3
- } from "./chunk-OLIBRKRD.js";
3
+ } from "./chunk-KX6F6NCG.js";
4
4
  import {
5
5
  VerificationError
6
- } from "./chunk-QYJT52YW.js";
6
+ } from "./chunk-3BFEG2F6.js";
7
7
 
8
8
  // src/release-confidence.ts
9
9
  var DEFAULT_THRESHOLDS = {
@@ -574,4 +574,4 @@ export {
574
574
  judgeReplayGate,
575
575
  renderReleaseReport
576
576
  };
577
- //# sourceMappingURL=chunk-AIXHUIHG.js.map
577
+ //# sourceMappingURL=chunk-B26KI423.js.map
@@ -3,7 +3,7 @@ import {
3
3
  } from "./chunk-VSMTAMNK.js";
4
4
  import {
5
5
  ValidationError
6
- } from "./chunk-QYJT52YW.js";
6
+ } from "./chunk-3BFEG2F6.js";
7
7
 
8
8
  // src/agent-profile-cell.ts
9
9
  var AgentProfileCellValidationError = class extends ValidationError {
@@ -538,4 +538,4 @@ export {
538
538
  parseRunRecordSafe,
539
539
  roundTripRunRecord
540
540
  };
541
- //# sourceMappingURL=chunk-NCK5QLGT.js.map
541
+ //# sourceMappingURL=chunk-F3SRAAZO.js.map
@@ -4,7 +4,7 @@ import {
4
4
  import {
5
5
  AgentEvalError,
6
6
  ValidationError
7
- } from "./chunk-QYJT52YW.js";
7
+ } from "./chunk-3BFEG2F6.js";
8
8
 
9
9
  // src/integrity/backend-integrity.ts
10
10
  var BackendIntegrityError = class extends AgentEvalError {
@@ -864,4 +864,4 @@ export {
864
864
  buildReflectionPrompt,
865
865
  parseReflectionResponse
866
866
  };
867
- //# sourceMappingURL=chunk-GBHRUAOF.js.map
867
+ //# sourceMappingURL=chunk-GMXHLSLL.js.map
@@ -5,7 +5,7 @@ import {
5
5
  import {
6
6
  AgentEvalError,
7
7
  CaptureIntegrityError
8
- } from "./chunk-QYJT52YW.js";
8
+ } from "./chunk-3BFEG2F6.js";
9
9
 
10
10
  // src/llm-client.ts
11
11
  var LlmCallError = class extends AgentEvalError {
@@ -494,4 +494,4 @@ export {
494
494
  probeLlm,
495
495
  LlmClient
496
496
  };
497
- //# sourceMappingURL=chunk-VXNVVBZO.js.map
497
+ //# sourceMappingURL=chunk-IHDHUN2X.js.map
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  ValidationError
3
- } from "./chunk-QYJT52YW.js";
3
+ } from "./chunk-3BFEG2F6.js";
4
4
 
5
5
  // src/judge-calibration.ts
6
6
  function calibrateJudge(golden, candidate) {
@@ -928,4 +928,4 @@ export {
928
928
  benjaminiHochberg,
929
929
  pairedBootstrap
930
930
  };
931
- //# sourceMappingURL=chunk-S3SDD56V.js.map
931
+ //# sourceMappingURL=chunk-ITBRCT73.js.map
@@ -5,7 +5,7 @@ import {
5
5
  pairedBootstrap,
6
6
  pairedMde,
7
7
  wilcoxonSignedRank
8
- } from "./chunk-S3SDD56V.js";
8
+ } from "./chunk-ITBRCT73.js";
9
9
  import {
10
10
  canonicalize,
11
11
  hashJson
@@ -877,4 +877,4 @@ export {
877
877
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
878
878
  researchReport
879
879
  };
880
- //# sourceMappingURL=chunk-OLIBRKRD.js.map
880
+ //# sourceMappingURL=chunk-KX6F6NCG.js.map
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  confidenceInterval
3
- } from "./chunk-S3SDD56V.js";
3
+ } from "./chunk-ITBRCT73.js";
4
4
 
5
5
  // src/campaign/run-campaign.ts
6
6
  import { createHash } from "crypto";
@@ -187,13 +187,22 @@ async function executeCell(args) {
187
187
  }
188
188
  };
189
189
  let costSoFar = 0;
190
+ const tokensSoFar = { input: 0, output: 0 };
190
191
  const cost = {
191
192
  observe(amount, source) {
192
193
  costSoFar += amount;
193
194
  trace.span(`cost.${source}`, { amountUsd: amount }).end();
194
195
  },
196
+ observeTokens(usage) {
197
+ tokensSoFar.input += usage.input;
198
+ tokensSoFar.output += usage.output;
199
+ if (usage.cached) tokensSoFar.cached = (tokensSoFar.cached ?? 0) + usage.cached;
200
+ },
195
201
  current() {
196
202
  return costSoFar;
203
+ },
204
+ tokens() {
205
+ return { ...tokensSoFar };
197
206
  }
198
207
  };
199
208
  const placement = args.opts.cellPlacement?.({
@@ -241,6 +250,7 @@ async function executeCell(args) {
241
250
  artifact: artifact ?? null,
242
251
  judgeScores,
243
252
  costUsd: costSoFar,
253
+ tokenUsage: { ...tokensSoFar },
244
254
  durationMs: Date.now() - startMs,
245
255
  seed: args.slot.cellSeed,
246
256
  cached: false,
@@ -287,6 +297,7 @@ function skippedCell(slot, reason) {
287
297
  artifact: null,
288
298
  judgeScores: {},
289
299
  costUsd: 0,
300
+ tokenUsage: { input: 0, output: 0 },
290
301
  durationMs: 0,
291
302
  seed: slot.cellSeed,
292
303
  cached: false,
@@ -363,4 +374,4 @@ export {
363
374
  inMemoryCampaignStorage,
364
375
  runCampaign
365
376
  };
366
- //# sourceMappingURL=chunk-NOPYCRNG.js.map
377
+ //# sourceMappingURL=chunk-OLULBECP.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/campaign/run-campaign.ts","../src/campaign/storage.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates\n * scenarios → dispatch → artifacts → judges → aggregates, with full\n * reproducibility (seed + manifest hash), cell-level resumability, bootstrap\n * CIs, and the `LabeledScenarioStore` capture flywheel.\n *\n * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this\n * primitive but live in `presets/run-improvement-loop.ts`. This file keeps\n * the core orchestrator minimal — Phase 1 of the Pass A track.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport { confidenceInterval } from '../statistics'\nimport { type CampaignStorage, fsCampaignStorage } from './storage'\nimport type {\n CampaignAggregates,\n CampaignArtifactWriter,\n CampaignCellResult,\n CampaignCostMeter,\n CampaignResult,\n CampaignTokenUsage,\n CampaignTraceWriter,\n DispatchContext,\n DispatchFn,\n JudgeAggregate,\n JudgeConfig,\n JudgeScore,\n LabeledScenarioStore,\n Scenario,\n ScenarioAggregate,\n TraceSpan,\n} from './types'\n\nexport interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {\n scenarios: TScenario[]\n dispatch: DispatchFn<TScenario, TArtifact>\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Required for reproducibility. Default 42. */\n seed?: number\n /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for\n * bootstrap-tight intervals on critical eval. */\n reps?: number\n /** When true (default), completed cells are cached by\n * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */\n resumable?: boolean\n /** Optional store — when present, every artifact + judge score is captured\n * with the configured `captureSource`. Capture is default ON; pass `'off'`\n * to disable. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic'\n captureSourceVersionHash?: string\n /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */\n costCeiling?: number\n /** Max concurrent cells. Default 2. */\n maxConcurrency?: number\n /** Required: where artifacts + traces land. */\n runDir: string\n /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted\n * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate\n * refuses this when the caller wires `autoOnPromote !== 'none'`. */\n tracing?: 'on' | 'off'\n /** Test seam — override the wall clock for deterministic tests. */\n now?: () => Date\n /** Test seam — override per-cell trace writer factory. */\n buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter\n /** Storage backend for run/cell dirs, the resumability cache, artifacts,\n * and trace spans. Default: the Node filesystem (`fsCampaignStorage`).\n * Pass `inMemoryCampaignStorage()` to run in a filesystem-less runtime\n * (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still\n * produced; artifacts/traces just aren't persisted to disk. */\n storage?: CampaignStorage\n /**\n * Optional per-cell placement strategy. Returns an opaque string the\n * substrate forwards as `ctx.placement` to the Dispatch — placement-aware\n * Dispatches (e.g. `httpDispatch` from `/adapters/http`) use it to route\n * each cell to the right worker, region, or sandbox. When unset, every\n * cell receives `ctx.placement = undefined` and behaves identically to\n * the in-process case.\n *\n * @example\n * cellPlacement: ({ scenario }) => scenario.tags?.includes('eu') ? 'eu-west' : 'us-east'\n */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n}\n\nexport async function runCampaign<TScenario extends Scenario, TArtifact>(\n opts: RunCampaignOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n const seed = opts.seed ?? 42\n const reps = opts.reps ?? 1\n const resumable = opts.resumable ?? true\n const maxConcurrency = opts.maxConcurrency ?? 2\n const now = opts.now ?? (() => new Date())\n const judges = opts.judges ?? []\n const storage = opts.storage ?? fsCampaignStorage()\n\n storage.ensureDir(opts.runDir)\n\n const manifestHash = computeManifestHash({\n scenarios: opts.scenarios,\n judges: judges as unknown as JudgeConfig<unknown>[],\n dispatchRef: opts.dispatch.name || 'anonymous',\n seed,\n reps,\n })\n\n const startedAt = now()\n const cells: CampaignCellResult<TArtifact>[] = []\n const artifactsByPath: Record<string, string> = {}\n\n // Build the cell schedule (scenario × rep).\n const schedule: Array<{ scenario: TScenario; rep: number; cellId: string; cellSeed: number }> = []\n let cellIndex = 0\n for (const scenario of opts.scenarios) {\n for (let rep = 0; rep < reps; rep++) {\n const cellId = `${scenario.id}:${rep}`\n const cellSeed = seed + cellIndex\n schedule.push({ scenario, rep, cellId, cellSeed })\n cellIndex += 1\n }\n }\n\n // Concurrency-limited execution.\n let totalCostUsd = 0\n let costCeilingReached = false\n const abortController = new AbortController()\n // Concurrency lanes that drain the cell schedule. Named \"lanes\" — not\n // \"workers\" — to avoid clashing with the taxonomy's worker (= the agent\n // harness in a sandbox, invoked behind `dispatch`). See loop-taxonomy.md.\n const lanes: Promise<void>[] = []\n let nextIdx = 0\n const cellsRef = cells\n\n for (let i = 0; i < maxConcurrency; i++) {\n lanes.push(\n (async () => {\n while (true) {\n const myIdx = nextIdx++\n if (myIdx >= schedule.length) return\n const slot = schedule[myIdx]!\n if (costCeilingReached) {\n cellsRef.push(skippedCell(slot, 'cost_ceiling_reached'))\n continue\n }\n const result = await executeCell({\n slot,\n opts,\n manifestHash,\n resumable,\n now,\n storage,\n buildTraceWriter: opts.buildTraceWriter ?? defaultBuildTraceWriter(storage),\n signal: abortController.signal,\n })\n cellsRef.push(result.cell)\n totalCostUsd += result.cell.costUsd\n Object.assign(artifactsByPath, result.artifactsByPath)\n if (opts.costCeiling !== undefined && totalCostUsd >= opts.costCeiling) {\n costCeilingReached = true\n }\n // Capture into LabeledScenarioStore unless explicitly disabled.\n if (opts.labeledStore && opts.labeledStore !== 'off' && !result.cell.error) {\n await captureToStore({\n store: opts.labeledStore,\n cell: result.cell,\n scenario: slot.scenario,\n opts,\n now,\n }).catch((err) => {\n // Capture failures are non-fatal — log but don't crash the campaign.\n // (Trace would normally land here.)\n console.warn(\n `[runCampaign] capture failed for ${result.cell.cellId}: ${err instanceof Error ? err.message : String(err)}`,\n )\n })\n }\n }\n })(),\n )\n }\n await Promise.all(lanes)\n\n const endedAt = now()\n cellsRef.sort((a, b) => a.cellId.localeCompare(b.cellId))\n\n const aggregates = computeAggregates(\n cellsRef,\n judges as unknown as JudgeConfig<TArtifact>[],\n seed,\n )\n\n return {\n manifestHash,\n seed,\n startedAt: startedAt.toISOString(),\n endedAt: endedAt.toISOString(),\n durationMs: endedAt.getTime() - startedAt.getTime(),\n cells: cellsRef,\n aggregates,\n runDir: opts.runDir,\n artifactsByPath,\n scenarios: opts.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n }\n}\n\n// ── Internals ─────────────────────────────────────────────────────────\n\ninterface ExecuteCellArgs<TScenario extends Scenario, TArtifact> {\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number }\n opts: RunCampaignOptions<TScenario, TArtifact>\n manifestHash: string\n resumable: boolean\n now: () => Date\n storage: CampaignStorage\n buildTraceWriter: (cellId: string, dir: string) => CampaignTraceWriter\n signal: AbortSignal\n}\n\nasync function executeCell<TScenario extends Scenario, TArtifact>(\n args: ExecuteCellArgs<TScenario, TArtifact>,\n): Promise<{ cell: CampaignCellResult<TArtifact>; artifactsByPath: Record<string, string> }> {\n const storage = args.storage\n const cellDir = join(args.opts.runDir, args.slot.cellId.replace(/[^a-zA-Z0-9_-]/g, '_'))\n storage.ensureDir(cellDir)\n\n // Resumability: cache key = (manifestHash, scenarioId, rep)\n const cachePath = join(cellDir, 'cached-result.json')\n if (args.resumable) {\n const raw = storage.read(cachePath)\n if (raw !== undefined) {\n try {\n const cached = JSON.parse(raw) as CampaignCellResult<TArtifact>\n if (cached.cellId === args.slot.cellId) {\n return { cell: { ...cached, cached: true }, artifactsByPath: {} }\n }\n } catch {\n // Corrupt cache — fall through to re-run.\n }\n }\n }\n\n const startMs = Date.now()\n const trace = args.buildTraceWriter(args.slot.cellId, cellDir)\n const artifactsByPath: Record<string, string> = {}\n const artifacts: CampaignArtifactWriter = {\n async write(path, content) {\n const fullPath = join(cellDir, path)\n storage.ensureDir(join(fullPath, '..'))\n storage.write(fullPath, content)\n artifactsByPath[`${args.slot.cellId}/${path}`] = fullPath\n return fullPath\n },\n async writeJson(path, value) {\n return artifacts.write(path, JSON.stringify(value, null, 2))\n },\n }\n let costSoFar = 0\n const tokensSoFar: CampaignTokenUsage = { input: 0, output: 0 }\n const cost: CampaignCostMeter = {\n observe(amount, source) {\n costSoFar += amount\n trace.span(`cost.${source}`, { amountUsd: amount }).end()\n },\n observeTokens(usage) {\n tokensSoFar.input += usage.input\n tokensSoFar.output += usage.output\n if (usage.cached) tokensSoFar.cached = (tokensSoFar.cached ?? 0) + usage.cached\n },\n current() {\n return costSoFar\n },\n tokens() {\n return { ...tokensSoFar }\n },\n }\n\n const placement = args.opts.cellPlacement?.({\n scenario: args.slot.scenario,\n rep: args.slot.rep,\n })\n\n const ctx: DispatchContext = {\n cellId: args.slot.cellId,\n rep: args.slot.rep,\n seed: args.slot.cellSeed,\n signal: args.signal,\n trace,\n artifacts,\n cost,\n placement,\n }\n\n let artifact: TArtifact | undefined\n let errorMessage: string | undefined\n try {\n artifact = await args.opts.dispatch(args.slot.scenario, ctx)\n } catch (err) {\n errorMessage = err instanceof Error ? err.message : String(err)\n }\n\n // Run judges (only if we have an artifact). A judge that throws invalidates\n // the cell — recorded as `error`, NOT folded into a fake composite:0 (a fake\n // zero is indistinguishable from a real zero and poisons every aggregate).\n const judgeScores: Record<string, JudgeScore> = {}\n if (artifact !== undefined) {\n for (const judge of args.opts.judges ?? []) {\n if (judge.appliesTo && !judge.appliesTo(args.slot.scenario)) continue\n try {\n judgeScores[judge.name] = await runJudgeCell(judge, {\n artifact,\n scenario: args.slot.scenario,\n signal: args.signal,\n })\n } catch (err) {\n errorMessage = `judge '${judge.name}' failed: ${err instanceof Error ? err.message : String(err)}`\n break\n }\n }\n }\n\n await trace.flush()\n\n const cell: CampaignCellResult<TArtifact> = {\n cellId: args.slot.cellId,\n scenarioId: args.slot.scenario.id,\n rep: args.slot.rep,\n artifact: (artifact ?? null) as TArtifact,\n judgeScores,\n costUsd: costSoFar,\n tokenUsage: { ...tokensSoFar },\n durationMs: Date.now() - startMs,\n seed: args.slot.cellSeed,\n cached: false,\n error: errorMessage,\n }\n\n if (!errorMessage && args.resumable) {\n storage.write(cachePath, JSON.stringify(cell))\n }\n\n return { cell, artifactsByPath }\n}\n\nasync function runJudgeCell<TArtifact, TScenario extends Scenario>(\n judge: JudgeConfig<TArtifact, TScenario>,\n input: { artifact: TArtifact; scenario: TScenario; signal: AbortSignal },\n): Promise<JudgeScore> {\n return judge.score(input)\n}\n\nfunction defaultBuildTraceWriter(\n storage: CampaignStorage,\n): (cellId: string, dir: string) => CampaignTraceWriter {\n return (cellId, dir) => {\n const spans: Array<Record<string, unknown>> = []\n return {\n span(name, attributes) {\n const startMs = Date.now()\n const record: Record<string, unknown> = { name, cellId, startMs, ...(attributes ?? {}) }\n const finish: TraceSpan = {\n end(endAttrs) {\n record.durationMs = Date.now() - startMs\n if (endAttrs) Object.assign(record, endAttrs)\n spans.push(record)\n },\n setAttribute(key, value) {\n record[key] = value\n },\n }\n return finish\n },\n async flush() {\n storage.write(join(dir, 'spans.jsonl'), spans.map((s) => JSON.stringify(s)).join('\\n'))\n },\n }\n }\n}\n\nfunction skippedCell<TScenario extends Scenario, TArtifact>(\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number },\n reason: string,\n): CampaignCellResult<TArtifact> {\n return {\n cellId: slot.cellId,\n scenarioId: slot.scenario.id,\n rep: slot.rep,\n artifact: null as unknown as TArtifact,\n judgeScores: {},\n costUsd: 0,\n tokenUsage: { input: 0, output: 0 },\n durationMs: 0,\n seed: slot.cellSeed,\n cached: false,\n error: `skipped: ${reason}`,\n }\n}\n\ninterface CaptureArgs<TScenario extends Scenario, TArtifact> {\n store: LabeledScenarioStore\n cell: CampaignCellResult<TArtifact>\n scenario: TScenario\n opts: RunCampaignOptions<TScenario, TArtifact>\n now: () => Date\n}\n\nasync function captureToStore<TScenario extends Scenario, TArtifact>(\n args: CaptureArgs<TScenario, TArtifact>,\n): Promise<void> {\n await args.store.observe({\n scenario: args.scenario,\n artifact: args.cell.artifact,\n judgeScores: args.cell.judgeScores,\n source: args.opts.captureSource ?? 'eval-run',\n sourceVersionHash: args.opts.captureSourceVersionHash ?? 'unknown',\n capturedAt: args.now().toISOString(),\n redactionStatus: 'raw',\n })\n}\n\n// ── Aggregates + manifest hash ────────────────────────────────────────\n\nfunction computeManifestHash(input: {\n scenarios: Scenario[]\n judges: JudgeConfig<unknown>[]\n dispatchRef: string\n seed: number\n reps: number\n}): string {\n const canonical = {\n scenarios: input.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n judges: input.judges.map((j) => ({ name: j.name, dims: j.dimensions.map((d) => d.key) })),\n dispatch: input.dispatchRef,\n seed: input.seed,\n reps: input.reps,\n }\n return createHash('sha256').update(JSON.stringify(canonical)).digest('hex')\n}\n\nfunction computeAggregates<TArtifact>(\n cells: CampaignCellResult<TArtifact>[],\n judges: JudgeConfig<TArtifact>[],\n seed: number,\n): CampaignAggregates {\n const byJudge: Record<string, JudgeAggregate> = {}\n for (const judge of judges) {\n const scores: number[] = []\n for (const cell of cells) {\n const s = cell.judgeScores[judge.name]\n if (s !== undefined) scores.push(s.composite)\n }\n byJudge[judge.name] = aggregate(scores, seed)\n }\n const byScenario: Record<string, ScenarioAggregate> = {}\n const scenarioGroups = new Map<string, number[]>()\n for (const cell of cells) {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n if (composites.length === 0) continue\n const mean = composites.reduce((a, b) => a + b, 0) / composites.length\n const arr = scenarioGroups.get(cell.scenarioId) ?? []\n arr.push(mean)\n scenarioGroups.set(cell.scenarioId, arr)\n }\n for (const [scenarioId, samples] of scenarioGroups) {\n const ag = aggregate(samples, seed)\n byScenario[scenarioId] = { meanComposite: ag.mean, ci95: ag.ci95, n: ag.n }\n }\n return {\n byJudge,\n byScenario,\n totalCostUsd: cells.reduce((a, c) => a + c.costUsd, 0),\n cellsExecuted: cells.filter((c) => !c.error).length,\n cellsSkipped: cells.filter((c) => c.error?.startsWith('skipped:')).length,\n cellsCached: cells.filter((c) => c.cached).length,\n cellsFailed: cells.filter((c) => c.error && !c.error.startsWith('skipped:')).length,\n }\n}\n\n// Percentile bootstrap CI95 via seeded resampling. Deterministic for a given\n// seed — same campaign re-run produces identical CI bands. Falls back to\n// degenerate intervals at n<=1 (the bootstrap is undefined there).\nfunction aggregate(samples: number[], seed: number): JudgeAggregate {\n const n = samples.length\n if (n === 0) return { mean: 0, stdev: 0, ci95: [0, 0], n: 0 }\n const mean = samples.reduce((a, b) => a + b, 0) / n\n const variance = samples.reduce((a, b) => a + (b - mean) ** 2, 0) / Math.max(1, n - 1)\n const stdev = Math.sqrt(variance)\n const ci = confidenceInterval(samples, 0.95, { seed, resamples: 1000 })\n return { mean, stdev, ci95: [ci.lower, ci.upper], n }\n}\n","import { createRequire } from 'node:module'\n\n/**\n * @experimental\n *\n * `CampaignStorage` — the filesystem seam `runCampaign` writes through\n * (run/cell dirs, the resumability cache, per-cell artifacts, trace spans).\n *\n * The default (`fsCampaignStorage`) is the Node filesystem — identical\n * behavior to the inline `node:fs` calls it replaces, so existing CLI\n * consumers are unaffected. `inMemoryCampaignStorage` keeps everything in a\n * `Map`, so the substrate runs in environments WITHOUT a filesystem\n * (Cloudflare Workers, Deno Deploy, other edge runtimes) — the campaign\n * still produces its `CampaignResult` (cells + aggregates) in memory;\n * artifacts/traces simply aren't persisted to disk.\n *\n * Paths are opaque keys to the in-memory adapter — it does not parse them,\n * so the same `join(...)`-built paths work unchanged across both adapters.\n */\nexport interface CampaignStorage {\n /** Ensure a directory exists (recursive). No-op for in-memory. */\n ensureDir(dir: string): void\n /** Does this path exist (as a written file or an ensured dir)? */\n exists(path: string): boolean\n /** Read a UTF-8 file; `undefined` when missing or unreadable. */\n read(path: string): string | undefined\n /** Write a file (string or bytes). Parent dir is assumed ensured. */\n write(path: string, content: string | Uint8Array): void\n}\n\n/** Node-filesystem storage — the default. Lazily requires `node:fs` so the\n * module imports cleanly in non-Node runtimes (where the caller passes\n * `inMemoryCampaignStorage` instead and never constructs this).\n *\n * `createRequire(import.meta.url)` is the ESM-native lazy require — a bare\n * `require` is a ReferenceError under `\"type\": \"module\"`, which is exactly\n * the shape this package publishes. */\nexport function fsCampaignStorage(): CampaignStorage {\n const nodeRequire = createRequire(import.meta.url)\n const { existsSync, mkdirSync, readFileSync, writeFileSync } = nodeRequire(\n 'node:fs',\n ) as typeof import('node:fs')\n return {\n ensureDir(dir) {\n if (!existsSync(dir)) mkdirSync(dir, { recursive: true })\n },\n exists(path) {\n return existsSync(path)\n },\n read(path) {\n try {\n return readFileSync(path, 'utf8')\n } catch {\n return undefined\n }\n },\n write(path, content) {\n writeFileSync(path, content as Uint8Array)\n },\n }\n}\n\n/** In-memory storage for filesystem-less runtimes. Artifacts + trace spans\n * live in a `Map` for the duration of the run; the `CampaignResult` is\n * fully populated, but nothing is persisted to disk. */\nexport function inMemoryCampaignStorage(): CampaignStorage {\n const files = new Map<string, string | Uint8Array>()\n const dirs = new Set<string>()\n return {\n ensureDir(dir) {\n dirs.add(dir)\n },\n exists(path) {\n return files.has(path) || dirs.has(path)\n },\n read(path) {\n const value = files.get(path)\n if (value === undefined) return undefined\n return typeof value === 'string' ? value : new TextDecoder().decode(value)\n },\n write(path, content) {\n files.set(path, content)\n },\n }\n}\n"],"mappings":";;;;;AAaA,SAAS,kBAAkB;AAC3B,SAAS,YAAY;;;ACdrB,SAAS,qBAAqB;AAqCvB,SAAS,oBAAqC;AACnD,QAAM,cAAc,cAAc,YAAY,GAAG;AACjD,QAAM,EAAE,YAAY,WAAW,cAAc,cAAc,IAAI;AAAA,IAC7D;AAAA,EACF;AACA,SAAO;AAAA,IACL,UAAU,KAAK;AACb,UAAI,CAAC,WAAW,GAAG,EAAG,WAAU,KAAK,EAAE,WAAW,KAAK,CAAC;AAAA,IAC1D;AAAA,IACA,OAAO,MAAM;AACX,aAAO,WAAW,IAAI;AAAA,IACxB;AAAA,IACA,KAAK,MAAM;AACT,UAAI;AACF,eAAO,aAAa,MAAM,MAAM;AAAA,MAClC,QAAQ;AACN,eAAO;AAAA,MACT;AAAA,IACF;AAAA,IACA,MAAM,MAAM,SAAS;AACnB,oBAAc,MAAM,OAAqB;AAAA,IAC3C;AAAA,EACF;AACF;AAKO,SAAS,0BAA2C;AACzD,QAAM,QAAQ,oBAAI,IAAiC;AACnD,QAAM,OAAO,oBAAI,IAAY;AAC7B,SAAO;AAAA,IACL,UAAU,KAAK;AACb,WAAK,IAAI,GAAG;AAAA,IACd;AAAA,IACA,OAAO,MAAM;AACX,aAAO,MAAM,IAAI,IAAI,KAAK,KAAK,IAAI,IAAI;AAAA,IACzC;AAAA,IACA,KAAK,MAAM;AACT,YAAM,QAAQ,MAAM,IAAI,IAAI;AAC5B,UAAI,UAAU,OAAW,QAAO;AAChC,aAAO,OAAO,UAAU,WAAW,QAAQ,IAAI,YAAY,EAAE,OAAO,KAAK;AAAA,IAC3E;AAAA,IACA,MAAM,MAAM,SAAS;AACnB,YAAM,IAAI,MAAM,OAAO;AAAA,IACzB;AAAA,EACF;AACF;;;ADQA,eAAsB,YACpB,MAC+C;AAC/C,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,QAAM,MAAM,KAAK,QAAQ,MAAM,oBAAI,KAAK;AACxC,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,UAAU,KAAK,WAAW,kBAAkB;AAElD,UAAQ,UAAU,KAAK,MAAM;AAE7B,QAAM,eAAe,oBAAoB;AAAA,IACvC,WAAW,KAAK;AAAA,IAChB;AAAA,IACA,aAAa,KAAK,SAAS,QAAQ;AAAA,IACnC;AAAA,IACA;AAAA,EACF,CAAC;AAED,QAAM,YAAY,IAAI;AACtB,QAAM,QAAyC,CAAC;AAChD,QAAM,kBAA0C,CAAC;AAGjD,QAAM,WAA0F,CAAC;AACjG,MAAI,YAAY;AAChB,aAAW,YAAY,KAAK,WAAW;AACrC,aAAS,MAAM,GAAG,MAAM,MAAM,OAAO;AACnC,YAAM,SAAS,GAAG,SAAS,EAAE,IAAI,GAAG;AACpC,YAAM,WAAW,OAAO;AACxB,eAAS,KAAK,EAAE,UAAU,KAAK,QAAQ,SAAS,CAAC;AACjD,mBAAa;AAAA,IACf;AAAA,EACF;AAGA,MAAI,eAAe;AACnB,MAAI,qBAAqB;AACzB,QAAM,kBAAkB,IAAI,gBAAgB;AAI5C,QAAM,QAAyB,CAAC;AAChC,MAAI,UAAU;AACd,QAAM,WAAW;AAEjB,WAAS,IAAI,GAAG,IAAI,gBAAgB,KAAK;AACvC,UAAM;AAAA,OACH,YAAY;AACX,eAAO,MAAM;AACX,gBAAM,QAAQ;AACd,cAAI,SAAS,SAAS,OAAQ;AAC9B,gBAAM,OAAO,SAAS,KAAK;AAC3B,cAAI,oBAAoB;AACtB,qBAAS,KAAK,YAAY,MAAM,sBAAsB,CAAC;AACvD;AAAA,UACF;AACA,gBAAM,SAAS,MAAM,YAAY;AAAA,YAC/B;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA,kBAAkB,KAAK,oBAAoB,wBAAwB,OAAO;AAAA,YAC1E,QAAQ,gBAAgB;AAAA,UAC1B,CAAC;AACD,mBAAS,KAAK,OAAO,IAAI;AACzB,0BAAgB,OAAO,KAAK;AAC5B,iBAAO,OAAO,iBAAiB,OAAO,eAAe;AACrD,cAAI,KAAK,gBAAgB,UAAa,gBAAgB,KAAK,aAAa;AACtE,iCAAqB;AAAA,UACvB;AAEA,cAAI,KAAK,gBAAgB,KAAK,iBAAiB,SAAS,CAAC,OAAO,KAAK,OAAO;AAC1E,kBAAM,eAAe;AAAA,cACnB,OAAO,KAAK;AAAA,cACZ,MAAM,OAAO;AAAA,cACb,UAAU,KAAK;AAAA,cACf;AAAA,cACA;AAAA,YACF,CAAC,EAAE,MAAM,CAAC,QAAQ;AAGhB,sBAAQ;AAAA,gBACN,oCAAoC,OAAO,KAAK,MAAM,KAAK,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,cAC7G;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,GAAG;AAAA,IACL;AAAA,EACF;AACA,QAAM,QAAQ,IAAI,KAAK;AAEvB,QAAM,UAAU,IAAI;AACpB,WAAS,KAAK,CAAC,GAAG,MAAM,EAAE,OAAO,cAAc,EAAE,MAAM,CAAC;AAExD,QAAM,aAAa;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,WAAW,UAAU,YAAY;AAAA,IACjC,SAAS,QAAQ,YAAY;AAAA,IAC7B,YAAY,QAAQ,QAAQ,IAAI,UAAU,QAAQ;AAAA,IAClD,OAAO;AAAA,IACP;AAAA,IACA,QAAQ,KAAK;AAAA,IACb;AAAA,IACA,WAAW,KAAK,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,EACnE;AACF;AAeA,eAAe,YACb,MAC2F;AAC3F,QAAM,UAAU,KAAK;AACrB,QAAM,UAAU,KAAK,KAAK,KAAK,QAAQ,KAAK,KAAK,OAAO,QAAQ,mBAAmB,GAAG,CAAC;AACvF,UAAQ,UAAU,OAAO;AAGzB,QAAM,YAAY,KAAK,SAAS,oBAAoB;AACpD,MAAI,KAAK,WAAW;AAClB,UAAM,MAAM,QAAQ,KAAK,SAAS;AAClC,QAAI,QAAQ,QAAW;AACrB,UAAI;AACF,cAAM,SAAS,KAAK,MAAM,GAAG;AAC7B,YAAI,OAAO,WAAW,KAAK,KAAK,QAAQ;AACtC,iBAAO,EAAE,MAAM,EAAE,GAAG,QAAQ,QAAQ,KAAK,GAAG,iBAAiB,CAAC,EAAE;AAAA,QAClE;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAU,KAAK,IAAI;AACzB,QAAM,QAAQ,KAAK,iBAAiB,KAAK,KAAK,QAAQ,OAAO;AAC7D,QAAM,kBAA0C,CAAC;AACjD,QAAM,YAAoC;AAAA,IACxC,MAAM,MAAM,MAAM,SAAS;AACzB,YAAM,WAAW,KAAK,SAAS,IAAI;AACnC,cAAQ,UAAU,KAAK,UAAU,IAAI,CAAC;AACtC,cAAQ,MAAM,UAAU,OAAO;AAC/B,sBAAgB,GAAG,KAAK,KAAK,MAAM,IAAI,IAAI,EAAE,IAAI;AACjD,aAAO;AAAA,IACT;AAAA,IACA,MAAM,UAAU,MAAM,OAAO;AAC3B,aAAO,UAAU,MAAM,MAAM,KAAK,UAAU,OAAO,MAAM,CAAC,CAAC;AAAA,IAC7D;AAAA,EACF;AACA,MAAI,YAAY;AAChB,QAAM,cAAkC,EAAE,OAAO,GAAG,QAAQ,EAAE;AAC9D,QAAM,OAA0B;AAAA,IAC9B,QAAQ,QAAQ,QAAQ;AACtB,mBAAa;AACb,YAAM,KAAK,QAAQ,MAAM,IAAI,EAAE,WAAW,OAAO,CAAC,EAAE,IAAI;AAAA,IAC1D;AAAA,IACA,cAAc,OAAO;AACnB,kBAAY,SAAS,MAAM;AAC3B,kBAAY,UAAU,MAAM;AAC5B,UAAI,MAAM,OAAQ,aAAY,UAAU,YAAY,UAAU,KAAK,MAAM;AAAA,IAC3E;AAAA,IACA,UAAU;AACR,aAAO;AAAA,IACT;AAAA,IACA,SAAS;AACP,aAAO,EAAE,GAAG,YAAY;AAAA,IAC1B;AAAA,EACF;AAEA,QAAM,YAAY,KAAK,KAAK,gBAAgB;AAAA,IAC1C,UAAU,KAAK,KAAK;AAAA,IACpB,KAAK,KAAK,KAAK;AAAA,EACjB,CAAC;AAED,QAAM,MAAuB;AAAA,IAC3B,QAAQ,KAAK,KAAK;AAAA,IAClB,KAAK,KAAK,KAAK;AAAA,IACf,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,MAAI;AACJ,MAAI;AACJ,MAAI;AACF,eAAW,MAAM,KAAK,KAAK,SAAS,KAAK,KAAK,UAAU,GAAG;AAAA,EAC7D,SAAS,KAAK;AACZ,mBAAe,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,EAChE;AAKA,QAAM,cAA0C,CAAC;AACjD,MAAI,aAAa,QAAW;AAC1B,eAAW,SAAS,KAAK,KAAK,UAAU,CAAC,GAAG;AAC1C,UAAI,MAAM,aAAa,CAAC,MAAM,UAAU,KAAK,KAAK,QAAQ,EAAG;AAC7D,UAAI;AACF,oBAAY,MAAM,IAAI,IAAI,MAAM,aAAa,OAAO;AAAA,UAClD;AAAA,UACA,UAAU,KAAK,KAAK;AAAA,UACpB,QAAQ,KAAK;AAAA,QACf,CAAC;AAAA,MACH,SAAS,KAAK;AACZ,uBAAe,UAAU,MAAM,IAAI,aAAa,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAChG;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,MAAM,MAAM;AAElB,QAAM,OAAsC;AAAA,IAC1C,QAAQ,KAAK,KAAK;AAAA,IAClB,YAAY,KAAK,KAAK,SAAS;AAAA,IAC/B,KAAK,KAAK,KAAK;AAAA,IACf,UAAW,YAAY;AAAA,IACvB;AAAA,IACA,SAAS;AAAA,IACT,YAAY,EAAE,GAAG,YAAY;AAAA,IAC7B,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ;AAAA,IACR,OAAO;AAAA,EACT;AAEA,MAAI,CAAC,gBAAgB,KAAK,WAAW;AACnC,YAAQ,MAAM,WAAW,KAAK,UAAU,IAAI,CAAC;AAAA,EAC/C;AAEA,SAAO,EAAE,MAAM,gBAAgB;AACjC;AAEA,eAAe,aACb,OACA,OACqB;AACrB,SAAO,MAAM,MAAM,KAAK;AAC1B;AAEA,SAAS,wBACP,SACsD;AACtD,SAAO,CAAC,QAAQ,QAAQ;AACtB,UAAM,QAAwC,CAAC;AAC/C,WAAO;AAAA,MACL,KAAK,MAAM,YAAY;AACrB,cAAM,UAAU,KAAK,IAAI;AACzB,cAAM,SAAkC,EAAE,MAAM,QAAQ,SAAS,GAAI,cAAc,CAAC,EAAG;AACvF,cAAM,SAAoB;AAAA,UACxB,IAAI,UAAU;AACZ,mBAAO,aAAa,KAAK,IAAI,IAAI;AACjC,gBAAI,SAAU,QAAO,OAAO,QAAQ,QAAQ;AAC5C,kBAAM,KAAK,MAAM;AAAA,UACnB;AAAA,UACA,aAAa,KAAK,OAAO;AACvB,mBAAO,GAAG,IAAI;AAAA,UAChB;AAAA,QACF;AACA,eAAO;AAAA,MACT;AAAA,MACA,MAAM,QAAQ;AACZ,gBAAQ,MAAM,KAAK,KAAK,aAAa,GAAG,MAAM,IAAI,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,MACxF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,YACP,MACA,QAC+B;AAC/B,SAAO;AAAA,IACL,QAAQ,KAAK;AAAA,IACb,YAAY,KAAK,SAAS;AAAA,IAC1B,KAAK,KAAK;AAAA,IACV,UAAU;AAAA,IACV,aAAa,CAAC;AAAA,IACd,SAAS;AAAA,IACT,YAAY,EAAE,OAAO,GAAG,QAAQ,EAAE;AAAA,IAClC,YAAY;AAAA,IACZ,MAAM,KAAK;AAAA,IACX,QAAQ;AAAA,IACR,OAAO,YAAY,MAAM;AAAA,EAC3B;AACF;AAUA,eAAe,eACb,MACe;AACf,QAAM,KAAK,MAAM,QAAQ;AAAA,IACvB,UAAU,KAAK;AAAA,IACf,UAAU,KAAK,KAAK;AAAA,IACpB,aAAa,KAAK,KAAK;AAAA,IACvB,QAAQ,KAAK,KAAK,iBAAiB;AAAA,IACnC,mBAAmB,KAAK,KAAK,4BAA4B;AAAA,IACzD,YAAY,KAAK,IAAI,EAAE,YAAY;AAAA,IACnC,iBAAiB;AAAA,EACnB,CAAC;AACH;AAIA,SAAS,oBAAoB,OAMlB;AACT,QAAM,YAAY;AAAA,IAChB,WAAW,MAAM,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,IAClE,QAAQ,MAAM,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,MAAM,EAAE,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;AAAA,IACxF,UAAU,MAAM;AAAA,IAChB,MAAM,MAAM;AAAA,IACZ,MAAM,MAAM;AAAA,EACd;AACA,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,SAAS,CAAC,EAAE,OAAO,KAAK;AAC5E;AAEA,SAAS,kBACP,OACA,QACA,MACoB;AACpB,QAAM,UAA0C,CAAC;AACjD,aAAW,SAAS,QAAQ;AAC1B,UAAM,SAAmB,CAAC;AAC1B,eAAW,QAAQ,OAAO;AACxB,YAAM,IAAI,KAAK,YAAY,MAAM,IAAI;AACrC,UAAI,MAAM,OAAW,QAAO,KAAK,EAAE,SAAS;AAAA,IAC9C;AACA,YAAQ,MAAM,IAAI,IAAI,UAAU,QAAQ,IAAI;AAAA,EAC9C;AACA,QAAM,aAAgD,CAAC;AACvD,QAAM,iBAAiB,oBAAI,IAAsB;AACjD,aAAW,QAAQ,OAAO;AACxB,UAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,QAAI,WAAW,WAAW,EAAG;AAC7B,UAAM,OAAO,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAChE,UAAM,MAAM,eAAe,IAAI,KAAK,UAAU,KAAK,CAAC;AACpD,QAAI,KAAK,IAAI;AACb,mBAAe,IAAI,KAAK,YAAY,GAAG;AAAA,EACzC;AACA,aAAW,CAAC,YAAY,OAAO,KAAK,gBAAgB;AAClD,UAAM,KAAK,UAAU,SAAS,IAAI;AAClC,eAAW,UAAU,IAAI,EAAE,eAAe,GAAG,MAAM,MAAM,GAAG,MAAM,GAAG,GAAG,EAAE;AAAA,EAC5E;AACA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,cAAc,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,IACrD,eAAe,MAAM,OAAO,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE;AAAA,IAC7C,cAAc,MAAM,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,UAAU,CAAC,EAAE;AAAA,IACnE,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AAAA,IAC3C,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,WAAW,UAAU,CAAC,EAAE;AAAA,EAC/E;AACF;AAKA,SAAS,UAAU,SAAmB,MAA8B;AAClE,QAAM,IAAI,QAAQ;AAClB,MAAI,MAAM,EAAG,QAAO,EAAE,MAAM,GAAG,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE;AAC5D,QAAM,OAAO,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAClD,QAAM,WAAW,QAAQ,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,SAAS,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,IAAI,CAAC;AACrF,QAAM,QAAQ,KAAK,KAAK,QAAQ;AAChC,QAAM,KAAK,mBAAmB,SAAS,MAAM,EAAE,MAAM,WAAW,IAAK,CAAC;AACtE,SAAO,EAAE,MAAM,OAAO,MAAM,CAAC,GAAG,OAAO,GAAG,KAAK,GAAG,EAAE;AACtD;","names":[]}
@@ -0,0 +1,27 @@
1
+ import {
2
+ canonicalize
3
+ } from "./chunk-VSMTAMNK.js";
4
+ import {
5
+ ValidationError
6
+ } from "./chunk-3BFEG2F6.js";
7
+
8
+ // src/agent-profile.ts
9
+ import { createHash } from "crypto";
10
+ function agentProfileHash(profile) {
11
+ if (typeof profile.model !== "string" || profile.model.trim().length === 0) {
12
+ throw new ValidationError(`AgentProfile "${profile.id}" has no model \u2014 cannot hash`);
13
+ }
14
+ const behaviour = {
15
+ model: profile.model.trim(),
16
+ skills: [...profile.skills ?? []].sort(),
17
+ promptVersion: profile.promptVersion ?? null,
18
+ tools: [...profile.tools ?? []].sort(),
19
+ metadata: profile.metadata ?? {}
20
+ };
21
+ return createHash("sha256").update(JSON.stringify(canonicalize(behaviour))).digest("hex");
22
+ }
23
+
24
+ export {
25
+ agentProfileHash
26
+ };
27
+ //# sourceMappingURL=chunk-PQV2TKC3.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/agent-profile.ts"],"sourcesContent":["/**\n * @stable\n *\n * AgentProfile — the eval harness's unit of variation.\n *\n * A profile pins everything that changes agent behaviour for a benchmark\n * cell: the model, the active skills, the prompt version, the available\n * tools. Vary the profile — swap a model, add a skill — and re-run the suite\n * to benchmark the change. The scorecard keys a cell on\n * `(scenarioId, profileHash)`, so the model is not a separate axis: it lives\n * inside the profile, and two profiles with the same model but different\n * skills are different cells.\n *\n * `agentProfileHash` is the profile's behaviour identity. Two profiles that\n * produce the same agent behaviour share a hash (and a scorecard cell);\n * reordering `skills` or `tools` does not change it; the human-facing `id`\n * label does not affect it.\n */\n\nimport { createHash } from 'node:crypto'\nimport { ValidationError } from './errors'\nimport { canonicalize } from './pre-registration'\n\nexport interface AgentProfile {\n /** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */\n id: string\n /** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */\n model: string\n /** Skill ids/versions active in this profile — the primary behaviour lever. */\n skills?: string[]\n /** Prompt version identifier. */\n promptVersion?: string\n /** Tool ids available to the agent. */\n tools?: string[]\n /** Any other behaviour-bearing knobs that should fingerprint into the hash. */\n metadata?: Record<string, string | number | boolean>\n}\n\n/**\n * Deterministic behaviour identity of a profile — a sha256 over the\n * behaviour-bearing fields. `skills` and `tools` are order-insensitive; the\n * `id` label is excluded. Throws on a profile with no `model` — an unkeyable\n * profile must fail loud rather than collapse into a blank-model cell.\n */\nexport function agentProfileHash(profile: AgentProfile): string {\n if (typeof profile.model !== 'string' || profile.model.trim().length === 0) {\n throw new ValidationError(`AgentProfile \"${profile.id}\" has no model — cannot hash`)\n }\n const behaviour = {\n model: profile.model.trim(),\n skills: [...(profile.skills ?? [])].sort(),\n promptVersion: profile.promptVersion ?? null,\n tools: [...(profile.tools ?? [])].sort(),\n metadata: profile.metadata ?? {},\n }\n return createHash('sha256')\n .update(JSON.stringify(canonicalize(behaviour)))\n .digest('hex')\n}\n"],"mappings":";;;;;;;;AAmBA,SAAS,kBAAkB;AAyBpB,SAAS,iBAAiB,SAA+B;AAC9D,MAAI,OAAO,QAAQ,UAAU,YAAY,QAAQ,MAAM,KAAK,EAAE,WAAW,GAAG;AAC1E,UAAM,IAAI,gBAAgB,iBAAiB,QAAQ,EAAE,mCAA8B;AAAA,EACrF;AACA,QAAM,YAAY;AAAA,IAChB,OAAO,QAAQ,MAAM,KAAK;AAAA,IAC1B,QAAQ,CAAC,GAAI,QAAQ,UAAU,CAAC,CAAE,EAAE,KAAK;AAAA,IACzC,eAAe,QAAQ,iBAAiB;AAAA,IACxC,OAAO,CAAC,GAAI,QAAQ,SAAS,CAAC,CAAE,EAAE,KAAK;AAAA,IACvC,UAAU,QAAQ,YAAY,CAAC;AAAA,EACjC;AACA,SAAO,WAAW,QAAQ,EACvB,OAAO,KAAK,UAAU,aAAa,SAAS,CAAC,CAAC,EAC9C,OAAO,KAAK;AACjB;","names":[]}
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  CaptureIntegrityError
3
- } from "./chunk-QYJT52YW.js";
3
+ } from "./chunk-3BFEG2F6.js";
4
4
 
5
5
  // src/trace/integrity.ts
6
6
  var RunIntegrityError = class extends CaptureIntegrityError {
@@ -121,4 +121,4 @@ export {
121
121
  assertRunCaptured,
122
122
  throwIfRunIncomplete
123
123
  };
124
- //# sourceMappingURL=chunk-UBPIXOC4.js.map
124
+ //# sourceMappingURL=chunk-SBCB6VZY.js.map
@@ -1,19 +1,19 @@
1
1
  import {
2
2
  runCampaign
3
- } from "./chunk-NOPYCRNG.js";
3
+ } from "./chunk-OLULBECP.js";
4
4
  import {
5
5
  buildReflectionPrompt,
6
6
  parseReflectionResponse,
7
7
  runCanaries,
8
8
  scoreRedTeamOutput,
9
9
  summarizeBackendIntegrity
10
- } from "./chunk-GBHRUAOF.js";
10
+ } from "./chunk-GMXHLSLL.js";
11
11
  import {
12
12
  detectRewardHacking
13
13
  } from "./chunk-YV7J7X5N.js";
14
14
  import {
15
15
  callLlm
16
- } from "./chunk-VXNVVBZO.js";
16
+ } from "./chunk-IHDHUN2X.js";
17
17
 
18
18
  // src/campaign/auto-pr.ts
19
19
  import { execSync } from "child_process";
@@ -612,7 +612,7 @@ async function runImprovementLoop(opts) {
612
612
  throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
613
613
  }
614
614
  const optimization = await runOptimization(opts);
615
- const { runCampaign: runCampaign2 } = await import("./run-campaign-5XENUKRF.js");
615
+ const { runCampaign: runCampaign2 } = await import("./run-campaign-HXPJAUZ3.js");
616
616
  const baselineOnHoldout = await runCampaign2({
617
617
  ...opts,
618
618
  scenarios: opts.holdoutScenarios,
@@ -923,4 +923,4 @@ export {
923
923
  provenanceSpansPath,
924
924
  emitLoopProvenance
925
925
  };
926
- //# sourceMappingURL=chunk-LBSXXH56.js.map
926
+ //# sourceMappingURL=chunk-SUGME4OT.js.map