@tangle-network/agent-eval 0.61.0 → 0.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -8
- package/dist/adapters/http.d.ts +4 -1
- package/dist/adapters/langchain.d.ts +4 -1
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/campaign/index.d.ts +8 -8
- package/dist/campaign/index.js +4 -3
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
- package/dist/chunk-7TPYV2ER.js.map +1 -0
- package/dist/{chunk-SUGME4OT.js → chunk-CV2BS2OV.js} +7 -5
- package/dist/chunk-CV2BS2OV.js.map +1 -0
- package/dist/chunk-E22YUOAL.js +111 -0
- package/dist/chunk-E22YUOAL.js.map +1 -0
- package/dist/{chunk-GMXHLSLL.js → chunk-SS2SOBBT.js} +1 -106
- package/dist/chunk-SS2SOBBT.js.map +1 -0
- package/dist/contract/index.d.ts +9 -9
- package/dist/contract/index.js +4 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
- package/dist/{index-D9dwa00f.d.ts → index-DxfmYUjC.d.ts} +2 -2
- package/dist/index.d.ts +98 -14
- package/dist/index.js +324 -45
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-D0WeCXt1.d.ts → provenance-CYBV9Ox6.d.ts} +14 -3
- package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
- package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
- package/dist/rl.d.ts +6 -6
- package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
- package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
- package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
- package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
- package/dist/{types-Beb6KPqZ.d.ts → types-DH22o8hM.d.ts} +15 -11
- package/package.json +1 -1
- package/dist/chunk-GMXHLSLL.js.map +0 -1
- package/dist/chunk-OLULBECP.js.map +0 -1
- package/dist/chunk-SUGME4OT.js.map +0 -1
- /package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -4,24 +4,30 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
|
|
|
4
4
|
|
|
5
5
|
---
|
|
6
6
|
|
|
7
|
-
## [0.
|
|
7
|
+
## [0.62.0] — 2026-05-30 — eval↔runtime boundary hardening (honest cost meter + per-cell stub guard)
|
|
8
|
+
|
|
9
|
+
From the agent-eval ↔ agent-runtime boundary critique. Builds on `runProfileMatrix` (0.61.0).
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
|
|
13
|
+
- **`CampaignCostMeter` docstring no longer lies.** It claimed "Substrate auto-tracks LLM costs via the cost-ledger backend hooks" — false (the meter mutates only on explicit `observe`/`observeTokens`), and it contradicted `observeTokens`' own doc. That doc was the root cause of consumers skipping `observeTokens`, getting `{0,0}` stub cells, and building `RunRecord`s on a side-channel. The doc now states plainly: nothing is captured automatically; the dispatch MUST report.
|
|
8
14
|
|
|
9
15
|
### Added
|
|
10
16
|
|
|
11
|
-
- **`
|
|
12
|
-
- **`ProfileMatrixError`** — thrown at preflight (before any LLM spend) when a profile's model lacks a snapshot version or the profile/scenario lists are empty.
|
|
17
|
+
- **`runCampaign({ expectUsage })`** — per-cell stub guard, the early/fine-grained sibling of batch `assertRealBackend`. A cell that produced an artifact but reported `costUsd === 0` AND zero tokens is a stub. Modes: `'warn'` (default, non-breaking), `'assert'` (throw `BackendIntegrityError` on the first stub cell), `'off'` (replay/offline). Errored/skipped cells and deterministic judge-only runs are not flagged.
|
|
13
18
|
|
|
14
|
-
###
|
|
19
|
+
### Changed
|
|
15
20
|
|
|
16
|
-
-
|
|
21
|
+
- **`CampaignTokenUsage` is now `type CampaignTokenUsage = RunTokenUsage`** (one source of truth; a field added to `RunTokenUsage` is a compile error here, not silent drift across the three hand-synced copies the audit found).
|
|
22
|
+
- **multishot aliases sandbox's `AgentProfile` → `SandboxAgentProfile`** so it no longer collides with the eval-harness `AgentProfile` the root exports.
|
|
17
23
|
|
|
18
|
-
###
|
|
24
|
+
### Boundary
|
|
19
25
|
|
|
20
|
-
|
|
26
|
+
- **`tests/boundary-integrity.test.ts`** — mechanically enforces the zero-upward-dependency rule (agent-eval must never import agent-runtime/agent-knowledge). The CLAUDE.md rule was prose-only; it is now a red build.
|
|
21
27
|
|
|
22
28
|
### Notes
|
|
23
29
|
|
|
24
|
-
Pure additive surface (
|
|
30
|
+
Pure additive/doc surface (`expectUsage` defaults to non-breaking `'warn'`). Full suite 1538/1538 green. Consumes-side: agent-runtime `loopDispatch` (0.32.0) turns the whole seam into one un-mis-wireable call.
|
|
25
31
|
|
|
26
32
|
---
|
|
27
33
|
|
package/dist/adapters/http.d.ts
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-
|
|
1
|
+
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-DH22o8hM.js';
|
|
2
|
+
import '../run-record-BgTFzO2r.js';
|
|
3
|
+
import '../errors-Dwqw-T_m.js';
|
|
4
|
+
import '../schema-m0gsnbt3.js';
|
|
2
5
|
|
|
3
6
|
/**
|
|
4
7
|
* # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-
|
|
1
|
+
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-DH22o8hM.js';
|
|
2
|
+
import '../run-record-BgTFzO2r.js';
|
|
3
|
+
import '../errors-Dwqw-T_m.js';
|
|
4
|
+
import '../schema-m0gsnbt3.js';
|
|
2
5
|
|
|
3
6
|
/**
|
|
4
7
|
* # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain
|
package/dist/adapters/otel.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { T as TraceSpanEvent, H as HostedClient } from '../index-
|
|
2
|
-
import '../types-
|
|
3
|
-
import '../
|
|
4
|
-
import '../run-record-DgUVo5pw.js';
|
|
1
|
+
import { T as TraceSpanEvent, H as HostedClient } from '../index-DxfmYUjC.js';
|
|
2
|
+
import '../types-DH22o8hM.js';
|
|
3
|
+
import '../run-record-BgTFzO2r.js';
|
|
5
4
|
import '../errors-Dwqw-T_m.js';
|
|
6
5
|
import '../schema-m0gsnbt3.js';
|
|
6
|
+
import '../summary-report-ByiOUrHj.js';
|
|
7
7
|
import '../failure-cluster-CL7IVgkJ.js';
|
|
8
8
|
import '../store-CKUAgsJz.js';
|
|
9
9
|
import '../judge-calibration-DilmB3Ml.js';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-
|
|
2
|
-
import '../run-record-
|
|
1
|
+
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-DsnOpCO6.js';
|
|
2
|
+
import '../run-record-BgTFzO2r.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
package/dist/campaign/index.d.ts
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
import { C as CampaignStorage } from '../provenance-
|
|
2
|
-
export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, E as EmitLoopProvenanceArgs, a as EmitLoopProvenanceResult, b as EvolutionaryDriverOptions, G as GepaDriverConstraints, c as GepaDriverOptions, H as HeldOutGateOptions, L as LoopProvenanceBackend, d as LoopProvenanceCandidate, e as LoopProvenanceRecord, O as OpenAutoPrOptions, f as OpenAutoPrResult, R as RunCampaignOptions, g as RunEvalOptions, h as RunImprovementLoopOptions, i as RunImprovementLoopResult, j as RunOptimizationOptions, k as RunOptimizationResult, l as buildLoopProvenanceRecord, m as composeGate, n as countSentenceEdits, o as defaultProductionGate, p as defaultRenderDiff, q as emitLoopProvenance, r as evolutionaryDriver, s as extractH2Sections, t as fsCampaignStorage, u as gepaDriver, v as heldOutGate, w as inMemoryCampaignStorage, x as loopProvenanceSpans, y as openAutoPr, z as provenanceRecordPath, A as provenanceSpansPath, F as runCampaign, I as runEval, J as runImprovementLoop, K as runOptimization, M as surfaceContentHash, N as surfaceHash } from '../provenance-
|
|
3
|
-
import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, f as LabelTrust, S as Scenario, b as DispatchContext, a as JudgeConfig, g as LabeledScenarioSource, C as CampaignResult, h as CodeSurface } from '../types-
|
|
4
|
-
export { i as CampaignAggregates, j as CampaignArtifactWriter, k as CampaignCellResult, l as CampaignCostMeter, m as CampaignTokenUsage, n as CampaignTraceWriter, D as DispatchFn, G as Gate, o as GateContext, p as GateDecision, q as GateResult, r as GenerationCandidate, s as GenerationRecord, I as ImprovementDriver, t as JudgeAggregate, u as JudgeDimension, J as JudgeScore, M as MutableSurface, v as Mutator, O as OptimizerConfig, P as ProposeContext, w as ProposedCandidate, R as RedactionStatus, x as ScenarioAggregate, y as SessionScript, T as TraceSpan, z as isProposedCandidate, A as labelTrustRank } from '../types-
|
|
5
|
-
import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-
|
|
1
|
+
import { C as CampaignStorage } from '../provenance-CYBV9Ox6.js';
|
|
2
|
+
export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, E as EmitLoopProvenanceArgs, a as EmitLoopProvenanceResult, b as EvolutionaryDriverOptions, G as GepaDriverConstraints, c as GepaDriverOptions, H as HeldOutGateOptions, L as LoopProvenanceBackend, d as LoopProvenanceCandidate, e as LoopProvenanceRecord, O as OpenAutoPrOptions, f as OpenAutoPrResult, R as RunCampaignOptions, g as RunEvalOptions, h as RunImprovementLoopOptions, i as RunImprovementLoopResult, j as RunOptimizationOptions, k as RunOptimizationResult, l as buildLoopProvenanceRecord, m as composeGate, n as countSentenceEdits, o as defaultProductionGate, p as defaultRenderDiff, q as emitLoopProvenance, r as evolutionaryDriver, s as extractH2Sections, t as fsCampaignStorage, u as gepaDriver, v as heldOutGate, w as inMemoryCampaignStorage, x as loopProvenanceSpans, y as openAutoPr, z as provenanceRecordPath, A as provenanceSpansPath, F as runCampaign, I as runEval, J as runImprovementLoop, K as runOptimization, M as surfaceContentHash, N as surfaceHash } from '../provenance-CYBV9Ox6.js';
|
|
3
|
+
import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, f as LabelTrust, S as Scenario, b as DispatchContext, a as JudgeConfig, g as LabeledScenarioSource, C as CampaignResult, h as CodeSurface } from '../types-DH22o8hM.js';
|
|
4
|
+
export { i as CampaignAggregates, j as CampaignArtifactWriter, k as CampaignCellResult, l as CampaignCostMeter, m as CampaignTokenUsage, n as CampaignTraceWriter, D as DispatchFn, G as Gate, o as GateContext, p as GateDecision, q as GateResult, r as GenerationCandidate, s as GenerationRecord, I as ImprovementDriver, t as JudgeAggregate, u as JudgeDimension, J as JudgeScore, M as MutableSurface, v as Mutator, O as OptimizerConfig, P as ProposeContext, w as ProposedCandidate, R as RedactionStatus, x as ScenarioAggregate, y as SessionScript, T as TraceSpan, z as isProposedCandidate, A as labelTrustRank } from '../types-DH22o8hM.js';
|
|
5
|
+
import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-DzcPHR1Z.js';
|
|
6
6
|
import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
|
|
7
|
-
import {
|
|
7
|
+
import { b as RunSplitTag, R as RunRecord } from '../run-record-BgTFzO2r.js';
|
|
8
8
|
import '../llm-client-DbjLfz-K.js';
|
|
9
9
|
import '../raw-provider-sink-C46HDghv.js';
|
|
10
10
|
import '../red-team-DW9Ca_tj.js';
|
|
11
11
|
import '../dataset-B2kL-fSM.js';
|
|
12
12
|
import '../store-CKUAgsJz.js';
|
|
13
13
|
import '../schema-m0gsnbt3.js';
|
|
14
|
-
import '../index-
|
|
15
|
-
import '../summary-report-
|
|
14
|
+
import '../index-DxfmYUjC.js';
|
|
15
|
+
import '../summary-report-ByiOUrHj.js';
|
|
16
16
|
import '../failure-cluster-CL7IVgkJ.js';
|
|
17
17
|
import '../judge-calibration-DilmB3Ml.js';
|
|
18
18
|
|
package/dist/campaign/index.js
CHANGED
|
@@ -20,19 +20,20 @@ import {
|
|
|
20
20
|
runOptimization,
|
|
21
21
|
surfaceContentHash,
|
|
22
22
|
surfaceHash
|
|
23
|
-
} from "../chunk-
|
|
23
|
+
} from "../chunk-CV2BS2OV.js";
|
|
24
24
|
import {
|
|
25
25
|
fsCampaignStorage,
|
|
26
26
|
inMemoryCampaignStorage,
|
|
27
27
|
runCampaign
|
|
28
|
-
} from "../chunk-
|
|
28
|
+
} from "../chunk-7TPYV2ER.js";
|
|
29
29
|
import {
|
|
30
30
|
agentProfileHash
|
|
31
31
|
} from "../chunk-PQV2TKC3.js";
|
|
32
|
+
import "../chunk-SS2SOBBT.js";
|
|
32
33
|
import {
|
|
33
34
|
assertRealBackend,
|
|
34
35
|
summarizeBackendIntegrity
|
|
35
|
-
} from "../chunk-
|
|
36
|
+
} from "../chunk-E22YUOAL.js";
|
|
36
37
|
import "../chunk-YV7J7X5N.js";
|
|
37
38
|
import {
|
|
38
39
|
validateRunRecord
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/presets/run-profile-matrix.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * `runProfileMatrix` — the missing keystone between `runAgentMatrix` and the\n * backend-integrity guard.\n *\n * The gap it closes: `runAgentMatrix` is a topology-opaque scheduler whose\n * cells return a bare `{ output, verdict, costUsd }` — no `tokenUsage`, not a\n * `RunRecord`. `assertRealBackend` / `summarizeBackendIntegrity` key on\n * `RunRecord.tokenUsage`, so they cannot run on a raw matrix result. Every\n * consumer therefore hand-writes the same bridge: fan a profile × scenario\n * cartesian, call dispatch, fabricate a `RunRecord` with token usage, thread it\n * back, run the integrity guard. That hand-rolled bridge is exactly the pile of\n * bespoke `eval:*` scripts the adoption skills keep trying (and failing) to\n * forbid.\n *\n * `runProfileMatrix` IS that bridge, once:\n *\n * - axis 3 (PROFILE) = `profiles: AgentProfile[]`\n * - axis 1 (PERSONA/SCENARIO) = `scenarios: Scenario[]` (each scenario carries\n * its persona; `personaOf` groups them for the `byPersona` pivot)\n * - the scoring axis = `judges`\n *\n * It runs `runCampaign` once per profile (reusing its seeds, reps, bootstrap\n * CIs, resumability, and the `LabeledScenarioStore` capture flywheel), maps\n * every cell to a validated `RunRecord` carrying the real `tokenUsage` the\n * dispatch reported via `ctx.cost.observeTokens`, and runs `assertRealBackend`\n * BY CONSTRUCTION before returning — so a stub-backend run fails loudly instead\n * of reporting a clean 0/N leaderboard.\n *\n * Dispatch contract: a dispatch that calls an LLM MUST report usage via\n * `ctx.cost.observeTokens({ input, output })` (and cost via `ctx.cost.observe`).\n * A dispatch that reports zero tokens is indistinguishable from a stub and the\n * integrity guard treats it as one.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport { type AgentProfile, agentProfileHash } from '../../agent-profile'\nimport { AgentEvalError } from '../../errors'\nimport {\n assertRealBackend,\n type BackendIntegrityReport,\n summarizeBackendIntegrity,\n} from '../../integrity/backend-integrity'\nimport {\n type RunOutcome,\n type RunRecord,\n type RunSplitTag,\n validateRunRecord,\n} from '../../run-record'\nimport { runCampaign } from '../run-campaign'\nimport type { CampaignStorage } from '../storage'\nimport type {\n CampaignCellResult,\n CampaignResult,\n DispatchContext,\n JudgeConfig,\n LabeledScenarioSource,\n LabeledScenarioStore,\n Scenario,\n} from '../types'\n\n/** Thrown when the matrix is misconfigured (no profiles, a profile whose model\n * lacks a snapshot version, etc.). Distinct from `BackendIntegrityError`,\n * which signals a stub backend at run time. */\nexport class ProfileMatrixError extends AgentEvalError {\n constructor(message: string) {\n super('profile_matrix', message)\n }\n}\n\n/** Dispatch for one cell: render `profile` against `scenario`, returning the\n * artifact the judges score. Report LLM usage via `ctx.cost.observeTokens`\n * and `ctx.cost.observe` — the integrity guard depends on it. */\nexport type ProfileDispatchFn<TScenario extends Scenario, TArtifact> = (\n profile: AgentProfile,\n scenario: TScenario,\n ctx: DispatchContext,\n) => Promise<TArtifact>\n\nexport interface RunProfileMatrixOptions<TScenario extends Scenario, TArtifact> {\n /** Axis 3 — the agent-under-test configurations. Each is one column. */\n profiles: AgentProfile[]\n /** Axis 1 — the persona/scenario corpus, run against every profile. */\n scenarios: TScenario[]\n /** Renders one (profile, scenario) cell. */\n dispatch: ProfileDispatchFn<TScenario, TArtifact>\n /** The scoring axis. */\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Where each profile's campaign writes artifacts/traces. One subdir per\n * profile. */\n runDir: string\n /** Git SHA the harness ran from — stamped onto every RunRecord (mandatory\n * for paper-grade records). */\n commitSha: string\n /** Logical experiment id shared across the whole matrix so the promotion\n * gate can pair profiles on matched scenarios. Default: a hash of the\n * profile + scenario ids. */\n experimentId?: string\n /** Which split these runs belong to. Default `'search'`. */\n splitTag?: RunSplitTag\n /** Replicates per (profile, scenario) cell for CI bands. Default 1. */\n reps?: number\n /** Campaign seed (per profile). Default 42. */\n seed?: number\n /**\n * Backend-integrity posture, enforced AFTER the matrix completes:\n * - `'assert'` (default) — throw `BackendIntegrityError` if the run was a\n * stub (and, with `allowMixed:false`, if it was mixed).\n * - `'warn'` — log the verdict but never throw.\n * - `'off'` — skip the guard entirely (only for offline/replay analysis).\n */\n integrity?: 'assert' | 'warn' | 'off'\n /** Forwarded to `assertRealBackend`. Default true (tolerate partial 429\n * cascades); set false for strict CI gates. */\n allowMixed?: boolean\n /** Max concurrent cells WITHIN each profile's campaign. Default 2.\n * Profiles run sequentially so the cost ceiling is honored deterministically. */\n maxConcurrency?: number\n /** Cumulative USD cap per profile campaign. */\n costCeiling?: number\n /** Capture flywheel — forwarded to each campaign. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: LabeledScenarioSource\n /** Storage backend. Default `fsCampaignStorage`. Pass\n * `inMemoryCampaignStorage()` for edge/CF-Worker/test runs. */\n storage?: CampaignStorage\n /** Test seam — override the wall clock. */\n now?: () => Date\n /** Optional persona key per scenario — drives the `byPersona` pivot. When\n * unset, `byPersona` is omitted. */\n personaOf?: (scenario: TScenario) => string\n /** Validate every produced RunRecord with `validateRunRecord` (fail-loud).\n * Default true — catches bad model snapshots and non-finite judge dims at\n * the boundary instead of letting them poison downstream analysis. */\n validate?: boolean\n}\n\nexport interface ProfileSummary {\n profileId: string\n profileHash: string\n model: string\n /** RunRecords produced for this profile (= scenarios × reps). */\n records: number\n /** Mean composite across this profile's records. */\n meanComposite: number\n totalCostUsd: number\n /** Per-profile integrity verdict — surfaces a single profile that ran stub\n * even when the matrix as a whole looks real. */\n integrity: BackendIntegrityReport\n}\n\nexport interface ScenarioRollup {\n meanComposite: number\n n: number\n}\n\nexport interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {\n matrixId: string\n experimentId: string\n /** One RunRecord per (profile, scenario, rep) cell — the integrity-checked,\n * paper-grade output. Feed straight into `analyzeRuns`, `HeldOutGate`,\n * scorecards, the hosted wire format. */\n records: RunRecord[]\n byProfile: Record<string, ProfileSummary>\n byScenario: Record<string, ScenarioRollup>\n /** Present only when `personaOf` was supplied. */\n byPersona?: Record<string, ScenarioRollup>\n /** Whole-matrix integrity report (the one `integrity:'assert'` enforces). */\n integrity: BackendIntegrityReport\n /** The raw per-profile campaign results, keyed by profile id. */\n campaigns: Record<string, CampaignResult<TArtifact, TScenario>>\n}\n\nfunction sanitize(id: string): string {\n return id.replace(/[^a-zA-Z0-9_-]/g, '_')\n}\n\nfunction sha(input: unknown): string {\n return createHash('sha256').update(JSON.stringify(input)).digest('hex')\n}\n\nfunction mean(xs: number[]): number {\n return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length\n}\n\nfunction cellComposite(cell: CampaignCellResult<unknown>): number {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n return composites.length === 0 ? 0 : mean(composites)\n}\n\ninterface BuildRecordArgs<TArtifact> {\n cell: CampaignCellResult<TArtifact>\n profile: AgentProfile\n profileHash: string\n configHash: string\n experimentId: string\n splitTag: RunSplitTag\n commitSha: string\n matrixId: string\n}\n\nfunction buildRunRecord<TArtifact>(args: BuildRecordArgs<TArtifact>): RunRecord {\n const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } =\n args\n const composite = cellComposite(cell)\n\n // Flatten judge dimensions (judge-prefixed to avoid collisions) into raw.\n const raw: Record<string, number> = { composite }\n const perJudge: Record<string, Record<string, number>> = {}\n const dimAccum: Record<string, number[]> = {}\n const notes: string[] = []\n for (const [judgeName, js] of Object.entries(cell.judgeScores)) {\n perJudge[judgeName] = { ...js.dimensions }\n for (const [dim, value] of Object.entries(js.dimensions)) {\n raw[`${judgeName}.${dim}`] = value\n ;(dimAccum[dim] ??= []).push(value)\n }\n if (js.notes) notes.push(`${judgeName}: ${js.notes}`)\n }\n const perDimMean: Record<string, number> = {}\n for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values)\n\n const outcome: RunOutcome =\n splitTag === 'holdout' ? { holdoutScore: composite, raw } : { searchScore: composite, raw }\n if (Object.keys(perJudge).length > 0) {\n outcome.judgeScores = {\n perJudge,\n perDimMean,\n composite,\n ...(notes.length > 0 ? { notes: notes.join(' | ') } : {}),\n }\n }\n\n return {\n runId: `${matrixId}:${profile.id}:${cell.cellId}`,\n experimentId,\n candidateId: profile.id,\n seed: cell.seed,\n model: profile.model,\n promptHash: profileHash,\n configHash,\n commitSha,\n wallMs: cell.durationMs,\n costUsd: cell.costUsd,\n tokenUsage: cell.tokenUsage,\n outcome,\n splitTag,\n scenarioId: cell.scenarioId,\n ...(cell.error ? { failureMode: cell.error } : {}),\n }\n}\n\nexport async function runProfileMatrix<TScenario extends Scenario, TArtifact>(\n opts: RunProfileMatrixOptions<TScenario, TArtifact>,\n): Promise<RunProfileMatrixResult<TArtifact, TScenario>> {\n if (opts.profiles.length === 0) throw new ProfileMatrixError('profiles must not be empty')\n if (opts.scenarios.length === 0) throw new ProfileMatrixError('scenarios must not be empty')\n\n const splitTag = opts.splitTag ?? 'search'\n const seed = opts.seed ?? 42\n const validate = opts.validate ?? true\n const integrityMode = opts.integrity ?? 'assert'\n const profileIds = opts.profiles.map((p) => p.id)\n const experimentId =\n opts.experimentId ??\n `pm_${sha({ profileIds, scenarios: opts.scenarios.map((s) => s.id) }).slice(0, 16)}`\n const matrixId = `mtx_${sha({ experimentId, profileIds, seed, splitTag }).slice(0, 16)}`\n\n // Preflight: every profile must hash (non-empty model) AND its model must\n // carry a snapshot version, BEFORE any LLM spend. A probe record run through\n // validateRunRecord catches both in the exact place they'd otherwise surface\n // far downstream.\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n try {\n validateRunRecord({\n runId: `${matrixId}:${profile.id}:probe`,\n experimentId,\n candidateId: profile.id,\n seed,\n model: profile.model,\n promptHash: profileHash,\n configHash: profileHash,\n commitSha: opts.commitSha,\n wallMs: 0,\n costUsd: 0,\n tokenUsage: { input: 0, output: 0 },\n outcome:\n splitTag === 'holdout' ? { holdoutScore: 0, raw: {} } : { searchScore: 0, raw: {} },\n splitTag,\n })\n } catch (err) {\n throw new ProfileMatrixError(\n `profile '${profile.id}' is not recordable: ${err instanceof Error ? err.message : String(err)}`,\n )\n }\n }\n\n const records: RunRecord[] = []\n const campaigns: Record<string, CampaignResult<TArtifact, TScenario>> = {}\n const byProfile: Record<string, ProfileSummary> = {}\n\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n const configHash = sha({\n profile: profileHash,\n judges: (opts.judges ?? []).map((j) => j.name),\n seed,\n splitTag,\n })\n\n // Bind the profile into a campaign dispatch. Name it so the campaign's\n // manifest hash is stable + distinct per profile.\n const dispatch = (scenario: TScenario, ctx: DispatchContext): Promise<TArtifact> =>\n opts.dispatch(profile, scenario, ctx)\n Object.defineProperty(dispatch, 'name', { value: `profile_${sanitize(profile.id)}` })\n\n const campaign = await runCampaign<TScenario, TArtifact>({\n scenarios: opts.scenarios,\n dispatch,\n judges: opts.judges,\n seed,\n reps: opts.reps,\n maxConcurrency: opts.maxConcurrency,\n costCeiling: opts.costCeiling,\n labeledStore: opts.labeledStore,\n captureSource: opts.captureSource,\n storage: opts.storage,\n now: opts.now,\n runDir: join(opts.runDir, sanitize(profile.id)),\n })\n campaigns[profile.id] = campaign\n\n const profileRecords: RunRecord[] = []\n for (const cell of campaign.cells) {\n const record = buildRunRecord({\n cell,\n profile,\n profileHash,\n configHash,\n experimentId,\n splitTag,\n commitSha: opts.commitSha,\n matrixId,\n })\n if (validate) validateRunRecord(record)\n profileRecords.push(record)\n records.push(record)\n }\n\n byProfile[profile.id] = {\n profileId: profile.id,\n profileHash,\n model: profile.model,\n records: profileRecords.length,\n meanComposite: mean(profileRecords.map(compositeOf)),\n totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),\n integrity: summarizeBackendIntegrity(profileRecords),\n }\n }\n\n // Integrity by construction — the whole point of the primitive.\n const integrity = summarizeBackendIntegrity(records)\n if (integrityMode === 'assert') {\n assertRealBackend(records, { allowMixed: opts.allowMixed ?? true })\n } else if (integrityMode === 'warn' && integrity.verdict !== 'real') {\n // eslint-disable-next-line no-console\n console.warn(\n `[runProfileMatrix] backend integrity: ${integrity.verdict} — ${integrity.diagnosis}`,\n )\n }\n\n // Pivots.\n const byScenario = rollup(records, (r) => r.scenarioId)\n const byPersona = opts.personaOf\n ? rollupByPersona(records, opts.scenarios, opts.personaOf)\n : undefined\n\n return { matrixId, experimentId, records, byProfile, byScenario, byPersona, integrity, campaigns }\n}\n\n/** Composite for a produced RunRecord (the split score it carries). */\nfunction compositeOf(r: RunRecord): number {\n return r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0\n}\n\nfunction rollup(\n records: RunRecord[],\n keyOf: (r: RunRecord) => string | undefined,\n): Record<string, ScenarioRollup> {\n const groups = new Map<string, number[]>()\n for (const r of records) {\n const key = keyOf(r)\n if (key === undefined) continue\n const arr = groups.get(key) ?? []\n arr.push(compositeOf(r))\n groups.set(key, arr)\n }\n const out: Record<string, ScenarioRollup> = {}\n for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length }\n return out\n}\n\nfunction rollupByPersona<TScenario extends Scenario>(\n records: RunRecord[],\n scenarios: TScenario[],\n personaOf: (s: TScenario) => string,\n): Record<string, ScenarioRollup> {\n const personaByScenarioId = new Map<string, string>()\n for (const s of scenarios) personaByScenarioId.set(s.id, personaOf(s))\n return rollup(records, (r) => (r.scenarioId ? personaByScenarioId.get(r.scenarioId) : undefined))\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAqBd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AC5QA,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,QAAAC,aAAY;AA6Bd,IAAM,qBAAN,cAAiC,eAAe;AAAA,EACrD,YAAY,SAAiB;AAC3B,UAAM,kBAAkB,OAAO;AAAA,EACjC;AACF;AAyGA,SAAS,SAAS,IAAoB;AACpC,SAAO,GAAG,QAAQ,mBAAmB,GAAG;AAC1C;AAEA,SAAS,IAAI,OAAwB;AACnC,SAAOC,YAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,KAAK,CAAC,EAAE,OAAO,KAAK;AACxE;AAEA,SAAS,KAAK,IAAsB;AAClC,SAAO,GAAG,WAAW,IAAI,IAAI,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAClE;AAEA,SAAS,cAAc,MAA2C;AAChE,QAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,SAAO,WAAW,WAAW,IAAI,IAAI,KAAK,UAAU;AACtD;AAaA,SAAS,eAA0B,MAA6C;AAC9E,QAAM,EAAE,MAAM,SAAS,aAAa,YAAY,cAAc,UAAU,WAAW,SAAS,IAC1F;AACF,QAAM,YAAY,cAAc,IAAI;AAGpC,QAAM,MAA8B,EAAE,UAAU;AAChD,QAAM,WAAmD,CAAC;AAC1D,QAAM,WAAqC,CAAC;AAC5C,QAAM,QAAkB,CAAC;AACzB,aAAW,CAAC,WAAW,EAAE,KAAK,OAAO,QAAQ,KAAK,WAAW,GAAG;AAC9D,aAAS,SAAS,IAAI,EAAE,GAAG,GAAG,WAAW;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,UAAU,GAAG;AACxD,UAAI,GAAG,SAAS,IAAI,GAAG,EAAE,IAAI;AAC5B,OAAC,SAAS,GAAG,MAAM,CAAC,GAAG,KAAK,KAAK;AAAA,IACpC;AACA,QAAI,GAAG,MAAO,OAAM,KAAK,GAAG,SAAS,KAAK,GAAG,KAAK,EAAE;AAAA,EACtD;AACA,QAAM,aAAqC,CAAC;AAC5C,aAAW,CAAC,KAAK,MAAM,KAAK,OAAO,QAAQ,QAAQ,EAAG,YAAW,GAAG,IAAI,KAAK,MAAM;AAEnF,QAAM,UACJ,aAAa,YAAY,EAAE,cAAc,WAAW,IAAI,IAAI,EAAE,aAAa,WAAW,IAAI;AAC5F,MAAI,OAAO,KAAK,QAAQ,EAAE,SAAS,GAAG;AACpC,YAAQ,cAAc;AAAA,MACpB;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAI,MAAM,SAAS,IAAI,EAAE,OAAO,MAAM,KAAK,KAAK,EAAE,IAAI,CAAC;AAAA,IACzD;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE,IAAI,KAAK,MAAM;AAAA,IAC/C;AAAA,IACA,aAAa,QAAQ;AAAA,IACrB,MAAM,KAAK;AAAA,IACX,OAAO,QAAQ;AAAA,IACf,YAAY;AAAA,IACZ;AAAA,IACA;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,SAAS,KAAK;AAAA,IACd,YAAY,KAAK;AAAA,IACjB;AAAA,IACA;AAAA,IACA,YAAY,KAAK;AAAA,IACjB,GAAI,KAAK,QAAQ,EAAE,aAAa,KAAK,MAAM,IAAI,CAAC;AAAA,EAClD;AACF;AAEA,eAAsB,iBACpB,MACuD;AACvD,MAAI,KAAK,SAAS,WAAW,EAAG,OAAM,IAAI,mBAAmB,4BAA4B;AACzF,MAAI,KAAK,UAAU,WAAW,EAAG,OAAM,IAAI,mBAAmB,6BAA6B;AAE3F,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,gBAAgB,KAAK,aAAa;AACxC,QAAM,aAAa,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE;AAChD,QAAM,eACJ,KAAK,gBACL,MAAM,IAAI,EAAE,YAAY,WAAW,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AACpF,QAAM,WAAW,OAAO,IAAI,EAAE,cAAc,YAAY,MAAM,SAAS,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AAMtF,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,QAAI;AACF,wBAAkB;AAAA,QAChB,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE;AAAA,QAChC;AAAA,QACA,aAAa,QAAQ;AAAA,QACrB;AAAA,QACA,OAAO,QAAQ;AAAA,QACf,YAAY;AAAA,QACZ,YAAY;AAAA,QACZ,WAAW,KAAK;AAAA,QAChB,QAAQ;AAAA,QACR,SAAS;AAAA,QACT,YAAY,EAAE,OAAO,GAAG,QAAQ,EAAE;AAAA,QAClC,SACE,aAAa,YAAY,EAAE,cAAc,GAAG,KAAK,CAAC,EAAE,IAAI,EAAE,aAAa,GAAG,KAAK,CAAC,EAAE;AAAA,QACpF;AAAA,MACF,CAAC;AAAA,IACH,SAAS,KAAK;AACZ,YAAM,IAAI;AAAA,QACR,YAAY,QAAQ,EAAE,wBAAwB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,MAChG;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAuB,CAAC;AAC9B,QAAM,YAAkE,CAAC;AACzE,QAAM,YAA4C,CAAC;AAEnD,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,UAAM,aAAa,IAAI;AAAA,MACrB,SAAS;AAAA,MACT,SAAS,KAAK,UAAU,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,IAAI;AAAA,MAC7C;AAAA,MACA;AAAA,IACF,CAAC;AAID,UAAM,WAAW,CAAC,UAAqB,QACrC,KAAK,SAAS,SAAS,UAAU,GAAG;AACtC,WAAO,eAAe,UAAU,QAAQ,EAAE,OAAO,WAAW,SAAS,QAAQ,EAAE,CAAC,GAAG,CAAC;AAEpF,UAAM,WAAW,MAAM,YAAkC;AAAA,MACvD,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,QAAQ,KAAK;AAAA,MACb;AAAA,MACA,MAAM,KAAK;AAAA,MACX,gBAAgB,KAAK;AAAA,MACrB,aAAa,KAAK;AAAA,MAClB,cAAc,KAAK;AAAA,MACnB,eAAe,KAAK;AAAA,MACpB,SAAS,KAAK;AAAA,MACd,KAAK,KAAK;AAAA,MACV,QAAQC,MAAK,KAAK,QAAQ,SAAS,QAAQ,EAAE,CAAC;AAAA,IAChD,CAAC;AACD,cAAU,QAAQ,EAAE,IAAI;AAExB,UAAM,iBAA8B,CAAC;AACrC,eAAW,QAAQ,SAAS,OAAO;AACjC,YAAM,SAAS,eAAe;AAAA,QAC5B;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,WAAW,KAAK;AAAA,QAChB;AAAA,MACF,CAAC;AACD,UAAI,SAAU,mBAAkB,MAAM;AACtC,qBAAe,KAAK,MAAM;AAC1B,cAAQ,KAAK,MAAM;AAAA,IACrB;AAEA,cAAU,QAAQ,EAAE,IAAI;AAAA,MACtB,WAAW,QAAQ;AAAA,MACnB;AAAA,MACA,OAAO,QAAQ;AAAA,MACf,SAAS,eAAe;AAAA,MACxB,eAAe,KAAK,eAAe,IAAI,WAAW,CAAC;AAAA,MACnD,cAAc,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,MAC9D,WAAW,0BAA0B,cAAc;AAAA,IACrD;AAAA,EACF;AAGA,QAAM,YAAY,0BAA0B,OAAO;AACnD,MAAI,kBAAkB,UAAU;AAC9B,sBAAkB,SAAS,EAAE,YAAY,KAAK,cAAc,KAAK,CAAC;AAAA,EACpE,WAAW,kBAAkB,UAAU,UAAU,YAAY,QAAQ;AAEnE,YAAQ;AAAA,MACN,yCAAyC,UAAU,OAAO,WAAM,UAAU,SAAS;AAAA,IACrF;AAAA,EACF;AAGA,QAAM,aAAa,OAAO,SAAS,CAAC,MAAM,EAAE,UAAU;AACtD,QAAM,YAAY,KAAK,YACnB,gBAAgB,SAAS,KAAK,WAAW,KAAK,SAAS,IACvD;AAEJ,SAAO,EAAE,UAAU,cAAc,SAAS,WAAW,YAAY,WAAW,WAAW,UAAU;AACnG;AAGA,SAAS,YAAY,GAAsB;AACzC,SAAO,EAAE,QAAQ,gBAAgB,EAAE,QAAQ,eAAe;AAC5D;AAEA,SAAS,OACP,SACA,OACgC;AAChC,QAAM,SAAS,oBAAI,IAAsB;AACzC,aAAW,KAAK,SAAS;AACvB,UAAM,MAAM,MAAM,CAAC;AACnB,QAAI,QAAQ,OAAW;AACvB,UAAM,MAAM,OAAO,IAAI,GAAG,KAAK,CAAC;AAChC,QAAI,KAAK,YAAY,CAAC,CAAC;AACvB,WAAO,IAAI,KAAK,GAAG;AAAA,EACrB;AACA,QAAM,MAAsC,CAAC;AAC7C,aAAW,CAAC,KAAK,EAAE,KAAK,OAAQ,KAAI,GAAG,IAAI,EAAE,eAAe,KAAK,EAAE,GAAG,GAAG,GAAG,OAAO;AACnF,SAAO;AACT;AAEA,SAAS,gBACP,SACA,WACA,WACgC;AAChC,QAAM,sBAAsB,oBAAI,IAAoB;AACpD,aAAW,KAAK,UAAW,qBAAoB,IAAI,EAAE,IAAI,UAAU,CAAC,CAAC;AACrE,SAAO,OAAO,SAAS,CAAC,MAAO,EAAE,aAAa,oBAAoB,IAAI,EAAE,UAAU,IAAI,MAAU;AAClG;;;AC/YA,SAAS,oBAAoB;AAC7B,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["createHash","join","createHash","join","existsSync","join"]}
|
|
1
|
+
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/presets/run-profile-matrix.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * `runProfileMatrix` — the missing keystone between `runAgentMatrix` and the\n * backend-integrity guard.\n *\n * The gap it closes: `runAgentMatrix` is a topology-opaque scheduler whose\n * cells return a bare `{ output, verdict, costUsd }` — no `tokenUsage`, not a\n * `RunRecord`. `assertRealBackend` / `summarizeBackendIntegrity` key on\n * `RunRecord.tokenUsage`, so they cannot run on a raw matrix result. Every\n * consumer therefore hand-writes the same bridge: fan a profile × scenario\n * cartesian, call dispatch, fabricate a `RunRecord` with token usage, thread it\n * back, run the integrity guard. That hand-rolled bridge is exactly the pile of\n * bespoke `eval:*` scripts the adoption skills keep trying (and failing) to\n * forbid.\n *\n * `runProfileMatrix` IS that bridge, once:\n *\n * - axis 3 (PROFILE) = `profiles: AgentProfile[]`\n * - axis 1 (PERSONA/SCENARIO) = `scenarios: Scenario[]` (each scenario carries\n * its persona; `personaOf` groups them for the `byPersona` pivot)\n * - the scoring axis = `judges`\n *\n * It runs `runCampaign` once per profile (reusing its seeds, reps, bootstrap\n * CIs, resumability, and the `LabeledScenarioStore` capture flywheel), maps\n * every cell to a validated `RunRecord` carrying the real `tokenUsage` the\n * dispatch reported via `ctx.cost.observeTokens`, and runs `assertRealBackend`\n * BY CONSTRUCTION before returning — so a stub-backend run fails loudly instead\n * of reporting a clean 0/N leaderboard.\n *\n * Dispatch contract: a dispatch that calls an LLM MUST report usage via\n * `ctx.cost.observeTokens({ input, output })` (and cost via `ctx.cost.observe`).\n * A dispatch that reports zero tokens is indistinguishable from a stub and the\n * integrity guard treats it as one.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport { type AgentProfile, agentProfileHash } from '../../agent-profile'\nimport { AgentEvalError } from '../../errors'\nimport {\n assertRealBackend,\n type BackendIntegrityReport,\n summarizeBackendIntegrity,\n} from '../../integrity/backend-integrity'\nimport {\n type RunOutcome,\n type RunRecord,\n type RunSplitTag,\n validateRunRecord,\n} from '../../run-record'\nimport { runCampaign } from '../run-campaign'\nimport type { CampaignStorage } from '../storage'\nimport type {\n CampaignCellResult,\n CampaignResult,\n DispatchContext,\n JudgeConfig,\n LabeledScenarioSource,\n LabeledScenarioStore,\n Scenario,\n} from '../types'\n\n/** Thrown when the matrix is misconfigured (no profiles, a profile whose model\n * lacks a snapshot version, etc.). Distinct from `BackendIntegrityError`,\n * which signals a stub backend at run time. */\nexport class ProfileMatrixError extends AgentEvalError {\n constructor(message: string) {\n super('profile_matrix', message)\n }\n}\n\n/** Dispatch for one cell: render `profile` against `scenario`, returning the\n * artifact the judges score. Report LLM usage via `ctx.cost.observeTokens`\n * and `ctx.cost.observe` — the integrity guard depends on it. */\nexport type ProfileDispatchFn<TScenario extends Scenario, TArtifact> = (\n profile: AgentProfile,\n scenario: TScenario,\n ctx: DispatchContext,\n) => Promise<TArtifact>\n\nexport interface RunProfileMatrixOptions<TScenario extends Scenario, TArtifact> {\n /** Axis 3 — the agent-under-test configurations. Each is one column. */\n profiles: AgentProfile[]\n /** Axis 1 — the persona/scenario corpus, run against every profile. */\n scenarios: TScenario[]\n /** Renders one (profile, scenario) cell. */\n dispatch: ProfileDispatchFn<TScenario, TArtifact>\n /** The scoring axis. */\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Where each profile's campaign writes artifacts/traces. One subdir per\n * profile. */\n runDir: string\n /** Git SHA the harness ran from — stamped onto every RunRecord (mandatory\n * for paper-grade records). */\n commitSha: string\n /** Logical experiment id shared across the whole matrix so the promotion\n * gate can pair profiles on matched scenarios. Default: a hash of the\n * profile + scenario ids. */\n experimentId?: string\n /** Which split these runs belong to. Default `'search'`. */\n splitTag?: RunSplitTag\n /** Replicates per (profile, scenario) cell for CI bands. Default 1. */\n reps?: number\n /** Campaign seed (per profile). Default 42. */\n seed?: number\n /**\n * Backend-integrity posture, enforced AFTER the matrix completes:\n * - `'assert'` (default) — throw `BackendIntegrityError` if the run was a\n * stub (and, with `allowMixed:false`, if it was mixed).\n * - `'warn'` — log the verdict but never throw.\n * - `'off'` — skip the guard entirely (only for offline/replay analysis).\n */\n integrity?: 'assert' | 'warn' | 'off'\n /** Forwarded to `assertRealBackend`. Default true (tolerate partial 429\n * cascades); set false for strict CI gates. */\n allowMixed?: boolean\n /** Max concurrent cells WITHIN each profile's campaign. Default 2.\n * Profiles run sequentially so the cost ceiling is honored deterministically. */\n maxConcurrency?: number\n /** Cumulative USD cap per profile campaign. */\n costCeiling?: number\n /** Capture flywheel — forwarded to each campaign. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: LabeledScenarioSource\n /** Storage backend. Default `fsCampaignStorage`. Pass\n * `inMemoryCampaignStorage()` for edge/CF-Worker/test runs. */\n storage?: CampaignStorage\n /** Test seam — override the wall clock. */\n now?: () => Date\n /** Optional persona key per scenario — drives the `byPersona` pivot. When\n * unset, `byPersona` is omitted. */\n personaOf?: (scenario: TScenario) => string\n /** Validate every produced RunRecord with `validateRunRecord` (fail-loud).\n * Default true — catches bad model snapshots and non-finite judge dims at\n * the boundary instead of letting them poison downstream analysis. */\n validate?: boolean\n}\n\nexport interface ProfileSummary {\n profileId: string\n profileHash: string\n model: string\n /** RunRecords produced for this profile (= scenarios × reps). */\n records: number\n /** Mean composite across this profile's records. */\n meanComposite: number\n totalCostUsd: number\n /** Per-profile integrity verdict — surfaces a single profile that ran stub\n * even when the matrix as a whole looks real. */\n integrity: BackendIntegrityReport\n}\n\nexport interface ScenarioRollup {\n meanComposite: number\n n: number\n}\n\nexport interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {\n matrixId: string\n experimentId: string\n /** One RunRecord per (profile, scenario, rep) cell — the integrity-checked,\n * paper-grade output. Feed straight into `analyzeRuns`, `HeldOutGate`,\n * scorecards, the hosted wire format. */\n records: RunRecord[]\n byProfile: Record<string, ProfileSummary>\n byScenario: Record<string, ScenarioRollup>\n /** Present only when `personaOf` was supplied. */\n byPersona?: Record<string, ScenarioRollup>\n /** Whole-matrix integrity report (the one `integrity:'assert'` enforces). */\n integrity: BackendIntegrityReport\n /** The raw per-profile campaign results, keyed by profile id. */\n campaigns: Record<string, CampaignResult<TArtifact, TScenario>>\n}\n\nfunction sanitize(id: string): string {\n return id.replace(/[^a-zA-Z0-9_-]/g, '_')\n}\n\nfunction sha(input: unknown): string {\n return createHash('sha256').update(JSON.stringify(input)).digest('hex')\n}\n\nfunction mean(xs: number[]): number {\n return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length\n}\n\nfunction cellComposite(cell: CampaignCellResult<unknown>): number {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n return composites.length === 0 ? 0 : mean(composites)\n}\n\ninterface BuildRecordArgs<TArtifact> {\n cell: CampaignCellResult<TArtifact>\n profile: AgentProfile\n profileHash: string\n configHash: string\n experimentId: string\n splitTag: RunSplitTag\n commitSha: string\n matrixId: string\n}\n\nfunction buildRunRecord<TArtifact>(args: BuildRecordArgs<TArtifact>): RunRecord {\n const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } =\n args\n const composite = cellComposite(cell)\n\n // Flatten judge dimensions (judge-prefixed to avoid collisions) into raw.\n const raw: Record<string, number> = { composite }\n const perJudge: Record<string, Record<string, number>> = {}\n const dimAccum: Record<string, number[]> = {}\n const notes: string[] = []\n for (const [judgeName, js] of Object.entries(cell.judgeScores)) {\n perJudge[judgeName] = { ...js.dimensions }\n for (const [dim, value] of Object.entries(js.dimensions)) {\n raw[`${judgeName}.${dim}`] = value\n ;(dimAccum[dim] ??= []).push(value)\n }\n if (js.notes) notes.push(`${judgeName}: ${js.notes}`)\n }\n const perDimMean: Record<string, number> = {}\n for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values)\n\n const outcome: RunOutcome =\n splitTag === 'holdout' ? { holdoutScore: composite, raw } : { searchScore: composite, raw }\n if (Object.keys(perJudge).length > 0) {\n outcome.judgeScores = {\n perJudge,\n perDimMean,\n composite,\n ...(notes.length > 0 ? { notes: notes.join(' | ') } : {}),\n }\n }\n\n return {\n runId: `${matrixId}:${profile.id}:${cell.cellId}`,\n experimentId,\n candidateId: profile.id,\n seed: cell.seed,\n model: profile.model,\n promptHash: profileHash,\n configHash,\n commitSha,\n wallMs: cell.durationMs,\n costUsd: cell.costUsd,\n tokenUsage: cell.tokenUsage,\n outcome,\n splitTag,\n scenarioId: cell.scenarioId,\n ...(cell.error ? { failureMode: cell.error } : {}),\n }\n}\n\nexport async function runProfileMatrix<TScenario extends Scenario, TArtifact>(\n opts: RunProfileMatrixOptions<TScenario, TArtifact>,\n): Promise<RunProfileMatrixResult<TArtifact, TScenario>> {\n if (opts.profiles.length === 0) throw new ProfileMatrixError('profiles must not be empty')\n if (opts.scenarios.length === 0) throw new ProfileMatrixError('scenarios must not be empty')\n\n const splitTag = opts.splitTag ?? 'search'\n const seed = opts.seed ?? 42\n const validate = opts.validate ?? true\n const integrityMode = opts.integrity ?? 'assert'\n const profileIds = opts.profiles.map((p) => p.id)\n const experimentId =\n opts.experimentId ??\n `pm_${sha({ profileIds, scenarios: opts.scenarios.map((s) => s.id) }).slice(0, 16)}`\n const matrixId = `mtx_${sha({ experimentId, profileIds, seed, splitTag }).slice(0, 16)}`\n\n // Preflight: every profile must hash (non-empty model) AND its model must\n // carry a snapshot version, BEFORE any LLM spend. A probe record run through\n // validateRunRecord catches both in the exact place they'd otherwise surface\n // far downstream.\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n try {\n validateRunRecord({\n runId: `${matrixId}:${profile.id}:probe`,\n experimentId,\n candidateId: profile.id,\n seed,\n model: profile.model,\n promptHash: profileHash,\n configHash: profileHash,\n commitSha: opts.commitSha,\n wallMs: 0,\n costUsd: 0,\n tokenUsage: { input: 0, output: 0 },\n outcome:\n splitTag === 'holdout' ? { holdoutScore: 0, raw: {} } : { searchScore: 0, raw: {} },\n splitTag,\n })\n } catch (err) {\n throw new ProfileMatrixError(\n `profile '${profile.id}' is not recordable: ${err instanceof Error ? err.message : String(err)}`,\n )\n }\n }\n\n const records: RunRecord[] = []\n const campaigns: Record<string, CampaignResult<TArtifact, TScenario>> = {}\n const byProfile: Record<string, ProfileSummary> = {}\n\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n const configHash = sha({\n profile: profileHash,\n judges: (opts.judges ?? []).map((j) => j.name),\n seed,\n splitTag,\n })\n\n // Bind the profile into a campaign dispatch. Name it so the campaign's\n // manifest hash is stable + distinct per profile.\n const dispatch = (scenario: TScenario, ctx: DispatchContext): Promise<TArtifact> =>\n opts.dispatch(profile, scenario, ctx)\n Object.defineProperty(dispatch, 'name', { value: `profile_${sanitize(profile.id)}` })\n\n const campaign = await runCampaign<TScenario, TArtifact>({\n scenarios: opts.scenarios,\n dispatch,\n judges: opts.judges,\n seed,\n reps: opts.reps,\n maxConcurrency: opts.maxConcurrency,\n costCeiling: opts.costCeiling,\n labeledStore: opts.labeledStore,\n captureSource: opts.captureSource,\n storage: opts.storage,\n now: opts.now,\n runDir: join(opts.runDir, sanitize(profile.id)),\n })\n campaigns[profile.id] = campaign\n\n const profileRecords: RunRecord[] = []\n for (const cell of campaign.cells) {\n const record = buildRunRecord({\n cell,\n profile,\n profileHash,\n configHash,\n experimentId,\n splitTag,\n commitSha: opts.commitSha,\n matrixId,\n })\n if (validate) validateRunRecord(record)\n profileRecords.push(record)\n records.push(record)\n }\n\n byProfile[profile.id] = {\n profileId: profile.id,\n profileHash,\n model: profile.model,\n records: profileRecords.length,\n meanComposite: mean(profileRecords.map(compositeOf)),\n totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),\n integrity: summarizeBackendIntegrity(profileRecords),\n }\n }\n\n // Integrity by construction — the whole point of the primitive.\n const integrity = summarizeBackendIntegrity(records)\n if (integrityMode === 'assert') {\n assertRealBackend(records, { allowMixed: opts.allowMixed ?? true })\n } else if (integrityMode === 'warn' && integrity.verdict !== 'real') {\n // eslint-disable-next-line no-console\n console.warn(\n `[runProfileMatrix] backend integrity: ${integrity.verdict} — ${integrity.diagnosis}`,\n )\n }\n\n // Pivots.\n const byScenario = rollup(records, (r) => r.scenarioId)\n const byPersona = opts.personaOf\n ? rollupByPersona(records, opts.scenarios, opts.personaOf)\n : undefined\n\n return { matrixId, experimentId, records, byProfile, byScenario, byPersona, integrity, campaigns }\n}\n\n/** Composite for a produced RunRecord (the split score it carries). */\nfunction compositeOf(r: RunRecord): number {\n return r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0\n}\n\nfunction rollup(\n records: RunRecord[],\n keyOf: (r: RunRecord) => string | undefined,\n): Record<string, ScenarioRollup> {\n const groups = new Map<string, number[]>()\n for (const r of records) {\n const key = keyOf(r)\n if (key === undefined) continue\n const arr = groups.get(key) ?? []\n arr.push(compositeOf(r))\n groups.set(key, arr)\n }\n const out: Record<string, ScenarioRollup> = {}\n for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length }\n return out\n}\n\nfunction rollupByPersona<TScenario extends Scenario>(\n records: RunRecord[],\n scenarios: TScenario[],\n personaOf: (s: TScenario) => string,\n): Record<string, ScenarioRollup> {\n const personaByScenarioId = new Map<string, string>()\n for (const s of scenarios) personaByScenarioId.set(s.id, personaOf(s))\n return rollup(records, (r) => (r.scenarioId ? personaByScenarioId.get(r.scenarioId) : undefined))\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAqBd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AC5QA,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,QAAAC,aAAY;AA6Bd,IAAM,qBAAN,cAAiC,eAAe;AAAA,EACrD,YAAY,SAAiB;AAC3B,UAAM,kBAAkB,OAAO;AAAA,EACjC;AACF;AAyGA,SAAS,SAAS,IAAoB;AACpC,SAAO,GAAG,QAAQ,mBAAmB,GAAG;AAC1C;AAEA,SAAS,IAAI,OAAwB;AACnC,SAAOC,YAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,KAAK,CAAC,EAAE,OAAO,KAAK;AACxE;AAEA,SAAS,KAAK,IAAsB;AAClC,SAAO,GAAG,WAAW,IAAI,IAAI,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAClE;AAEA,SAAS,cAAc,MAA2C;AAChE,QAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,SAAO,WAAW,WAAW,IAAI,IAAI,KAAK,UAAU;AACtD;AAaA,SAAS,eAA0B,MAA6C;AAC9E,QAAM,EAAE,MAAM,SAAS,aAAa,YAAY,cAAc,UAAU,WAAW,SAAS,IAC1F;AACF,QAAM,YAAY,cAAc,IAAI;AAGpC,QAAM,MAA8B,EAAE,UAAU;AAChD,QAAM,WAAmD,CAAC;AAC1D,QAAM,WAAqC,CAAC;AAC5C,QAAM,QAAkB,CAAC;AACzB,aAAW,CAAC,WAAW,EAAE,KAAK,OAAO,QAAQ,KAAK,WAAW,GAAG;AAC9D,aAAS,SAAS,IAAI,EAAE,GAAG,GAAG,WAAW;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,UAAU,GAAG;AACxD,UAAI,GAAG,SAAS,IAAI,GAAG,EAAE,IAAI;AAC5B,OAAC,SAAS,GAAG,MAAM,CAAC,GAAG,KAAK,KAAK;AAAA,IACpC;AACA,QAAI,GAAG,MAAO,OAAM,KAAK,GAAG,SAAS,KAAK,GAAG,KAAK,EAAE;AAAA,EACtD;AACA,QAAM,aAAqC,CAAC;AAC5C,aAAW,CAAC,KAAK,MAAM,KAAK,OAAO,QAAQ,QAAQ,EAAG,YAAW,GAAG,IAAI,KAAK,MAAM;AAEnF,QAAM,UACJ,aAAa,YAAY,EAAE,cAAc,WAAW,IAAI,IAAI,EAAE,aAAa,WAAW,IAAI;AAC5F,MAAI,OAAO,KAAK,QAAQ,EAAE,SAAS,GAAG;AACpC,YAAQ,cAAc;AAAA,MACpB;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAI,MAAM,SAAS,IAAI,EAAE,OAAO,MAAM,KAAK,KAAK,EAAE,IAAI,CAAC;AAAA,IACzD;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE,IAAI,KAAK,MAAM;AAAA,IAC/C;AAAA,IACA,aAAa,QAAQ;AAAA,IACrB,MAAM,KAAK;AAAA,IACX,OAAO,QAAQ;AAAA,IACf,YAAY;AAAA,IACZ;AAAA,IACA;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,SAAS,KAAK;AAAA,IACd,YAAY,KAAK;AAAA,IACjB;AAAA,IACA;AAAA,IACA,YAAY,KAAK;AAAA,IACjB,GAAI,KAAK,QAAQ,EAAE,aAAa,KAAK,MAAM,IAAI,CAAC;AAAA,EAClD;AACF;AAEA,eAAsB,iBACpB,MACuD;AACvD,MAAI,KAAK,SAAS,WAAW,EAAG,OAAM,IAAI,mBAAmB,4BAA4B;AACzF,MAAI,KAAK,UAAU,WAAW,EAAG,OAAM,IAAI,mBAAmB,6BAA6B;AAE3F,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,gBAAgB,KAAK,aAAa;AACxC,QAAM,aAAa,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE;AAChD,QAAM,eACJ,KAAK,gBACL,MAAM,IAAI,EAAE,YAAY,WAAW,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AACpF,QAAM,WAAW,OAAO,IAAI,EAAE,cAAc,YAAY,MAAM,SAAS,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AAMtF,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,QAAI;AACF,wBAAkB;AAAA,QAChB,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE;AAAA,QAChC;AAAA,QACA,aAAa,QAAQ;AAAA,QACrB;AAAA,QACA,OAAO,QAAQ;AAAA,QACf,YAAY;AAAA,QACZ,YAAY;AAAA,QACZ,WAAW,KAAK;AAAA,QAChB,QAAQ;AAAA,QACR,SAAS;AAAA,QACT,YAAY,EAAE,OAAO,GAAG,QAAQ,EAAE;AAAA,QAClC,SACE,aAAa,YAAY,EAAE,cAAc,GAAG,KAAK,CAAC,EAAE,IAAI,EAAE,aAAa,GAAG,KAAK,CAAC,EAAE;AAAA,QACpF;AAAA,MACF,CAAC;AAAA,IACH,SAAS,KAAK;AACZ,YAAM,IAAI;AAAA,QACR,YAAY,QAAQ,EAAE,wBAAwB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,MAChG;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAuB,CAAC;AAC9B,QAAM,YAAkE,CAAC;AACzE,QAAM,YAA4C,CAAC;AAEnD,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,UAAM,aAAa,IAAI;AAAA,MACrB,SAAS;AAAA,MACT,SAAS,KAAK,UAAU,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,IAAI;AAAA,MAC7C;AAAA,MACA;AAAA,IACF,CAAC;AAID,UAAM,WAAW,CAAC,UAAqB,QACrC,KAAK,SAAS,SAAS,UAAU,GAAG;AACtC,WAAO,eAAe,UAAU,QAAQ,EAAE,OAAO,WAAW,SAAS,QAAQ,EAAE,CAAC,GAAG,CAAC;AAEpF,UAAM,WAAW,MAAM,YAAkC;AAAA,MACvD,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,QAAQ,KAAK;AAAA,MACb;AAAA,MACA,MAAM,KAAK;AAAA,MACX,gBAAgB,KAAK;AAAA,MACrB,aAAa,KAAK;AAAA,MAClB,cAAc,KAAK;AAAA,MACnB,eAAe,KAAK;AAAA,MACpB,SAAS,KAAK;AAAA,MACd,KAAK,KAAK;AAAA,MACV,QAAQC,MAAK,KAAK,QAAQ,SAAS,QAAQ,EAAE,CAAC;AAAA,IAChD,CAAC;AACD,cAAU,QAAQ,EAAE,IAAI;AAExB,UAAM,iBAA8B,CAAC;AACrC,eAAW,QAAQ,SAAS,OAAO;AACjC,YAAM,SAAS,eAAe;AAAA,QAC5B;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,WAAW,KAAK;AAAA,QAChB;AAAA,MACF,CAAC;AACD,UAAI,SAAU,mBAAkB,MAAM;AACtC,qBAAe,KAAK,MAAM;AAC1B,cAAQ,KAAK,MAAM;AAAA,IACrB;AAEA,cAAU,QAAQ,EAAE,IAAI;AAAA,MACtB,WAAW,QAAQ;AAAA,MACnB;AAAA,MACA,OAAO,QAAQ;AAAA,MACf,SAAS,eAAe;AAAA,MACxB,eAAe,KAAK,eAAe,IAAI,WAAW,CAAC;AAAA,MACnD,cAAc,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,MAC9D,WAAW,0BAA0B,cAAc;AAAA,IACrD;AAAA,EACF;AAGA,QAAM,YAAY,0BAA0B,OAAO;AACnD,MAAI,kBAAkB,UAAU;AAC9B,sBAAkB,SAAS,EAAE,YAAY,KAAK,cAAc,KAAK,CAAC;AAAA,EACpE,WAAW,kBAAkB,UAAU,UAAU,YAAY,QAAQ;AAEnE,YAAQ;AAAA,MACN,yCAAyC,UAAU,OAAO,WAAM,UAAU,SAAS;AAAA,IACrF;AAAA,EACF;AAGA,QAAM,aAAa,OAAO,SAAS,CAAC,MAAM,EAAE,UAAU;AACtD,QAAM,YAAY,KAAK,YACnB,gBAAgB,SAAS,KAAK,WAAW,KAAK,SAAS,IACvD;AAEJ,SAAO,EAAE,UAAU,cAAc,SAAS,WAAW,YAAY,WAAW,WAAW,UAAU;AACnG;AAGA,SAAS,YAAY,GAAsB;AACzC,SAAO,EAAE,QAAQ,gBAAgB,EAAE,QAAQ,eAAe;AAC5D;AAEA,SAAS,OACP,SACA,OACgC;AAChC,QAAM,SAAS,oBAAI,IAAsB;AACzC,aAAW,KAAK,SAAS;AACvB,UAAM,MAAM,MAAM,CAAC;AACnB,QAAI,QAAQ,OAAW;AACvB,UAAM,MAAM,OAAO,IAAI,GAAG,KAAK,CAAC;AAChC,QAAI,KAAK,YAAY,CAAC,CAAC;AACvB,WAAO,IAAI,KAAK,GAAG;AAAA,EACrB;AACA,QAAM,MAAsC,CAAC;AAC7C,aAAW,CAAC,KAAK,EAAE,KAAK,OAAQ,KAAI,GAAG,IAAI,EAAE,eAAe,KAAK,EAAE,GAAG,GAAG,GAAG,OAAO;AACnF,SAAO;AACT;AAEA,SAAS,gBACP,SACA,WACA,WACgC;AAChC,QAAM,sBAAsB,oBAAI,IAAoB;AACpD,aAAW,KAAK,UAAW,qBAAoB,IAAI,EAAE,IAAI,UAAU,CAAC,CAAC;AACrE,SAAO,OAAO,SAAS,CAAC,MAAO,EAAE,aAAa,oBAAoB,IAAI,EAAE,UAAU,IAAI,MAAU;AAClG;;;AC/YA,SAAS,oBAAoB;AAC7B,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["createHash","join","createHash","join","existsSync","join"]}
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
import {
|
|
2
|
+
BackendIntegrityError
|
|
3
|
+
} from "./chunk-E22YUOAL.js";
|
|
1
4
|
import {
|
|
2
5
|
confidenceInterval
|
|
3
6
|
} from "./chunk-ITBRCT73.js";
|
|
@@ -111,6 +114,7 @@ async function runCampaign(opts) {
|
|
|
111
114
|
signal: abortController.signal
|
|
112
115
|
});
|
|
113
116
|
cellsRef.push(result.cell);
|
|
117
|
+
enforceCellUsage(result.cell, opts.expectUsage ?? "warn");
|
|
114
118
|
totalCostUsd += result.cell.costUsd;
|
|
115
119
|
Object.assign(artifactsByPath, result.artifactsByPath);
|
|
116
120
|
if (opts.costCeiling !== void 0 && totalCostUsd >= opts.costCeiling) {
|
|
@@ -261,6 +265,28 @@ async function executeCell(args) {
|
|
|
261
265
|
}
|
|
262
266
|
return { cell, artifactsByPath };
|
|
263
267
|
}
|
|
268
|
+
function enforceCellUsage(cell, mode) {
|
|
269
|
+
if (mode === "off" || cell.error) return;
|
|
270
|
+
if (cell.artifact === null || cell.artifact === void 0) return;
|
|
271
|
+
const zeroTokens = cell.tokenUsage.input === 0 && cell.tokenUsage.output === 0;
|
|
272
|
+
if (cell.costUsd !== 0 || !zeroTokens) return;
|
|
273
|
+
const msg = `cell '${cell.cellId}' produced an artifact but reported zero cost and zero tokens \u2014 the dispatch never reported LLM usage via ctx.cost.observe/observeTokens (a stub cell)`;
|
|
274
|
+
if (mode === "assert") {
|
|
275
|
+
const report = {
|
|
276
|
+
totalRecords: 1,
|
|
277
|
+
stubRecords: 1,
|
|
278
|
+
realRecords: 0,
|
|
279
|
+
uncostedRecords: 0,
|
|
280
|
+
totalInputTokens: 0,
|
|
281
|
+
totalOutputTokens: 0,
|
|
282
|
+
totalCostUsd: 0,
|
|
283
|
+
verdict: "stub",
|
|
284
|
+
diagnosis: msg
|
|
285
|
+
};
|
|
286
|
+
throw new BackendIntegrityError(`expectUsage: ${msg}`, report);
|
|
287
|
+
}
|
|
288
|
+
console.warn(`[runCampaign] expectUsage: ${msg}`);
|
|
289
|
+
}
|
|
264
290
|
async function runJudgeCell(judge, input) {
|
|
265
291
|
return judge.score(input);
|
|
266
292
|
}
|
|
@@ -374,4 +400,4 @@ export {
|
|
|
374
400
|
inMemoryCampaignStorage,
|
|
375
401
|
runCampaign
|
|
376
402
|
};
|
|
377
|
-
//# sourceMappingURL=chunk-
|
|
403
|
+
//# sourceMappingURL=chunk-7TPYV2ER.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/campaign/run-campaign.ts","../src/campaign/storage.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates\n * scenarios → dispatch → artifacts → judges → aggregates, with full\n * reproducibility (seed + manifest hash), cell-level resumability, bootstrap\n * CIs, and the `LabeledScenarioStore` capture flywheel.\n *\n * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this\n * primitive but live in `presets/run-improvement-loop.ts`. This file keeps\n * the core orchestrator minimal — Phase 1 of the Pass A track.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport { BackendIntegrityError, type BackendIntegrityReport } from '../integrity/backend-integrity'\nimport { confidenceInterval } from '../statistics'\nimport { type CampaignStorage, fsCampaignStorage } from './storage'\nimport type {\n CampaignAggregates,\n CampaignArtifactWriter,\n CampaignCellResult,\n CampaignCostMeter,\n CampaignResult,\n CampaignTokenUsage,\n CampaignTraceWriter,\n DispatchContext,\n DispatchFn,\n JudgeAggregate,\n JudgeConfig,\n JudgeScore,\n LabeledScenarioStore,\n Scenario,\n ScenarioAggregate,\n TraceSpan,\n} from './types'\n\nexport interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {\n scenarios: TScenario[]\n dispatch: DispatchFn<TScenario, TArtifact>\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Required for reproducibility. Default 42. */\n seed?: number\n /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for\n * bootstrap-tight intervals on critical eval. */\n reps?: number\n /** When true (default), completed cells are cached by\n * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */\n resumable?: boolean\n /** Optional store — when present, every artifact + judge score is captured\n * with the configured `captureSource`. Capture is default ON; pass `'off'`\n * to disable. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic'\n captureSourceVersionHash?: string\n /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */\n costCeiling?: number\n /** Max concurrent cells. Default 2. */\n maxConcurrency?: number\n /** Required: where artifacts + traces land. */\n runDir: string\n /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted\n * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate\n * refuses this when the caller wires `autoOnPromote !== 'none'`. */\n tracing?: 'on' | 'off'\n /**\n * Per-cell usage expectation — the early, fine-grained sibling of the\n * batch `assertRealBackend` guard. A cell that produced an artifact (no\n * error) but reported `costUsd === 0` AND zero tokens is a stub: the\n * dispatch never reported LLM activity via `ctx.cost`. Modes:\n * - `'warn'` (default) — log the offending cell loudly, keep going.\n * - `'assert'` — throw `BackendIntegrityError` on the first such cell\n * (fail-fast; recommended for CI campaigns expecting real LLM calls).\n * - `'off'` — no check (replay / deterministic-only / offline analysis).\n */\n expectUsage?: 'assert' | 'warn' | 'off'\n /** Test seam — override the wall clock for deterministic tests. */\n now?: () => Date\n /** Test seam — override per-cell trace writer factory. */\n buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter\n /** Storage backend for run/cell dirs, the resumability cache, artifacts,\n * and trace spans. Default: the Node filesystem (`fsCampaignStorage`).\n * Pass `inMemoryCampaignStorage()` to run in a filesystem-less runtime\n * (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still\n * produced; artifacts/traces just aren't persisted to disk. */\n storage?: CampaignStorage\n /**\n * Optional per-cell placement strategy. Returns an opaque string the\n * substrate forwards as `ctx.placement` to the Dispatch — placement-aware\n * Dispatches (e.g. `httpDispatch` from `/adapters/http`) use it to route\n * each cell to the right worker, region, or sandbox. When unset, every\n * cell receives `ctx.placement = undefined` and behaves identically to\n * the in-process case.\n *\n * @example\n * cellPlacement: ({ scenario }) => scenario.tags?.includes('eu') ? 'eu-west' : 'us-east'\n */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n}\n\nexport async function runCampaign<TScenario extends Scenario, TArtifact>(\n opts: RunCampaignOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n const seed = opts.seed ?? 42\n const reps = opts.reps ?? 1\n const resumable = opts.resumable ?? true\n const maxConcurrency = opts.maxConcurrency ?? 2\n const now = opts.now ?? (() => new Date())\n const judges = opts.judges ?? []\n const storage = opts.storage ?? fsCampaignStorage()\n\n storage.ensureDir(opts.runDir)\n\n const manifestHash = computeManifestHash({\n scenarios: opts.scenarios,\n judges: judges as unknown as JudgeConfig<unknown>[],\n dispatchRef: opts.dispatch.name || 'anonymous',\n seed,\n reps,\n })\n\n const startedAt = now()\n const cells: CampaignCellResult<TArtifact>[] = []\n const artifactsByPath: Record<string, string> = {}\n\n // Build the cell schedule (scenario × rep).\n const schedule: Array<{ scenario: TScenario; rep: number; cellId: string; cellSeed: number }> = []\n let cellIndex = 0\n for (const scenario of opts.scenarios) {\n for (let rep = 0; rep < reps; rep++) {\n const cellId = `${scenario.id}:${rep}`\n const cellSeed = seed + cellIndex\n schedule.push({ scenario, rep, cellId, cellSeed })\n cellIndex += 1\n }\n }\n\n // Concurrency-limited execution.\n let totalCostUsd = 0\n let costCeilingReached = false\n const abortController = new AbortController()\n // Concurrency lanes that drain the cell schedule. Named \"lanes\" — not\n // \"workers\" — to avoid clashing with the taxonomy's worker (= the agent\n // harness in a sandbox, invoked behind `dispatch`). See loop-taxonomy.md.\n const lanes: Promise<void>[] = []\n let nextIdx = 0\n const cellsRef = cells\n\n for (let i = 0; i < maxConcurrency; i++) {\n lanes.push(\n (async () => {\n while (true) {\n const myIdx = nextIdx++\n if (myIdx >= schedule.length) return\n const slot = schedule[myIdx]!\n if (costCeilingReached) {\n cellsRef.push(skippedCell(slot, 'cost_ceiling_reached'))\n continue\n }\n const result = await executeCell({\n slot,\n opts,\n manifestHash,\n resumable,\n now,\n storage,\n buildTraceWriter: opts.buildTraceWriter ?? defaultBuildTraceWriter(storage),\n signal: abortController.signal,\n })\n cellsRef.push(result.cell)\n enforceCellUsage(result.cell, opts.expectUsage ?? 'warn')\n totalCostUsd += result.cell.costUsd\n Object.assign(artifactsByPath, result.artifactsByPath)\n if (opts.costCeiling !== undefined && totalCostUsd >= opts.costCeiling) {\n costCeilingReached = true\n }\n // Capture into LabeledScenarioStore unless explicitly disabled.\n if (opts.labeledStore && opts.labeledStore !== 'off' && !result.cell.error) {\n await captureToStore({\n store: opts.labeledStore,\n cell: result.cell,\n scenario: slot.scenario,\n opts,\n now,\n }).catch((err) => {\n // Capture failures are non-fatal — log but don't crash the campaign.\n // (Trace would normally land here.)\n console.warn(\n `[runCampaign] capture failed for ${result.cell.cellId}: ${err instanceof Error ? err.message : String(err)}`,\n )\n })\n }\n }\n })(),\n )\n }\n await Promise.all(lanes)\n\n const endedAt = now()\n cellsRef.sort((a, b) => a.cellId.localeCompare(b.cellId))\n\n const aggregates = computeAggregates(\n cellsRef,\n judges as unknown as JudgeConfig<TArtifact>[],\n seed,\n )\n\n return {\n manifestHash,\n seed,\n startedAt: startedAt.toISOString(),\n endedAt: endedAt.toISOString(),\n durationMs: endedAt.getTime() - startedAt.getTime(),\n cells: cellsRef,\n aggregates,\n runDir: opts.runDir,\n artifactsByPath,\n scenarios: opts.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n }\n}\n\n// ── Internals ─────────────────────────────────────────────────────────\n\ninterface ExecuteCellArgs<TScenario extends Scenario, TArtifact> {\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number }\n opts: RunCampaignOptions<TScenario, TArtifact>\n manifestHash: string\n resumable: boolean\n now: () => Date\n storage: CampaignStorage\n buildTraceWriter: (cellId: string, dir: string) => CampaignTraceWriter\n signal: AbortSignal\n}\n\nasync function executeCell<TScenario extends Scenario, TArtifact>(\n args: ExecuteCellArgs<TScenario, TArtifact>,\n): Promise<{ cell: CampaignCellResult<TArtifact>; artifactsByPath: Record<string, string> }> {\n const storage = args.storage\n const cellDir = join(args.opts.runDir, args.slot.cellId.replace(/[^a-zA-Z0-9_-]/g, '_'))\n storage.ensureDir(cellDir)\n\n // Resumability: cache key = (manifestHash, scenarioId, rep)\n const cachePath = join(cellDir, 'cached-result.json')\n if (args.resumable) {\n const raw = storage.read(cachePath)\n if (raw !== undefined) {\n try {\n const cached = JSON.parse(raw) as CampaignCellResult<TArtifact>\n if (cached.cellId === args.slot.cellId) {\n return { cell: { ...cached, cached: true }, artifactsByPath: {} }\n }\n } catch {\n // Corrupt cache — fall through to re-run.\n }\n }\n }\n\n const startMs = Date.now()\n const trace = args.buildTraceWriter(args.slot.cellId, cellDir)\n const artifactsByPath: Record<string, string> = {}\n const artifacts: CampaignArtifactWriter = {\n async write(path, content) {\n const fullPath = join(cellDir, path)\n storage.ensureDir(join(fullPath, '..'))\n storage.write(fullPath, content)\n artifactsByPath[`${args.slot.cellId}/${path}`] = fullPath\n return fullPath\n },\n async writeJson(path, value) {\n return artifacts.write(path, JSON.stringify(value, null, 2))\n },\n }\n let costSoFar = 0\n const tokensSoFar: CampaignTokenUsage = { input: 0, output: 0 }\n const cost: CampaignCostMeter = {\n observe(amount, source) {\n costSoFar += amount\n trace.span(`cost.${source}`, { amountUsd: amount }).end()\n },\n observeTokens(usage) {\n tokensSoFar.input += usage.input\n tokensSoFar.output += usage.output\n if (usage.cached) tokensSoFar.cached = (tokensSoFar.cached ?? 0) + usage.cached\n },\n current() {\n return costSoFar\n },\n tokens() {\n return { ...tokensSoFar }\n },\n }\n\n const placement = args.opts.cellPlacement?.({\n scenario: args.slot.scenario,\n rep: args.slot.rep,\n })\n\n const ctx: DispatchContext = {\n cellId: args.slot.cellId,\n rep: args.slot.rep,\n seed: args.slot.cellSeed,\n signal: args.signal,\n trace,\n artifacts,\n cost,\n placement,\n }\n\n let artifact: TArtifact | undefined\n let errorMessage: string | undefined\n try {\n artifact = await args.opts.dispatch(args.slot.scenario, ctx)\n } catch (err) {\n errorMessage = err instanceof Error ? err.message : String(err)\n }\n\n // Run judges (only if we have an artifact). A judge that throws invalidates\n // the cell — recorded as `error`, NOT folded into a fake composite:0 (a fake\n // zero is indistinguishable from a real zero and poisons every aggregate).\n const judgeScores: Record<string, JudgeScore> = {}\n if (artifact !== undefined) {\n for (const judge of args.opts.judges ?? []) {\n if (judge.appliesTo && !judge.appliesTo(args.slot.scenario)) continue\n try {\n judgeScores[judge.name] = await runJudgeCell(judge, {\n artifact,\n scenario: args.slot.scenario,\n signal: args.signal,\n })\n } catch (err) {\n errorMessage = `judge '${judge.name}' failed: ${err instanceof Error ? err.message : String(err)}`\n break\n }\n }\n }\n\n await trace.flush()\n\n const cell: CampaignCellResult<TArtifact> = {\n cellId: args.slot.cellId,\n scenarioId: args.slot.scenario.id,\n rep: args.slot.rep,\n artifact: (artifact ?? null) as TArtifact,\n judgeScores,\n costUsd: costSoFar,\n tokenUsage: { ...tokensSoFar },\n durationMs: Date.now() - startMs,\n seed: args.slot.cellSeed,\n cached: false,\n error: errorMessage,\n }\n\n if (!errorMessage && args.resumable) {\n storage.write(cachePath, JSON.stringify(cell))\n }\n\n return { cell, artifactsByPath }\n}\n\n/**\n * Per-cell stub guard. A cell that produced an artifact (no error) but reported\n * `costUsd === 0` AND zero tokens means the dispatch never called `ctx.cost` —\n * i.e. it ran against a stub or silently dropped its usage. `'warn'` logs it,\n * `'assert'` throws (fail-fast), `'off'` skips. An errored/skipped cell or a\n * deterministic judge-only run that genuinely made no LLM call is not flagged.\n */\nfunction enforceCellUsage<TArtifact>(\n cell: CampaignCellResult<TArtifact>,\n mode: 'assert' | 'warn' | 'off',\n): void {\n if (mode === 'off' || cell.error) return\n if (cell.artifact === null || cell.artifact === undefined) return\n const zeroTokens = cell.tokenUsage.input === 0 && cell.tokenUsage.output === 0\n if (cell.costUsd !== 0 || !zeroTokens) return\n const msg = `cell '${cell.cellId}' produced an artifact but reported zero cost and zero tokens — the dispatch never reported LLM usage via ctx.cost.observe/observeTokens (a stub cell)`\n if (mode === 'assert') {\n const report: BackendIntegrityReport = {\n totalRecords: 1,\n stubRecords: 1,\n realRecords: 0,\n uncostedRecords: 0,\n totalInputTokens: 0,\n totalOutputTokens: 0,\n totalCostUsd: 0,\n verdict: 'stub',\n diagnosis: msg,\n }\n throw new BackendIntegrityError(`expectUsage: ${msg}`, report)\n }\n // eslint-disable-next-line no-console\n console.warn(`[runCampaign] expectUsage: ${msg}`)\n}\n\nasync function runJudgeCell<TArtifact, TScenario extends Scenario>(\n judge: JudgeConfig<TArtifact, TScenario>,\n input: { artifact: TArtifact; scenario: TScenario; signal: AbortSignal },\n): Promise<JudgeScore> {\n return judge.score(input)\n}\n\nfunction defaultBuildTraceWriter(\n storage: CampaignStorage,\n): (cellId: string, dir: string) => CampaignTraceWriter {\n return (cellId, dir) => {\n const spans: Array<Record<string, unknown>> = []\n return {\n span(name, attributes) {\n const startMs = Date.now()\n const record: Record<string, unknown> = { name, cellId, startMs, ...(attributes ?? {}) }\n const finish: TraceSpan = {\n end(endAttrs) {\n record.durationMs = Date.now() - startMs\n if (endAttrs) Object.assign(record, endAttrs)\n spans.push(record)\n },\n setAttribute(key, value) {\n record[key] = value\n },\n }\n return finish\n },\n async flush() {\n storage.write(join(dir, 'spans.jsonl'), spans.map((s) => JSON.stringify(s)).join('\\n'))\n },\n }\n }\n}\n\nfunction skippedCell<TScenario extends Scenario, TArtifact>(\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number },\n reason: string,\n): CampaignCellResult<TArtifact> {\n return {\n cellId: slot.cellId,\n scenarioId: slot.scenario.id,\n rep: slot.rep,\n artifact: null as unknown as TArtifact,\n judgeScores: {},\n costUsd: 0,\n tokenUsage: { input: 0, output: 0 },\n durationMs: 0,\n seed: slot.cellSeed,\n cached: false,\n error: `skipped: ${reason}`,\n }\n}\n\ninterface CaptureArgs<TScenario extends Scenario, TArtifact> {\n store: LabeledScenarioStore\n cell: CampaignCellResult<TArtifact>\n scenario: TScenario\n opts: RunCampaignOptions<TScenario, TArtifact>\n now: () => Date\n}\n\nasync function captureToStore<TScenario extends Scenario, TArtifact>(\n args: CaptureArgs<TScenario, TArtifact>,\n): Promise<void> {\n await args.store.observe({\n scenario: args.scenario,\n artifact: args.cell.artifact,\n judgeScores: args.cell.judgeScores,\n source: args.opts.captureSource ?? 'eval-run',\n sourceVersionHash: args.opts.captureSourceVersionHash ?? 'unknown',\n capturedAt: args.now().toISOString(),\n redactionStatus: 'raw',\n })\n}\n\n// ── Aggregates + manifest hash ────────────────────────────────────────\n\nfunction computeManifestHash(input: {\n scenarios: Scenario[]\n judges: JudgeConfig<unknown>[]\n dispatchRef: string\n seed: number\n reps: number\n}): string {\n const canonical = {\n scenarios: input.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n judges: input.judges.map((j) => ({ name: j.name, dims: j.dimensions.map((d) => d.key) })),\n dispatch: input.dispatchRef,\n seed: input.seed,\n reps: input.reps,\n }\n return createHash('sha256').update(JSON.stringify(canonical)).digest('hex')\n}\n\nfunction computeAggregates<TArtifact>(\n cells: CampaignCellResult<TArtifact>[],\n judges: JudgeConfig<TArtifact>[],\n seed: number,\n): CampaignAggregates {\n const byJudge: Record<string, JudgeAggregate> = {}\n for (const judge of judges) {\n const scores: number[] = []\n for (const cell of cells) {\n const s = cell.judgeScores[judge.name]\n if (s !== undefined) scores.push(s.composite)\n }\n byJudge[judge.name] = aggregate(scores, seed)\n }\n const byScenario: Record<string, ScenarioAggregate> = {}\n const scenarioGroups = new Map<string, number[]>()\n for (const cell of cells) {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n if (composites.length === 0) continue\n const mean = composites.reduce((a, b) => a + b, 0) / composites.length\n const arr = scenarioGroups.get(cell.scenarioId) ?? []\n arr.push(mean)\n scenarioGroups.set(cell.scenarioId, arr)\n }\n for (const [scenarioId, samples] of scenarioGroups) {\n const ag = aggregate(samples, seed)\n byScenario[scenarioId] = { meanComposite: ag.mean, ci95: ag.ci95, n: ag.n }\n }\n return {\n byJudge,\n byScenario,\n totalCostUsd: cells.reduce((a, c) => a + c.costUsd, 0),\n cellsExecuted: cells.filter((c) => !c.error).length,\n cellsSkipped: cells.filter((c) => c.error?.startsWith('skipped:')).length,\n cellsCached: cells.filter((c) => c.cached).length,\n cellsFailed: cells.filter((c) => c.error && !c.error.startsWith('skipped:')).length,\n }\n}\n\n// Percentile bootstrap CI95 via seeded resampling. Deterministic for a given\n// seed — same campaign re-run produces identical CI bands. Falls back to\n// degenerate intervals at n<=1 (the bootstrap is undefined there).\nfunction aggregate(samples: number[], seed: number): JudgeAggregate {\n const n = samples.length\n if (n === 0) return { mean: 0, stdev: 0, ci95: [0, 0], n: 0 }\n const mean = samples.reduce((a, b) => a + b, 0) / n\n const variance = samples.reduce((a, b) => a + (b - mean) ** 2, 0) / Math.max(1, n - 1)\n const stdev = Math.sqrt(variance)\n const ci = confidenceInterval(samples, 0.95, { seed, resamples: 1000 })\n return { mean, stdev, ci95: [ci.lower, ci.upper], n }\n}\n","import { createRequire } from 'node:module'\n\n/**\n * @experimental\n *\n * `CampaignStorage` — the filesystem seam `runCampaign` writes through\n * (run/cell dirs, the resumability cache, per-cell artifacts, trace spans).\n *\n * The default (`fsCampaignStorage`) is the Node filesystem — identical\n * behavior to the inline `node:fs` calls it replaces, so existing CLI\n * consumers are unaffected. `inMemoryCampaignStorage` keeps everything in a\n * `Map`, so the substrate runs in environments WITHOUT a filesystem\n * (Cloudflare Workers, Deno Deploy, other edge runtimes) — the campaign\n * still produces its `CampaignResult` (cells + aggregates) in memory;\n * artifacts/traces simply aren't persisted to disk.\n *\n * Paths are opaque keys to the in-memory adapter — it does not parse them,\n * so the same `join(...)`-built paths work unchanged across both adapters.\n */\nexport interface CampaignStorage {\n /** Ensure a directory exists (recursive). No-op for in-memory. */\n ensureDir(dir: string): void\n /** Does this path exist (as a written file or an ensured dir)? */\n exists(path: string): boolean\n /** Read a UTF-8 file; `undefined` when missing or unreadable. */\n read(path: string): string | undefined\n /** Write a file (string or bytes). Parent dir is assumed ensured. */\n write(path: string, content: string | Uint8Array): void\n}\n\n/** Node-filesystem storage — the default. Lazily requires `node:fs` so the\n * module imports cleanly in non-Node runtimes (where the caller passes\n * `inMemoryCampaignStorage` instead and never constructs this).\n *\n * `createRequire(import.meta.url)` is the ESM-native lazy require — a bare\n * `require` is a ReferenceError under `\"type\": \"module\"`, which is exactly\n * the shape this package publishes. */\nexport function fsCampaignStorage(): CampaignStorage {\n const nodeRequire = createRequire(import.meta.url)\n const { existsSync, mkdirSync, readFileSync, writeFileSync } = nodeRequire(\n 'node:fs',\n ) as typeof import('node:fs')\n return {\n ensureDir(dir) {\n if (!existsSync(dir)) mkdirSync(dir, { recursive: true })\n },\n exists(path) {\n return existsSync(path)\n },\n read(path) {\n try {\n return readFileSync(path, 'utf8')\n } catch {\n return undefined\n }\n },\n write(path, content) {\n writeFileSync(path, content as Uint8Array)\n },\n }\n}\n\n/** In-memory storage for filesystem-less runtimes. Artifacts + trace spans\n * live in a `Map` for the duration of the run; the `CampaignResult` is\n * fully populated, but nothing is persisted to disk. */\nexport function inMemoryCampaignStorage(): CampaignStorage {\n const files = new Map<string, string | Uint8Array>()\n const dirs = new Set<string>()\n return {\n ensureDir(dir) {\n dirs.add(dir)\n },\n exists(path) {\n return files.has(path) || dirs.has(path)\n },\n read(path) {\n const value = files.get(path)\n if (value === undefined) return undefined\n return typeof value === 'string' ? value : new TextDecoder().decode(value)\n },\n write(path, content) {\n files.set(path, content)\n },\n }\n}\n"],"mappings":";;;;;;;;AAaA,SAAS,kBAAkB;AAC3B,SAAS,YAAY;;;ACdrB,SAAS,qBAAqB;AAqCvB,SAAS,oBAAqC;AACnD,QAAM,cAAc,cAAc,YAAY,GAAG;AACjD,QAAM,EAAE,YAAY,WAAW,cAAc,cAAc,IAAI;AAAA,IAC7D;AAAA,EACF;AACA,SAAO;AAAA,IACL,UAAU,KAAK;AACb,UAAI,CAAC,WAAW,GAAG,EAAG,WAAU,KAAK,EAAE,WAAW,KAAK,CAAC;AAAA,IAC1D;AAAA,IACA,OAAO,MAAM;AACX,aAAO,WAAW,IAAI;AAAA,IACxB;AAAA,IACA,KAAK,MAAM;AACT,UAAI;AACF,eAAO,aAAa,MAAM,MAAM;AAAA,MAClC,QAAQ;AACN,eAAO;AAAA,MACT;AAAA,IACF;AAAA,IACA,MAAM,MAAM,SAAS;AACnB,oBAAc,MAAM,OAAqB;AAAA,IAC3C;AAAA,EACF;AACF;AAKO,SAAS,0BAA2C;AACzD,QAAM,QAAQ,oBAAI,IAAiC;AACnD,QAAM,OAAO,oBAAI,IAAY;AAC7B,SAAO;AAAA,IACL,UAAU,KAAK;AACb,WAAK,IAAI,GAAG;AAAA,IACd;AAAA,IACA,OAAO,MAAM;AACX,aAAO,MAAM,IAAI,IAAI,KAAK,KAAK,IAAI,IAAI;AAAA,IACzC;AAAA,IACA,KAAK,MAAM;AACT,YAAM,QAAQ,MAAM,IAAI,IAAI;AAC5B,UAAI,UAAU,OAAW,QAAO;AAChC,aAAO,OAAO,UAAU,WAAW,QAAQ,IAAI,YAAY,EAAE,OAAO,KAAK;AAAA,IAC3E;AAAA,IACA,MAAM,MAAM,SAAS;AACnB,YAAM,IAAI,MAAM,OAAO;AAAA,IACzB;AAAA,EACF;AACF;;;ADoBA,eAAsB,YACpB,MAC+C;AAC/C,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,QAAM,MAAM,KAAK,QAAQ,MAAM,oBAAI,KAAK;AACxC,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,UAAU,KAAK,WAAW,kBAAkB;AAElD,UAAQ,UAAU,KAAK,MAAM;AAE7B,QAAM,eAAe,oBAAoB;AAAA,IACvC,WAAW,KAAK;AAAA,IAChB;AAAA,IACA,aAAa,KAAK,SAAS,QAAQ;AAAA,IACnC;AAAA,IACA;AAAA,EACF,CAAC;AAED,QAAM,YAAY,IAAI;AACtB,QAAM,QAAyC,CAAC;AAChD,QAAM,kBAA0C,CAAC;AAGjD,QAAM,WAA0F,CAAC;AACjG,MAAI,YAAY;AAChB,aAAW,YAAY,KAAK,WAAW;AACrC,aAAS,MAAM,GAAG,MAAM,MAAM,OAAO;AACnC,YAAM,SAAS,GAAG,SAAS,EAAE,IAAI,GAAG;AACpC,YAAM,WAAW,OAAO;AACxB,eAAS,KAAK,EAAE,UAAU,KAAK,QAAQ,SAAS,CAAC;AACjD,mBAAa;AAAA,IACf;AAAA,EACF;AAGA,MAAI,eAAe;AACnB,MAAI,qBAAqB;AACzB,QAAM,kBAAkB,IAAI,gBAAgB;AAI5C,QAAM,QAAyB,CAAC;AAChC,MAAI,UAAU;AACd,QAAM,WAAW;AAEjB,WAAS,IAAI,GAAG,IAAI,gBAAgB,KAAK;AACvC,UAAM;AAAA,OACH,YAAY;AACX,eAAO,MAAM;AACX,gBAAM,QAAQ;AACd,cAAI,SAAS,SAAS,OAAQ;AAC9B,gBAAM,OAAO,SAAS,KAAK;AAC3B,cAAI,oBAAoB;AACtB,qBAAS,KAAK,YAAY,MAAM,sBAAsB,CAAC;AACvD;AAAA,UACF;AACA,gBAAM,SAAS,MAAM,YAAY;AAAA,YAC/B;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA,kBAAkB,KAAK,oBAAoB,wBAAwB,OAAO;AAAA,YAC1E,QAAQ,gBAAgB;AAAA,UAC1B,CAAC;AACD,mBAAS,KAAK,OAAO,IAAI;AACzB,2BAAiB,OAAO,MAAM,KAAK,eAAe,MAAM;AACxD,0BAAgB,OAAO,KAAK;AAC5B,iBAAO,OAAO,iBAAiB,OAAO,eAAe;AACrD,cAAI,KAAK,gBAAgB,UAAa,gBAAgB,KAAK,aAAa;AACtE,iCAAqB;AAAA,UACvB;AAEA,cAAI,KAAK,gBAAgB,KAAK,iBAAiB,SAAS,CAAC,OAAO,KAAK,OAAO;AAC1E,kBAAM,eAAe;AAAA,cACnB,OAAO,KAAK;AAAA,cACZ,MAAM,OAAO;AAAA,cACb,UAAU,KAAK;AAAA,cACf;AAAA,cACA;AAAA,YACF,CAAC,EAAE,MAAM,CAAC,QAAQ;AAGhB,sBAAQ;AAAA,gBACN,oCAAoC,OAAO,KAAK,MAAM,KAAK,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,cAC7G;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,GAAG;AAAA,IACL;AAAA,EACF;AACA,QAAM,QAAQ,IAAI,KAAK;AAEvB,QAAM,UAAU,IAAI;AACpB,WAAS,KAAK,CAAC,GAAG,MAAM,EAAE,OAAO,cAAc,EAAE,MAAM,CAAC;AAExD,QAAM,aAAa;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,WAAW,UAAU,YAAY;AAAA,IACjC,SAAS,QAAQ,YAAY;AAAA,IAC7B,YAAY,QAAQ,QAAQ,IAAI,UAAU,QAAQ;AAAA,IAClD,OAAO;AAAA,IACP;AAAA,IACA,QAAQ,KAAK;AAAA,IACb;AAAA,IACA,WAAW,KAAK,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,EACnE;AACF;AAeA,eAAe,YACb,MAC2F;AAC3F,QAAM,UAAU,KAAK;AACrB,QAAM,UAAU,KAAK,KAAK,KAAK,QAAQ,KAAK,KAAK,OAAO,QAAQ,mBAAmB,GAAG,CAAC;AACvF,UAAQ,UAAU,OAAO;AAGzB,QAAM,YAAY,KAAK,SAAS,oBAAoB;AACpD,MAAI,KAAK,WAAW;AAClB,UAAM,MAAM,QAAQ,KAAK,SAAS;AAClC,QAAI,QAAQ,QAAW;AACrB,UAAI;AACF,cAAM,SAAS,KAAK,MAAM,GAAG;AAC7B,YAAI,OAAO,WAAW,KAAK,KAAK,QAAQ;AACtC,iBAAO,EAAE,MAAM,EAAE,GAAG,QAAQ,QAAQ,KAAK,GAAG,iBAAiB,CAAC,EAAE;AAAA,QAClE;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAU,KAAK,IAAI;AACzB,QAAM,QAAQ,KAAK,iBAAiB,KAAK,KAAK,QAAQ,OAAO;AAC7D,QAAM,kBAA0C,CAAC;AACjD,QAAM,YAAoC;AAAA,IACxC,MAAM,MAAM,MAAM,SAAS;AACzB,YAAM,WAAW,KAAK,SAAS,IAAI;AACnC,cAAQ,UAAU,KAAK,UAAU,IAAI,CAAC;AACtC,cAAQ,MAAM,UAAU,OAAO;AAC/B,sBAAgB,GAAG,KAAK,KAAK,MAAM,IAAI,IAAI,EAAE,IAAI;AACjD,aAAO;AAAA,IACT;AAAA,IACA,MAAM,UAAU,MAAM,OAAO;AAC3B,aAAO,UAAU,MAAM,MAAM,KAAK,UAAU,OAAO,MAAM,CAAC,CAAC;AAAA,IAC7D;AAAA,EACF;AACA,MAAI,YAAY;AAChB,QAAM,cAAkC,EAAE,OAAO,GAAG,QAAQ,EAAE;AAC9D,QAAM,OAA0B;AAAA,IAC9B,QAAQ,QAAQ,QAAQ;AACtB,mBAAa;AACb,YAAM,KAAK,QAAQ,MAAM,IAAI,EAAE,WAAW,OAAO,CAAC,EAAE,IAAI;AAAA,IAC1D;AAAA,IACA,cAAc,OAAO;AACnB,kBAAY,SAAS,MAAM;AAC3B,kBAAY,UAAU,MAAM;AAC5B,UAAI,MAAM,OAAQ,aAAY,UAAU,YAAY,UAAU,KAAK,MAAM;AAAA,IAC3E;AAAA,IACA,UAAU;AACR,aAAO;AAAA,IACT;AAAA,IACA,SAAS;AACP,aAAO,EAAE,GAAG,YAAY;AAAA,IAC1B;AAAA,EACF;AAEA,QAAM,YAAY,KAAK,KAAK,gBAAgB;AAAA,IAC1C,UAAU,KAAK,KAAK;AAAA,IACpB,KAAK,KAAK,KAAK;AAAA,EACjB,CAAC;AAED,QAAM,MAAuB;AAAA,IAC3B,QAAQ,KAAK,KAAK;AAAA,IAClB,KAAK,KAAK,KAAK;AAAA,IACf,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,MAAI;AACJ,MAAI;AACJ,MAAI;AACF,eAAW,MAAM,KAAK,KAAK,SAAS,KAAK,KAAK,UAAU,GAAG;AAAA,EAC7D,SAAS,KAAK;AACZ,mBAAe,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,EAChE;AAKA,QAAM,cAA0C,CAAC;AACjD,MAAI,aAAa,QAAW;AAC1B,eAAW,SAAS,KAAK,KAAK,UAAU,CAAC,GAAG;AAC1C,UAAI,MAAM,aAAa,CAAC,MAAM,UAAU,KAAK,KAAK,QAAQ,EAAG;AAC7D,UAAI;AACF,oBAAY,MAAM,IAAI,IAAI,MAAM,aAAa,OAAO;AAAA,UAClD;AAAA,UACA,UAAU,KAAK,KAAK;AAAA,UACpB,QAAQ,KAAK;AAAA,QACf,CAAC;AAAA,MACH,SAAS,KAAK;AACZ,uBAAe,UAAU,MAAM,IAAI,aAAa,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAChG;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,MAAM,MAAM;AAElB,QAAM,OAAsC;AAAA,IAC1C,QAAQ,KAAK,KAAK;AAAA,IAClB,YAAY,KAAK,KAAK,SAAS;AAAA,IAC/B,KAAK,KAAK,KAAK;AAAA,IACf,UAAW,YAAY;AAAA,IACvB;AAAA,IACA,SAAS;AAAA,IACT,YAAY,EAAE,GAAG,YAAY;AAAA,IAC7B,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ;AAAA,IACR,OAAO;AAAA,EACT;AAEA,MAAI,CAAC,gBAAgB,KAAK,WAAW;AACnC,YAAQ,MAAM,WAAW,KAAK,UAAU,IAAI,CAAC;AAAA,EAC/C;AAEA,SAAO,EAAE,MAAM,gBAAgB;AACjC;AASA,SAAS,iBACP,MACA,MACM;AACN,MAAI,SAAS,SAAS,KAAK,MAAO;AAClC,MAAI,KAAK,aAAa,QAAQ,KAAK,aAAa,OAAW;AAC3D,QAAM,aAAa,KAAK,WAAW,UAAU,KAAK,KAAK,WAAW,WAAW;AAC7E,MAAI,KAAK,YAAY,KAAK,CAAC,WAAY;AACvC,QAAM,MAAM,SAAS,KAAK,MAAM;AAChC,MAAI,SAAS,UAAU;AACrB,UAAM,SAAiC;AAAA,MACrC,cAAc;AAAA,MACd,aAAa;AAAA,MACb,aAAa;AAAA,MACb,iBAAiB;AAAA,MACjB,kBAAkB;AAAA,MAClB,mBAAmB;AAAA,MACnB,cAAc;AAAA,MACd,SAAS;AAAA,MACT,WAAW;AAAA,IACb;AACA,UAAM,IAAI,sBAAsB,gBAAgB,GAAG,IAAI,MAAM;AAAA,EAC/D;AAEA,UAAQ,KAAK,8BAA8B,GAAG,EAAE;AAClD;AAEA,eAAe,aACb,OACA,OACqB;AACrB,SAAO,MAAM,MAAM,KAAK;AAC1B;AAEA,SAAS,wBACP,SACsD;AACtD,SAAO,CAAC,QAAQ,QAAQ;AACtB,UAAM,QAAwC,CAAC;AAC/C,WAAO;AAAA,MACL,KAAK,MAAM,YAAY;AACrB,cAAM,UAAU,KAAK,IAAI;AACzB,cAAM,SAAkC,EAAE,MAAM,QAAQ,SAAS,GAAI,cAAc,CAAC,EAAG;AACvF,cAAM,SAAoB;AAAA,UACxB,IAAI,UAAU;AACZ,mBAAO,aAAa,KAAK,IAAI,IAAI;AACjC,gBAAI,SAAU,QAAO,OAAO,QAAQ,QAAQ;AAC5C,kBAAM,KAAK,MAAM;AAAA,UACnB;AAAA,UACA,aAAa,KAAK,OAAO;AACvB,mBAAO,GAAG,IAAI;AAAA,UAChB;AAAA,QACF;AACA,eAAO;AAAA,MACT;AAAA,MACA,MAAM,QAAQ;AACZ,gBAAQ,MAAM,KAAK,KAAK,aAAa,GAAG,MAAM,IAAI,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,MACxF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,YACP,MACA,QAC+B;AAC/B,SAAO;AAAA,IACL,QAAQ,KAAK;AAAA,IACb,YAAY,KAAK,SAAS;AAAA,IAC1B,KAAK,KAAK;AAAA,IACV,UAAU;AAAA,IACV,aAAa,CAAC;AAAA,IACd,SAAS;AAAA,IACT,YAAY,EAAE,OAAO,GAAG,QAAQ,EAAE;AAAA,IAClC,YAAY;AAAA,IACZ,MAAM,KAAK;AAAA,IACX,QAAQ;AAAA,IACR,OAAO,YAAY,MAAM;AAAA,EAC3B;AACF;AAUA,eAAe,eACb,MACe;AACf,QAAM,KAAK,MAAM,QAAQ;AAAA,IACvB,UAAU,KAAK;AAAA,IACf,UAAU,KAAK,KAAK;AAAA,IACpB,aAAa,KAAK,KAAK;AAAA,IACvB,QAAQ,KAAK,KAAK,iBAAiB;AAAA,IACnC,mBAAmB,KAAK,KAAK,4BAA4B;AAAA,IACzD,YAAY,KAAK,IAAI,EAAE,YAAY;AAAA,IACnC,iBAAiB;AAAA,EACnB,CAAC;AACH;AAIA,SAAS,oBAAoB,OAMlB;AACT,QAAM,YAAY;AAAA,IAChB,WAAW,MAAM,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,IAClE,QAAQ,MAAM,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,MAAM,EAAE,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;AAAA,IACxF,UAAU,MAAM;AAAA,IAChB,MAAM,MAAM;AAAA,IACZ,MAAM,MAAM;AAAA,EACd;AACA,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,SAAS,CAAC,EAAE,OAAO,KAAK;AAC5E;AAEA,SAAS,kBACP,OACA,QACA,MACoB;AACpB,QAAM,UAA0C,CAAC;AACjD,aAAW,SAAS,QAAQ;AAC1B,UAAM,SAAmB,CAAC;AAC1B,eAAW,QAAQ,OAAO;AACxB,YAAM,IAAI,KAAK,YAAY,MAAM,IAAI;AACrC,UAAI,MAAM,OAAW,QAAO,KAAK,EAAE,SAAS;AAAA,IAC9C;AACA,YAAQ,MAAM,IAAI,IAAI,UAAU,QAAQ,IAAI;AAAA,EAC9C;AACA,QAAM,aAAgD,CAAC;AACvD,QAAM,iBAAiB,oBAAI,IAAsB;AACjD,aAAW,QAAQ,OAAO;AACxB,UAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,QAAI,WAAW,WAAW,EAAG;AAC7B,UAAM,OAAO,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAChE,UAAM,MAAM,eAAe,IAAI,KAAK,UAAU,KAAK,CAAC;AACpD,QAAI,KAAK,IAAI;AACb,mBAAe,IAAI,KAAK,YAAY,GAAG;AAAA,EACzC;AACA,aAAW,CAAC,YAAY,OAAO,KAAK,gBAAgB;AAClD,UAAM,KAAK,UAAU,SAAS,IAAI;AAClC,eAAW,UAAU,IAAI,EAAE,eAAe,GAAG,MAAM,MAAM,GAAG,MAAM,GAAG,GAAG,EAAE;AAAA,EAC5E;AACA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,cAAc,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,IACrD,eAAe,MAAM,OAAO,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE;AAAA,IAC7C,cAAc,MAAM,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,UAAU,CAAC,EAAE;AAAA,IACnE,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AAAA,IAC3C,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,WAAW,UAAU,CAAC,EAAE;AAAA,EAC/E;AACF;AAKA,SAAS,UAAU,SAAmB,MAA8B;AAClE,QAAM,IAAI,QAAQ;AAClB,MAAI,MAAM,EAAG,QAAO,EAAE,MAAM,GAAG,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE;AAC5D,QAAM,OAAO,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAClD,QAAM,WAAW,QAAQ,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,SAAS,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,IAAI,CAAC;AACrF,QAAM,QAAQ,KAAK,KAAK,QAAQ;AAChC,QAAM,KAAK,mBAAmB,SAAS,MAAM,EAAE,MAAM,WAAW,IAAK,CAAC;AACtE,SAAO,EAAE,MAAM,OAAO,MAAM,CAAC,GAAG,OAAO,GAAG,KAAK,GAAG,EAAE;AACtD;","names":[]}
|
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-7TPYV2ER.js";
|
|
4
4
|
import {
|
|
5
5
|
buildReflectionPrompt,
|
|
6
6
|
parseReflectionResponse,
|
|
7
7
|
runCanaries,
|
|
8
|
-
scoreRedTeamOutput
|
|
8
|
+
scoreRedTeamOutput
|
|
9
|
+
} from "./chunk-SS2SOBBT.js";
|
|
10
|
+
import {
|
|
9
11
|
summarizeBackendIntegrity
|
|
10
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-E22YUOAL.js";
|
|
11
13
|
import {
|
|
12
14
|
detectRewardHacking
|
|
13
15
|
} from "./chunk-YV7J7X5N.js";
|
|
@@ -612,7 +614,7 @@ async function runImprovementLoop(opts) {
|
|
|
612
614
|
throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
|
|
613
615
|
}
|
|
614
616
|
const optimization = await runOptimization(opts);
|
|
615
|
-
const { runCampaign: runCampaign2 } = await import("./run-campaign-
|
|
617
|
+
const { runCampaign: runCampaign2 } = await import("./run-campaign-5J3ED2UJ.js");
|
|
616
618
|
const baselineOnHoldout = await runCampaign2({
|
|
617
619
|
...opts,
|
|
618
620
|
scenarios: opts.holdoutScenarios,
|
|
@@ -923,4 +925,4 @@ export {
|
|
|
923
925
|
provenanceSpansPath,
|
|
924
926
|
emitLoopProvenance
|
|
925
927
|
};
|
|
926
|
-
//# sourceMappingURL=chunk-
|
|
928
|
+
//# sourceMappingURL=chunk-CV2BS2OV.js.map
|