@tangle-network/agent-eval 0.53.0 → 0.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +7 -6
- package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/builder-eval/index.d.ts +4 -3
- package/dist/campaign/index.d.ts +9 -7
- package/dist/campaign/index.js +33 -4
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
- package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
- package/dist/chunk-NCK5QLGT.js.map +1 -0
- package/dist/{chunk-5KSDYBYH.js → chunk-YXTT6GSZ.js} +2 -2
- package/dist/contract/index.d.ts +13 -12
- package/dist/contract/index.js +25 -0
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
- package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
- package/dist/control.d.ts +7 -6
- package/dist/control.js +2 -2
- package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
- package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
- package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -2
- package/dist/hosted/index.d.ts +7 -6
- package/dist/{index-C7RhhEME.d.ts → index-D2nT6_KT.d.ts} +20 -2
- package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
- package/dist/index.d.ts +31 -29
- package/dist/index.js +3 -3
- package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
- package/dist/knowledge/index.d.ts +4 -3
- package/dist/meta-eval/index.d.ts +4 -3
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.d.ts +7 -6
- package/dist/prm/index.d.ts +5 -4
- package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
- package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
- package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
- package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
- package/dist/reporting.d.ts +7 -6
- package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} +5 -5
- package/dist/rl.d.ts +11 -10
- package/dist/rl.js +2 -2
- package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
- package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
- package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
- package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
- package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
- package/dist/store-CKUAgsJz.d.ts +101 -0
- package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
- package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
- package/dist/traces.d.ts +7 -6
- package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
- package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
- package/dist/wire/index.d.ts +5 -4
- package/docs/pilot/README.md +62 -0
- package/docs/pilot/customer-checklist.md +90 -0
- package/docs/pilot/integration-foreign-stack.md +296 -0
- package/docs/pilot/integration-tangle-stack.md +248 -0
- package/docs/pilot/one-pager.md +161 -0
- package/docs/pilot/sample-insight-report.json +172 -0
- package/docs/research/research-roadmap.md +204 -0
- package/package.json +1 -1
- package/dist/chunk-BWZEGTES.js.map +0 -1
- /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
- /package/dist/{chunk-5KSDYBYH.js.map → chunk-YXTT6GSZ.js.map} +0 -0
package/dist/adapters/http.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-
|
|
1
|
+
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-BgrxOJSf.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-
|
|
1
|
+
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-BgrxOJSf.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain
|
package/dist/adapters/otel.d.ts
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
import { T as TraceSpanEvent, H as HostedClient } from '../index-
|
|
2
|
-
import '../types-
|
|
3
|
-
import '../summary-report-
|
|
4
|
-
import '../run-record-
|
|
1
|
+
import { T as TraceSpanEvent, H as HostedClient } from '../index-D2nT6_KT.js';
|
|
2
|
+
import '../types-BgrxOJSf.js';
|
|
3
|
+
import '../summary-report-DLxh4yWk.js';
|
|
4
|
+
import '../run-record-etiCMsUq.js';
|
|
5
5
|
import '../errors-mje_cKOs.js';
|
|
6
|
-
import '../
|
|
7
|
-
import '../
|
|
6
|
+
import '../schema-m0gsnbt3.js';
|
|
7
|
+
import '../failure-cluster-CL7IVgkJ.js';
|
|
8
|
+
import '../store-CKUAgsJz.js';
|
|
8
9
|
import '../judge-calibration-DilmB3Ml.js';
|
|
9
10
|
|
|
10
11
|
/**
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
-
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-
|
|
2
|
-
import '../run-record-
|
|
1
|
+
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-wlaiph9Y.js';
|
|
2
|
+
import '../run-record-etiCMsUq.js';
|
|
3
3
|
import '../errors-mje_cKOs.js';
|
|
4
|
+
import '../schema-m0gsnbt3.js';
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-
|
|
2
|
-
import { T as TraceEmitter } from '../emitter-
|
|
3
|
-
import {
|
|
1
|
+
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-BdVaPyHT.js';
|
|
2
|
+
import { T as TraceEmitter } from '../emitter-DEZwY14K.js';
|
|
3
|
+
import { R as Run } from '../schema-m0gsnbt3.js';
|
|
4
|
+
import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
4
5
|
|
|
5
6
|
/**
|
|
6
7
|
* BuilderSession — ties a builder-of-builders workflow together.
|
package/dist/campaign/index.d.ts
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverConstraints, a as GepaDriverOptions, H as HeldOutGateOptions, O as OpenAutoPrOptions, b as OpenAutoPrResult, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, e as RunImprovementLoopResult, f as RunOptimizationOptions, g as RunOptimizationResult, h as composeGate, i as countSentenceEdits, j as defaultProductionGate, k as evolutionaryDriver, l as extractH2Sections, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, q as openAutoPr, r as runCampaign, s as runEval, t as runImprovementLoop, u as runOptimization, v as surfaceHash } from '../run-improvement-loop-
|
|
2
|
-
import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, C as CodeSurface } from '../types-
|
|
3
|
-
export {
|
|
1
|
+
export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverConstraints, a as GepaDriverOptions, H as HeldOutGateOptions, O as OpenAutoPrOptions, b as OpenAutoPrResult, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, e as RunImprovementLoopResult, f as RunOptimizationOptions, g as RunOptimizationResult, h as composeGate, i as countSentenceEdits, j as defaultProductionGate, k as evolutionaryDriver, l as extractH2Sections, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, q as openAutoPr, r as runCampaign, s as runEval, t as runImprovementLoop, u as runOptimization, v as surfaceHash } from '../run-improvement-loop-BhfdjrMY.js';
|
|
2
|
+
import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, f as LabelTrust, C as CodeSurface } from '../types-BgrxOJSf.js';
|
|
3
|
+
export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, b as DispatchContext, D as DispatchFn, G as Gate, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, I as ImprovementDriver, r as JudgeAggregate, a as JudgeConfig, s as JudgeDimension, J as JudgeScore, t as LabeledScenarioSource, M as MutableSurface, u as Mutator, O as OptimizerConfig, P as ProposeContext, R as RedactionStatus, S as Scenario, v as ScenarioAggregate, w as SessionScript, T as TraceSpan, x as labelTrustRank } from '../types-BgrxOJSf.js';
|
|
4
4
|
import '../llm-client-BXVRUZyX.js';
|
|
5
5
|
import '../errors-mje_cKOs.js';
|
|
6
6
|
import '../raw-provider-sink-C46HDghv.js';
|
|
7
|
-
import '../red-team-
|
|
7
|
+
import '../red-team-CrC5MZYd.js';
|
|
8
8
|
import '../dataset-BlwAtYYf.js';
|
|
9
|
-
import '../store-
|
|
10
|
-
import '../
|
|
9
|
+
import '../store-CKUAgsJz.js';
|
|
10
|
+
import '../schema-m0gsnbt3.js';
|
|
11
|
+
import '../run-record-etiCMsUq.js';
|
|
11
12
|
|
|
12
13
|
/**
|
|
13
14
|
* @experimental
|
|
@@ -62,6 +63,7 @@ declare class FsLabeledScenarioStore implements LabeledScenarioStore {
|
|
|
62
63
|
train: number;
|
|
63
64
|
test: number;
|
|
64
65
|
bySource: Record<string, number>;
|
|
66
|
+
byTrust: Record<LabelTrust, number>;
|
|
65
67
|
}>;
|
|
66
68
|
private assertProvenance;
|
|
67
69
|
private assertRateLimit;
|
|
@@ -124,4 +126,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
|
|
|
124
126
|
* as a ref under the adapter's worktree dir. */
|
|
125
127
|
declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
|
|
126
128
|
|
|
127
|
-
export { CodeSurface, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type Worktree, type WorktreeAdapter, WorktreeAdapterError, gitWorktreeAdapter, resolveWorktreePath };
|
|
129
|
+
export { CodeSurface, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type Worktree, type WorktreeAdapter, WorktreeAdapterError, gitWorktreeAdapter, resolveWorktreePath };
|
package/dist/campaign/index.js
CHANGED
|
@@ -30,6 +30,18 @@ import "../chunk-NSBPE2FW.js";
|
|
|
30
30
|
import { createHash } from "crypto";
|
|
31
31
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
32
32
|
import { join } from "path";
|
|
33
|
+
|
|
34
|
+
// src/campaign/types.ts
|
|
35
|
+
var LABEL_TRUST_RANK = {
|
|
36
|
+
unverified: 0,
|
|
37
|
+
"verified-signal": 1,
|
|
38
|
+
"human-rated": 2
|
|
39
|
+
};
|
|
40
|
+
function labelTrustRank(trust) {
|
|
41
|
+
return LABEL_TRUST_RANK[trust ?? "unverified"];
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// src/campaign/labeled-store/fs-adapter.ts
|
|
33
45
|
var LabeledScenarioStoreError = class extends Error {
|
|
34
46
|
constructor(code, message) {
|
|
35
47
|
super(message);
|
|
@@ -97,6 +109,11 @@ var FsLabeledScenarioStore = class {
|
|
|
97
109
|
}
|
|
98
110
|
async size() {
|
|
99
111
|
const bySource = {};
|
|
112
|
+
const byTrust = {
|
|
113
|
+
unverified: 0,
|
|
114
|
+
"verified-signal": 0,
|
|
115
|
+
"human-rated": 0
|
|
116
|
+
};
|
|
100
117
|
let total = 0;
|
|
101
118
|
for (const source of ALL_SOURCES) {
|
|
102
119
|
const path = this.pathForSource(source);
|
|
@@ -104,11 +121,19 @@ var FsLabeledScenarioStore = class {
|
|
|
104
121
|
bySource[source] = 0;
|
|
105
122
|
continue;
|
|
106
123
|
}
|
|
107
|
-
const
|
|
108
|
-
bySource[source] =
|
|
109
|
-
total +=
|
|
124
|
+
const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
|
|
125
|
+
bySource[source] = lines.length;
|
|
126
|
+
total += lines.length;
|
|
127
|
+
for (const line of lines) {
|
|
128
|
+
let trust = "unverified";
|
|
129
|
+
try {
|
|
130
|
+
trust = JSON.parse(line).labelTrust ?? "unverified";
|
|
131
|
+
} catch {
|
|
132
|
+
}
|
|
133
|
+
byTrust[trust] += 1;
|
|
134
|
+
}
|
|
110
135
|
}
|
|
111
|
-
return { train: total, test: total, bySource };
|
|
136
|
+
return { train: total, test: total, bySource, byTrust };
|
|
112
137
|
}
|
|
113
138
|
assertProvenance(write) {
|
|
114
139
|
if (!write.source) {
|
|
@@ -207,6 +232,9 @@ function matchesFilter(record, args, source) {
|
|
|
207
232
|
if (f.minComposite !== void 0 && max < f.minComposite) return false;
|
|
208
233
|
if (f.maxComposite !== void 0 && max > f.maxComposite) return false;
|
|
209
234
|
}
|
|
235
|
+
if (f.minTrust !== void 0 && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {
|
|
236
|
+
return false;
|
|
237
|
+
}
|
|
210
238
|
return true;
|
|
211
239
|
}
|
|
212
240
|
function sha256(input) {
|
|
@@ -294,6 +322,7 @@ export {
|
|
|
294
322
|
gitWorktreeAdapter,
|
|
295
323
|
heldOutGate,
|
|
296
324
|
inMemoryCampaignStorage,
|
|
325
|
+
labelTrustRank,
|
|
297
326
|
openAutoPr,
|
|
298
327
|
resolveWorktreePath,
|
|
299
328
|
runCampaign,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n} from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{ train: number; test: number; bySource: Record<string, number> }> {\n const bySource: Record<string, number> = {}\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const count = readFileSync(path, 'utf8').split('\\n').filter(Boolean).length\n bySource[source] = count\n total += count\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAmBd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAAmF;AACvF,UAAM,WAAmC,CAAC;AAC1C,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO,EAAE;AACrE,eAAS,MAAM,IAAI;AACnB,eAAS;AAAA,IACX;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,SAAS;AAAA,EAC/C;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AC1QA,SAAS,oBAAoB;AAC7B,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["existsSync","join"]}
|
|
1
|
+
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/types.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * Pass A substrate types — `runCampaign` is the one primitive every\n * eval flow composes from. Three contracts in this file:\n *\n * - `Scenario` input set\n * - `DispatchFn` how to run one scenario → artifact\n * - `CampaignResult` defined output schema (the contract downstream tools depend on)\n *\n * Three more lifted from earlier substrate work (re-exported):\n *\n * - `JudgeConfig` pluggable dimensional scorer (0.38)\n * - `Mutator` optimization-loop surface mutator\n * - `Gate` promotion gate (`HeldOutGate` and friends adapt to this)\n *\n * No new architecture vs 0.38 — Pass A formalizes the shapes so consumers\n * can build dashboards / CI gates / regression diffs against a stable schema.\n */\n\n/** @experimental Stable identifier + kind tag for any scenario. Consumers\n * extend with their per-domain payload (persona, task, requirement, ...). */\nexport interface Scenario {\n id: string\n kind: string\n tags?: string[]\n}\n\n/** @experimental Context handed to every dispatch invocation. Scoped — every\n * trace/span carries the cellId, every artifact write lands under the cell's\n * artifact root, the cost meter accumulates per cell. */\nexport interface DispatchContext {\n cellId: string\n rep: number\n generation?: number\n seed: number\n signal: AbortSignal\n trace: CampaignTraceWriter\n artifacts: CampaignArtifactWriter\n cost: CampaignCostMeter\n /** Populated when this run is part of a multi-cycle improvement loop. */\n cycleId?: string\n /** Populated when the substrate resumed from a prior cache hit. */\n resumedFrom?: string\n /**\n * Opaque placement key supplied by `RunCampaignOptions.cellPlacement`.\n * The substrate forwards it through unchanged; placement-aware Dispatch\n * implementations (e.g. `httpDispatch` from `/adapters/http`) read it to\n * route the cell to the right worker / region / sandbox. `undefined`\n * when no placement strategy is configured.\n */\n placement?: string\n}\n\n/** @experimental One function: scenario + ctx → artifact. Dispatcher chooses\n * whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */\nexport type DispatchFn<TScenario extends Scenario, TArtifact> = (\n scenario: TScenario,\n ctx: DispatchContext,\n) => Promise<TArtifact>\n\n// ── Sessions ──────────────────────────────────────────────────────────\n\n/** @experimental One session within a multi-session journey. Dispatch is\n * invoked once per session in order; state from prior session's artifact\n * is exposed via `ctx.priorSessionArtifact`. */\nexport interface SessionScript<TScenario, TArtifact> {\n id: string\n intent: string\n maxTurns?: number\n /** When true, knowledge accumulated this session persists to next. */\n affectsKnowledge?: boolean\n /** Optional per-session persona evolution — called after the session\n * resolves. Returns the persona shape used by the NEXT session. */\n evolveAfterSession?: (artifact: TArtifact, sessionIndex: number, scenario: TScenario) => TScenario\n}\n\n// ── Judges (re-export 0.38 shape) ─────────────────────────────────────\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\n/** @experimental Pluggable dimensional scorer. `score` is the contract:\n * given an artifact + scenario, return a `JudgeScore`. This is deliberately a\n * function, not a fixed LLM-prompt shape — real consumers judge with\n * ensembles, deterministic checks, or a single LLM call, and the substrate\n * must not constrain that. The `llmJudge()` helper builds a `score` that does\n * one LLM call for the common case. `appliesTo` lets a judge run only on\n * scenarios that match (e.g. a legal-citation judge only on legal scenarios). */\nexport interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {\n name: string\n dimensions: JudgeDimension[]\n /** Score one artifact. Throw on failure — a thrown judge is recorded as a\n * failed cell, never silently folded into a zero. */\n score(input: {\n artifact: TArtifact\n scenario: TScenario\n signal: AbortSignal\n }): JudgeScore | Promise<JudgeScore>\n appliesTo?: (scenario: TScenario) => boolean\n}\n\nexport interface JudgeScore {\n dimensions: Record<string, number>\n composite: number\n notes: string\n}\n\n// ── Optimization (population + generations + mutator) ─────────────────\n\n/** @experimental A tier-4 code surface — a candidate change to the agent's\n * IMPLEMENTATION, not its prompt. Produced by autoresearch (reads codebase +\n * trace findings → opens a worktree). Measured by checking out `worktreeRef`\n * and running the worker against the changed code. See the improvement-tier\n * table in `docs/design/loop-taxonomy.md`. */\nexport interface CodeSurface {\n kind: 'code'\n /** Worktree path or git ref holding the candidate code change. The\n * consumer's `dispatchWithSurface` checks this out before running. */\n worktreeRef: string\n /** Base ref the change is measured against. Default: the repo's main. */\n baseRef?: string\n /** Human summary of what changed — rendered into the auto-PR body. */\n summary?: string\n}\n\n/** @experimental The mutable surface a driver proposes. Tiers (see\n * `docs/design/loop-taxonomy.md`):\n * - `string` — tiers 1-2: system-prompt addendum / serialized tool\n * config. Cheap, reversible, text-diffable.\n * - `CodeSurface` — tier 4: an implementation change behind a worktree ref.\n * Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,\n * not this type. */\nexport type MutableSurface = string | CodeSurface\n\n/** @experimental Stateless surface mutation — given findings + current\n * surface, return N candidate surfaces. Pure transform, no generation\n * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`\n * conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */\nexport interface Mutator<TFindings = unknown> {\n kind: string\n mutate(args: {\n findings: TFindings[]\n currentSurface: MutableSurface\n populationSize: number\n signal: AbortSignal\n }): Promise<MutableSurface[]>\n}\n\n/** @experimental Everything a driver's `propose()` may read to plan the next\n * batch of candidates. The first six fields are always present; the rest are\n * optional context the loop supplies when available, so cheap drivers\n * (`evolutionaryDriver`) can ignore them while a code-tier agentic generator\n * consumes the research report + dataset to drive a coding harness.\n * See `docs/design/self-improvement-engine.md`. */\nexport interface ProposeContext<TFindings = unknown> {\n currentSurface: MutableSurface\n history: GenerationRecord[]\n findings: TFindings[]\n /** BREADTH: how many candidate surfaces to return this generation. */\n populationSize: number\n generation: number\n signal: AbortSignal\n /** The Phase-2 research report (analyst findings + diff), produced AFTER the\n * trace analysts run. Opaque to the substrate — the driver that consumes it\n * types it. See the phase diagram in self-improvement-engine.md. */\n report?: unknown\n /** Handle to all captured data — the driver samples traces / artifacts /\n * rewards here to ground its proposals. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the agentic generator may take per candidate.\n * 1 = single-shot; >1 = it may iterate on its own change before handing it\n * back to be measured. */\n maxImprovementShots?: number\n}\n\n/** @experimental A surface-improvement strategy — the DRIVER of the\n * improvement loop. Given the current best surface, the history of what's\n * been tried + scored, and any external findings, propose the next batch of\n * candidate surfaces to measure. Optionally decide to stop early.\n *\n * The evolutionary mutator (`evolutionaryDriver`, here) and agent-runtime's\n * `improvementDriver` (with reflective / agentic generators) both conform —\n * drivers of the SAME loop, not separate loops. The loop body\n * (`runOptimization`) and the gated promotion shell (`runImprovementLoop`)\n * are driver-agnostic. */\nexport interface ImprovementDriver<TFindings = unknown> {\n kind: string\n /** Plan: propose N candidate surfaces for the next generation. */\n propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]>\n /** Decide: stop early when the driver judges the search converged or\n * exhausted. Default (omitted) runs all `maxGenerations`. */\n decide?(args: { history: GenerationRecord[] }): { stop: boolean; reason?: string }\n}\n\nexport interface OptimizerConfig {\n driver: ImprovementDriver\n populationSize: number\n maxGenerations: number\n surfaceExtractor: (profile: unknown) => MutableSurface\n}\n\n// ── Gates ─────────────────────────────────────────────────────────────\n\n/** @experimental Five-valued verdict taxonomy (MOSS-paper alignment). */\nexport type GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n\nexport interface GateContext<TArtifact, TScenario extends Scenario> {\n candidateArtifacts: Map<string, TArtifact>\n baselineArtifacts?: Map<string, TArtifact>\n /** Candidate (winner) judge scores, keyed by cellId. */\n judgeScores: Map<string, Record<string, JudgeScore>>\n /** Baseline judge scores, keyed by cellId. SEPARATE from `judgeScores` —\n * baseline + candidate share cellIds (same scenarios), so a single map\n * cannot represent both. A gate computing a holdout delta MUST read\n * candidate from `judgeScores` and baseline from here. */\n baselineJudgeScores?: Map<string, Record<string, JudgeScore>>\n scenarios: TScenario[]\n cost: { candidate: number; baseline: number }\n signal: AbortSignal\n}\n\nexport interface GateResult {\n decision: GateDecision\n reasons: string[]\n contributingGates: Array<{ name: string; passed: boolean; detail: unknown }>\n delta?: number\n}\n\n/** @experimental Composable promotion gate. */\nexport interface Gate<TArtifact = unknown, TScenario extends Scenario = Scenario> {\n name: string\n decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult>\n}\n\n// ── Tracing / artifacts / cost ────────────────────────────────────────\n\n/** @experimental Scoped trace writer handed to each dispatch — every span\n * auto-tagged with the cellId so traces filter cleanly. */\nexport interface CampaignTraceWriter {\n span(name: string, attributes?: Record<string, unknown>): TraceSpan\n flush(): Promise<void>\n}\n\nexport interface TraceSpan {\n end(attributes?: Record<string, unknown>): void\n setAttribute(key: string, value: unknown): void\n}\n\n/** @experimental Scoped artifact writer — `write(path, content)` lands under\n * `<runDir>/<cellId>/<path>`. */\nexport interface CampaignArtifactWriter {\n write(path: string, content: string | Uint8Array): Promise<string>\n writeJson(path: string, value: unknown): Promise<string>\n}\n\n/** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs\n * via the cost-ledger backend hooks; consumers can record additional\n * spend (sandbox time, tool costs) via `observe`. */\nexport interface CampaignCostMeter {\n observe(amountUsd: number, source: string): void\n current(): number\n}\n\n// ── LabeledScenarioStore ──────────────────────────────────────────────\n\n/** @experimental Source tag — required on every store write. Used by the\n * default training-source filter (production-trace samples NOT used as\n * training scenarios unless explicitly opted in). */\nexport type LabeledScenarioSource =\n | 'production-trace'\n | 'eval-run'\n | 'manual'\n | 'red-team'\n | 'synthetic'\n\nexport type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted'\n\n/** How much a label can be trusted to evaluate against — the gold-admission\n * gate. Strictly ordered: a record qualifies for a `minTrust` filter when its\n * trust rank is >= the requested rank.\n *\n * - `unverified` — label is a heuristic (e.g. raw outcome success/fail).\n * Fine as corpus; MUST NOT enter a gold set that lift\n * numbers are computed against.\n * - `verified-signal` — an external signal confirmed the outcome (PR merged,\n * tests green, user did not retry, downstream check).\n * - `human-rated` — a human explicitly rated or corrected the artifact.\n *\n * Absent on a write ⇒ treated as `unverified` (fail-closed: a writer must\n * explicitly assert trust to make a record gold-eligible — it never happens\n * by accident). */\nexport type LabelTrust = 'unverified' | 'verified-signal' | 'human-rated'\n\nconst LABEL_TRUST_RANK: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 1,\n 'human-rated': 2,\n}\n\n/** Ordinal rank for a label-trust tier; absent ⇒ `unverified` (rank 0). */\nexport function labelTrustRank(trust: LabelTrust | undefined): number {\n return LABEL_TRUST_RANK[trust ?? 'unverified']\n}\n\n/** @experimental Required-provenance write. The store rejects writes that\n * lack provenance — a default-on flywheel without provenance is the\n * data-poisoning vector flagged in the alignment review. */\nexport interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact = unknown> {\n scenario: TScenario\n artifact: TArtifact\n judgeScores: Record<string, JudgeScore>\n source: LabeledScenarioSource\n sourceVersionHash: string\n capturedAt: string\n redactionStatus: RedactionStatus\n /** Gold-admission trust tier. Absent ⇒ `unverified` (fail-closed): the\n * record is corpus, never gold. A writer must explicitly assert\n * `verified-signal` or `human-rated` to make it eligible for a gold\n * sample. See {@link LabelTrust}. */\n labelTrust?: LabelTrust\n /** Optional per-source rate-limit bucket key (e.g., the tenant id). */\n rateLimitBucket?: string\n}\n\nexport interface LabeledScenarioRecord<TScenario extends Scenario = Scenario, TArtifact = unknown>\n extends LabeledScenarioWrite<TScenario, TArtifact> {\n /** Stable hash of (scenario.id, source, capturedAt, sourceVersionHash). */\n recordHash: string\n /** Substrate-assigned split — train if captured before the campaign's\n * `temporalCutoff`, test if after. Explicit override allowed via filter. */\n split: 'train' | 'test'\n}\n\nexport interface LabeledScenarioSampleArgs {\n count: number\n /** REQUIRED — substrate refuses to sample without an explicit split. */\n split: 'train' | 'test'\n /** REQUIRED — only records captured before this timestamp are returned.\n * Enforces temporal split discipline (test scenarios captured AFTER train\n * cannot enter the training pool). */\n capturedBefore: string\n filter?: {\n kind?: string\n source?: LabeledScenarioSource | LabeledScenarioSource[]\n minComposite?: number\n maxComposite?: number\n /** Gold gate: only records whose trust rank is >= this tier are\n * returned. `sample({ split: 'test', minTrust: 'verified-signal' })` is\n * the canonical \"give me the gold set\" call. Absent ⇒ no trust gate\n * (corpus-level read). */\n minTrust?: LabelTrust\n }\n}\n\nexport interface LabeledScenarioStore {\n observe(write: LabeledScenarioWrite): Promise<void>\n sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>\n size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n /** Count by trust tier — tells the flywheel how much gold it has\n * accumulated vs. raw corpus. */\n byTrust: Record<LabelTrust, number>\n }>\n}\n\n// ── The CampaignResult schema (the downstream-tools contract) ─────────\n\nexport interface CampaignCellResult<TArtifact> {\n cellId: string\n scenarioId: string\n rep: number\n generation?: number\n artifact: TArtifact\n judgeScores: Record<string, JudgeScore>\n costUsd: number\n durationMs: number\n seed: number\n cached: boolean\n error?: string\n}\n\nexport interface JudgeAggregate {\n mean: number\n stdev: number\n ci95: [number, number]\n n: number\n}\n\nexport interface ScenarioAggregate {\n meanComposite: number\n ci95: [number, number]\n n: number\n}\n\nexport interface GenerationRecord {\n generationIndex: number\n candidates: GenerationCandidate[]\n promoted: string[]\n}\n\n/** One scored candidate surface in a generation. `dimensions` + `scenarios`\n * let a reflective `ImprovementDriver` ground its next proposal on WHICH\n * dimensions the candidate is weakest on and WHICH scenarios it best/worst\n * handled — the evidence a blind `Mutator` cannot see. */\nexport interface GenerationCandidate {\n surfaceHash: string\n composite: number\n ci95: [number, number]\n /** Mean score per judge dimension across all cells (scenarios × reps ×\n * judges that reported the dimension). */\n dimensions: Record<string, number>\n /** Per-scenario composite (mean over reps + judges). */\n scenarios: Array<{ scenarioId: string; composite: number }>\n}\n\nexport interface CampaignAggregates {\n byJudge: Record<string, JudgeAggregate>\n byScenario: Record<string, ScenarioAggregate>\n totalCostUsd: number\n cellsExecuted: number\n cellsSkipped: number\n cellsCached: number\n cellsFailed: number\n}\n\nexport interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scenario> {\n /** sha256(scenarios, judges, dispatch source ref, optimizer config, seed). Stable identity for reruns. */\n manifestHash: string\n seed: number\n startedAt: string\n endedAt: string\n durationMs: number\n cells: Array<CampaignCellResult<TArtifact>>\n aggregates: CampaignAggregates\n optimization?: {\n generations: GenerationRecord[]\n winnerSurfaceHash?: string\n }\n gate?: GateResult\n prUrl?: string\n runDir: string\n artifactsByPath: Record<string, string>\n /** Substrate strips the input scenarios to id+kind for the result manifest;\n * consumers needing full payload look it up via the original input. The\n * type parameter `TScenario` is propagated for downstream consumers that\n * want narrowed types when extending `CampaignResult`. */\n scenarios: Array<Pick<TScenario, 'id' | 'kind'>>\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;;;AC2QrB,IAAM,mBAA+C;AAAA,EACnD,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,eAAe;AACjB;AAGO,SAAS,eAAe,OAAuC;AACpE,SAAO,iBAAiB,SAAS,YAAY;AAC/C;;;AD/PO,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AElSA,SAAS,oBAAoB;AAC7B,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["existsSync","join"]}
|
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
} from "./chunk-NCRFYPS3.js";
|
|
5
5
|
import {
|
|
6
6
|
validateRunRecord
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-NCK5QLGT.js";
|
|
8
8
|
import {
|
|
9
9
|
TraceEmitter
|
|
10
10
|
} from "./chunk-TVVP3ZZQ.js";
|
|
@@ -610,4 +610,4 @@ export {
|
|
|
610
610
|
runProposeReviewAsControlLoop,
|
|
611
611
|
controlFailureClassFromVerification
|
|
612
612
|
};
|
|
613
|
-
//# sourceMappingURL=chunk-
|
|
613
|
+
//# sourceMappingURL=chunk-J4DIMSRK.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/agent-profile-cell.ts","../src/run-record.ts"],"sourcesContent":["import { ValidationError } from './errors'\nimport { hashJson } from './pre-registration'\n\nexport type AgentProfileCellSchemaVersion = 'agent-profile-cell/v1'\n\nexport type AgentProfileJson =\n | string\n | number\n | boolean\n | null\n | AgentProfileJson[]\n | { [key: string]: AgentProfileJson }\n\nexport type AgentProfileDimensionValue = string | number | boolean | null\n\nexport interface AgentProfileSource {\n /** Runtime/profile contract being fingerprinted, e.g. `sandbox-agent-profile`. */\n kind: string\n /** sha256 over the canonical source profile object. */\n hash: string\n}\n\nexport interface AgentProfileSourceInput {\n kind: string\n /** Precomputed sha256 for callers that already sign their profile artifact. */\n hash?: string\n /** Full canonical runtime profile; hashed and then discarded from the cell. */\n profile?: AgentProfileJson\n}\n\nexport interface AgentProfileHarness {\n id: string\n version?: string\n hash?: string\n}\n\nexport interface AgentProfileCellInput {\n profileId: string\n sourceProfile: AgentProfileSourceInput\n harness?: AgentProfileHarness\n model?: string\n promptHash?: string\n dimensions?: Record<string, AgentProfileDimensionValue>\n}\n\nexport interface AgentProfileCell {\n schemaVersion: AgentProfileCellSchemaVersion\n cellId: string\n profileId: string\n sourceProfile: AgentProfileSource\n harness?: AgentProfileHarness\n model?: string\n promptHash?: string\n dimensions?: Record<string, AgentProfileDimensionValue>\n}\n\nexport class AgentProfileCellValidationError extends ValidationError {\n readonly path: string\n constructor(message: string, path = '') {\n super(path ? `${message} (at ${path})` : message)\n this.path = path\n }\n}\n\nconst SHA256_HEX = /^[0-9a-f]{64}$/\nconst CELL_ID = /^agent-profile-cell:sha256:[0-9a-f]{64}$/\n\nexport async function buildAgentProfileCell(\n input: AgentProfileCellInput,\n): Promise<AgentProfileCell> {\n const material = await normalizeAgentProfileCellInput(input)\n const cellId = `agent-profile-cell:sha256:${await hashJson(material)}`\n return { ...material, cellId }\n}\n\nexport function agentProfileCellHashMaterial(\n cell: AgentProfileCell,\n): Omit<AgentProfileCell, 'cellId'> {\n const { cellId: _cellId, ...material } = cell\n void _cellId\n return normalizeAgentProfileCell(material)\n}\n\nexport async function verifyAgentProfileCell(cell: AgentProfileCell): Promise<boolean> {\n validateAgentProfileCell(cell)\n return (\n cell.cellId ===\n `agent-profile-cell:sha256:${await hashJson(agentProfileCellHashMaterial(cell))}`\n )\n}\n\nexport function validateAgentProfileCell(input: unknown): AgentProfileCell {\n if (input === null || typeof input !== 'object') {\n throw new AgentProfileCellValidationError('expected object')\n }\n const obj = input as Record<string, unknown>\n expectLiteral(obj.schemaVersion, 'agent-profile-cell/v1', 'schemaVersion')\n if (typeof obj.cellId !== 'string' || !CELL_ID.test(obj.cellId)) {\n throw new AgentProfileCellValidationError(\n 'cellId must match agent-profile-cell:sha256:<64 lowercase hex chars>',\n 'cellId',\n )\n }\n expectString(obj.profileId, 'profileId')\n validateSource(obj.sourceProfile, 'sourceProfile')\n if (obj.harness !== undefined) validateHarness(obj.harness, 'harness')\n if (obj.model !== undefined) expectString(obj.model, 'model')\n if (obj.promptHash !== undefined) expectString(obj.promptHash, 'promptHash')\n if (obj.dimensions !== undefined) validateDimensions(obj.dimensions, 'dimensions')\n return input as AgentProfileCell\n}\n\nexport function requireAgentProfileCell(record: {\n runId: string\n agentProfile?: AgentProfileCell\n}): AgentProfileCell {\n if (!record.agentProfile) {\n throw new AgentProfileCellValidationError(\n `run \"${record.runId}\" is missing agentProfile; profile-cell grouping requires explicit profile identity`,\n 'agentProfile',\n )\n }\n return validateAgentProfileCell(record.agentProfile)\n}\n\nexport function agentProfileCellKey(record: {\n runId: string\n agentProfile?: AgentProfileCell\n}): string {\n return requireAgentProfileCell(record).cellId\n}\n\nexport async function assertRunAgentProfileCell(record: {\n runId: string\n model: string\n promptHash: string\n agentProfile?: AgentProfileCell\n}): Promise<AgentProfileCell> {\n const profile = requireAgentProfileCell(record)\n if (!(await verifyAgentProfileCell(profile))) {\n throw new AgentProfileCellValidationError(\n `run \"${record.runId}\" has an agentProfile.cellId that does not match its content`,\n 'agentProfile.cellId',\n )\n }\n if (profile.model !== undefined && profile.model !== record.model) {\n throw new AgentProfileCellValidationError(\n `run \"${record.runId}\" agentProfile.model \"${profile.model}\" does not match model \"${record.model}\"`,\n 'agentProfile.model',\n )\n }\n if (profile.promptHash !== undefined && profile.promptHash !== record.promptHash) {\n throw new AgentProfileCellValidationError(\n `run \"${record.runId}\" agentProfile.promptHash \"${profile.promptHash}\" does not match promptHash \"${record.promptHash}\"`,\n 'agentProfile.promptHash',\n )\n }\n return profile\n}\n\nexport function groupRunsByAgentProfileCell<\n T extends { runId: string; agentProfile?: AgentProfileCell },\n>(records: readonly T[]): Map<string, T[]> {\n const groups = new Map<string, T[]>()\n for (const record of records) {\n const key = agentProfileCellKey(record)\n const bucket = groups.get(key)\n if (bucket) bucket.push(record)\n else groups.set(key, [record])\n }\n return groups\n}\n\nasync function normalizeAgentProfileCellInput(\n input: AgentProfileCellInput,\n): Promise<Omit<AgentProfileCell, 'cellId'>> {\n return normalizeAgentProfileCell({\n schemaVersion: 'agent-profile-cell/v1',\n profileId: input.profileId,\n sourceProfile: await normalizeSourceInput(input.sourceProfile),\n harness: input.harness,\n model: input.model,\n promptHash: input.promptHash,\n dimensions: input.dimensions,\n })\n}\n\nfunction normalizeAgentProfileCell(\n input: Omit<AgentProfileCell, 'cellId'>,\n): Omit<AgentProfileCell, 'cellId'> {\n return compactObject({\n schemaVersion: 'agent-profile-cell/v1' as const,\n profileId: requireNonEmpty(input.profileId, 'profileId'),\n sourceProfile: normalizeSource(input.sourceProfile),\n harness: input.harness ? normalizeHarness(input.harness, 'harness') : undefined,\n model: optionalNonEmpty(input.model, 'model'),\n promptHash: optionalNonEmpty(input.promptHash, 'promptHash'),\n dimensions: input.dimensions\n ? nonEmptyRecord(normalizeDimensions(input.dimensions))\n : undefined,\n })\n}\n\nasync function normalizeSourceInput(input: AgentProfileSourceInput): Promise<AgentProfileSource> {\n const kind = requireNonEmpty(input.kind, 'sourceProfile.kind')\n if (input.hash !== undefined && input.profile !== undefined) {\n throw new AgentProfileCellValidationError(\n 'sourceProfile must provide either hash or profile, not both',\n 'sourceProfile',\n )\n }\n if (input.hash !== undefined) {\n return { kind, hash: requireSha256Hex(input.hash, 'sourceProfile.hash') }\n }\n if (input.profile === undefined) {\n throw new AgentProfileCellValidationError(\n 'sourceProfile must provide hash or profile',\n 'sourceProfile',\n )\n }\n assertJson(input.profile, 'sourceProfile.profile')\n return { kind, hash: await hashJson(input.profile) }\n}\n\nfunction normalizeSource(input: AgentProfileSource): AgentProfileSource {\n return {\n kind: requireNonEmpty(input.kind, 'sourceProfile.kind'),\n hash: requireSha256Hex(input.hash, 'sourceProfile.hash'),\n }\n}\n\nfunction normalizeHarness(input: AgentProfileHarness, path: string): AgentProfileHarness {\n return compactObject({\n id: requireNonEmpty(input.id, `${path}.id`),\n version: optionalNonEmpty(input.version, `${path}.version`),\n hash: optionalNonEmpty(input.hash, `${path}.hash`),\n })\n}\n\nfunction normalizeDimensions(\n input: Record<string, AgentProfileDimensionValue>,\n): Record<string, AgentProfileDimensionValue> {\n const out: Record<string, AgentProfileDimensionValue> = {}\n for (const key of Object.keys(input).sort()) {\n const value = input[key]\n requireNonEmpty(key, 'dimensions.<key>')\n if (\n value !== null &&\n typeof value !== 'string' &&\n typeof value !== 'number' &&\n typeof value !== 'boolean'\n ) {\n throw new AgentProfileCellValidationError(\n 'expected primitive dimension value',\n `dimensions.${key}`,\n )\n }\n if (typeof value === 'number' && !Number.isFinite(value)) {\n throw new AgentProfileCellValidationError('expected finite number', `dimensions.${key}`)\n }\n out[key] = value\n }\n return out\n}\n\nfunction compactObject<T extends Record<string, unknown>>(input: T): T {\n const out: Record<string, unknown> = {}\n for (const [key, value] of Object.entries(input)) {\n if (value !== undefined) out[key] = value\n }\n return out as T\n}\n\nfunction nonEmptyRecord<T extends Record<string, unknown>>(input: T): T | undefined {\n return Object.keys(input).length > 0 ? input : undefined\n}\n\nfunction validateSource(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object' || Array.isArray(value)) {\n throw new AgentProfileCellValidationError('expected object', path)\n }\n const rec = value as Record<string, unknown>\n expectString(rec.kind, `${path}.kind`)\n requireSha256Hex(rec.hash, `${path}.hash`)\n}\n\nfunction validateHarness(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object' || Array.isArray(value)) {\n throw new AgentProfileCellValidationError('expected object', path)\n }\n const rec = value as Record<string, unknown>\n expectString(rec.id, `${path}.id`)\n if (rec.version !== undefined) expectString(rec.version, `${path}.version`)\n if (rec.hash !== undefined) expectString(rec.hash, `${path}.hash`)\n}\n\nfunction validateDimensions(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object' || Array.isArray(value)) {\n throw new AgentProfileCellValidationError('expected object', path)\n }\n normalizeDimensions(value as Record<string, AgentProfileDimensionValue>)\n}\n\nfunction assertJson(value: AgentProfileJson, path: string): void {\n if (value === null) return\n const type = typeof value\n if (type === 'string' || type === 'boolean') return\n if (type === 'number') {\n if (!Number.isFinite(value)) {\n throw new AgentProfileCellValidationError('expected finite number', path)\n }\n return\n }\n if (Array.isArray(value)) {\n value.forEach((item, index) => {\n assertJson(item, `${path}[${index}]`)\n })\n return\n }\n if (type === 'object') {\n for (const [key, nested] of Object.entries(value)) {\n requireNonEmpty(key, `${path}.<key>`)\n assertJson(nested, `${path}.${key}`)\n }\n return\n }\n throw new AgentProfileCellValidationError('expected JSON-compatible value', path)\n}\n\nfunction expectLiteral(value: unknown, expected: string, path: string): void {\n if (value !== expected) {\n throw new AgentProfileCellValidationError(`expected ${expected}`, path)\n }\n}\n\nfunction expectString(value: unknown, path: string): void {\n if (typeof value !== 'string' || value.length === 0) {\n throw new AgentProfileCellValidationError('expected non-empty string', path)\n }\n}\n\nfunction requireNonEmpty(value: string, path: string): string {\n if (typeof value !== 'string' || value.length === 0) {\n throw new AgentProfileCellValidationError('expected non-empty string', path)\n }\n return value\n}\n\nfunction optionalNonEmpty(value: string | undefined, path: string): string | undefined {\n if (value === undefined) return undefined\n return requireNonEmpty(value, path)\n}\n\nfunction requireSha256Hex(value: unknown, path: string): string {\n if (typeof value !== 'string' || !SHA256_HEX.test(value)) {\n throw new AgentProfileCellValidationError('expected 64 lowercase sha256 hex chars', path)\n }\n return value\n}\n\n// ── Consumer helpers ─────────────────────────────────────────────────\n//\n// Two pieces of boilerplate every product consuming `buildAgentProfileCell`\n// has been duplicating (gtm-agent #137, blueprint-agent #1756/#1757):\n//\n// 1. A `JSON.parse(JSON.stringify(value))` helper that canonicalizes an\n// arbitrary sandbox-SDK `AgentProfile` into the recursive\n// `AgentProfileJson` shape, with a fail-loud error when the profile\n// is not JSON-serializable.\n//\n// 2. The magic string `'sandbox-agent-profile'` for `sourceProfile.kind`.\n//\n// Both belong here so the cross-product cell join (same canonical profile\n// hashes to the same `sourceProfile.hash` across products) is enforced by\n// the type system, not by every consumer remembering to do it right.\n// See blueprint-agent issue tangle-network/agent-eval#82.\n\n/** Canonical `sourceProfile.kind` values. Two products fingerprinting the\n * same canonical profile MUST use the same kind for their cells to share\n * `sourceProfile.hash`. Extend rather than create new strings — adding a\n * new kind is a deliberate cross-product schema change. */\nexport const AGENT_PROFILE_KINDS = {\n /** A profile declared via `defineAgentProfile(...)` from\n * `@tangle-network/sandbox`. The default kind for sandbox-hosted\n * products (gtm-agent, blueprint-agent, sandbox, evals). */\n SANDBOX_AGENT_PROFILE: 'sandbox-agent-profile',\n} as const\n\nexport type AgentProfileKind = (typeof AGENT_PROFILE_KINDS)[keyof typeof AGENT_PROFILE_KINDS]\n\n/** Canonicalize an arbitrary value into `AgentProfileJson` by JSON\n * round-trip. Throws when the value contains anything not representable\n * as JSON (functions, BigInt, cycles) — non-portable profiles fail loud\n * rather than silently dropping fields. */\nexport function toAgentProfileJson(value: unknown): AgentProfileJson {\n let serialized: string | undefined\n try {\n serialized = JSON.stringify(value)\n } catch (err) {\n throw new AgentProfileCellValidationError(\n `agent profile must be JSON-serializable: ${err instanceof Error ? err.message : String(err)}`,\n 'sourceProfile.profile',\n )\n }\n if (serialized === undefined) {\n throw new AgentProfileCellValidationError(\n 'agent profile must be JSON-serializable (got undefined after JSON.stringify)',\n 'sourceProfile.profile',\n )\n }\n return JSON.parse(serialized) as AgentProfileJson\n}\n\n/** Minimal shape required of any sandbox-SDK `AgentProfile` — anything\n * with a non-empty `name` and `version` plus JSON-serializable contents.\n * Compatible with `defineAgentProfile(...)` output from\n * `@tangle-network/sandbox`; products that have not yet declared a real\n * profile can pass a `{ name, version, ...metadata }` stub. */\nexport interface SandboxAgentProfileLike {\n name: string\n version: string\n [key: string]: unknown\n}\n\n/** Higher-level helper that hard-codes the canonical\n * `sandbox-agent-profile` kind plus the JSON canonicalization. Equivalent\n * to calling `buildAgentProfileCell` with `profileId = \\`${name}@${version}\\``\n * and `sourceProfile = { kind: SANDBOX_AGENT_PROFILE, profile: <round-tripped> }`.\n *\n * Use this from any product consuming a sandbox-SDK `AgentProfile`; the\n * manual `buildAgentProfileCell` call is reserved for advanced cases\n * (custom kinds, pre-computed source hashes, alternate profileId\n * conventions). */\nexport async function buildSandboxAgentProfileCell(\n profile: SandboxAgentProfileLike,\n input: Omit<AgentProfileCellInput, 'profileId' | 'sourceProfile'>,\n): Promise<AgentProfileCell> {\n if (!profile || typeof profile !== 'object') {\n throw new AgentProfileCellValidationError('sandbox AgentProfile must be an object', 'profile')\n }\n if (typeof profile.name !== 'string' || profile.name.length === 0) {\n throw new AgentProfileCellValidationError(\n 'sandbox AgentProfile must have a non-empty `name`',\n 'profile.name',\n )\n }\n if (typeof profile.version !== 'string' || profile.version.length === 0) {\n throw new AgentProfileCellValidationError(\n 'sandbox AgentProfile must have a non-empty `version`',\n 'profile.version',\n )\n }\n return buildAgentProfileCell({\n ...input,\n profileId: `${profile.name}@${profile.version}`,\n sourceProfile: {\n kind: AGENT_PROFILE_KINDS.SANDBOX_AGENT_PROFILE,\n profile: toAgentProfileJson(profile),\n },\n })\n}\n","/**\n * Paper-grade RunRecord schema + runtime validator.\n *\n * Every run that participates in a promotion gate, paper table, or\n * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory\n * fields are exactly those the paper \"Two Loops, Three Roles\" requires\n * for reproducibility: who/what/when/cost/seed/hash, plus the search vs\n * holdout split tag and either a `searchScore` or a `holdoutScore`.\n *\n * This is intentionally NOT a replacement for the rich `Run` /\n * `ProposeReviewReport` / `ScenarioResult` types already in the\n * package. Those are runtime structures with full provenance. A\n * `RunRecord` is the analysis-time projection — the JSON-friendly\n * row you'd put in a parquet file or paste into a notebook.\n *\n * Validate at the boundary:\n *\n * const rec = validateRunRecord(rawJson) // throws on missing\n * const ok = isRunRecord(rawJson) // boolean check\n * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }\n *\n * The validator runs in pure TS — zod is intentionally NOT a\n * dependency. Round-trip tested in `tests/run-record.test.ts`.\n */\n\nimport type { AgentProfileCell } from './agent-profile-cell'\nimport { validateAgentProfileCell } from './agent-profile-cell'\nimport { ValidationError } from './errors'\nimport type { FailureClass } from './trace/schema'\n\n/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the\n * combined train+test pool that the optimizer is allowed to read. */\nexport type RunSplitTag = 'search' | 'dev' | 'holdout'\n\nexport interface RunTokenUsage {\n input: number\n output: number\n cached?: number\n}\n\nexport interface RunJudgeMetadata {\n model: string\n promptVersion: string\n /** [0,1] confidence the judge declared. Constant judge confidence\n * across many runs is a fallback signal (see `canary.ts`). */\n confidence: number\n /** True if the judge degraded to a fallback path (rules-only,\n * prior-call cache, etc.). The canary uses this to alert. */\n fallback: boolean\n}\n\n/**\n * Per-judge / per-dimension breakdown for runs scored by an ensemble of\n * judges over a multi-dimensional rubric.\n *\n * The collapsed `outcome.searchScore` / `holdoutScore` carries the\n * composite the gate uses. The full breakdown belongs here so consumers\n * can answer \"which judge disagreed?\", \"which dimension dragged the\n * composite down?\", and \"did half the panel fail?\" without re-running.\n *\n * `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and\n * `composite` are convenience projections — derivable but precomputed so\n * downstream IRR primitives (`interRaterReliability`,\n * `corpusInterRaterAgreement`) and reporters don't pay the same\n * aggregation twice.\n *\n * Fail-loud discipline: judges that errored out land in `failedJudges`\n * by id. A missing key in `perJudge` is ambiguous (silent zero vs not\n * run); the explicit list makes a partial-failure recorded as such.\n */\nexport interface JudgeScoresRecord {\n /** Per-judge per-dimension scores. `{ \"kimi-k2.6\": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */\n perJudge: Record<string, Record<string, number>>\n /** Per-dim mean across judges. Convenience — derivable from `perJudge`. */\n perDimMean: Record<string, number>\n /** Composite mean across all dims and judges. Mirrors the score\n * the gate sees on `outcome.searchScore` / `holdoutScore`. */\n composite: number\n /** Judges that errored or returned an unparseable verdict. Recorded\n * by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit,\n * not inferred from missing keys in `perJudge`. */\n failedJudges?: string[]\n /** Free-form notes the judges emitted (joined across judges or\n * first-judge only — consumer's choice). */\n notes?: string\n}\n\nexport interface RunOutcome {\n /** Score on the search/optimization split. Optional because a\n * holdout-only evaluation only fills `holdoutScore`. */\n searchScore?: number\n /** Score on the held-out split. Optional because a search-only run\n * only fills `searchScore`. At least one must be present. */\n holdoutScore?: number\n /** Bag of any other metric the run produced — judge dimensions,\n * pass/fail counters, latency stats, etc. Numeric only — keeps\n * reporters honest. */\n raw: Record<string, number>\n /** Per-judge / per-dim breakdown. Consumers writing ensemble\n * judgements populate this; substrate primitives like\n * `interRaterReliability` and `corpusInterRaterAgreement` accept\n * these records as input. Optional — single-judge or scalar-only\n * runs leave it unset. */\n judgeScores?: JudgeScoresRecord\n}\n\n/**\n * Mandatory paper-grade fields for a single evaluation run. Optional\n * fields are extension points; mandatory fields throw if missing.\n *\n * Hash discipline:\n * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the\n * model (after any steering bundle merge).\n * - `configHash` is the sha256 of the effective run config (model,\n * temperature, tools, judges, splits). The pair (promptHash,\n * configHash) uniquely identifies an experimental cell.\n *\n * Model snapshot discipline:\n * - `model` MUST encode a snapshot version. Bare aliases like\n * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.\n * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.\n */\nexport interface RunRecord {\n /** UUID for the run. */\n runId: string\n /** Logical experiment grouping (a treatment vs a baseline within\n * the same sweep should share `experimentId`). */\n experimentId: string\n /** Stable identifier for the candidate (variant) being run. The\n * promotion gate compares two `candidateId`s on matched items. */\n candidateId: string\n /** RNG seed for the run. Always recorded — silent re-seeding is\n * the most common cause of non-reproducible numbers. */\n seed: number\n /** Model identifier WITH snapshot version. */\n model: string\n /** sha256 of the effective prompt (post-steering). */\n promptHash: string\n /** sha256 of the effective config. */\n configHash: string\n /** Git SHA the harness was run from. */\n commitSha: string\n /** End-to-end wall-clock duration in milliseconds. */\n wallMs: number\n /** Time spent queued before execution started, if known. */\n queueMs?: number\n /** Total USD cost. Mandatory — runs without a cost number are\n * unbounded by definition and must not be admitted into the gate. */\n costUsd: number\n /** Token usage breakdown. */\n tokenUsage: RunTokenUsage\n /** Judge-side metadata, if a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n /** Per-split scores + raw bag. */\n outcome: RunOutcome\n /** Canonical, cross-agent failure class drawn from the shared\n * `FAILURE_CLASSES` taxonomy. This is the aggregation key that makes\n * \"which failure dominates across the whole fleet\" answerable in ONE\n * vocabulary — every agent classifies against the same enum. Producers\n * set it via the substrate classifier; leave unset only when the failure\n * genuinely can't be classified. */\n failureClass?: FailureClass\n /** Free-form domain-specific failure detail, scoped UNDER `failureClass`\n * (e.g. failureClass='tool_recovery_failure', failureMode='forge_build_unsatisfied').\n * The within-agent drill-down; `failureClass` is the cross-agent key. */\n failureMode?: string\n /** Which split this run was drawn from. */\n splitTag: RunSplitTag\n /**\n * Stable scenario identifier the run was scored against. Optional for\n * backwards compatibility, but **strongly recommended**: every primitive\n * that pairs runs by scenario (preferences, paired stats, BT tournament)\n * keys on this. The campaign artifact populates it canonically; legacy\n * runs without it fall back to inference from `outcome.raw.scenario_id`\n * or `experimentId`.\n */\n scenarioId?: string\n /**\n * Canonical identity for the agent profile cell that produced this row:\n * profile artifact hash plus optional harness/model/prompt/reporting\n * dimensions. Use `agentProfile.cellId` to group persona sweeps and\n * longitudinal reports by the complete source profile, not by a loose\n * candidate label or opaque config hash.\n */\n agentProfile?: AgentProfileCell\n}\n\n// ── Validation ───────────────────────────────────────────────────────\n\nconst MANDATORY_TOP_LEVEL = [\n 'runId',\n 'experimentId',\n 'candidateId',\n 'seed',\n 'model',\n 'promptHash',\n 'configHash',\n 'commitSha',\n 'wallMs',\n 'costUsd',\n 'tokenUsage',\n 'outcome',\n 'splitTag',\n] as const\n\nconst SPLIT_TAGS: ReadonlyArray<RunSplitTag> = ['search', 'dev', 'holdout']\n\nexport class RunRecordValidationError extends ValidationError {\n readonly path: string\n constructor(message: string, path = '') {\n super(path ? `${message} (at ${path})` : message)\n this.path = path\n }\n}\n\n/**\n * Strict validator. Throws `RunRecordValidationError` on the first\n * missing or wrongly-typed field. Returns the input cast to\n * `RunRecord` on success — the validator does not coerce.\n */\nexport function validateRunRecord(input: unknown): RunRecord {\n if (input === null || typeof input !== 'object') {\n throw new RunRecordValidationError('expected object')\n }\n const obj = input as Record<string, unknown>\n\n for (const key of MANDATORY_TOP_LEVEL) {\n if (!(key in obj)) {\n throw new RunRecordValidationError(`missing mandatory field \"${key}\"`)\n }\n }\n\n expectString(obj.runId, 'runId')\n expectString(obj.experimentId, 'experimentId')\n expectString(obj.candidateId, 'candidateId')\n expectFiniteNumber(obj.seed, 'seed')\n expectString(obj.model, 'model')\n expectString(obj.promptHash, 'promptHash')\n expectString(obj.configHash, 'configHash')\n expectString(obj.commitSha, 'commitSha')\n expectFiniteNumber(obj.wallMs, 'wallMs')\n if (obj.queueMs !== undefined) expectFiniteNumber(obj.queueMs, 'queueMs')\n expectFiniteNumber(obj.costUsd, 'costUsd')\n\n // Snapshot discipline: bare model aliases are not paper-grade.\n if (!modelHasSnapshot(obj.model as string)) {\n throw new RunRecordValidationError(\n `model \"${obj.model}\" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,\n 'model',\n )\n }\n\n // Token usage.\n const tu = obj.tokenUsage\n if (tu === null || typeof tu !== 'object') {\n throw new RunRecordValidationError('tokenUsage must be an object', 'tokenUsage')\n }\n const tuRec = tu as Record<string, unknown>\n expectFiniteNumber(tuRec.input, 'tokenUsage.input')\n expectFiniteNumber(tuRec.output, 'tokenUsage.output')\n if (tuRec.cached !== undefined) expectFiniteNumber(tuRec.cached, 'tokenUsage.cached')\n\n // Judge metadata, optional.\n if (obj.judgeMetadata !== undefined) {\n const jm = obj.judgeMetadata\n if (jm === null || typeof jm !== 'object') {\n throw new RunRecordValidationError('judgeMetadata must be an object', 'judgeMetadata')\n }\n const jmRec = jm as Record<string, unknown>\n expectString(jmRec.model, 'judgeMetadata.model')\n expectString(jmRec.promptVersion, 'judgeMetadata.promptVersion')\n expectFiniteNumber(jmRec.confidence, 'judgeMetadata.confidence')\n if (typeof jmRec.fallback !== 'boolean') {\n throw new RunRecordValidationError(\n 'judgeMetadata.fallback must be boolean',\n 'judgeMetadata.fallback',\n )\n }\n }\n\n // Outcome.\n const out = obj.outcome\n if (out === null || typeof out !== 'object') {\n throw new RunRecordValidationError('outcome must be an object', 'outcome')\n }\n const outRec = out as Record<string, unknown>\n if (outRec.searchScore !== undefined)\n expectFiniteNumber(outRec.searchScore, 'outcome.searchScore')\n if (outRec.holdoutScore !== undefined)\n expectFiniteNumber(outRec.holdoutScore, 'outcome.holdoutScore')\n if (outRec.searchScore === undefined && outRec.holdoutScore === undefined) {\n throw new RunRecordValidationError(\n 'outcome must define searchScore or holdoutScore (or both)',\n 'outcome',\n )\n }\n const raw = outRec.raw\n if (raw === null || typeof raw !== 'object') {\n throw new RunRecordValidationError('outcome.raw must be an object', 'outcome.raw')\n }\n for (const [k, v] of Object.entries(raw as Record<string, unknown>)) {\n expectFiniteNumber(v, `outcome.raw.${k}`)\n }\n\n // Per-judge / per-dim breakdown, optional.\n if (outRec.judgeScores !== undefined) {\n validateJudgeScores(outRec.judgeScores, 'outcome.judgeScores')\n }\n\n // Failure mode optional.\n if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode')\n\n if (obj.agentProfile !== undefined) {\n try {\n const profile = validateAgentProfileCell(obj.agentProfile)\n if (profile.model !== undefined && profile.model !== obj.model) {\n throw new RunRecordValidationError(\n `agentProfile.model \"${profile.model}\" does not match model \"${obj.model}\"`,\n 'agentProfile.model',\n )\n }\n if (profile.promptHash !== undefined && profile.promptHash !== obj.promptHash) {\n throw new RunRecordValidationError(\n `agentProfile.promptHash \"${profile.promptHash}\" does not match promptHash \"${obj.promptHash}\"`,\n 'agentProfile.promptHash',\n )\n }\n } catch (error) {\n if (error instanceof RunRecordValidationError) throw error\n if (error instanceof Error) {\n throw new RunRecordValidationError(error.message, 'agentProfile')\n }\n throw error\n }\n }\n\n // Split tag.\n if (typeof obj.splitTag !== 'string' || !SPLIT_TAGS.includes(obj.splitTag as RunSplitTag)) {\n throw new RunRecordValidationError(\n `splitTag must be one of ${SPLIT_TAGS.join(', ')}, got ${String(obj.splitTag)}`,\n 'splitTag',\n )\n }\n\n return input as RunRecord\n}\n\n/** Boolean validator — convenience for filtering arrays. */\nexport function isRunRecord(input: unknown): input is RunRecord {\n try {\n validateRunRecord(input)\n return true\n } catch {\n return false\n }\n}\n\n/** Non-throwing validator — returns a discriminated union. */\nexport function parseRunRecordSafe(\n input: unknown,\n): { ok: true; value: RunRecord } | { ok: false; error: RunRecordValidationError } {\n try {\n return { ok: true, value: validateRunRecord(input) }\n } catch (e) {\n if (e instanceof RunRecordValidationError) return { ok: false, error: e }\n throw e\n }\n}\n\n/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */\nexport function roundTripRunRecord(record: RunRecord): RunRecord {\n const json = JSON.stringify(record)\n return validateRunRecord(JSON.parse(json))\n}\n\n// ── Internals ────────────────────────────────────────────────────────\n\nfunction expectString(value: unknown, path: string): void {\n if (typeof value !== 'string' || value.length === 0) {\n throw new RunRecordValidationError(`expected non-empty string`, path)\n }\n}\n\nfunction expectFiniteNumber(value: unknown, path: string): void {\n if (typeof value !== 'number' || !Number.isFinite(value)) {\n throw new RunRecordValidationError(`expected finite number`, path)\n }\n}\n\nfunction validateJudgeScores(value: unknown, path: string): void {\n if (value === null || typeof value !== 'object') {\n throw new RunRecordValidationError('judgeScores must be an object', path)\n }\n const rec = value as Record<string, unknown>\n\n const perJudge = rec.perJudge\n if (perJudge === null || typeof perJudge !== 'object') {\n throw new RunRecordValidationError('perJudge must be an object', `${path}.perJudge`)\n }\n for (const [judgeId, dims] of Object.entries(perJudge as Record<string, unknown>)) {\n if (dims === null || typeof dims !== 'object') {\n throw new RunRecordValidationError(\n 'per-judge entry must be an object of dimension scores',\n `${path}.perJudge.${judgeId}`,\n )\n }\n for (const [dim, score] of Object.entries(dims as Record<string, unknown>)) {\n expectFiniteNumber(score, `${path}.perJudge.${judgeId}.${dim}`)\n }\n }\n\n const perDimMean = rec.perDimMean\n if (perDimMean === null || typeof perDimMean !== 'object') {\n throw new RunRecordValidationError('perDimMean must be an object', `${path}.perDimMean`)\n }\n for (const [dim, mean] of Object.entries(perDimMean as Record<string, unknown>)) {\n expectFiniteNumber(mean, `${path}.perDimMean.${dim}`)\n }\n\n expectFiniteNumber(rec.composite, `${path}.composite`)\n\n if (rec.failedJudges !== undefined) {\n if (!Array.isArray(rec.failedJudges)) {\n throw new RunRecordValidationError(\n 'failedJudges must be an array of strings',\n `${path}.failedJudges`,\n )\n }\n for (let i = 0; i < rec.failedJudges.length; i++) {\n const id = rec.failedJudges[i]\n if (typeof id !== 'string' || id.length === 0) {\n throw new RunRecordValidationError(\n 'failedJudges entry must be a non-empty string',\n `${path}.failedJudges[${i}]`,\n )\n }\n }\n }\n\n if (rec.notes !== undefined && typeof rec.notes !== 'string') {\n throw new RunRecordValidationError('notes must be a string', `${path}.notes`)\n }\n}\n\n/**\n * Heuristic snapshot check. Accepts:\n * - `name@YYYY-MM-DD` (Anthropic style: `claude-sonnet-4-6@2025-04-15`)\n * - `name-YYYYMMDD` (OpenAI style: `gpt-4o-2024-11-20`)\n * - `name@<arbitrary-token>` (allow opaque snapshots like `@v3`)\n * - explicit `:date-...` Vertex-style tags\n *\n * Rejects bare aliases like `claude-sonnet-4` or `gpt-4o` that remap\n * silently as providers ship new snapshots.\n */\nfunction modelHasSnapshot(model: string): boolean {\n if (model.includes('@')) return true\n if (/-\\d{8}$/.test(model)) return true\n if (/-\\d{4}-\\d{2}-\\d{2}$/.test(model)) return true\n if (/:date-/.test(model)) return true\n return false\n}\n"],"mappings":";;;;;;;;AAwDO,IAAM,kCAAN,cAA8C,gBAAgB;AAAA,EAC1D;AAAA,EACT,YAAY,SAAiB,OAAO,IAAI;AACtC,UAAM,OAAO,GAAG,OAAO,QAAQ,IAAI,MAAM,OAAO;AAChD,SAAK,OAAO;AAAA,EACd;AACF;AAEA,IAAM,aAAa;AACnB,IAAM,UAAU;AAEhB,eAAsB,sBACpB,OAC2B;AAC3B,QAAM,WAAW,MAAM,+BAA+B,KAAK;AAC3D,QAAM,SAAS,6BAA6B,MAAM,SAAS,QAAQ,CAAC;AACpE,SAAO,EAAE,GAAG,UAAU,OAAO;AAC/B;AAEO,SAAS,6BACd,MACkC;AAClC,QAAM,EAAE,QAAQ,SAAS,GAAG,SAAS,IAAI;AACzC,OAAK;AACL,SAAO,0BAA0B,QAAQ;AAC3C;AAEA,eAAsB,uBAAuB,MAA0C;AACrF,2BAAyB,IAAI;AAC7B,SACE,KAAK,WACL,6BAA6B,MAAM,SAAS,6BAA6B,IAAI,CAAC,CAAC;AAEnF;AAEO,SAAS,yBAAyB,OAAkC;AACzE,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,gCAAgC,iBAAiB;AAAA,EAC7D;AACA,QAAM,MAAM;AACZ,gBAAc,IAAI,eAAe,yBAAyB,eAAe;AACzE,MAAI,OAAO,IAAI,WAAW,YAAY,CAAC,QAAQ,KAAK,IAAI,MAAM,GAAG;AAC/D,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,eAAa,IAAI,WAAW,WAAW;AACvC,iBAAe,IAAI,eAAe,eAAe;AACjD,MAAI,IAAI,YAAY,OAAW,iBAAgB,IAAI,SAAS,SAAS;AACrE,MAAI,IAAI,UAAU,OAAW,cAAa,IAAI,OAAO,OAAO;AAC5D,MAAI,IAAI,eAAe,OAAW,cAAa,IAAI,YAAY,YAAY;AAC3E,MAAI,IAAI,eAAe,OAAW,oBAAmB,IAAI,YAAY,YAAY;AACjF,SAAO;AACT;AAEO,SAAS,wBAAwB,QAGnB;AACnB,MAAI,CAAC,OAAO,cAAc;AACxB,UAAM,IAAI;AAAA,MACR,QAAQ,OAAO,KAAK;AAAA,MACpB;AAAA,IACF;AAAA,EACF;AACA,SAAO,yBAAyB,OAAO,YAAY;AACrD;AAEO,SAAS,oBAAoB,QAGzB;AACT,SAAO,wBAAwB,MAAM,EAAE;AACzC;AAEA,eAAsB,0BAA0B,QAKlB;AAC5B,QAAM,UAAU,wBAAwB,MAAM;AAC9C,MAAI,CAAE,MAAM,uBAAuB,OAAO,GAAI;AAC5C,UAAM,IAAI;AAAA,MACR,QAAQ,OAAO,KAAK;AAAA,MACpB;AAAA,IACF;AAAA,EACF;AACA,MAAI,QAAQ,UAAU,UAAa,QAAQ,UAAU,OAAO,OAAO;AACjE,UAAM,IAAI;AAAA,MACR,QAAQ,OAAO,KAAK,yBAAyB,QAAQ,KAAK,2BAA2B,OAAO,KAAK;AAAA,MACjG;AAAA,IACF;AAAA,EACF;AACA,MAAI,QAAQ,eAAe,UAAa,QAAQ,eAAe,OAAO,YAAY;AAChF,UAAM,IAAI;AAAA,MACR,QAAQ,OAAO,KAAK,8BAA8B,QAAQ,UAAU,gCAAgC,OAAO,UAAU;AAAA,MACrH;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AAEO,SAAS,4BAEd,SAAyC;AACzC,QAAM,SAAS,oBAAI,IAAiB;AACpC,aAAW,UAAU,SAAS;AAC5B,UAAM,MAAM,oBAAoB,MAAM;AACtC,UAAM,SAAS,OAAO,IAAI,GAAG;AAC7B,QAAI,OAAQ,QAAO,KAAK,MAAM;AAAA,QACzB,QAAO,IAAI,KAAK,CAAC,MAAM,CAAC;AAAA,EAC/B;AACA,SAAO;AACT;AAEA,eAAe,+BACb,OAC2C;AAC3C,SAAO,0BAA0B;AAAA,IAC/B,eAAe;AAAA,IACf,WAAW,MAAM;AAAA,IACjB,eAAe,MAAM,qBAAqB,MAAM,aAAa;AAAA,IAC7D,SAAS,MAAM;AAAA,IACf,OAAO,MAAM;AAAA,IACb,YAAY,MAAM;AAAA,IAClB,YAAY,MAAM;AAAA,EACpB,CAAC;AACH;AAEA,SAAS,0BACP,OACkC;AAClC,SAAO,cAAc;AAAA,IACnB,eAAe;AAAA,IACf,WAAW,gBAAgB,MAAM,WAAW,WAAW;AAAA,IACvD,eAAe,gBAAgB,MAAM,aAAa;AAAA,IAClD,SAAS,MAAM,UAAU,iBAAiB,MAAM,SAAS,SAAS,IAAI;AAAA,IACtE,OAAO,iBAAiB,MAAM,OAAO,OAAO;AAAA,IAC5C,YAAY,iBAAiB,MAAM,YAAY,YAAY;AAAA,IAC3D,YAAY,MAAM,aACd,eAAe,oBAAoB,MAAM,UAAU,CAAC,IACpD;AAAA,EACN,CAAC;AACH;AAEA,eAAe,qBAAqB,OAA6D;AAC/F,QAAM,OAAO,gBAAgB,MAAM,MAAM,oBAAoB;AAC7D,MAAI,MAAM,SAAS,UAAa,MAAM,YAAY,QAAW;AAC3D,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,MAAI,MAAM,SAAS,QAAW;AAC5B,WAAO,EAAE,MAAM,MAAM,iBAAiB,MAAM,MAAM,oBAAoB,EAAE;AAAA,EAC1E;AACA,MAAI,MAAM,YAAY,QAAW;AAC/B,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,aAAW,MAAM,SAAS,uBAAuB;AACjD,SAAO,EAAE,MAAM,MAAM,MAAM,SAAS,MAAM,OAAO,EAAE;AACrD;AAEA,SAAS,gBAAgB,OAA+C;AACtE,SAAO;AAAA,IACL,MAAM,gBAAgB,MAAM,MAAM,oBAAoB;AAAA,IACtD,MAAM,iBAAiB,MAAM,MAAM,oBAAoB;AAAA,EACzD;AACF;AAEA,SAAS,iBAAiB,OAA4B,MAAmC;AACvF,SAAO,cAAc;AAAA,IACnB,IAAI,gBAAgB,MAAM,IAAI,GAAG,IAAI,KAAK;AAAA,IAC1C,SAAS,iBAAiB,MAAM,SAAS,GAAG,IAAI,UAAU;AAAA,IAC1D,MAAM,iBAAiB,MAAM,MAAM,GAAG,IAAI,OAAO;AAAA,EACnD,CAAC;AACH;AAEA,SAAS,oBACP,OAC4C;AAC5C,QAAM,MAAkD,CAAC;AACzD,aAAW,OAAO,OAAO,KAAK,KAAK,EAAE,KAAK,GAAG;AAC3C,UAAM,QAAQ,MAAM,GAAG;AACvB,oBAAgB,KAAK,kBAAkB;AACvC,QACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,YACjB,OAAO,UAAU,WACjB;AACA,YAAM,IAAI;AAAA,QACR;AAAA,QACA,cAAc,GAAG;AAAA,MACnB;AAAA,IACF;AACA,QAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,GAAG;AACxD,YAAM,IAAI,gCAAgC,0BAA0B,cAAc,GAAG,EAAE;AAAA,IACzF;AACA,QAAI,GAAG,IAAI;AAAA,EACb;AACA,SAAO;AACT;AAEA,SAAS,cAAiD,OAAa;AACrE,QAAM,MAA+B,CAAC;AACtC,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,KAAK,GAAG;AAChD,QAAI,UAAU,OAAW,KAAI,GAAG,IAAI;AAAA,EACtC;AACA,SAAO;AACT;AAEA,SAAS,eAAkD,OAAyB;AAClF,SAAO,OAAO,KAAK,KAAK,EAAE,SAAS,IAAI,QAAQ;AACjD;AAEA,SAAS,eAAe,OAAgB,MAAoB;AAC1D,MAAI,UAAU,QAAQ,OAAO,UAAU,YAAY,MAAM,QAAQ,KAAK,GAAG;AACvE,UAAM,IAAI,gCAAgC,mBAAmB,IAAI;AAAA,EACnE;AACA,QAAM,MAAM;AACZ,eAAa,IAAI,MAAM,GAAG,IAAI,OAAO;AACrC,mBAAiB,IAAI,MAAM,GAAG,IAAI,OAAO;AAC3C;AAEA,SAAS,gBAAgB,OAAgB,MAAoB;AAC3D,MAAI,UAAU,QAAQ,OAAO,UAAU,YAAY,MAAM,QAAQ,KAAK,GAAG;AACvE,UAAM,IAAI,gCAAgC,mBAAmB,IAAI;AAAA,EACnE;AACA,QAAM,MAAM;AACZ,eAAa,IAAI,IAAI,GAAG,IAAI,KAAK;AACjC,MAAI,IAAI,YAAY,OAAW,cAAa,IAAI,SAAS,GAAG,IAAI,UAAU;AAC1E,MAAI,IAAI,SAAS,OAAW,cAAa,IAAI,MAAM,GAAG,IAAI,OAAO;AACnE;AAEA,SAAS,mBAAmB,OAAgB,MAAoB;AAC9D,MAAI,UAAU,QAAQ,OAAO,UAAU,YAAY,MAAM,QAAQ,KAAK,GAAG;AACvE,UAAM,IAAI,gCAAgC,mBAAmB,IAAI;AAAA,EACnE;AACA,sBAAoB,KAAmD;AACzE;AAEA,SAAS,WAAW,OAAyB,MAAoB;AAC/D,MAAI,UAAU,KAAM;AACpB,QAAM,OAAO,OAAO;AACpB,MAAI,SAAS,YAAY,SAAS,UAAW;AAC7C,MAAI,SAAS,UAAU;AACrB,QAAI,CAAC,OAAO,SAAS,KAAK,GAAG;AAC3B,YAAM,IAAI,gCAAgC,0BAA0B,IAAI;AAAA,IAC1E;AACA;AAAA,EACF;AACA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,UAAM,QAAQ,CAAC,MAAM,UAAU;AAC7B,iBAAW,MAAM,GAAG,IAAI,IAAI,KAAK,GAAG;AAAA,IACtC,CAAC;AACD;AAAA,EACF;AACA,MAAI,SAAS,UAAU;AACrB,eAAW,CAAC,KAAK,MAAM,KAAK,OAAO,QAAQ,KAAK,GAAG;AACjD,sBAAgB,KAAK,GAAG,IAAI,QAAQ;AACpC,iBAAW,QAAQ,GAAG,IAAI,IAAI,GAAG,EAAE;AAAA,IACrC;AACA;AAAA,EACF;AACA,QAAM,IAAI,gCAAgC,kCAAkC,IAAI;AAClF;AAEA,SAAS,cAAc,OAAgB,UAAkB,MAAoB;AAC3E,MAAI,UAAU,UAAU;AACtB,UAAM,IAAI,gCAAgC,YAAY,QAAQ,IAAI,IAAI;AAAA,EACxE;AACF;AAEA,SAAS,aAAa,OAAgB,MAAoB;AACxD,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,gCAAgC,6BAA6B,IAAI;AAAA,EAC7E;AACF;AAEA,SAAS,gBAAgB,OAAe,MAAsB;AAC5D,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,gCAAgC,6BAA6B,IAAI;AAAA,EAC7E;AACA,SAAO;AACT;AAEA,SAAS,iBAAiB,OAA2B,MAAkC;AACrF,MAAI,UAAU,OAAW,QAAO;AAChC,SAAO,gBAAgB,OAAO,IAAI;AACpC;AAEA,SAAS,iBAAiB,OAAgB,MAAsB;AAC9D,MAAI,OAAO,UAAU,YAAY,CAAC,WAAW,KAAK,KAAK,GAAG;AACxD,UAAM,IAAI,gCAAgC,0CAA0C,IAAI;AAAA,EAC1F;AACA,SAAO;AACT;AAuBO,IAAM,sBAAsB;AAAA;AAAA;AAAA;AAAA,EAIjC,uBAAuB;AACzB;AAQO,SAAS,mBAAmB,OAAkC;AACnE,MAAI;AACJ,MAAI;AACF,iBAAa,KAAK,UAAU,KAAK;AAAA,EACnC,SAAS,KAAK;AACZ,UAAM,IAAI;AAAA,MACR,4CAA4C,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,MAC5F;AAAA,IACF;AAAA,EACF;AACA,MAAI,eAAe,QAAW;AAC5B,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,SAAO,KAAK,MAAM,UAAU;AAC9B;AAsBA,eAAsB,6BACpB,SACA,OAC2B;AAC3B,MAAI,CAAC,WAAW,OAAO,YAAY,UAAU;AAC3C,UAAM,IAAI,gCAAgC,0CAA0C,SAAS;AAAA,EAC/F;AACA,MAAI,OAAO,QAAQ,SAAS,YAAY,QAAQ,KAAK,WAAW,GAAG;AACjE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,MAAI,OAAO,QAAQ,YAAY,YAAY,QAAQ,QAAQ,WAAW,GAAG;AACvE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,SAAO,sBAAsB;AAAA,IAC3B,GAAG;AAAA,IACH,WAAW,GAAG,QAAQ,IAAI,IAAI,QAAQ,OAAO;AAAA,IAC7C,eAAe;AAAA,MACb,MAAM,oBAAoB;AAAA,MAC1B,SAAS,mBAAmB,OAAO;AAAA,IACrC;AAAA,EACF,CAAC;AACH;;;AC/QA,IAAM,sBAAsB;AAAA,EAC1B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,IAAM,aAAyC,CAAC,UAAU,OAAO,SAAS;AAEnE,IAAM,2BAAN,cAAuC,gBAAgB;AAAA,EACnD;AAAA,EACT,YAAY,SAAiB,OAAO,IAAI;AACtC,UAAM,OAAO,GAAG,OAAO,QAAQ,IAAI,MAAM,OAAO;AAChD,SAAK,OAAO;AAAA,EACd;AACF;AAOO,SAAS,kBAAkB,OAA2B;AAC3D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iBAAiB;AAAA,EACtD;AACA,QAAM,MAAM;AAEZ,aAAW,OAAO,qBAAqB;AACrC,QAAI,EAAE,OAAO,MAAM;AACjB,YAAM,IAAI,yBAAyB,4BAA4B,GAAG,GAAG;AAAA,IACvE;AAAA,EACF;AAEA,EAAAA,cAAa,IAAI,OAAO,OAAO;AAC/B,EAAAA,cAAa,IAAI,cAAc,cAAc;AAC7C,EAAAA,cAAa,IAAI,aAAa,aAAa;AAC3C,qBAAmB,IAAI,MAAM,MAAM;AACnC,EAAAA,cAAa,IAAI,OAAO,OAAO;AAC/B,EAAAA,cAAa,IAAI,YAAY,YAAY;AACzC,EAAAA,cAAa,IAAI,YAAY,YAAY;AACzC,EAAAA,cAAa,IAAI,WAAW,WAAW;AACvC,qBAAmB,IAAI,QAAQ,QAAQ;AACvC,MAAI,IAAI,YAAY,OAAW,oBAAmB,IAAI,SAAS,SAAS;AACxE,qBAAmB,IAAI,SAAS,SAAS;AAGzC,MAAI,CAAC,iBAAiB,IAAI,KAAe,GAAG;AAC1C,UAAM,IAAI;AAAA,MACR,UAAU,IAAI,KAAK;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AAGA,QAAM,KAAK,IAAI;AACf,MAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,UAAM,IAAI,yBAAyB,gCAAgC,YAAY;AAAA,EACjF;AACA,QAAM,QAAQ;AACd,qBAAmB,MAAM,OAAO,kBAAkB;AAClD,qBAAmB,MAAM,QAAQ,mBAAmB;AACpD,MAAI,MAAM,WAAW,OAAW,oBAAmB,MAAM,QAAQ,mBAAmB;AAGpF,MAAI,IAAI,kBAAkB,QAAW;AACnC,UAAM,KAAK,IAAI;AACf,QAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,YAAM,IAAI,yBAAyB,mCAAmC,eAAe;AAAA,IACvF;AACA,UAAM,QAAQ;AACd,IAAAA,cAAa,MAAM,OAAO,qBAAqB;AAC/C,IAAAA,cAAa,MAAM,eAAe,6BAA6B;AAC/D,uBAAmB,MAAM,YAAY,0BAA0B;AAC/D,QAAI,OAAO,MAAM,aAAa,WAAW;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,QAAM,MAAM,IAAI;AAChB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,6BAA6B,SAAS;AAAA,EAC3E;AACA,QAAM,SAAS;AACf,MAAI,OAAO,gBAAgB;AACzB,uBAAmB,OAAO,aAAa,qBAAqB;AAC9D,MAAI,OAAO,iBAAiB;AAC1B,uBAAmB,OAAO,cAAc,sBAAsB;AAChE,MAAI,OAAO,gBAAgB,UAAa,OAAO,iBAAiB,QAAW;AACzE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,QAAM,MAAM,OAAO;AACnB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,iCAAiC,aAAa;AAAA,EACnF;AACA,aAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,GAA8B,GAAG;AACnE,uBAAmB,GAAG,eAAe,CAAC,EAAE;AAAA,EAC1C;AAGA,MAAI,OAAO,gBAAgB,QAAW;AACpC,wBAAoB,OAAO,aAAa,qBAAqB;AAAA,EAC/D;AAGA,MAAI,IAAI,gBAAgB,OAAW,CAAAA,cAAa,IAAI,aAAa,aAAa;AAE9E,MAAI,IAAI,iBAAiB,QAAW;AAClC,QAAI;AACF,YAAM,UAAU,yBAAyB,IAAI,YAAY;AACzD,UAAI,QAAQ,UAAU,UAAa,QAAQ,UAAU,IAAI,OAAO;AAC9D,cAAM,IAAI;AAAA,UACR,uBAAuB,QAAQ,KAAK,2BAA2B,IAAI,KAAK;AAAA,UACxE;AAAA,QACF;AAAA,MACF;AACA,UAAI,QAAQ,eAAe,UAAa,QAAQ,eAAe,IAAI,YAAY;AAC7E,cAAM,IAAI;AAAA,UACR,4BAA4B,QAAQ,UAAU,gCAAgC,IAAI,UAAU;AAAA,UAC5F;AAAA,QACF;AAAA,MACF;AAAA,IACF,SAAS,OAAO;AACd,UAAI,iBAAiB,yBAA0B,OAAM;AACrD,UAAI,iBAAiB,OAAO;AAC1B,cAAM,IAAI,yBAAyB,MAAM,SAAS,cAAc;AAAA,MAClE;AACA,YAAM;AAAA,IACR;AAAA,EACF;AAGA,MAAI,OAAO,IAAI,aAAa,YAAY,CAAC,WAAW,SAAS,IAAI,QAAuB,GAAG;AACzF,UAAM,IAAI;AAAA,MACR,2BAA2B,WAAW,KAAK,IAAI,CAAC,SAAS,OAAO,IAAI,QAAQ,CAAC;AAAA,MAC7E;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,YAAY,OAAoC;AAC9D,MAAI;AACF,sBAAkB,KAAK;AACvB,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAGO,SAAS,mBACd,OACiF;AACjF,MAAI;AACF,WAAO,EAAE,IAAI,MAAM,OAAO,kBAAkB,KAAK,EAAE;AAAA,EACrD,SAAS,GAAG;AACV,QAAI,aAAa,yBAA0B,QAAO,EAAE,IAAI,OAAO,OAAO,EAAE;AACxE,UAAM;AAAA,EACR;AACF;AAGO,SAAS,mBAAmB,QAA8B;AAC/D,QAAM,OAAO,KAAK,UAAU,MAAM;AAClC,SAAO,kBAAkB,KAAK,MAAM,IAAI,CAAC;AAC3C;AAIA,SAASA,cAAa,OAAgB,MAAoB;AACxD,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,yBAAyB,6BAA6B,IAAI;AAAA,EACtE;AACF;AAEA,SAAS,mBAAmB,OAAgB,MAAoB;AAC9D,MAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,GAAG;AACxD,UAAM,IAAI,yBAAyB,0BAA0B,IAAI;AAAA,EACnE;AACF;AAEA,SAAS,oBAAoB,OAAgB,MAAoB;AAC/D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iCAAiC,IAAI;AAAA,EAC1E;AACA,QAAM,MAAM;AAEZ,QAAM,WAAW,IAAI;AACrB,MAAI,aAAa,QAAQ,OAAO,aAAa,UAAU;AACrD,UAAM,IAAI,yBAAyB,8BAA8B,GAAG,IAAI,WAAW;AAAA,EACrF;AACA,aAAW,CAAC,SAAS,IAAI,KAAK,OAAO,QAAQ,QAAmC,GAAG;AACjF,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,IAAI;AAAA,QACR;AAAA,QACA,GAAG,IAAI,aAAa,OAAO;AAAA,MAC7B;AAAA,IACF;AACA,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,IAA+B,GAAG;AAC1E,yBAAmB,OAAO,GAAG,IAAI,aAAa,OAAO,IAAI,GAAG,EAAE;AAAA,IAChE;AAAA,EACF;AAEA,QAAM,aAAa,IAAI;AACvB,MAAI,eAAe,QAAQ,OAAO,eAAe,UAAU;AACzD,UAAM,IAAI,yBAAyB,gCAAgC,GAAG,IAAI,aAAa;AAAA,EACzF;AACA,aAAW,CAAC,KAAK,IAAI,KAAK,OAAO,QAAQ,UAAqC,GAAG;AAC/E,uBAAmB,MAAM,GAAG,IAAI,eAAe,GAAG,EAAE;AAAA,EACtD;AAEA,qBAAmB,IAAI,WAAW,GAAG,IAAI,YAAY;AAErD,MAAI,IAAI,iBAAiB,QAAW;AAClC,QAAI,CAAC,MAAM,QAAQ,IAAI,YAAY,GAAG;AACpC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,GAAG,IAAI;AAAA,MACT;AAAA,IACF;AACA,aAAS,IAAI,GAAG,IAAI,IAAI,aAAa,QAAQ,KAAK;AAChD,YAAM,KAAK,IAAI,aAAa,CAAC;AAC7B,UAAI,OAAO,OAAO,YAAY,GAAG,WAAW,GAAG;AAC7C,cAAM,IAAI;AAAA,UACR;AAAA,UACA,GAAG,IAAI,iBAAiB,CAAC;AAAA,QAC3B;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,IAAI,UAAU,UAAa,OAAO,IAAI,UAAU,UAAU;AAC5D,UAAM,IAAI,yBAAyB,0BAA0B,GAAG,IAAI,QAAQ;AAAA,EAC9E;AACF;AAYA,SAAS,iBAAiB,OAAwB;AAChD,MAAI,MAAM,SAAS,GAAG,EAAG,QAAO;AAChC,MAAI,UAAU,KAAK,KAAK,EAAG,QAAO;AAClC,MAAI,sBAAsB,KAAK,KAAK,EAAG,QAAO;AAC9C,MAAI,SAAS,KAAK,KAAK,EAAG,QAAO;AACjC,SAAO;AACT;","names":["expectString"]}
|
|
@@ -2,7 +2,7 @@ import {
|
|
|
2
2
|
buildAgentProfileCell,
|
|
3
3
|
validateRunRecord,
|
|
4
4
|
verifyAgentProfileCell
|
|
5
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-NCK5QLGT.js";
|
|
6
6
|
import {
|
|
7
7
|
researchReport
|
|
8
8
|
} from "./chunk-EGIPWXHL.js";
|
|
@@ -328,4 +328,4 @@ function defaultRunId(params) {
|
|
|
328
328
|
export {
|
|
329
329
|
runEvalCampaign
|
|
330
330
|
};
|
|
331
|
-
//# sourceMappingURL=chunk-
|
|
331
|
+
//# sourceMappingURL=chunk-YXTT6GSZ.js.map
|
package/dist/contract/index.d.ts
CHANGED
|
@@ -1,20 +1,21 @@
|
|
|
1
|
-
import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-
|
|
2
|
-
export {
|
|
3
|
-
import { C as CampaignStorage, e as RunImprovementLoopResult } from '../run-improvement-loop-
|
|
4
|
-
export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, a as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, h as composeGate, j as defaultProductionGate, k as evolutionaryDriver, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, r as runCampaign, s as runEval, t as runImprovementLoop } from '../run-improvement-loop-
|
|
1
|
+
import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-BgrxOJSf.js';
|
|
2
|
+
export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, C as CodeSurface, D as Dispatch, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, s as JudgeDimension, J as JudgeScore, u as Mutator, O as OptimizerConfig, w as SessionScript } from '../types-BgrxOJSf.js';
|
|
3
|
+
import { C as CampaignStorage, e as RunImprovementLoopResult } from '../run-improvement-loop-BhfdjrMY.js';
|
|
4
|
+
export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, a as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, h as composeGate, j as defaultProductionGate, k as evolutionaryDriver, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, r as runCampaign, s as runEval, t as runImprovementLoop } from '../run-improvement-loop-BhfdjrMY.js';
|
|
5
5
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
6
|
-
import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-
|
|
7
|
-
export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-
|
|
8
|
-
import { A as AnalystRegistry } from '../registry-
|
|
6
|
+
import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-D2nT6_KT.js';
|
|
7
|
+
export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-D2nT6_KT.js';
|
|
8
|
+
import { A as AnalystRegistry } from '../registry-BSWy0rvH.js';
|
|
9
9
|
import { a as DatasetScenario } from '../dataset-BlwAtYYf.js';
|
|
10
|
-
import { R as RunRecord, a as RunSplitTag } from '../run-record-
|
|
10
|
+
import { R as RunRecord, a as RunSplitTag } from '../run-record-etiCMsUq.js';
|
|
11
11
|
import '../llm-client-BXVRUZyX.js';
|
|
12
12
|
import '../errors-mje_cKOs.js';
|
|
13
13
|
import '../raw-provider-sink-C46HDghv.js';
|
|
14
|
-
import '../red-team-
|
|
15
|
-
import '../store-
|
|
16
|
-
import '../
|
|
17
|
-
import '../
|
|
14
|
+
import '../red-team-CrC5MZYd.js';
|
|
15
|
+
import '../store-CKUAgsJz.js';
|
|
16
|
+
import '../schema-m0gsnbt3.js';
|
|
17
|
+
import '../summary-report-DLxh4yWk.js';
|
|
18
|
+
import '../failure-cluster-CL7IVgkJ.js';
|
|
18
19
|
import '../judge-calibration-DilmB3Ml.js';
|
|
19
20
|
import '../store-CJbzDxZ2.js';
|
|
20
21
|
import '../types-DhqpAi_z.js';
|
package/dist/contract/index.js
CHANGED
|
@@ -75,6 +75,7 @@ async function analyzeRuns(opts) {
|
|
|
75
75
|
const interRater = opts.raterScores ? computeInterRater(opts.raterScores) : void 0;
|
|
76
76
|
const lift = computeLift(runs, opts.baselineCandidateId, opts.candidateCandidateId, split);
|
|
77
77
|
const failureClusters = opts.analyst ? await computeFailureClusters(runs, opts.analyst, split) : void 0;
|
|
78
|
+
const failureModes = computeFailureModes(runs);
|
|
78
79
|
const contamination = opts.canaryScenarios ? computeContamination(runs, opts.canaryScenarios) : void 0;
|
|
79
80
|
const outcomeCorrelation = opts.outcomeSignal ? computeOutcomeCorrelation(runs, opts.outcomeSignal, split) : void 0;
|
|
80
81
|
const release = buildReleaseScorecard(composite, lift, contamination);
|
|
@@ -85,6 +86,7 @@ async function analyzeRuns(opts) {
|
|
|
85
86
|
interRater,
|
|
86
87
|
lift,
|
|
87
88
|
failureClusters,
|
|
89
|
+
failureModes,
|
|
88
90
|
contamination,
|
|
89
91
|
outcomeCorrelation,
|
|
90
92
|
priorPeriodComparison,
|
|
@@ -102,10 +104,21 @@ async function analyzeRuns(opts) {
|
|
|
102
104
|
contamination,
|
|
103
105
|
outcomeCorrelation,
|
|
104
106
|
release,
|
|
107
|
+
...failureModes ? { failureModes } : {},
|
|
105
108
|
...priorPeriodComparison ? { priorPeriodComparison } : {},
|
|
106
109
|
recommendations
|
|
107
110
|
};
|
|
108
111
|
}
|
|
112
|
+
function computeFailureModes(runs) {
|
|
113
|
+
const counts = /* @__PURE__ */ new Map();
|
|
114
|
+
for (const r of runs) {
|
|
115
|
+
const key = r.failureClass ?? r.failureMode;
|
|
116
|
+
if (key) counts.set(key, (counts.get(key) ?? 0) + 1);
|
|
117
|
+
}
|
|
118
|
+
if (counts.size === 0) return void 0;
|
|
119
|
+
const n = runs.length;
|
|
120
|
+
return [...counts.entries()].map(([mode, count]) => ({ mode, count, share: n > 0 ? count / n : 0 })).sort((a, b) => b.count - a.count || a.mode.localeCompare(b.mode));
|
|
121
|
+
}
|
|
109
122
|
function computePriorPeriodComparison(current, baseline, split, windowLabel) {
|
|
110
123
|
if (current.length === 0 || baseline.length === 0) return void 0;
|
|
111
124
|
const metrics = {};
|
|
@@ -661,6 +674,18 @@ function buildRecommendations(ctx) {
|
|
|
661
674
|
});
|
|
662
675
|
}
|
|
663
676
|
}
|
|
677
|
+
if (ctx.failureModes && ctx.failureModes.length > 0) {
|
|
678
|
+
const top = ctx.failureModes[0];
|
|
679
|
+
if (top.count >= 3 && top.share >= 0.15) {
|
|
680
|
+
out.push({
|
|
681
|
+
priority: top.share >= 0.25 ? "high" : "medium",
|
|
682
|
+
kind: "investigate",
|
|
683
|
+
title: `'${top.mode}' is the dominant failure mode \u2014 ${top.count} runs (${(top.share * 100).toFixed(0)}% of the corpus)`,
|
|
684
|
+
detail: `The mean composite can look acceptable while one named failure dominates the lower tail. ${top.count} of ${ctx.composite.n} runs failed with '${top.mode}'${ctx.failureModes.length > 1 ? ` (next: '${ctx.failureModes[1].mode}' \xD7${ctx.failureModes[1].count})` : ""}. Fix this cause first.`,
|
|
685
|
+
evidencePath: "failureModes"
|
|
686
|
+
});
|
|
687
|
+
}
|
|
688
|
+
}
|
|
664
689
|
if (Object.keys(ctx.judges).length === 0 && ctx.composite.n > 0) {
|
|
665
690
|
out.push({
|
|
666
691
|
priority: "medium",
|