@tangle-network/agent-eval 0.59.1 → 0.61.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +21 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/http.js +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +1 -1
- package/dist/adapters/otel.d.ts +5 -5
- package/dist/adapters/otel.js +1 -1
- package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
- package/dist/benchmarks/index.d.ts +3 -3
- package/dist/benchmarks/index.js +2 -2
- package/dist/builder-eval/index.js +3 -3
- package/dist/campaign/index.d.ts +153 -9
- package/dist/campaign/index.js +229 -23
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
- package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
- package/dist/chunk-3BFEG2F6.js.map +1 -0
- package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
- package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
- package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
- package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
- package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
- package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
- package/dist/{chunk-N4SBKEPJ.js → chunk-GMXHLSLL.js} +107 -2
- package/dist/chunk-GMXHLSLL.js.map +1 -0
- package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
- package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
- package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
- package/dist/{chunk-74Y2EMNH.js → chunk-OLULBECP.js} +18 -6
- package/dist/chunk-OLULBECP.js.map +1 -0
- package/dist/chunk-PQV2TKC3.js +27 -0
- package/dist/chunk-PQV2TKC3.js.map +1 -0
- package/dist/chunk-PZ5AY32C.js +10 -0
- package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/{chunk-JB4UWIM6.js → chunk-SUGME4OT.js} +266 -15
- package/dist/chunk-SUGME4OT.js.map +1 -0
- package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
- package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
- package/dist/cli.js +4 -4
- package/dist/contract/index.d.ts +48 -16
- package/dist/contract/index.js +59 -19
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
- package/dist/control.d.ts +5 -5
- package/dist/control.js +4 -4
- package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
- package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
- package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -3
- package/dist/governance/index.js +1 -1
- package/dist/hosted/index.d.ts +5 -5
- package/dist/hosted/index.js +1 -1
- package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
- package/dist/{index-D2nT6_KT.d.ts → index-D9dwa00f.d.ts} +2 -2
- package/dist/index.d.ts +24 -132
- package/dist/index.js +23 -36
- package/dist/index.js.map +1 -1
- package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
- package/dist/knowledge/index.js +1 -1
- package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
- package/dist/matrix/index.js +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/meta-eval/index.js +1 -1
- package/dist/multishot/index.js +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +4 -4
- package/dist/prm/index.js +1 -1
- package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-D0WeCXt1.d.ts} +208 -6
- package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
- package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
- package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
- package/dist/reporting.d.ts +6 -6
- package/dist/reporting.js +5 -5
- package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
- package/dist/rl.d.ts +9 -9
- package/dist/rl.js +8 -8
- package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
- package/dist/run-campaign-HXPJAUZ3.js +10 -0
- package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
- package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
- package/dist/telemetry/file.js +1 -1
- package/dist/telemetry/index.js +1 -1
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +4 -4
- package/dist/{types-BgrxOJSf.d.ts → types-Beb6KPqZ.d.ts} +52 -4
- package/dist/wire/index.d.ts +3 -3
- package/dist/wire/index.js +4 -4
- package/package.json +1 -1
- package/dist/chunk-74Y2EMNH.js.map +0 -1
- package/dist/chunk-JB4UWIM6.js.map +0 -1
- package/dist/chunk-N4SBKEPJ.js.map +0 -1
- package/dist/chunk-NSBPE2FW.js +0 -17
- package/dist/chunk-QYJT52YW.js.map +0 -1
- package/dist/chunk-ZWEQJIM6.js +0 -220
- package/dist/chunk-ZWEQJIM6.js.map +0 -1
- package/dist/run-campaign-ZURVWMMI.js +0 -10
- /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
- /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
- /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
- /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
- /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
- /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
- /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
- /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
- /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
- /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
- /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
- /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
- /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
- /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
- /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0
package/dist/campaign/index.js
CHANGED
|
@@ -1,47 +1,56 @@
|
|
|
1
1
|
import {
|
|
2
|
+
buildLoopProvenanceRecord,
|
|
2
3
|
composeGate,
|
|
3
4
|
countSentenceEdits,
|
|
4
5
|
defaultProductionGate,
|
|
6
|
+
defaultRenderDiff,
|
|
7
|
+
emitLoopProvenance,
|
|
5
8
|
evolutionaryDriver,
|
|
6
9
|
extractH2Sections,
|
|
7
10
|
gepaDriver,
|
|
8
11
|
heldOutGate,
|
|
12
|
+
isProposedCandidate,
|
|
13
|
+
labelTrustRank,
|
|
14
|
+
loopProvenanceSpans,
|
|
9
15
|
openAutoPr,
|
|
16
|
+
provenanceRecordPath,
|
|
17
|
+
provenanceSpansPath,
|
|
10
18
|
runEval,
|
|
11
19
|
runImprovementLoop,
|
|
12
20
|
runOptimization,
|
|
21
|
+
surfaceContentHash,
|
|
13
22
|
surfaceHash
|
|
14
|
-
} from "../chunk-
|
|
23
|
+
} from "../chunk-SUGME4OT.js";
|
|
15
24
|
import {
|
|
16
25
|
fsCampaignStorage,
|
|
17
26
|
inMemoryCampaignStorage,
|
|
18
27
|
runCampaign
|
|
19
|
-
} from "../chunk-
|
|
20
|
-
import
|
|
28
|
+
} from "../chunk-OLULBECP.js";
|
|
29
|
+
import {
|
|
30
|
+
agentProfileHash
|
|
31
|
+
} from "../chunk-PQV2TKC3.js";
|
|
32
|
+
import {
|
|
33
|
+
assertRealBackend,
|
|
34
|
+
summarizeBackendIntegrity
|
|
35
|
+
} from "../chunk-GMXHLSLL.js";
|
|
21
36
|
import "../chunk-YV7J7X5N.js";
|
|
22
|
-
import
|
|
37
|
+
import {
|
|
38
|
+
validateRunRecord
|
|
39
|
+
} from "../chunk-F3SRAAZO.js";
|
|
40
|
+
import "../chunk-ITBRCT73.js";
|
|
23
41
|
import "../chunk-GGE4NNQT.js";
|
|
24
|
-
import "../chunk-
|
|
42
|
+
import "../chunk-VSMTAMNK.js";
|
|
43
|
+
import "../chunk-IHDHUN2X.js";
|
|
25
44
|
import "../chunk-PC4UYEBM.js";
|
|
26
|
-
import
|
|
27
|
-
|
|
45
|
+
import {
|
|
46
|
+
AgentEvalError
|
|
47
|
+
} from "../chunk-3BFEG2F6.js";
|
|
48
|
+
import "../chunk-PZ5AY32C.js";
|
|
28
49
|
|
|
29
50
|
// src/campaign/labeled-store/fs-adapter.ts
|
|
30
51
|
import { createHash } from "crypto";
|
|
31
52
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
32
53
|
import { join } from "path";
|
|
33
|
-
|
|
34
|
-
// src/campaign/types.ts
|
|
35
|
-
var LABEL_TRUST_RANK = {
|
|
36
|
-
unverified: 0,
|
|
37
|
-
"verified-signal": 1,
|
|
38
|
-
"human-rated": 2
|
|
39
|
-
};
|
|
40
|
-
function labelTrustRank(trust) {
|
|
41
|
-
return LABEL_TRUST_RANK[trust ?? "unverified"];
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
// src/campaign/labeled-store/fs-adapter.ts
|
|
45
54
|
var LabeledScenarioStoreError = class extends Error {
|
|
46
55
|
constructor(code, message) {
|
|
47
56
|
super(message);
|
|
@@ -249,10 +258,197 @@ function appendLine(path, line) {
|
|
|
249
258
|
}
|
|
250
259
|
}
|
|
251
260
|
|
|
261
|
+
// src/campaign/presets/run-profile-matrix.ts
|
|
262
|
+
import { createHash as createHash2 } from "crypto";
|
|
263
|
+
import { join as join2 } from "path";
|
|
264
|
+
var ProfileMatrixError = class extends AgentEvalError {
|
|
265
|
+
constructor(message) {
|
|
266
|
+
super("profile_matrix", message);
|
|
267
|
+
}
|
|
268
|
+
};
|
|
269
|
+
function sanitize(id) {
|
|
270
|
+
return id.replace(/[^a-zA-Z0-9_-]/g, "_");
|
|
271
|
+
}
|
|
272
|
+
function sha(input) {
|
|
273
|
+
return createHash2("sha256").update(JSON.stringify(input)).digest("hex");
|
|
274
|
+
}
|
|
275
|
+
function mean(xs) {
|
|
276
|
+
return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
277
|
+
}
|
|
278
|
+
function cellComposite(cell) {
|
|
279
|
+
const composites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
280
|
+
return composites.length === 0 ? 0 : mean(composites);
|
|
281
|
+
}
|
|
282
|
+
function buildRunRecord(args) {
|
|
283
|
+
const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } = args;
|
|
284
|
+
const composite = cellComposite(cell);
|
|
285
|
+
const raw = { composite };
|
|
286
|
+
const perJudge = {};
|
|
287
|
+
const dimAccum = {};
|
|
288
|
+
const notes = [];
|
|
289
|
+
for (const [judgeName, js] of Object.entries(cell.judgeScores)) {
|
|
290
|
+
perJudge[judgeName] = { ...js.dimensions };
|
|
291
|
+
for (const [dim, value] of Object.entries(js.dimensions)) {
|
|
292
|
+
raw[`${judgeName}.${dim}`] = value;
|
|
293
|
+
(dimAccum[dim] ??= []).push(value);
|
|
294
|
+
}
|
|
295
|
+
if (js.notes) notes.push(`${judgeName}: ${js.notes}`);
|
|
296
|
+
}
|
|
297
|
+
const perDimMean = {};
|
|
298
|
+
for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values);
|
|
299
|
+
const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
|
|
300
|
+
if (Object.keys(perJudge).length > 0) {
|
|
301
|
+
outcome.judgeScores = {
|
|
302
|
+
perJudge,
|
|
303
|
+
perDimMean,
|
|
304
|
+
composite,
|
|
305
|
+
...notes.length > 0 ? { notes: notes.join(" | ") } : {}
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
return {
|
|
309
|
+
runId: `${matrixId}:${profile.id}:${cell.cellId}`,
|
|
310
|
+
experimentId,
|
|
311
|
+
candidateId: profile.id,
|
|
312
|
+
seed: cell.seed,
|
|
313
|
+
model: profile.model,
|
|
314
|
+
promptHash: profileHash,
|
|
315
|
+
configHash,
|
|
316
|
+
commitSha,
|
|
317
|
+
wallMs: cell.durationMs,
|
|
318
|
+
costUsd: cell.costUsd,
|
|
319
|
+
tokenUsage: cell.tokenUsage,
|
|
320
|
+
outcome,
|
|
321
|
+
splitTag,
|
|
322
|
+
scenarioId: cell.scenarioId,
|
|
323
|
+
...cell.error ? { failureMode: cell.error } : {}
|
|
324
|
+
};
|
|
325
|
+
}
|
|
326
|
+
async function runProfileMatrix(opts) {
|
|
327
|
+
if (opts.profiles.length === 0) throw new ProfileMatrixError("profiles must not be empty");
|
|
328
|
+
if (opts.scenarios.length === 0) throw new ProfileMatrixError("scenarios must not be empty");
|
|
329
|
+
const splitTag = opts.splitTag ?? "search";
|
|
330
|
+
const seed = opts.seed ?? 42;
|
|
331
|
+
const validate = opts.validate ?? true;
|
|
332
|
+
const integrityMode = opts.integrity ?? "assert";
|
|
333
|
+
const profileIds = opts.profiles.map((p) => p.id);
|
|
334
|
+
const experimentId = opts.experimentId ?? `pm_${sha({ profileIds, scenarios: opts.scenarios.map((s) => s.id) }).slice(0, 16)}`;
|
|
335
|
+
const matrixId = `mtx_${sha({ experimentId, profileIds, seed, splitTag }).slice(0, 16)}`;
|
|
336
|
+
for (const profile of opts.profiles) {
|
|
337
|
+
const profileHash = agentProfileHash(profile);
|
|
338
|
+
try {
|
|
339
|
+
validateRunRecord({
|
|
340
|
+
runId: `${matrixId}:${profile.id}:probe`,
|
|
341
|
+
experimentId,
|
|
342
|
+
candidateId: profile.id,
|
|
343
|
+
seed,
|
|
344
|
+
model: profile.model,
|
|
345
|
+
promptHash: profileHash,
|
|
346
|
+
configHash: profileHash,
|
|
347
|
+
commitSha: opts.commitSha,
|
|
348
|
+
wallMs: 0,
|
|
349
|
+
costUsd: 0,
|
|
350
|
+
tokenUsage: { input: 0, output: 0 },
|
|
351
|
+
outcome: splitTag === "holdout" ? { holdoutScore: 0, raw: {} } : { searchScore: 0, raw: {} },
|
|
352
|
+
splitTag
|
|
353
|
+
});
|
|
354
|
+
} catch (err) {
|
|
355
|
+
throw new ProfileMatrixError(
|
|
356
|
+
`profile '${profile.id}' is not recordable: ${err instanceof Error ? err.message : String(err)}`
|
|
357
|
+
);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
const records = [];
|
|
361
|
+
const campaigns = {};
|
|
362
|
+
const byProfile = {};
|
|
363
|
+
for (const profile of opts.profiles) {
|
|
364
|
+
const profileHash = agentProfileHash(profile);
|
|
365
|
+
const configHash = sha({
|
|
366
|
+
profile: profileHash,
|
|
367
|
+
judges: (opts.judges ?? []).map((j) => j.name),
|
|
368
|
+
seed,
|
|
369
|
+
splitTag
|
|
370
|
+
});
|
|
371
|
+
const dispatch = (scenario, ctx) => opts.dispatch(profile, scenario, ctx);
|
|
372
|
+
Object.defineProperty(dispatch, "name", { value: `profile_${sanitize(profile.id)}` });
|
|
373
|
+
const campaign = await runCampaign({
|
|
374
|
+
scenarios: opts.scenarios,
|
|
375
|
+
dispatch,
|
|
376
|
+
judges: opts.judges,
|
|
377
|
+
seed,
|
|
378
|
+
reps: opts.reps,
|
|
379
|
+
maxConcurrency: opts.maxConcurrency,
|
|
380
|
+
costCeiling: opts.costCeiling,
|
|
381
|
+
labeledStore: opts.labeledStore,
|
|
382
|
+
captureSource: opts.captureSource,
|
|
383
|
+
storage: opts.storage,
|
|
384
|
+
now: opts.now,
|
|
385
|
+
runDir: join2(opts.runDir, sanitize(profile.id))
|
|
386
|
+
});
|
|
387
|
+
campaigns[profile.id] = campaign;
|
|
388
|
+
const profileRecords = [];
|
|
389
|
+
for (const cell of campaign.cells) {
|
|
390
|
+
const record = buildRunRecord({
|
|
391
|
+
cell,
|
|
392
|
+
profile,
|
|
393
|
+
profileHash,
|
|
394
|
+
configHash,
|
|
395
|
+
experimentId,
|
|
396
|
+
splitTag,
|
|
397
|
+
commitSha: opts.commitSha,
|
|
398
|
+
matrixId
|
|
399
|
+
});
|
|
400
|
+
if (validate) validateRunRecord(record);
|
|
401
|
+
profileRecords.push(record);
|
|
402
|
+
records.push(record);
|
|
403
|
+
}
|
|
404
|
+
byProfile[profile.id] = {
|
|
405
|
+
profileId: profile.id,
|
|
406
|
+
profileHash,
|
|
407
|
+
model: profile.model,
|
|
408
|
+
records: profileRecords.length,
|
|
409
|
+
meanComposite: mean(profileRecords.map(compositeOf)),
|
|
410
|
+
totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),
|
|
411
|
+
integrity: summarizeBackendIntegrity(profileRecords)
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
const integrity = summarizeBackendIntegrity(records);
|
|
415
|
+
if (integrityMode === "assert") {
|
|
416
|
+
assertRealBackend(records, { allowMixed: opts.allowMixed ?? true });
|
|
417
|
+
} else if (integrityMode === "warn" && integrity.verdict !== "real") {
|
|
418
|
+
console.warn(
|
|
419
|
+
`[runProfileMatrix] backend integrity: ${integrity.verdict} \u2014 ${integrity.diagnosis}`
|
|
420
|
+
);
|
|
421
|
+
}
|
|
422
|
+
const byScenario = rollup(records, (r) => r.scenarioId);
|
|
423
|
+
const byPersona = opts.personaOf ? rollupByPersona(records, opts.scenarios, opts.personaOf) : void 0;
|
|
424
|
+
return { matrixId, experimentId, records, byProfile, byScenario, byPersona, integrity, campaigns };
|
|
425
|
+
}
|
|
426
|
+
function compositeOf(r) {
|
|
427
|
+
return r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0;
|
|
428
|
+
}
|
|
429
|
+
function rollup(records, keyOf) {
|
|
430
|
+
const groups = /* @__PURE__ */ new Map();
|
|
431
|
+
for (const r of records) {
|
|
432
|
+
const key = keyOf(r);
|
|
433
|
+
if (key === void 0) continue;
|
|
434
|
+
const arr = groups.get(key) ?? [];
|
|
435
|
+
arr.push(compositeOf(r));
|
|
436
|
+
groups.set(key, arr);
|
|
437
|
+
}
|
|
438
|
+
const out = {};
|
|
439
|
+
for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length };
|
|
440
|
+
return out;
|
|
441
|
+
}
|
|
442
|
+
function rollupByPersona(records, scenarios, personaOf) {
|
|
443
|
+
const personaByScenarioId = /* @__PURE__ */ new Map();
|
|
444
|
+
for (const s of scenarios) personaByScenarioId.set(s.id, personaOf(s));
|
|
445
|
+
return rollup(records, (r) => r.scenarioId ? personaByScenarioId.get(r.scenarioId) : void 0);
|
|
446
|
+
}
|
|
447
|
+
|
|
252
448
|
// src/campaign/worktree/index.ts
|
|
253
449
|
import { execFileSync } from "child_process";
|
|
254
450
|
import { existsSync as existsSync2 } from "fs";
|
|
255
|
-
import { basename, isAbsolute, join as
|
|
451
|
+
import { basename, isAbsolute, join as join3 } from "path";
|
|
256
452
|
var WorktreeAdapterError = class extends Error {
|
|
257
453
|
constructor(message, cause) {
|
|
258
454
|
super(message);
|
|
@@ -274,13 +470,13 @@ function slug(label) {
|
|
|
274
470
|
}
|
|
275
471
|
function gitWorktreeAdapter(opts) {
|
|
276
472
|
const git = opts.git ?? defaultGit;
|
|
277
|
-
const worktreeDir = opts.worktreeDir ??
|
|
473
|
+
const worktreeDir = opts.worktreeDir ?? join3(opts.repoRoot, ".worktrees");
|
|
278
474
|
const branchPrefix = opts.branchPrefix ?? "improve";
|
|
279
475
|
return {
|
|
280
476
|
async create({ baseRef, label }) {
|
|
281
477
|
const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
|
|
282
478
|
const branch = `${branchPrefix}/${id}`;
|
|
283
|
-
const path =
|
|
479
|
+
const path = join3(worktreeDir, id);
|
|
284
480
|
git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
|
|
285
481
|
return { path, branch, baseRef };
|
|
286
482
|
},
|
|
@@ -305,16 +501,20 @@ function gitWorktreeAdapter(opts) {
|
|
|
305
501
|
}
|
|
306
502
|
function resolveWorktreePath(surface, worktreeDir) {
|
|
307
503
|
if (isAbsolute(surface.worktreeRef) && existsSync2(surface.worktreeRef)) return surface.worktreeRef;
|
|
308
|
-
if (worktreeDir) return
|
|
504
|
+
if (worktreeDir) return join3(worktreeDir, basename(surface.worktreeRef));
|
|
309
505
|
return surface.worktreeRef;
|
|
310
506
|
}
|
|
311
507
|
export {
|
|
312
508
|
FsLabeledScenarioStore,
|
|
313
509
|
LabeledScenarioStoreError,
|
|
510
|
+
ProfileMatrixError,
|
|
314
511
|
WorktreeAdapterError,
|
|
512
|
+
buildLoopProvenanceRecord,
|
|
315
513
|
composeGate,
|
|
316
514
|
countSentenceEdits,
|
|
317
515
|
defaultProductionGate,
|
|
516
|
+
defaultRenderDiff,
|
|
517
|
+
emitLoopProvenance,
|
|
318
518
|
evolutionaryDriver,
|
|
319
519
|
extractH2Sections,
|
|
320
520
|
fsCampaignStorage,
|
|
@@ -322,13 +522,19 @@ export {
|
|
|
322
522
|
gitWorktreeAdapter,
|
|
323
523
|
heldOutGate,
|
|
324
524
|
inMemoryCampaignStorage,
|
|
525
|
+
isProposedCandidate,
|
|
325
526
|
labelTrustRank,
|
|
527
|
+
loopProvenanceSpans,
|
|
326
528
|
openAutoPr,
|
|
529
|
+
provenanceRecordPath,
|
|
530
|
+
provenanceSpansPath,
|
|
327
531
|
resolveWorktreePath,
|
|
328
532
|
runCampaign,
|
|
329
533
|
runEval,
|
|
330
534
|
runImprovementLoop,
|
|
331
535
|
runOptimization,
|
|
536
|
+
runProfileMatrix,
|
|
537
|
+
surfaceContentHash,
|
|
332
538
|
surfaceHash
|
|
333
539
|
};
|
|
334
540
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/types.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * Pass A substrate types — `runCampaign` is the one primitive every\n * eval flow composes from. Three contracts in this file:\n *\n * - `Scenario` input set\n * - `DispatchFn` how to run one scenario → artifact\n * - `CampaignResult` defined output schema (the contract downstream tools depend on)\n *\n * Three more lifted from earlier substrate work (re-exported):\n *\n * - `JudgeConfig` pluggable dimensional scorer (0.38)\n * - `Mutator` optimization-loop surface mutator\n * - `Gate` promotion gate (`HeldOutGate` and friends adapt to this)\n *\n * No new architecture vs 0.38 — Pass A formalizes the shapes so consumers\n * can build dashboards / CI gates / regression diffs against a stable schema.\n */\n\n/** @experimental Stable identifier + kind tag for any scenario. Consumers\n * extend with their per-domain payload (persona, task, requirement, ...). */\nexport interface Scenario {\n id: string\n kind: string\n tags?: string[]\n}\n\n/** @experimental Context handed to every dispatch invocation. Scoped — every\n * trace/span carries the cellId, every artifact write lands under the cell's\n * artifact root, the cost meter accumulates per cell. */\nexport interface DispatchContext {\n cellId: string\n rep: number\n generation?: number\n seed: number\n signal: AbortSignal\n trace: CampaignTraceWriter\n artifacts: CampaignArtifactWriter\n cost: CampaignCostMeter\n /** Populated when this run is part of a multi-cycle improvement loop. */\n cycleId?: string\n /** Populated when the substrate resumed from a prior cache hit. */\n resumedFrom?: string\n /**\n * Opaque placement key supplied by `RunCampaignOptions.cellPlacement`.\n * The substrate forwards it through unchanged; placement-aware Dispatch\n * implementations (e.g. `httpDispatch` from `/adapters/http`) read it to\n * route the cell to the right worker / region / sandbox. `undefined`\n * when no placement strategy is configured.\n */\n placement?: string\n}\n\n/** @experimental One function: scenario + ctx → artifact. Dispatcher chooses\n * whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */\nexport type DispatchFn<TScenario extends Scenario, TArtifact> = (\n scenario: TScenario,\n ctx: DispatchContext,\n) => Promise<TArtifact>\n\n// ── Sessions ──────────────────────────────────────────────────────────\n\n/** @experimental One session within a multi-session journey. Dispatch is\n * invoked once per session in order; state from prior session's artifact\n * is exposed via `ctx.priorSessionArtifact`. */\nexport interface SessionScript<TScenario, TArtifact> {\n id: string\n intent: string\n maxTurns?: number\n /** When true, knowledge accumulated this session persists to next. */\n affectsKnowledge?: boolean\n /** Optional per-session persona evolution — called after the session\n * resolves. Returns the persona shape used by the NEXT session. */\n evolveAfterSession?: (artifact: TArtifact, sessionIndex: number, scenario: TScenario) => TScenario\n}\n\n// ── Judges (re-export 0.38 shape) ─────────────────────────────────────\n\nexport interface JudgeDimension {\n /** JSON field name + score key. */\n key: string\n /** Description shown in the judge's user prompt. */\n description: string\n}\n\n/** @experimental Pluggable dimensional scorer. `score` is the contract:\n * given an artifact + scenario, return a `JudgeScore`. This is deliberately a\n * function, not a fixed LLM-prompt shape — real consumers judge with\n * ensembles, deterministic checks, or a single LLM call, and the substrate\n * must not constrain that. The `llmJudge()` helper builds a `score` that does\n * one LLM call for the common case. `appliesTo` lets a judge run only on\n * scenarios that match (e.g. a legal-citation judge only on legal scenarios). */\nexport interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {\n name: string\n dimensions: JudgeDimension[]\n /** Score one artifact. Throw on failure — a thrown judge is recorded as a\n * failed cell, never silently folded into a zero. */\n score(input: {\n artifact: TArtifact\n scenario: TScenario\n signal: AbortSignal\n }): JudgeScore | Promise<JudgeScore>\n appliesTo?: (scenario: TScenario) => boolean\n}\n\nexport interface JudgeScore {\n dimensions: Record<string, number>\n composite: number\n notes: string\n}\n\n// ── Optimization (population + generations + mutator) ─────────────────\n\n/** @experimental A tier-4 code surface — a candidate change to the agent's\n * IMPLEMENTATION, not its prompt. Produced by autoresearch (reads codebase +\n * trace findings → opens a worktree). Measured by checking out `worktreeRef`\n * and running the worker against the changed code. See the improvement-tier\n * table in `docs/design/loop-taxonomy.md`. */\nexport interface CodeSurface {\n kind: 'code'\n /** Worktree path or git ref holding the candidate code change. The\n * consumer's `dispatchWithSurface` checks this out before running. */\n worktreeRef: string\n /** Base ref the change is measured against. Default: the repo's main. */\n baseRef?: string\n /** Human summary of what changed — rendered into the auto-PR body. */\n summary?: string\n}\n\n/** @experimental The mutable surface a driver proposes. Tiers (see\n * `docs/design/loop-taxonomy.md`):\n * - `string` — tiers 1-2: system-prompt addendum / serialized tool\n * config. Cheap, reversible, text-diffable.\n * - `CodeSurface` — tier 4: an implementation change behind a worktree ref.\n * Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,\n * not this type. */\nexport type MutableSurface = string | CodeSurface\n\n/** @experimental Stateless surface mutation — given findings + current\n * surface, return N candidate surfaces. Pure transform, no generation\n * awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`\n * conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */\nexport interface Mutator<TFindings = unknown> {\n kind: string\n mutate(args: {\n findings: TFindings[]\n currentSurface: MutableSurface\n populationSize: number\n signal: AbortSignal\n }): Promise<MutableSurface[]>\n}\n\n/** @experimental Everything a driver's `propose()` may read to plan the next\n * batch of candidates. The first six fields are always present; the rest are\n * optional context the loop supplies when available, so cheap drivers\n * (`evolutionaryDriver`) can ignore them while a code-tier agentic generator\n * consumes the research report + dataset to drive a coding harness.\n * See `docs/design/self-improvement-engine.md`. */\nexport interface ProposeContext<TFindings = unknown> {\n currentSurface: MutableSurface\n history: GenerationRecord[]\n findings: TFindings[]\n /** BREADTH: how many candidate surfaces to return this generation. */\n populationSize: number\n generation: number\n signal: AbortSignal\n /** The Phase-2 research report (analyst findings + diff), produced AFTER the\n * trace analysts run. Opaque to the substrate — the driver that consumes it\n * types it. See the phase diagram in self-improvement-engine.md. */\n report?: unknown\n /** Handle to all captured data — the driver samples traces / artifacts /\n * rewards here to ground its proposals. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the agentic generator may take per candidate.\n * 1 = single-shot; >1 = it may iterate on its own change before handing it\n * back to be measured. */\n maxImprovementShots?: number\n}\n\n/** @experimental A surface-improvement strategy — the DRIVER of the\n * improvement loop. Given the current best surface, the history of what's\n * been tried + scored, and any external findings, propose the next batch of\n * candidate surfaces to measure. Optionally decide to stop early.\n *\n * The evolutionary mutator (`evolutionaryDriver`, here) and agent-runtime's\n * `improvementDriver` (with reflective / agentic generators) both conform —\n * drivers of the SAME loop, not separate loops. The loop body\n * (`runOptimization`) and the gated promotion shell (`runImprovementLoop`)\n * are driver-agnostic. */\nexport interface ImprovementDriver<TFindings = unknown> {\n kind: string\n /** Plan: propose N candidate surfaces for the next generation. */\n propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]>\n /** Decide: stop early when the driver judges the search converged or\n * exhausted. Default (omitted) runs all `maxGenerations`. */\n decide?(args: { history: GenerationRecord[] }): { stop: boolean; reason?: string }\n}\n\nexport interface OptimizerConfig {\n driver: ImprovementDriver\n populationSize: number\n maxGenerations: number\n surfaceExtractor: (profile: unknown) => MutableSurface\n}\n\n// ── Gates ─────────────────────────────────────────────────────────────\n\n/** @experimental Five-valued verdict taxonomy (MOSS-paper alignment). */\nexport type GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n\nexport interface GateContext<TArtifact, TScenario extends Scenario> {\n candidateArtifacts: Map<string, TArtifact>\n baselineArtifacts?: Map<string, TArtifact>\n /** Candidate (winner) judge scores, keyed by cellId. */\n judgeScores: Map<string, Record<string, JudgeScore>>\n /** Baseline judge scores, keyed by cellId. SEPARATE from `judgeScores` —\n * baseline + candidate share cellIds (same scenarios), so a single map\n * cannot represent both. A gate computing a holdout delta MUST read\n * candidate from `judgeScores` and baseline from here. */\n baselineJudgeScores?: Map<string, Record<string, JudgeScore>>\n scenarios: TScenario[]\n cost: { candidate: number; baseline: number }\n signal: AbortSignal\n}\n\nexport interface GateResult {\n decision: GateDecision\n reasons: string[]\n contributingGates: Array<{ name: string; passed: boolean; detail: unknown }>\n delta?: number\n}\n\n/** @experimental Composable promotion gate. */\nexport interface Gate<TArtifact = unknown, TScenario extends Scenario = Scenario> {\n name: string\n decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult>\n}\n\n// ── Tracing / artifacts / cost ────────────────────────────────────────\n\n/** @experimental Scoped trace writer handed to each dispatch — every span\n * auto-tagged with the cellId so traces filter cleanly. */\nexport interface CampaignTraceWriter {\n span(name: string, attributes?: Record<string, unknown>): TraceSpan\n flush(): Promise<void>\n}\n\nexport interface TraceSpan {\n end(attributes?: Record<string, unknown>): void\n setAttribute(key: string, value: unknown): void\n}\n\n/** @experimental Scoped artifact writer — `write(path, content)` lands under\n * `<runDir>/<cellId>/<path>`. */\nexport interface CampaignArtifactWriter {\n write(path: string, content: string | Uint8Array): Promise<string>\n writeJson(path: string, value: unknown): Promise<string>\n}\n\n/** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs\n * via the cost-ledger backend hooks; consumers can record additional\n * spend (sandbox time, tool costs) via `observe`. */\nexport interface CampaignCostMeter {\n observe(amountUsd: number, source: string): void\n current(): number\n}\n\n// ── LabeledScenarioStore ──────────────────────────────────────────────\n\n/** @experimental Source tag — required on every store write. Used by the\n * default training-source filter (production-trace samples NOT used as\n * training scenarios unless explicitly opted in). */\nexport type LabeledScenarioSource =\n | 'production-trace'\n | 'eval-run'\n | 'manual'\n | 'red-team'\n | 'synthetic'\n\nexport type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted'\n\n/** How much a label can be trusted to evaluate against — the gold-admission\n * gate. Strictly ordered: a record qualifies for a `minTrust` filter when its\n * trust rank is >= the requested rank.\n *\n * - `unverified` — label is a heuristic (e.g. raw outcome success/fail).\n * Fine as corpus; MUST NOT enter a gold set that lift\n * numbers are computed against.\n * - `verified-signal` — an external signal confirmed the outcome (PR merged,\n * tests green, user did not retry, downstream check).\n * - `human-rated` — a human explicitly rated or corrected the artifact.\n *\n * Absent on a write ⇒ treated as `unverified` (fail-closed: a writer must\n * explicitly assert trust to make a record gold-eligible — it never happens\n * by accident). */\nexport type LabelTrust = 'unverified' | 'verified-signal' | 'human-rated'\n\nconst LABEL_TRUST_RANK: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 1,\n 'human-rated': 2,\n}\n\n/** Ordinal rank for a label-trust tier; absent ⇒ `unverified` (rank 0). */\nexport function labelTrustRank(trust: LabelTrust | undefined): number {\n return LABEL_TRUST_RANK[trust ?? 'unverified']\n}\n\n/** @experimental Required-provenance write. The store rejects writes that\n * lack provenance — a default-on flywheel without provenance is the\n * data-poisoning vector flagged in the alignment review. */\nexport interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact = unknown> {\n scenario: TScenario\n artifact: TArtifact\n judgeScores: Record<string, JudgeScore>\n source: LabeledScenarioSource\n sourceVersionHash: string\n capturedAt: string\n redactionStatus: RedactionStatus\n /** Gold-admission trust tier. Absent ⇒ `unverified` (fail-closed): the\n * record is corpus, never gold. A writer must explicitly assert\n * `verified-signal` or `human-rated` to make it eligible for a gold\n * sample. See {@link LabelTrust}. */\n labelTrust?: LabelTrust\n /** Optional per-source rate-limit bucket key (e.g., the tenant id). */\n rateLimitBucket?: string\n}\n\nexport interface LabeledScenarioRecord<TScenario extends Scenario = Scenario, TArtifact = unknown>\n extends LabeledScenarioWrite<TScenario, TArtifact> {\n /** Stable hash of (scenario.id, source, capturedAt, sourceVersionHash). */\n recordHash: string\n /** Substrate-assigned split — train if captured before the campaign's\n * `temporalCutoff`, test if after. Explicit override allowed via filter. */\n split: 'train' | 'test'\n}\n\nexport interface LabeledScenarioSampleArgs {\n count: number\n /** REQUIRED — substrate refuses to sample without an explicit split. */\n split: 'train' | 'test'\n /** REQUIRED — only records captured before this timestamp are returned.\n * Enforces temporal split discipline (test scenarios captured AFTER train\n * cannot enter the training pool). */\n capturedBefore: string\n filter?: {\n kind?: string\n source?: LabeledScenarioSource | LabeledScenarioSource[]\n minComposite?: number\n maxComposite?: number\n /** Gold gate: only records whose trust rank is >= this tier are\n * returned. `sample({ split: 'test', minTrust: 'verified-signal' })` is\n * the canonical \"give me the gold set\" call. Absent ⇒ no trust gate\n * (corpus-level read). */\n minTrust?: LabelTrust\n }\n}\n\nexport interface LabeledScenarioStore {\n observe(write: LabeledScenarioWrite): Promise<void>\n sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>\n size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n /** Count by trust tier — tells the flywheel how much gold it has\n * accumulated vs. raw corpus. */\n byTrust: Record<LabelTrust, number>\n }>\n}\n\n// ── The CampaignResult schema (the downstream-tools contract) ─────────\n\nexport interface CampaignCellResult<TArtifact> {\n cellId: string\n scenarioId: string\n rep: number\n generation?: number\n artifact: TArtifact\n judgeScores: Record<string, JudgeScore>\n costUsd: number\n durationMs: number\n seed: number\n cached: boolean\n error?: string\n}\n\nexport interface JudgeAggregate {\n mean: number\n stdev: number\n ci95: [number, number]\n n: number\n}\n\nexport interface ScenarioAggregate {\n meanComposite: number\n ci95: [number, number]\n n: number\n}\n\nexport interface GenerationRecord {\n generationIndex: number\n candidates: GenerationCandidate[]\n promoted: string[]\n}\n\n/** One scored candidate surface in a generation. `dimensions` + `scenarios`\n * let a reflective `ImprovementDriver` ground its next proposal on WHICH\n * dimensions the candidate is weakest on and WHICH scenarios it best/worst\n * handled — the evidence a blind `Mutator` cannot see. */\nexport interface GenerationCandidate {\n surfaceHash: string\n composite: number\n ci95: [number, number]\n /** Mean score per judge dimension across all cells (scenarios × reps ×\n * judges that reported the dimension). */\n dimensions: Record<string, number>\n /** Per-scenario composite (mean over reps + judges). */\n scenarios: Array<{ scenarioId: string; composite: number }>\n}\n\nexport interface CampaignAggregates {\n byJudge: Record<string, JudgeAggregate>\n byScenario: Record<string, ScenarioAggregate>\n totalCostUsd: number\n cellsExecuted: number\n cellsSkipped: number\n cellsCached: number\n cellsFailed: number\n}\n\nexport interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scenario> {\n /** sha256(scenarios, judges, dispatch source ref, optimizer config, seed). Stable identity for reruns. */\n manifestHash: string\n seed: number\n startedAt: string\n endedAt: string\n durationMs: number\n cells: Array<CampaignCellResult<TArtifact>>\n aggregates: CampaignAggregates\n optimization?: {\n generations: GenerationRecord[]\n winnerSurfaceHash?: string\n }\n gate?: GateResult\n prUrl?: string\n runDir: string\n artifactsByPath: Record<string, string>\n /** Substrate strips the input scenarios to id+kind for the result manifest;\n * consumers needing full payload look it up via the original input. The\n * type parameter `TScenario` is propagated for downstream consumers that\n * want narrowed types when extending `CampaignResult`. */\n scenarios: Array<Pick<TScenario, 'id' | 'kind'>>\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;;;AC2QrB,IAAM,mBAA+C;AAAA,EACnD,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,eAAe;AACjB;AAGO,SAAS,eAAe,OAAuC;AACpE,SAAO,iBAAiB,SAAS,YAAY;AAC/C;;;AD/PO,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AElSA,SAAS,oBAAoB;AAC7B,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["existsSync","join"]}
|
|
1
|
+
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/presets/run-profile-matrix.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * `runProfileMatrix` — the missing keystone between `runAgentMatrix` and the\n * backend-integrity guard.\n *\n * The gap it closes: `runAgentMatrix` is a topology-opaque scheduler whose\n * cells return a bare `{ output, verdict, costUsd }` — no `tokenUsage`, not a\n * `RunRecord`. `assertRealBackend` / `summarizeBackendIntegrity` key on\n * `RunRecord.tokenUsage`, so they cannot run on a raw matrix result. Every\n * consumer therefore hand-writes the same bridge: fan a profile × scenario\n * cartesian, call dispatch, fabricate a `RunRecord` with token usage, thread it\n * back, run the integrity guard. That hand-rolled bridge is exactly the pile of\n * bespoke `eval:*` scripts the adoption skills keep trying (and failing) to\n * forbid.\n *\n * `runProfileMatrix` IS that bridge, once:\n *\n * - axis 3 (PROFILE) = `profiles: AgentProfile[]`\n * - axis 1 (PERSONA/SCENARIO) = `scenarios: Scenario[]` (each scenario carries\n * its persona; `personaOf` groups them for the `byPersona` pivot)\n * - the scoring axis = `judges`\n *\n * It runs `runCampaign` once per profile (reusing its seeds, reps, bootstrap\n * CIs, resumability, and the `LabeledScenarioStore` capture flywheel), maps\n * every cell to a validated `RunRecord` carrying the real `tokenUsage` the\n * dispatch reported via `ctx.cost.observeTokens`, and runs `assertRealBackend`\n * BY CONSTRUCTION before returning — so a stub-backend run fails loudly instead\n * of reporting a clean 0/N leaderboard.\n *\n * Dispatch contract: a dispatch that calls an LLM MUST report usage via\n * `ctx.cost.observeTokens({ input, output })` (and cost via `ctx.cost.observe`).\n * A dispatch that reports zero tokens is indistinguishable from a stub and the\n * integrity guard treats it as one.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport { type AgentProfile, agentProfileHash } from '../../agent-profile'\nimport { AgentEvalError } from '../../errors'\nimport {\n assertRealBackend,\n type BackendIntegrityReport,\n summarizeBackendIntegrity,\n} from '../../integrity/backend-integrity'\nimport {\n type RunOutcome,\n type RunRecord,\n type RunSplitTag,\n validateRunRecord,\n} from '../../run-record'\nimport { runCampaign } from '../run-campaign'\nimport type { CampaignStorage } from '../storage'\nimport type {\n CampaignCellResult,\n CampaignResult,\n DispatchContext,\n JudgeConfig,\n LabeledScenarioSource,\n LabeledScenarioStore,\n Scenario,\n} from '../types'\n\n/** Thrown when the matrix is misconfigured (no profiles, a profile whose model\n * lacks a snapshot version, etc.). Distinct from `BackendIntegrityError`,\n * which signals a stub backend at run time. */\nexport class ProfileMatrixError extends AgentEvalError {\n constructor(message: string) {\n super('profile_matrix', message)\n }\n}\n\n/** Dispatch for one cell: render `profile` against `scenario`, returning the\n * artifact the judges score. Report LLM usage via `ctx.cost.observeTokens`\n * and `ctx.cost.observe` — the integrity guard depends on it. */\nexport type ProfileDispatchFn<TScenario extends Scenario, TArtifact> = (\n profile: AgentProfile,\n scenario: TScenario,\n ctx: DispatchContext,\n) => Promise<TArtifact>\n\nexport interface RunProfileMatrixOptions<TScenario extends Scenario, TArtifact> {\n /** Axis 3 — the agent-under-test configurations. Each is one column. */\n profiles: AgentProfile[]\n /** Axis 1 — the persona/scenario corpus, run against every profile. */\n scenarios: TScenario[]\n /** Renders one (profile, scenario) cell. */\n dispatch: ProfileDispatchFn<TScenario, TArtifact>\n /** The scoring axis. */\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Where each profile's campaign writes artifacts/traces. One subdir per\n * profile. */\n runDir: string\n /** Git SHA the harness ran from — stamped onto every RunRecord (mandatory\n * for paper-grade records). */\n commitSha: string\n /** Logical experiment id shared across the whole matrix so the promotion\n * gate can pair profiles on matched scenarios. Default: a hash of the\n * profile + scenario ids. */\n experimentId?: string\n /** Which split these runs belong to. Default `'search'`. */\n splitTag?: RunSplitTag\n /** Replicates per (profile, scenario) cell for CI bands. Default 1. */\n reps?: number\n /** Campaign seed (per profile). Default 42. */\n seed?: number\n /**\n * Backend-integrity posture, enforced AFTER the matrix completes:\n * - `'assert'` (default) — throw `BackendIntegrityError` if the run was a\n * stub (and, with `allowMixed:false`, if it was mixed).\n * - `'warn'` — log the verdict but never throw.\n * - `'off'` — skip the guard entirely (only for offline/replay analysis).\n */\n integrity?: 'assert' | 'warn' | 'off'\n /** Forwarded to `assertRealBackend`. Default true (tolerate partial 429\n * cascades); set false for strict CI gates. */\n allowMixed?: boolean\n /** Max concurrent cells WITHIN each profile's campaign. Default 2.\n * Profiles run sequentially so the cost ceiling is honored deterministically. */\n maxConcurrency?: number\n /** Cumulative USD cap per profile campaign. */\n costCeiling?: number\n /** Capture flywheel — forwarded to each campaign. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: LabeledScenarioSource\n /** Storage backend. Default `fsCampaignStorage`. Pass\n * `inMemoryCampaignStorage()` for edge/CF-Worker/test runs. */\n storage?: CampaignStorage\n /** Test seam — override the wall clock. */\n now?: () => Date\n /** Optional persona key per scenario — drives the `byPersona` pivot. When\n * unset, `byPersona` is omitted. */\n personaOf?: (scenario: TScenario) => string\n /** Validate every produced RunRecord with `validateRunRecord` (fail-loud).\n * Default true — catches bad model snapshots and non-finite judge dims at\n * the boundary instead of letting them poison downstream analysis. */\n validate?: boolean\n}\n\nexport interface ProfileSummary {\n profileId: string\n profileHash: string\n model: string\n /** RunRecords produced for this profile (= scenarios × reps). */\n records: number\n /** Mean composite across this profile's records. */\n meanComposite: number\n totalCostUsd: number\n /** Per-profile integrity verdict — surfaces a single profile that ran stub\n * even when the matrix as a whole looks real. */\n integrity: BackendIntegrityReport\n}\n\nexport interface ScenarioRollup {\n meanComposite: number\n n: number\n}\n\nexport interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {\n matrixId: string\n experimentId: string\n /** One RunRecord per (profile, scenario, rep) cell — the integrity-checked,\n * paper-grade output. Feed straight into `analyzeRuns`, `HeldOutGate`,\n * scorecards, the hosted wire format. */\n records: RunRecord[]\n byProfile: Record<string, ProfileSummary>\n byScenario: Record<string, ScenarioRollup>\n /** Present only when `personaOf` was supplied. */\n byPersona?: Record<string, ScenarioRollup>\n /** Whole-matrix integrity report (the one `integrity:'assert'` enforces). */\n integrity: BackendIntegrityReport\n /** The raw per-profile campaign results, keyed by profile id. */\n campaigns: Record<string, CampaignResult<TArtifact, TScenario>>\n}\n\nfunction sanitize(id: string): string {\n return id.replace(/[^a-zA-Z0-9_-]/g, '_')\n}\n\nfunction sha(input: unknown): string {\n return createHash('sha256').update(JSON.stringify(input)).digest('hex')\n}\n\nfunction mean(xs: number[]): number {\n return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length\n}\n\nfunction cellComposite(cell: CampaignCellResult<unknown>): number {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n return composites.length === 0 ? 0 : mean(composites)\n}\n\ninterface BuildRecordArgs<TArtifact> {\n cell: CampaignCellResult<TArtifact>\n profile: AgentProfile\n profileHash: string\n configHash: string\n experimentId: string\n splitTag: RunSplitTag\n commitSha: string\n matrixId: string\n}\n\nfunction buildRunRecord<TArtifact>(args: BuildRecordArgs<TArtifact>): RunRecord {\n const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } =\n args\n const composite = cellComposite(cell)\n\n // Flatten judge dimensions (judge-prefixed to avoid collisions) into raw.\n const raw: Record<string, number> = { composite }\n const perJudge: Record<string, Record<string, number>> = {}\n const dimAccum: Record<string, number[]> = {}\n const notes: string[] = []\n for (const [judgeName, js] of Object.entries(cell.judgeScores)) {\n perJudge[judgeName] = { ...js.dimensions }\n for (const [dim, value] of Object.entries(js.dimensions)) {\n raw[`${judgeName}.${dim}`] = value\n ;(dimAccum[dim] ??= []).push(value)\n }\n if (js.notes) notes.push(`${judgeName}: ${js.notes}`)\n }\n const perDimMean: Record<string, number> = {}\n for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values)\n\n const outcome: RunOutcome =\n splitTag === 'holdout' ? { holdoutScore: composite, raw } : { searchScore: composite, raw }\n if (Object.keys(perJudge).length > 0) {\n outcome.judgeScores = {\n perJudge,\n perDimMean,\n composite,\n ...(notes.length > 0 ? { notes: notes.join(' | ') } : {}),\n }\n }\n\n return {\n runId: `${matrixId}:${profile.id}:${cell.cellId}`,\n experimentId,\n candidateId: profile.id,\n seed: cell.seed,\n model: profile.model,\n promptHash: profileHash,\n configHash,\n commitSha,\n wallMs: cell.durationMs,\n costUsd: cell.costUsd,\n tokenUsage: cell.tokenUsage,\n outcome,\n splitTag,\n scenarioId: cell.scenarioId,\n ...(cell.error ? { failureMode: cell.error } : {}),\n }\n}\n\nexport async function runProfileMatrix<TScenario extends Scenario, TArtifact>(\n opts: RunProfileMatrixOptions<TScenario, TArtifact>,\n): Promise<RunProfileMatrixResult<TArtifact, TScenario>> {\n if (opts.profiles.length === 0) throw new ProfileMatrixError('profiles must not be empty')\n if (opts.scenarios.length === 0) throw new ProfileMatrixError('scenarios must not be empty')\n\n const splitTag = opts.splitTag ?? 'search'\n const seed = opts.seed ?? 42\n const validate = opts.validate ?? true\n const integrityMode = opts.integrity ?? 'assert'\n const profileIds = opts.profiles.map((p) => p.id)\n const experimentId =\n opts.experimentId ??\n `pm_${sha({ profileIds, scenarios: opts.scenarios.map((s) => s.id) }).slice(0, 16)}`\n const matrixId = `mtx_${sha({ experimentId, profileIds, seed, splitTag }).slice(0, 16)}`\n\n // Preflight: every profile must hash (non-empty model) AND its model must\n // carry a snapshot version, BEFORE any LLM spend. A probe record run through\n // validateRunRecord catches both in the exact place they'd otherwise surface\n // far downstream.\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n try {\n validateRunRecord({\n runId: `${matrixId}:${profile.id}:probe`,\n experimentId,\n candidateId: profile.id,\n seed,\n model: profile.model,\n promptHash: profileHash,\n configHash: profileHash,\n commitSha: opts.commitSha,\n wallMs: 0,\n costUsd: 0,\n tokenUsage: { input: 0, output: 0 },\n outcome:\n splitTag === 'holdout' ? { holdoutScore: 0, raw: {} } : { searchScore: 0, raw: {} },\n splitTag,\n })\n } catch (err) {\n throw new ProfileMatrixError(\n `profile '${profile.id}' is not recordable: ${err instanceof Error ? err.message : String(err)}`,\n )\n }\n }\n\n const records: RunRecord[] = []\n const campaigns: Record<string, CampaignResult<TArtifact, TScenario>> = {}\n const byProfile: Record<string, ProfileSummary> = {}\n\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n const configHash = sha({\n profile: profileHash,\n judges: (opts.judges ?? []).map((j) => j.name),\n seed,\n splitTag,\n })\n\n // Bind the profile into a campaign dispatch. Name it so the campaign's\n // manifest hash is stable + distinct per profile.\n const dispatch = (scenario: TScenario, ctx: DispatchContext): Promise<TArtifact> =>\n opts.dispatch(profile, scenario, ctx)\n Object.defineProperty(dispatch, 'name', { value: `profile_${sanitize(profile.id)}` })\n\n const campaign = await runCampaign<TScenario, TArtifact>({\n scenarios: opts.scenarios,\n dispatch,\n judges: opts.judges,\n seed,\n reps: opts.reps,\n maxConcurrency: opts.maxConcurrency,\n costCeiling: opts.costCeiling,\n labeledStore: opts.labeledStore,\n captureSource: opts.captureSource,\n storage: opts.storage,\n now: opts.now,\n runDir: join(opts.runDir, sanitize(profile.id)),\n })\n campaigns[profile.id] = campaign\n\n const profileRecords: RunRecord[] = []\n for (const cell of campaign.cells) {\n const record = buildRunRecord({\n cell,\n profile,\n profileHash,\n configHash,\n experimentId,\n splitTag,\n commitSha: opts.commitSha,\n matrixId,\n })\n if (validate) validateRunRecord(record)\n profileRecords.push(record)\n records.push(record)\n }\n\n byProfile[profile.id] = {\n profileId: profile.id,\n profileHash,\n model: profile.model,\n records: profileRecords.length,\n meanComposite: mean(profileRecords.map(compositeOf)),\n totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),\n integrity: summarizeBackendIntegrity(profileRecords),\n }\n }\n\n // Integrity by construction — the whole point of the primitive.\n const integrity = summarizeBackendIntegrity(records)\n if (integrityMode === 'assert') {\n assertRealBackend(records, { allowMixed: opts.allowMixed ?? true })\n } else if (integrityMode === 'warn' && integrity.verdict !== 'real') {\n // eslint-disable-next-line no-console\n console.warn(\n `[runProfileMatrix] backend integrity: ${integrity.verdict} — ${integrity.diagnosis}`,\n )\n }\n\n // Pivots.\n const byScenario = rollup(records, (r) => r.scenarioId)\n const byPersona = opts.personaOf\n ? rollupByPersona(records, opts.scenarios, opts.personaOf)\n : undefined\n\n return { matrixId, experimentId, records, byProfile, byScenario, byPersona, integrity, campaigns }\n}\n\n/** Composite for a produced RunRecord (the split score it carries). */\nfunction compositeOf(r: RunRecord): number {\n return r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0\n}\n\nfunction rollup(\n records: RunRecord[],\n keyOf: (r: RunRecord) => string | undefined,\n): Record<string, ScenarioRollup> {\n const groups = new Map<string, number[]>()\n for (const r of records) {\n const key = keyOf(r)\n if (key === undefined) continue\n const arr = groups.get(key) ?? []\n arr.push(compositeOf(r))\n groups.set(key, arr)\n }\n const out: Record<string, ScenarioRollup> = {}\n for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length }\n return out\n}\n\nfunction rollupByPersona<TScenario extends Scenario>(\n records: RunRecord[],\n scenarios: TScenario[],\n personaOf: (s: TScenario) => string,\n): Record<string, ScenarioRollup> {\n const personaByScenarioId = new Map<string, string>()\n for (const s of scenarios) personaByScenarioId.set(s.id, personaOf(s))\n return rollup(records, (r) => (r.scenarioId ? personaByScenarioId.get(r.scenarioId) : undefined))\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAqBd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AC5QA,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,QAAAC,aAAY;AA6Bd,IAAM,qBAAN,cAAiC,eAAe;AAAA,EACrD,YAAY,SAAiB;AAC3B,UAAM,kBAAkB,OAAO;AAAA,EACjC;AACF;AAyGA,SAAS,SAAS,IAAoB;AACpC,SAAO,GAAG,QAAQ,mBAAmB,GAAG;AAC1C;AAEA,SAAS,IAAI,OAAwB;AACnC,SAAOC,YAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,KAAK,CAAC,EAAE,OAAO,KAAK;AACxE;AAEA,SAAS,KAAK,IAAsB;AAClC,SAAO,GAAG,WAAW,IAAI,IAAI,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAClE;AAEA,SAAS,cAAc,MAA2C;AAChE,QAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,SAAO,WAAW,WAAW,IAAI,IAAI,KAAK,UAAU;AACtD;AAaA,SAAS,eAA0B,MAA6C;AAC9E,QAAM,EAAE,MAAM,SAAS,aAAa,YAAY,cAAc,UAAU,WAAW,SAAS,IAC1F;AACF,QAAM,YAAY,cAAc,IAAI;AAGpC,QAAM,MAA8B,EAAE,UAAU;AAChD,QAAM,WAAmD,CAAC;AAC1D,QAAM,WAAqC,CAAC;AAC5C,QAAM,QAAkB,CAAC;AACzB,aAAW,CAAC,WAAW,EAAE,KAAK,OAAO,QAAQ,KAAK,WAAW,GAAG;AAC9D,aAAS,SAAS,IAAI,EAAE,GAAG,GAAG,WAAW;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,UAAU,GAAG;AACxD,UAAI,GAAG,SAAS,IAAI,GAAG,EAAE,IAAI;AAC5B,OAAC,SAAS,GAAG,MAAM,CAAC,GAAG,KAAK,KAAK;AAAA,IACpC;AACA,QAAI,GAAG,MAAO,OAAM,KAAK,GAAG,SAAS,KAAK,GAAG,KAAK,EAAE;AAAA,EACtD;AACA,QAAM,aAAqC,CAAC;AAC5C,aAAW,CAAC,KAAK,MAAM,KAAK,OAAO,QAAQ,QAAQ,EAAG,YAAW,GAAG,IAAI,KAAK,MAAM;AAEnF,QAAM,UACJ,aAAa,YAAY,EAAE,cAAc,WAAW,IAAI,IAAI,EAAE,aAAa,WAAW,IAAI;AAC5F,MAAI,OAAO,KAAK,QAAQ,EAAE,SAAS,GAAG;AACpC,YAAQ,cAAc;AAAA,MACpB;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAI,MAAM,SAAS,IAAI,EAAE,OAAO,MAAM,KAAK,KAAK,EAAE,IAAI,CAAC;AAAA,IACzD;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE,IAAI,KAAK,MAAM;AAAA,IAC/C;AAAA,IACA,aAAa,QAAQ;AAAA,IACrB,MAAM,KAAK;AAAA,IACX,OAAO,QAAQ;AAAA,IACf,YAAY;AAAA,IACZ;AAAA,IACA;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,SAAS,KAAK;AAAA,IACd,YAAY,KAAK;AAAA,IACjB;AAAA,IACA;AAAA,IACA,YAAY,KAAK;AAAA,IACjB,GAAI,KAAK,QAAQ,EAAE,aAAa,KAAK,MAAM,IAAI,CAAC;AAAA,EAClD;AACF;AAEA,eAAsB,iBACpB,MACuD;AACvD,MAAI,KAAK,SAAS,WAAW,EAAG,OAAM,IAAI,mBAAmB,4BAA4B;AACzF,MAAI,KAAK,UAAU,WAAW,EAAG,OAAM,IAAI,mBAAmB,6BAA6B;AAE3F,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,gBAAgB,KAAK,aAAa;AACxC,QAAM,aAAa,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE;AAChD,QAAM,eACJ,KAAK,gBACL,MAAM,IAAI,EAAE,YAAY,WAAW,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AACpF,QAAM,WAAW,OAAO,IAAI,EAAE,cAAc,YAAY,MAAM,SAAS,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AAMtF,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,QAAI;AACF,wBAAkB;AAAA,QAChB,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE;AAAA,QAChC;AAAA,QACA,aAAa,QAAQ;AAAA,QACrB;AAAA,QACA,OAAO,QAAQ;AAAA,QACf,YAAY;AAAA,QACZ,YAAY;AAAA,QACZ,WAAW,KAAK;AAAA,QAChB,QAAQ;AAAA,QACR,SAAS;AAAA,QACT,YAAY,EAAE,OAAO,GAAG,QAAQ,EAAE;AAAA,QAClC,SACE,aAAa,YAAY,EAAE,cAAc,GAAG,KAAK,CAAC,EAAE,IAAI,EAAE,aAAa,GAAG,KAAK,CAAC,EAAE;AAAA,QACpF;AAAA,MACF,CAAC;AAAA,IACH,SAAS,KAAK;AACZ,YAAM,IAAI;AAAA,QACR,YAAY,QAAQ,EAAE,wBAAwB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,MAChG;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAuB,CAAC;AAC9B,QAAM,YAAkE,CAAC;AACzE,QAAM,YAA4C,CAAC;AAEnD,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,UAAM,aAAa,IAAI;AAAA,MACrB,SAAS;AAAA,MACT,SAAS,KAAK,UAAU,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,IAAI;AAAA,MAC7C;AAAA,MACA;AAAA,IACF,CAAC;AAID,UAAM,WAAW,CAAC,UAAqB,QACrC,KAAK,SAAS,SAAS,UAAU,GAAG;AACtC,WAAO,eAAe,UAAU,QAAQ,EAAE,OAAO,WAAW,SAAS,QAAQ,EAAE,CAAC,GAAG,CAAC;AAEpF,UAAM,WAAW,MAAM,YAAkC;AAAA,MACvD,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,QAAQ,KAAK;AAAA,MACb;AAAA,MACA,MAAM,KAAK;AAAA,MACX,gBAAgB,KAAK;AAAA,MACrB,aAAa,KAAK;AAAA,MAClB,cAAc,KAAK;AAAA,MACnB,eAAe,KAAK;AAAA,MACpB,SAAS,KAAK;AAAA,MACd,KAAK,KAAK;AAAA,MACV,QAAQC,MAAK,KAAK,QAAQ,SAAS,QAAQ,EAAE,CAAC;AAAA,IAChD,CAAC;AACD,cAAU,QAAQ,EAAE,IAAI;AAExB,UAAM,iBAA8B,CAAC;AACrC,eAAW,QAAQ,SAAS,OAAO;AACjC,YAAM,SAAS,eAAe;AAAA,QAC5B;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,WAAW,KAAK;AAAA,QAChB;AAAA,MACF,CAAC;AACD,UAAI,SAAU,mBAAkB,MAAM;AACtC,qBAAe,KAAK,MAAM;AAC1B,cAAQ,KAAK,MAAM;AAAA,IACrB;AAEA,cAAU,QAAQ,EAAE,IAAI;AAAA,MACtB,WAAW,QAAQ;AAAA,MACnB;AAAA,MACA,OAAO,QAAQ;AAAA,MACf,SAAS,eAAe;AAAA,MACxB,eAAe,KAAK,eAAe,IAAI,WAAW,CAAC;AAAA,MACnD,cAAc,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,MAC9D,WAAW,0BAA0B,cAAc;AAAA,IACrD;AAAA,EACF;AAGA,QAAM,YAAY,0BAA0B,OAAO;AACnD,MAAI,kBAAkB,UAAU;AAC9B,sBAAkB,SAAS,EAAE,YAAY,KAAK,cAAc,KAAK,CAAC;AAAA,EACpE,WAAW,kBAAkB,UAAU,UAAU,YAAY,QAAQ;AAEnE,YAAQ;AAAA,MACN,yCAAyC,UAAU,OAAO,WAAM,UAAU,SAAS;AAAA,IACrF;AAAA,EACF;AAGA,QAAM,aAAa,OAAO,SAAS,CAAC,MAAM,EAAE,UAAU;AACtD,QAAM,YAAY,KAAK,YACnB,gBAAgB,SAAS,KAAK,WAAW,KAAK,SAAS,IACvD;AAEJ,SAAO,EAAE,UAAU,cAAc,SAAS,WAAW,YAAY,WAAW,WAAW,UAAU;AACnG;AAGA,SAAS,YAAY,GAAsB;AACzC,SAAO,EAAE,QAAQ,gBAAgB,EAAE,QAAQ,eAAe;AAC5D;AAEA,SAAS,OACP,SACA,OACgC;AAChC,QAAM,SAAS,oBAAI,IAAsB;AACzC,aAAW,KAAK,SAAS;AACvB,UAAM,MAAM,MAAM,CAAC;AACnB,QAAI,QAAQ,OAAW;AACvB,UAAM,MAAM,OAAO,IAAI,GAAG,KAAK,CAAC;AAChC,QAAI,KAAK,YAAY,CAAC,CAAC;AACvB,WAAO,IAAI,KAAK,GAAG;AAAA,EACrB;AACA,QAAM,MAAsC,CAAC;AAC7C,aAAW,CAAC,KAAK,EAAE,KAAK,OAAQ,KAAI,GAAG,IAAI,EAAE,eAAe,KAAK,EAAE,GAAG,GAAG,GAAG,OAAO;AACnF,SAAO;AACT;AAEA,SAAS,gBACP,SACA,WACA,WACgC;AAChC,QAAM,sBAAsB,oBAAI,IAAoB;AACpD,aAAW,KAAK,UAAW,qBAAoB,IAAI,EAAE,IAAI,UAAU,CAAC,CAAC;AACrE,SAAO,OAAO,SAAS,CAAC,MAAO,EAAE,aAAa,oBAAoB,IAAI,EAAE,UAAU,IAAI,MAAU;AAClG;;;AC/YA,SAAS,oBAAoB;AAC7B,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["createHash","join","createHash","join","existsSync","join"]}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
cohensD
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-ITBRCT73.js";
|
|
4
4
|
import {
|
|
5
5
|
argHash,
|
|
6
6
|
groupBy,
|
|
@@ -551,4 +551,4 @@ export {
|
|
|
551
551
|
iqr,
|
|
552
552
|
welchsTTest
|
|
553
553
|
};
|
|
554
|
-
//# sourceMappingURL=chunk-
|
|
554
|
+
//# sourceMappingURL=chunk-3B7Y5AUR.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/errors.ts"],"sourcesContent":["/**\n * Error taxonomy for `@tangle-network/agent-eval`.\n *\n * Every error this package throws as part of its *public contract* extends\n * `AgentEvalError`. Consumers can pattern-match by `instanceof <Subclass>` or\n * by the stable string `code` carried on the base class.\n *\n * The codes are stable across minor versions; new codes can be added, but\n * existing codes never change meaning. New subclasses are non-breaking.\n *\n * Internal invariant guards (`throw new Error('this should never happen')`)\n * remain plain `Error`s on purpose — they're programmer-mistake assertions,\n * not consumer-catchable contract failures.\n */\n\nexport type AgentEvalErrorCode =\n | 'validation'\n | 'not_found'\n | 'config'\n | 'capture_integrity'\n | 'judge'\n | 'verification'\n | 'replay'\n | 'backend_integrity'\n | 'profile_matrix'\n\nexport class AgentEvalError extends Error {\n /** Stable string code. Survives minification; safe to switch on. */\n readonly code: AgentEvalErrorCode\n\n constructor(code: AgentEvalErrorCode, message: string, options?: { cause?: unknown }) {\n super(message, options)\n this.name = this.constructor.name\n this.code = code\n }\n}\n\n/** Caller passed invalid arguments (out of range, mutually-exclusive options, bad shape). */\nexport class ValidationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('validation', message, options)\n }\n}\n\n/** A named resource (run, span, rubric, scenario, dataset row, route) does not exist. */\nexport class NotFoundError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('not_found', message, options)\n }\n}\n\n/** Configuration missing or malformed (`HOME` unset, required image not supplied, env var absent). */\nexport class ConfigError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('config', message, options)\n }\n}\n\n/**\n * A run is missing the artifacts a launch-grade check requires:\n * raw HTTP capture absent, no LLM spans, route assertion failed, run-end\n * assertion tripped. Block ship on this; do not catch and move on.\n */\nexport class CaptureIntegrityError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('capture_integrity', message, options)\n }\n}\n\n/** A judge call failed in a way that's not retryable: schema parse failure, bad rubric, conflicting dimensions. */\nexport class JudgeError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('judge', message, options)\n }\n}\n\n/** A verifier signalled a hard failure (compile, test, schema) — distinct from a low judge score. */\nexport class VerificationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('verification', message, options)\n }\n}\n\n/** Replay cache cannot satisfy a request: miss with no fallback, sink lacks list(), unsupported URL. */\nexport class ReplayError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('replay', message, options)\n }\n}\n"],"mappings":";AA0BO,IAAM,iBAAN,cAA6B,MAAM;AAAA;AAAA,EAE/B;AAAA,EAET,YAAY,MAA0B,SAAiB,SAA+B;AACpF,UAAM,SAAS,OAAO;AACtB,SAAK,OAAO,KAAK,YAAY;AAC7B,SAAK,OAAO;AAAA,EACd;AACF;AAGO,IAAM,kBAAN,cAA8B,eAAe;AAAA,EAClD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,cAAc,SAAS,OAAO;AAAA,EACtC;AACF;AAGO,IAAM,gBAAN,cAA4B,eAAe;AAAA,EAChD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,aAAa,SAAS,OAAO;AAAA,EACrC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;AAOO,IAAM,wBAAN,cAAoC,eAAe;AAAA,EACxD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,qBAAqB,SAAS,OAAO;AAAA,EAC7C;AACF;AAGO,IAAM,aAAN,cAAyB,eAAe;AAAA,EAC7C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,SAAS,SAAS,OAAO;AAAA,EACjC;AACF;AAGO,IAAM,oBAAN,cAAgC,eAAe;AAAA,EACpD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,gBAAgB,SAAS,OAAO;AAAA,EACxC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;","names":[]}
|
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
} from "./chunk-NCRFYPS3.js";
|
|
5
5
|
import {
|
|
6
6
|
validateRunRecord
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-F3SRAAZO.js";
|
|
8
8
|
import {
|
|
9
9
|
TraceEmitter
|
|
10
10
|
} from "./chunk-TVVP3ZZQ.js";
|
|
@@ -610,4 +610,4 @@ export {
|
|
|
610
610
|
runProposeReviewAsControlLoop,
|
|
611
611
|
controlFailureClassFromVerification
|
|
612
612
|
};
|
|
613
|
-
//# sourceMappingURL=chunk-
|
|
613
|
+
//# sourceMappingURL=chunk-6EKXFFGQ.js.map
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
__export
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-PZ5AY32C.js";
|
|
4
4
|
|
|
5
5
|
// src/benchmarks/index.ts
|
|
6
6
|
var benchmarks_exports = {};
|
|
@@ -220,4 +220,4 @@ export {
|
|
|
220
220
|
routing_exports,
|
|
221
221
|
benchmarks_exports
|
|
222
222
|
};
|
|
223
|
-
//# sourceMappingURL=chunk-
|
|
223
|
+
//# sourceMappingURL=chunk-6QDKWHLS.js.map
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
callLlmJson
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-IHDHUN2X.js";
|
|
4
4
|
|
|
5
5
|
// src/wire/schemas.ts
|
|
6
6
|
import { extendZodWithOpenApi } from "@asteasolutions/zod-to-openapi";
|
|
@@ -1002,4 +1002,4 @@ export {
|
|
|
1002
1002
|
startServer,
|
|
1003
1003
|
startServerAsync
|
|
1004
1004
|
};
|
|
1005
|
-
//# sourceMappingURL=chunk-
|
|
1005
|
+
//# sourceMappingURL=chunk-6REHLN5J.js.map
|
|
@@ -2,14 +2,14 @@ import {
|
|
|
2
2
|
buildAgentProfileCell,
|
|
3
3
|
validateRunRecord,
|
|
4
4
|
verifyAgentProfileCell
|
|
5
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-F3SRAAZO.js";
|
|
6
6
|
import {
|
|
7
7
|
researchReport
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-KX6F6NCG.js";
|
|
9
9
|
import {
|
|
10
10
|
RunIntegrityError,
|
|
11
11
|
assertRunCaptured
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-SBCB6VZY.js";
|
|
13
13
|
import {
|
|
14
14
|
TraceEmitter
|
|
15
15
|
} from "./chunk-TVVP3ZZQ.js";
|
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
} from "./chunk-VSMTAMNK.js";
|
|
20
20
|
import {
|
|
21
21
|
assertLlmRoute
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-IHDHUN2X.js";
|
|
23
23
|
import {
|
|
24
24
|
FileSystemRawProviderSink
|
|
25
25
|
} from "./chunk-PC4UYEBM.js";
|
|
@@ -329,4 +329,4 @@ function defaultRunId(params) {
|
|
|
329
329
|
export {
|
|
330
330
|
runEvalCampaign
|
|
331
331
|
};
|
|
332
|
-
//# sourceMappingURL=chunk-
|
|
332
|
+
//# sourceMappingURL=chunk-AIWHLG7J.js.map
|