@tangle-network/agent-eval 0.60.0 → 0.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/dist/adapters/http.d.ts +4 -1
- package/dist/adapters/langchain.d.ts +4 -1
- package/dist/adapters/otel.d.ts +5 -5
- package/dist/agent-profile-DzcPHR1Z.d.ts +114 -0
- package/dist/benchmarks/index.d.ts +3 -3
- package/dist/builder-eval/index.js +2 -2
- package/dist/campaign/index.d.ts +151 -11
- package/dist/campaign/index.js +212 -10
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
- package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
- package/dist/chunk-3BFEG2F6.js.map +1 -0
- package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
- package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
- package/dist/{chunk-NOPYCRNG.js → chunk-7TPYV2ER.js} +39 -2
- package/dist/chunk-7TPYV2ER.js.map +1 -0
- package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
- package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
- package/dist/{chunk-LBSXXH56.js → chunk-CV2BS2OV.js} +8 -6
- package/dist/chunk-CV2BS2OV.js.map +1 -0
- package/dist/chunk-E22YUOAL.js +111 -0
- package/dist/chunk-E22YUOAL.js.map +1 -0
- package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
- package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
- package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
- package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
- package/dist/chunk-PQV2TKC3.js +27 -0
- package/dist/chunk-PQV2TKC3.js.map +1 -0
- package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
- package/dist/{chunk-GBHRUAOF.js → chunk-SS2SOBBT.js} +2 -107
- package/dist/chunk-SS2SOBBT.js.map +1 -0
- package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
- package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
- package/dist/cli.js +3 -3
- package/dist/contract/index.d.ts +13 -13
- package/dist/contract/index.js +8 -7
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DjEgwWNo.d.ts → control-DxvZeV5X.d.ts} +2 -2
- package/dist/control.d.ts +5 -5
- package/dist/control.js +3 -3
- package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
- package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
- package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -3
- package/dist/hosted/index.d.ts +5 -5
- package/dist/{index-wlaiph9Y.d.ts → index-DsnOpCO6.d.ts} +1 -1
- package/dist/{index-BIkvdkSU.d.ts → index-DxfmYUjC.d.ts} +2 -2
- package/dist/index.d.ts +108 -132
- package/dist/index.js +339 -73
- package/dist/index.js.map +1 -1
- package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
- package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +3 -3
- package/dist/{provenance-BM8vmMBa.d.ts → provenance-CYBV9Ox6.d.ts} +16 -5
- package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
- package/dist/{registry-DK9kqXvb.d.ts → registry-DPly4_hZ.d.ts} +2 -2
- package/dist/{release-report-DmPjIce3.d.ts → release-report-DGoeObZT.d.ts} +3 -3
- package/dist/reporting.d.ts +6 -6
- package/dist/reporting.js +4 -4
- package/dist/{researcher-JP8EvnLv.d.ts → researcher-WJvIpX3L.d.ts} +4 -4
- package/dist/rl.d.ts +9 -9
- package/dist/rl.js +7 -7
- package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
- package/dist/run-campaign-5J3ED2UJ.js +11 -0
- package/dist/{run-record-etiCMsUq.d.ts → run-record-BgTFzO2r.d.ts} +2 -2
- package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +3 -3
- package/dist/{types-VCIXx_yo.d.ts → types-DH22o8hM.d.ts} +28 -4
- package/dist/wire/index.d.ts +3 -3
- package/dist/wire/index.js +3 -3
- package/package.json +12 -25
- package/dist/chunk-GBHRUAOF.js.map +0 -1
- package/dist/chunk-LBSXXH56.js.map +0 -1
- package/dist/chunk-NOPYCRNG.js.map +0 -1
- package/dist/chunk-QYJT52YW.js.map +0 -1
- package/dist/run-campaign-5XENUKRF.js +0 -10
- /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
- /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
- /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
- /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
- /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
- /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
- /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
- /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
- /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
- /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
- /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
- /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
- /package/dist/{run-campaign-5XENUKRF.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
package/dist/campaign/index.js
CHANGED
|
@@ -20,19 +20,32 @@ import {
|
|
|
20
20
|
runOptimization,
|
|
21
21
|
surfaceContentHash,
|
|
22
22
|
surfaceHash
|
|
23
|
-
} from "../chunk-
|
|
23
|
+
} from "../chunk-CV2BS2OV.js";
|
|
24
24
|
import {
|
|
25
25
|
fsCampaignStorage,
|
|
26
26
|
inMemoryCampaignStorage,
|
|
27
27
|
runCampaign
|
|
28
|
-
} from "../chunk-
|
|
29
|
-
import
|
|
28
|
+
} from "../chunk-7TPYV2ER.js";
|
|
29
|
+
import {
|
|
30
|
+
agentProfileHash
|
|
31
|
+
} from "../chunk-PQV2TKC3.js";
|
|
32
|
+
import "../chunk-SS2SOBBT.js";
|
|
33
|
+
import {
|
|
34
|
+
assertRealBackend,
|
|
35
|
+
summarizeBackendIntegrity
|
|
36
|
+
} from "../chunk-E22YUOAL.js";
|
|
30
37
|
import "../chunk-YV7J7X5N.js";
|
|
31
|
-
import
|
|
38
|
+
import {
|
|
39
|
+
validateRunRecord
|
|
40
|
+
} from "../chunk-F3SRAAZO.js";
|
|
41
|
+
import "../chunk-ITBRCT73.js";
|
|
32
42
|
import "../chunk-GGE4NNQT.js";
|
|
33
|
-
import "../chunk-
|
|
43
|
+
import "../chunk-VSMTAMNK.js";
|
|
44
|
+
import "../chunk-IHDHUN2X.js";
|
|
34
45
|
import "../chunk-PC4UYEBM.js";
|
|
35
|
-
import
|
|
46
|
+
import {
|
|
47
|
+
AgentEvalError
|
|
48
|
+
} from "../chunk-3BFEG2F6.js";
|
|
36
49
|
import "../chunk-PZ5AY32C.js";
|
|
37
50
|
|
|
38
51
|
// src/campaign/labeled-store/fs-adapter.ts
|
|
@@ -246,10 +259,197 @@ function appendLine(path, line) {
|
|
|
246
259
|
}
|
|
247
260
|
}
|
|
248
261
|
|
|
262
|
+
// src/campaign/presets/run-profile-matrix.ts
|
|
263
|
+
import { createHash as createHash2 } from "crypto";
|
|
264
|
+
import { join as join2 } from "path";
|
|
265
|
+
var ProfileMatrixError = class extends AgentEvalError {
|
|
266
|
+
constructor(message) {
|
|
267
|
+
super("profile_matrix", message);
|
|
268
|
+
}
|
|
269
|
+
};
|
|
270
|
+
function sanitize(id) {
|
|
271
|
+
return id.replace(/[^a-zA-Z0-9_-]/g, "_");
|
|
272
|
+
}
|
|
273
|
+
function sha(input) {
|
|
274
|
+
return createHash2("sha256").update(JSON.stringify(input)).digest("hex");
|
|
275
|
+
}
|
|
276
|
+
function mean(xs) {
|
|
277
|
+
return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
278
|
+
}
|
|
279
|
+
function cellComposite(cell) {
|
|
280
|
+
const composites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
281
|
+
return composites.length === 0 ? 0 : mean(composites);
|
|
282
|
+
}
|
|
283
|
+
function buildRunRecord(args) {
|
|
284
|
+
const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } = args;
|
|
285
|
+
const composite = cellComposite(cell);
|
|
286
|
+
const raw = { composite };
|
|
287
|
+
const perJudge = {};
|
|
288
|
+
const dimAccum = {};
|
|
289
|
+
const notes = [];
|
|
290
|
+
for (const [judgeName, js] of Object.entries(cell.judgeScores)) {
|
|
291
|
+
perJudge[judgeName] = { ...js.dimensions };
|
|
292
|
+
for (const [dim, value] of Object.entries(js.dimensions)) {
|
|
293
|
+
raw[`${judgeName}.${dim}`] = value;
|
|
294
|
+
(dimAccum[dim] ??= []).push(value);
|
|
295
|
+
}
|
|
296
|
+
if (js.notes) notes.push(`${judgeName}: ${js.notes}`);
|
|
297
|
+
}
|
|
298
|
+
const perDimMean = {};
|
|
299
|
+
for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values);
|
|
300
|
+
const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
|
|
301
|
+
if (Object.keys(perJudge).length > 0) {
|
|
302
|
+
outcome.judgeScores = {
|
|
303
|
+
perJudge,
|
|
304
|
+
perDimMean,
|
|
305
|
+
composite,
|
|
306
|
+
...notes.length > 0 ? { notes: notes.join(" | ") } : {}
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
return {
|
|
310
|
+
runId: `${matrixId}:${profile.id}:${cell.cellId}`,
|
|
311
|
+
experimentId,
|
|
312
|
+
candidateId: profile.id,
|
|
313
|
+
seed: cell.seed,
|
|
314
|
+
model: profile.model,
|
|
315
|
+
promptHash: profileHash,
|
|
316
|
+
configHash,
|
|
317
|
+
commitSha,
|
|
318
|
+
wallMs: cell.durationMs,
|
|
319
|
+
costUsd: cell.costUsd,
|
|
320
|
+
tokenUsage: cell.tokenUsage,
|
|
321
|
+
outcome,
|
|
322
|
+
splitTag,
|
|
323
|
+
scenarioId: cell.scenarioId,
|
|
324
|
+
...cell.error ? { failureMode: cell.error } : {}
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
async function runProfileMatrix(opts) {
|
|
328
|
+
if (opts.profiles.length === 0) throw new ProfileMatrixError("profiles must not be empty");
|
|
329
|
+
if (opts.scenarios.length === 0) throw new ProfileMatrixError("scenarios must not be empty");
|
|
330
|
+
const splitTag = opts.splitTag ?? "search";
|
|
331
|
+
const seed = opts.seed ?? 42;
|
|
332
|
+
const validate = opts.validate ?? true;
|
|
333
|
+
const integrityMode = opts.integrity ?? "assert";
|
|
334
|
+
const profileIds = opts.profiles.map((p) => p.id);
|
|
335
|
+
const experimentId = opts.experimentId ?? `pm_${sha({ profileIds, scenarios: opts.scenarios.map((s) => s.id) }).slice(0, 16)}`;
|
|
336
|
+
const matrixId = `mtx_${sha({ experimentId, profileIds, seed, splitTag }).slice(0, 16)}`;
|
|
337
|
+
for (const profile of opts.profiles) {
|
|
338
|
+
const profileHash = agentProfileHash(profile);
|
|
339
|
+
try {
|
|
340
|
+
validateRunRecord({
|
|
341
|
+
runId: `${matrixId}:${profile.id}:probe`,
|
|
342
|
+
experimentId,
|
|
343
|
+
candidateId: profile.id,
|
|
344
|
+
seed,
|
|
345
|
+
model: profile.model,
|
|
346
|
+
promptHash: profileHash,
|
|
347
|
+
configHash: profileHash,
|
|
348
|
+
commitSha: opts.commitSha,
|
|
349
|
+
wallMs: 0,
|
|
350
|
+
costUsd: 0,
|
|
351
|
+
tokenUsage: { input: 0, output: 0 },
|
|
352
|
+
outcome: splitTag === "holdout" ? { holdoutScore: 0, raw: {} } : { searchScore: 0, raw: {} },
|
|
353
|
+
splitTag
|
|
354
|
+
});
|
|
355
|
+
} catch (err) {
|
|
356
|
+
throw new ProfileMatrixError(
|
|
357
|
+
`profile '${profile.id}' is not recordable: ${err instanceof Error ? err.message : String(err)}`
|
|
358
|
+
);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
const records = [];
|
|
362
|
+
const campaigns = {};
|
|
363
|
+
const byProfile = {};
|
|
364
|
+
for (const profile of opts.profiles) {
|
|
365
|
+
const profileHash = agentProfileHash(profile);
|
|
366
|
+
const configHash = sha({
|
|
367
|
+
profile: profileHash,
|
|
368
|
+
judges: (opts.judges ?? []).map((j) => j.name),
|
|
369
|
+
seed,
|
|
370
|
+
splitTag
|
|
371
|
+
});
|
|
372
|
+
const dispatch = (scenario, ctx) => opts.dispatch(profile, scenario, ctx);
|
|
373
|
+
Object.defineProperty(dispatch, "name", { value: `profile_${sanitize(profile.id)}` });
|
|
374
|
+
const campaign = await runCampaign({
|
|
375
|
+
scenarios: opts.scenarios,
|
|
376
|
+
dispatch,
|
|
377
|
+
judges: opts.judges,
|
|
378
|
+
seed,
|
|
379
|
+
reps: opts.reps,
|
|
380
|
+
maxConcurrency: opts.maxConcurrency,
|
|
381
|
+
costCeiling: opts.costCeiling,
|
|
382
|
+
labeledStore: opts.labeledStore,
|
|
383
|
+
captureSource: opts.captureSource,
|
|
384
|
+
storage: opts.storage,
|
|
385
|
+
now: opts.now,
|
|
386
|
+
runDir: join2(opts.runDir, sanitize(profile.id))
|
|
387
|
+
});
|
|
388
|
+
campaigns[profile.id] = campaign;
|
|
389
|
+
const profileRecords = [];
|
|
390
|
+
for (const cell of campaign.cells) {
|
|
391
|
+
const record = buildRunRecord({
|
|
392
|
+
cell,
|
|
393
|
+
profile,
|
|
394
|
+
profileHash,
|
|
395
|
+
configHash,
|
|
396
|
+
experimentId,
|
|
397
|
+
splitTag,
|
|
398
|
+
commitSha: opts.commitSha,
|
|
399
|
+
matrixId
|
|
400
|
+
});
|
|
401
|
+
if (validate) validateRunRecord(record);
|
|
402
|
+
profileRecords.push(record);
|
|
403
|
+
records.push(record);
|
|
404
|
+
}
|
|
405
|
+
byProfile[profile.id] = {
|
|
406
|
+
profileId: profile.id,
|
|
407
|
+
profileHash,
|
|
408
|
+
model: profile.model,
|
|
409
|
+
records: profileRecords.length,
|
|
410
|
+
meanComposite: mean(profileRecords.map(compositeOf)),
|
|
411
|
+
totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),
|
|
412
|
+
integrity: summarizeBackendIntegrity(profileRecords)
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
const integrity = summarizeBackendIntegrity(records);
|
|
416
|
+
if (integrityMode === "assert") {
|
|
417
|
+
assertRealBackend(records, { allowMixed: opts.allowMixed ?? true });
|
|
418
|
+
} else if (integrityMode === "warn" && integrity.verdict !== "real") {
|
|
419
|
+
console.warn(
|
|
420
|
+
`[runProfileMatrix] backend integrity: ${integrity.verdict} \u2014 ${integrity.diagnosis}`
|
|
421
|
+
);
|
|
422
|
+
}
|
|
423
|
+
const byScenario = rollup(records, (r) => r.scenarioId);
|
|
424
|
+
const byPersona = opts.personaOf ? rollupByPersona(records, opts.scenarios, opts.personaOf) : void 0;
|
|
425
|
+
return { matrixId, experimentId, records, byProfile, byScenario, byPersona, integrity, campaigns };
|
|
426
|
+
}
|
|
427
|
+
function compositeOf(r) {
|
|
428
|
+
return r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0;
|
|
429
|
+
}
|
|
430
|
+
function rollup(records, keyOf) {
|
|
431
|
+
const groups = /* @__PURE__ */ new Map();
|
|
432
|
+
for (const r of records) {
|
|
433
|
+
const key = keyOf(r);
|
|
434
|
+
if (key === void 0) continue;
|
|
435
|
+
const arr = groups.get(key) ?? [];
|
|
436
|
+
arr.push(compositeOf(r));
|
|
437
|
+
groups.set(key, arr);
|
|
438
|
+
}
|
|
439
|
+
const out = {};
|
|
440
|
+
for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length };
|
|
441
|
+
return out;
|
|
442
|
+
}
|
|
443
|
+
function rollupByPersona(records, scenarios, personaOf) {
|
|
444
|
+
const personaByScenarioId = /* @__PURE__ */ new Map();
|
|
445
|
+
for (const s of scenarios) personaByScenarioId.set(s.id, personaOf(s));
|
|
446
|
+
return rollup(records, (r) => r.scenarioId ? personaByScenarioId.get(r.scenarioId) : void 0);
|
|
447
|
+
}
|
|
448
|
+
|
|
249
449
|
// src/campaign/worktree/index.ts
|
|
250
450
|
import { execFileSync } from "child_process";
|
|
251
451
|
import { existsSync as existsSync2 } from "fs";
|
|
252
|
-
import { basename, isAbsolute, join as
|
|
452
|
+
import { basename, isAbsolute, join as join3 } from "path";
|
|
253
453
|
var WorktreeAdapterError = class extends Error {
|
|
254
454
|
constructor(message, cause) {
|
|
255
455
|
super(message);
|
|
@@ -271,13 +471,13 @@ function slug(label) {
|
|
|
271
471
|
}
|
|
272
472
|
function gitWorktreeAdapter(opts) {
|
|
273
473
|
const git = opts.git ?? defaultGit;
|
|
274
|
-
const worktreeDir = opts.worktreeDir ??
|
|
474
|
+
const worktreeDir = opts.worktreeDir ?? join3(opts.repoRoot, ".worktrees");
|
|
275
475
|
const branchPrefix = opts.branchPrefix ?? "improve";
|
|
276
476
|
return {
|
|
277
477
|
async create({ baseRef, label }) {
|
|
278
478
|
const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
|
|
279
479
|
const branch = `${branchPrefix}/${id}`;
|
|
280
|
-
const path =
|
|
480
|
+
const path = join3(worktreeDir, id);
|
|
281
481
|
git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
|
|
282
482
|
return { path, branch, baseRef };
|
|
283
483
|
},
|
|
@@ -302,12 +502,13 @@ function gitWorktreeAdapter(opts) {
|
|
|
302
502
|
}
|
|
303
503
|
function resolveWorktreePath(surface, worktreeDir) {
|
|
304
504
|
if (isAbsolute(surface.worktreeRef) && existsSync2(surface.worktreeRef)) return surface.worktreeRef;
|
|
305
|
-
if (worktreeDir) return
|
|
505
|
+
if (worktreeDir) return join3(worktreeDir, basename(surface.worktreeRef));
|
|
306
506
|
return surface.worktreeRef;
|
|
307
507
|
}
|
|
308
508
|
export {
|
|
309
509
|
FsLabeledScenarioStore,
|
|
310
510
|
LabeledScenarioStoreError,
|
|
511
|
+
ProfileMatrixError,
|
|
311
512
|
WorktreeAdapterError,
|
|
312
513
|
buildLoopProvenanceRecord,
|
|
313
514
|
composeGate,
|
|
@@ -333,6 +534,7 @@ export {
|
|
|
333
534
|
runEval,
|
|
334
535
|
runImprovementLoop,
|
|
335
536
|
runOptimization,
|
|
537
|
+
runProfileMatrix,
|
|
336
538
|
surfaceContentHash,
|
|
337
539
|
surfaceHash
|
|
338
540
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAqBd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AClSA,SAAS,oBAAoB;AAC7B,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["existsSync","join"]}
|
|
1
|
+
{"version":3,"sources":["../../src/campaign/labeled-store/fs-adapter.ts","../../src/campaign/presets/run-profile-matrix.ts","../../src/campaign/worktree/index.ts"],"sourcesContent":["/**\n * @experimental\n *\n * Filesystem `LabeledScenarioStore` adapter. The default capture sink for\n * traces + eval artifacts. Production deployments typically swap for a\n * Turso/SQLite adapter (same interface).\n *\n * Records land as one JSONL file per source under `<root>/<source>.jsonl`.\n * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.\n *\n * Safety properties enforced at write-time:\n *\n * - **Provenance required**: writes without `source`, `sourceVersionHash`,\n * `capturedAt`, `redactionStatus` are rejected. Closes the alignment\n * reviewer's data-poisoning gap.\n * - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`\n * stops a single tenant/source from flooding the store.\n *\n * Safety properties enforced at sample-time:\n *\n * - **Required split + capturedBefore**: substrate refuses to sample without\n * an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates\n * accidental train/test contamination.\n * - **Default training-source filter**: when the store is sampled with\n * `split: 'train'`, production-trace records are EXCLUDED unless the\n * caller passes `filter.source: 'production-trace'` explicitly. Closes\n * the contamination-by-default gap flagged by the senior eval engineer.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport type {\n LabeledScenarioRecord,\n LabeledScenarioSampleArgs,\n LabeledScenarioSource,\n LabeledScenarioStore,\n LabeledScenarioWrite,\n LabelTrust,\n} from '../types'\nimport { labelTrustRank } from '../types'\n\nexport interface FsLabeledScenarioStoreOptions {\n /** Root directory for JSONL files. Created if missing. */\n root: string\n /** Per-source rate limit. When set, writes exceeding the cap are rejected\n * with a typed error. Default: no limit. */\n maxWritesPerMinutePerBucket?: number\n /** Test seam — override `Date.now()` for deterministic tests. */\n now?: () => number\n}\n\nexport class LabeledScenarioStoreError extends Error {\n constructor(\n public readonly code: string,\n message: string,\n ) {\n super(message)\n this.name = 'LabeledScenarioStoreError'\n }\n}\n\ninterface RateLimitState {\n bucket: string\n windowStartMs: number\n count: number\n}\n\nexport class FsLabeledScenarioStore implements LabeledScenarioStore {\n private readonly now: () => number\n private readonly rateLimits = new Map<string, RateLimitState>()\n\n constructor(private readonly options: FsLabeledScenarioStoreOptions) {\n if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true })\n this.now = options.now ?? Date.now\n }\n\n async observe(write: LabeledScenarioWrite): Promise<void> {\n this.assertProvenance(write)\n this.assertRateLimit(write)\n const record = this.toRecord(write)\n const path = this.pathForSource(write.source)\n const line = `${JSON.stringify(record)}\\n`\n // Append atomically. For high-throughput a writev-friendly buffered\n // implementation lands in the Turso adapter; FS adapter is for tests +\n // local dev + small workloads.\n appendLine(path, line)\n }\n\n async sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]> {\n if (!args.split) {\n throw new LabeledScenarioStoreError(\n 'split_required',\n 'sample() requires an explicit `split` (train | test) — substrate refuses ambiguous reads',\n )\n }\n if (!args.capturedBefore) {\n throw new LabeledScenarioStoreError(\n 'capturedBefore_required',\n 'sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline',\n )\n }\n\n const all: LabeledScenarioRecord[] = []\n for (const source of ALL_SOURCES) {\n // Default training-source filter: when sampling train, EXCLUDE\n // production-trace records unless the caller asks for them.\n if (args.split === 'train' && source === 'production-trace') {\n const explicit = sourceFilterContains(args.filter?.source, 'production-trace')\n if (!explicit) continue\n }\n const path = this.pathForSource(source)\n if (!existsSync(path)) continue\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n for (const line of lines) {\n let record: LabeledScenarioRecord\n try {\n record = JSON.parse(line) as LabeledScenarioRecord\n } catch {\n continue\n }\n if (!matchesFilter(record, args, source)) continue\n all.push(record)\n }\n }\n\n // Deterministic order: by capturedAt ascending, then recordHash.\n all.sort((a, b) => {\n if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt)\n return a.recordHash.localeCompare(b.recordHash)\n })\n\n return all.slice(0, args.count)\n }\n\n async size(): Promise<{\n train: number\n test: number\n bySource: Record<string, number>\n byTrust: Record<LabelTrust, number>\n }> {\n const bySource: Record<string, number> = {}\n const byTrust: Record<LabelTrust, number> = {\n unverified: 0,\n 'verified-signal': 0,\n 'human-rated': 0,\n }\n let total = 0\n for (const source of ALL_SOURCES) {\n const path = this.pathForSource(source)\n if (!existsSync(path)) {\n bySource[source] = 0\n continue\n }\n const lines = readFileSync(path, 'utf8').split('\\n').filter(Boolean)\n bySource[source] = lines.length\n total += lines.length\n for (const line of lines) {\n let trust: LabelTrust = 'unverified'\n try {\n trust = (JSON.parse(line) as LabeledScenarioRecord).labelTrust ?? 'unverified'\n } catch {\n // A malformed line counts as unverified — never silently gold.\n }\n byTrust[trust] += 1\n }\n }\n // FS adapter doesn't track split assignments per-record (split is\n // computed at sample-time based on `capturedBefore`). For size(), we\n // report `train`+`test` as the same total — split is a sampling concept.\n return { train: total, test: total, bySource, byTrust }\n }\n\n private assertProvenance(write: LabeledScenarioWrite): void {\n if (!write.source) {\n throw new LabeledScenarioStoreError(\n 'missing_source',\n 'LabeledScenarioWrite requires `source`',\n )\n }\n if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {\n throw new LabeledScenarioStoreError(\n 'missing_source_version',\n 'LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)',\n )\n }\n if (!write.capturedAt) {\n throw new LabeledScenarioStoreError(\n 'missing_captured_at',\n 'LabeledScenarioWrite requires `capturedAt` ISO timestamp',\n )\n }\n if (!write.redactionStatus) {\n throw new LabeledScenarioStoreError(\n 'missing_redaction_status',\n 'LabeledScenarioWrite requires explicit `redactionStatus` — raw / redacted-pii / redacted-secrets / fully-redacted',\n )\n }\n if (!ALL_SOURCES.includes(write.source)) {\n throw new LabeledScenarioStoreError(\n 'unknown_source',\n `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(', ')}`,\n )\n }\n }\n\n private assertRateLimit(write: LabeledScenarioWrite): void {\n const cap = this.options.maxWritesPerMinutePerBucket\n if (!cap || !write.rateLimitBucket) return\n const now = this.now()\n const windowMs = 60_000\n let state = this.rateLimits.get(write.rateLimitBucket)\n if (!state || now - state.windowStartMs >= windowMs) {\n state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 }\n this.rateLimits.set(write.rateLimitBucket, state)\n }\n if (state.count >= cap) {\n throw new LabeledScenarioStoreError(\n 'rate_limit_exceeded',\n `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`,\n )\n }\n state.count += 1\n }\n\n private toRecord(write: LabeledScenarioWrite): LabeledScenarioRecord {\n const recordHash = sha256(\n JSON.stringify({\n id: write.scenario.id,\n src: write.source,\n at: write.capturedAt,\n ver: write.sourceVersionHash,\n }),\n )\n // FS adapter assigns split at sample-time, but we cache a hint here\n // based on capturedAt vs the world's \"now\" — sampler overrides this.\n return {\n ...write,\n recordHash,\n split: 'train',\n }\n }\n\n private pathForSource(source: string): string {\n return join(this.options.root, `${source}.jsonl`)\n }\n}\n\nconst ALL_SOURCES: LabeledScenarioWrite['source'][] = [\n 'production-trace',\n 'eval-run',\n 'manual',\n 'red-team',\n 'synthetic',\n]\n\nfunction sourceFilterContains(\n filter: LabeledScenarioSource | LabeledScenarioSource[] | undefined,\n needle: LabeledScenarioSource,\n): boolean {\n if (!filter) return false\n if (Array.isArray(filter)) return filter.includes(needle)\n return filter === needle\n}\n\nfunction matchesFilter(\n record: LabeledScenarioRecord,\n args: LabeledScenarioSampleArgs,\n source: string,\n): boolean {\n // Temporal cutoff — train must be capturedAt < capturedBefore.\n if (args.split === 'train' && record.capturedAt >= args.capturedBefore) return false\n if (args.split === 'test' && record.capturedAt < args.capturedBefore) return false\n\n const f = args.filter\n if (!f) return true\n if (f.kind && record.scenario.kind !== f.kind) return false\n if (f.source) {\n const sources = Array.isArray(f.source) ? f.source : [f.source]\n if (!sources.includes(source as never)) return false\n }\n if (f.minComposite !== undefined || f.maxComposite !== undefined) {\n const composites = Object.values(record.judgeScores).map((s) => s.composite)\n const max = composites.length === 0 ? 0 : Math.max(...composites)\n if (f.minComposite !== undefined && max < f.minComposite) return false\n if (f.maxComposite !== undefined && max > f.maxComposite) return false\n }\n if (f.minTrust !== undefined && labelTrustRank(record.labelTrust) < labelTrustRank(f.minTrust)) {\n return false\n }\n return true\n}\n\nfunction sha256(input: string): string {\n return createHash('sha256').update(input).digest('hex').slice(0, 16)\n}\n\nfunction appendLine(path: string, line: string): void {\n if (existsSync(path)) {\n const existing = readFileSync(path, 'utf8')\n writeFileSync(path, existing + line)\n } else {\n writeFileSync(path, line)\n }\n}\n","/**\n * @experimental\n *\n * `runProfileMatrix` — the missing keystone between `runAgentMatrix` and the\n * backend-integrity guard.\n *\n * The gap it closes: `runAgentMatrix` is a topology-opaque scheduler whose\n * cells return a bare `{ output, verdict, costUsd }` — no `tokenUsage`, not a\n * `RunRecord`. `assertRealBackend` / `summarizeBackendIntegrity` key on\n * `RunRecord.tokenUsage`, so they cannot run on a raw matrix result. Every\n * consumer therefore hand-writes the same bridge: fan a profile × scenario\n * cartesian, call dispatch, fabricate a `RunRecord` with token usage, thread it\n * back, run the integrity guard. That hand-rolled bridge is exactly the pile of\n * bespoke `eval:*` scripts the adoption skills keep trying (and failing) to\n * forbid.\n *\n * `runProfileMatrix` IS that bridge, once:\n *\n * - axis 3 (PROFILE) = `profiles: AgentProfile[]`\n * - axis 1 (PERSONA/SCENARIO) = `scenarios: Scenario[]` (each scenario carries\n * its persona; `personaOf` groups them for the `byPersona` pivot)\n * - the scoring axis = `judges`\n *\n * It runs `runCampaign` once per profile (reusing its seeds, reps, bootstrap\n * CIs, resumability, and the `LabeledScenarioStore` capture flywheel), maps\n * every cell to a validated `RunRecord` carrying the real `tokenUsage` the\n * dispatch reported via `ctx.cost.observeTokens`, and runs `assertRealBackend`\n * BY CONSTRUCTION before returning — so a stub-backend run fails loudly instead\n * of reporting a clean 0/N leaderboard.\n *\n * Dispatch contract: a dispatch that calls an LLM MUST report usage via\n * `ctx.cost.observeTokens({ input, output })` (and cost via `ctx.cost.observe`).\n * A dispatch that reports zero tokens is indistinguishable from a stub and the\n * integrity guard treats it as one.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport { type AgentProfile, agentProfileHash } from '../../agent-profile'\nimport { AgentEvalError } from '../../errors'\nimport {\n assertRealBackend,\n type BackendIntegrityReport,\n summarizeBackendIntegrity,\n} from '../../integrity/backend-integrity'\nimport {\n type RunOutcome,\n type RunRecord,\n type RunSplitTag,\n validateRunRecord,\n} from '../../run-record'\nimport { runCampaign } from '../run-campaign'\nimport type { CampaignStorage } from '../storage'\nimport type {\n CampaignCellResult,\n CampaignResult,\n DispatchContext,\n JudgeConfig,\n LabeledScenarioSource,\n LabeledScenarioStore,\n Scenario,\n} from '../types'\n\n/** Thrown when the matrix is misconfigured (no profiles, a profile whose model\n * lacks a snapshot version, etc.). Distinct from `BackendIntegrityError`,\n * which signals a stub backend at run time. */\nexport class ProfileMatrixError extends AgentEvalError {\n constructor(message: string) {\n super('profile_matrix', message)\n }\n}\n\n/** Dispatch for one cell: render `profile` against `scenario`, returning the\n * artifact the judges score. Report LLM usage via `ctx.cost.observeTokens`\n * and `ctx.cost.observe` — the integrity guard depends on it. */\nexport type ProfileDispatchFn<TScenario extends Scenario, TArtifact> = (\n profile: AgentProfile,\n scenario: TScenario,\n ctx: DispatchContext,\n) => Promise<TArtifact>\n\nexport interface RunProfileMatrixOptions<TScenario extends Scenario, TArtifact> {\n /** Axis 3 — the agent-under-test configurations. Each is one column. */\n profiles: AgentProfile[]\n /** Axis 1 — the persona/scenario corpus, run against every profile. */\n scenarios: TScenario[]\n /** Renders one (profile, scenario) cell. */\n dispatch: ProfileDispatchFn<TScenario, TArtifact>\n /** The scoring axis. */\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Where each profile's campaign writes artifacts/traces. One subdir per\n * profile. */\n runDir: string\n /** Git SHA the harness ran from — stamped onto every RunRecord (mandatory\n * for paper-grade records). */\n commitSha: string\n /** Logical experiment id shared across the whole matrix so the promotion\n * gate can pair profiles on matched scenarios. Default: a hash of the\n * profile + scenario ids. */\n experimentId?: string\n /** Which split these runs belong to. Default `'search'`. */\n splitTag?: RunSplitTag\n /** Replicates per (profile, scenario) cell for CI bands. Default 1. */\n reps?: number\n /** Campaign seed (per profile). Default 42. */\n seed?: number\n /**\n * Backend-integrity posture, enforced AFTER the matrix completes:\n * - `'assert'` (default) — throw `BackendIntegrityError` if the run was a\n * stub (and, with `allowMixed:false`, if it was mixed).\n * - `'warn'` — log the verdict but never throw.\n * - `'off'` — skip the guard entirely (only for offline/replay analysis).\n */\n integrity?: 'assert' | 'warn' | 'off'\n /** Forwarded to `assertRealBackend`. Default true (tolerate partial 429\n * cascades); set false for strict CI gates. */\n allowMixed?: boolean\n /** Max concurrent cells WITHIN each profile's campaign. Default 2.\n * Profiles run sequentially so the cost ceiling is honored deterministically. */\n maxConcurrency?: number\n /** Cumulative USD cap per profile campaign. */\n costCeiling?: number\n /** Capture flywheel — forwarded to each campaign. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: LabeledScenarioSource\n /** Storage backend. Default `fsCampaignStorage`. Pass\n * `inMemoryCampaignStorage()` for edge/CF-Worker/test runs. */\n storage?: CampaignStorage\n /** Test seam — override the wall clock. */\n now?: () => Date\n /** Optional persona key per scenario — drives the `byPersona` pivot. When\n * unset, `byPersona` is omitted. */\n personaOf?: (scenario: TScenario) => string\n /** Validate every produced RunRecord with `validateRunRecord` (fail-loud).\n * Default true — catches bad model snapshots and non-finite judge dims at\n * the boundary instead of letting them poison downstream analysis. */\n validate?: boolean\n}\n\nexport interface ProfileSummary {\n profileId: string\n profileHash: string\n model: string\n /** RunRecords produced for this profile (= scenarios × reps). */\n records: number\n /** Mean composite across this profile's records. */\n meanComposite: number\n totalCostUsd: number\n /** Per-profile integrity verdict — surfaces a single profile that ran stub\n * even when the matrix as a whole looks real. */\n integrity: BackendIntegrityReport\n}\n\nexport interface ScenarioRollup {\n meanComposite: number\n n: number\n}\n\nexport interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {\n matrixId: string\n experimentId: string\n /** One RunRecord per (profile, scenario, rep) cell — the integrity-checked,\n * paper-grade output. Feed straight into `analyzeRuns`, `HeldOutGate`,\n * scorecards, the hosted wire format. */\n records: RunRecord[]\n byProfile: Record<string, ProfileSummary>\n byScenario: Record<string, ScenarioRollup>\n /** Present only when `personaOf` was supplied. */\n byPersona?: Record<string, ScenarioRollup>\n /** Whole-matrix integrity report (the one `integrity:'assert'` enforces). */\n integrity: BackendIntegrityReport\n /** The raw per-profile campaign results, keyed by profile id. */\n campaigns: Record<string, CampaignResult<TArtifact, TScenario>>\n}\n\nfunction sanitize(id: string): string {\n return id.replace(/[^a-zA-Z0-9_-]/g, '_')\n}\n\nfunction sha(input: unknown): string {\n return createHash('sha256').update(JSON.stringify(input)).digest('hex')\n}\n\nfunction mean(xs: number[]): number {\n return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length\n}\n\nfunction cellComposite(cell: CampaignCellResult<unknown>): number {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n return composites.length === 0 ? 0 : mean(composites)\n}\n\ninterface BuildRecordArgs<TArtifact> {\n cell: CampaignCellResult<TArtifact>\n profile: AgentProfile\n profileHash: string\n configHash: string\n experimentId: string\n splitTag: RunSplitTag\n commitSha: string\n matrixId: string\n}\n\nfunction buildRunRecord<TArtifact>(args: BuildRecordArgs<TArtifact>): RunRecord {\n const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } =\n args\n const composite = cellComposite(cell)\n\n // Flatten judge dimensions (judge-prefixed to avoid collisions) into raw.\n const raw: Record<string, number> = { composite }\n const perJudge: Record<string, Record<string, number>> = {}\n const dimAccum: Record<string, number[]> = {}\n const notes: string[] = []\n for (const [judgeName, js] of Object.entries(cell.judgeScores)) {\n perJudge[judgeName] = { ...js.dimensions }\n for (const [dim, value] of Object.entries(js.dimensions)) {\n raw[`${judgeName}.${dim}`] = value\n ;(dimAccum[dim] ??= []).push(value)\n }\n if (js.notes) notes.push(`${judgeName}: ${js.notes}`)\n }\n const perDimMean: Record<string, number> = {}\n for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values)\n\n const outcome: RunOutcome =\n splitTag === 'holdout' ? { holdoutScore: composite, raw } : { searchScore: composite, raw }\n if (Object.keys(perJudge).length > 0) {\n outcome.judgeScores = {\n perJudge,\n perDimMean,\n composite,\n ...(notes.length > 0 ? { notes: notes.join(' | ') } : {}),\n }\n }\n\n return {\n runId: `${matrixId}:${profile.id}:${cell.cellId}`,\n experimentId,\n candidateId: profile.id,\n seed: cell.seed,\n model: profile.model,\n promptHash: profileHash,\n configHash,\n commitSha,\n wallMs: cell.durationMs,\n costUsd: cell.costUsd,\n tokenUsage: cell.tokenUsage,\n outcome,\n splitTag,\n scenarioId: cell.scenarioId,\n ...(cell.error ? { failureMode: cell.error } : {}),\n }\n}\n\nexport async function runProfileMatrix<TScenario extends Scenario, TArtifact>(\n opts: RunProfileMatrixOptions<TScenario, TArtifact>,\n): Promise<RunProfileMatrixResult<TArtifact, TScenario>> {\n if (opts.profiles.length === 0) throw new ProfileMatrixError('profiles must not be empty')\n if (opts.scenarios.length === 0) throw new ProfileMatrixError('scenarios must not be empty')\n\n const splitTag = opts.splitTag ?? 'search'\n const seed = opts.seed ?? 42\n const validate = opts.validate ?? true\n const integrityMode = opts.integrity ?? 'assert'\n const profileIds = opts.profiles.map((p) => p.id)\n const experimentId =\n opts.experimentId ??\n `pm_${sha({ profileIds, scenarios: opts.scenarios.map((s) => s.id) }).slice(0, 16)}`\n const matrixId = `mtx_${sha({ experimentId, profileIds, seed, splitTag }).slice(0, 16)}`\n\n // Preflight: every profile must hash (non-empty model) AND its model must\n // carry a snapshot version, BEFORE any LLM spend. A probe record run through\n // validateRunRecord catches both in the exact place they'd otherwise surface\n // far downstream.\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n try {\n validateRunRecord({\n runId: `${matrixId}:${profile.id}:probe`,\n experimentId,\n candidateId: profile.id,\n seed,\n model: profile.model,\n promptHash: profileHash,\n configHash: profileHash,\n commitSha: opts.commitSha,\n wallMs: 0,\n costUsd: 0,\n tokenUsage: { input: 0, output: 0 },\n outcome:\n splitTag === 'holdout' ? { holdoutScore: 0, raw: {} } : { searchScore: 0, raw: {} },\n splitTag,\n })\n } catch (err) {\n throw new ProfileMatrixError(\n `profile '${profile.id}' is not recordable: ${err instanceof Error ? err.message : String(err)}`,\n )\n }\n }\n\n const records: RunRecord[] = []\n const campaigns: Record<string, CampaignResult<TArtifact, TScenario>> = {}\n const byProfile: Record<string, ProfileSummary> = {}\n\n for (const profile of opts.profiles) {\n const profileHash = agentProfileHash(profile)\n const configHash = sha({\n profile: profileHash,\n judges: (opts.judges ?? []).map((j) => j.name),\n seed,\n splitTag,\n })\n\n // Bind the profile into a campaign dispatch. Name it so the campaign's\n // manifest hash is stable + distinct per profile.\n const dispatch = (scenario: TScenario, ctx: DispatchContext): Promise<TArtifact> =>\n opts.dispatch(profile, scenario, ctx)\n Object.defineProperty(dispatch, 'name', { value: `profile_${sanitize(profile.id)}` })\n\n const campaign = await runCampaign<TScenario, TArtifact>({\n scenarios: opts.scenarios,\n dispatch,\n judges: opts.judges,\n seed,\n reps: opts.reps,\n maxConcurrency: opts.maxConcurrency,\n costCeiling: opts.costCeiling,\n labeledStore: opts.labeledStore,\n captureSource: opts.captureSource,\n storage: opts.storage,\n now: opts.now,\n runDir: join(opts.runDir, sanitize(profile.id)),\n })\n campaigns[profile.id] = campaign\n\n const profileRecords: RunRecord[] = []\n for (const cell of campaign.cells) {\n const record = buildRunRecord({\n cell,\n profile,\n profileHash,\n configHash,\n experimentId,\n splitTag,\n commitSha: opts.commitSha,\n matrixId,\n })\n if (validate) validateRunRecord(record)\n profileRecords.push(record)\n records.push(record)\n }\n\n byProfile[profile.id] = {\n profileId: profile.id,\n profileHash,\n model: profile.model,\n records: profileRecords.length,\n meanComposite: mean(profileRecords.map(compositeOf)),\n totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),\n integrity: summarizeBackendIntegrity(profileRecords),\n }\n }\n\n // Integrity by construction — the whole point of the primitive.\n const integrity = summarizeBackendIntegrity(records)\n if (integrityMode === 'assert') {\n assertRealBackend(records, { allowMixed: opts.allowMixed ?? true })\n } else if (integrityMode === 'warn' && integrity.verdict !== 'real') {\n // eslint-disable-next-line no-console\n console.warn(\n `[runProfileMatrix] backend integrity: ${integrity.verdict} — ${integrity.diagnosis}`,\n )\n }\n\n // Pivots.\n const byScenario = rollup(records, (r) => r.scenarioId)\n const byPersona = opts.personaOf\n ? rollupByPersona(records, opts.scenarios, opts.personaOf)\n : undefined\n\n return { matrixId, experimentId, records, byProfile, byScenario, byPersona, integrity, campaigns }\n}\n\n/** Composite for a produced RunRecord (the split score it carries). */\nfunction compositeOf(r: RunRecord): number {\n return r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0\n}\n\nfunction rollup(\n records: RunRecord[],\n keyOf: (r: RunRecord) => string | undefined,\n): Record<string, ScenarioRollup> {\n const groups = new Map<string, number[]>()\n for (const r of records) {\n const key = keyOf(r)\n if (key === undefined) continue\n const arr = groups.get(key) ?? []\n arr.push(compositeOf(r))\n groups.set(key, arr)\n }\n const out: Record<string, ScenarioRollup> = {}\n for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length }\n return out\n}\n\nfunction rollupByPersona<TScenario extends Scenario>(\n records: RunRecord[],\n scenarios: TScenario[],\n personaOf: (s: TScenario) => string,\n): Record<string, ScenarioRollup> {\n const personaByScenarioId = new Map<string, string>()\n for (const s of scenarios) personaByScenarioId.set(s.id, personaOf(s))\n return rollup(records, (r) => (r.scenarioId ? personaByScenarioId.get(r.scenarioId) : undefined))\n}\n","/**\n * @experimental\n *\n * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like\n * (multiple commits allowed). A code-tier driver's `propose()` creates a\n * worktree, an agent commits the change into it, and `finalize()` returns a\n * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker\n * against the changed code. On promotion the worktree becomes the PR branch.\n *\n * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))\n * adapter can slot in without touching driver code. Only the git adapter\n * ships today. See `docs/design/self-improvement-engine.md`.\n */\n\nimport { execFileSync } from 'node:child_process'\nimport { existsSync } from 'node:fs'\nimport { basename, isAbsolute, join } from 'node:path'\nimport type { CodeSurface } from '../types'\n\nexport interface Worktree {\n /** Absolute path to the checked-out worktree directory. */\n path: string\n /** The branch the worktree is on (becomes the PR branch on promotion). */\n branch: string\n /** The ref the worktree was forked from. */\n baseRef: string\n}\n\nexport interface WorktreeAdapter {\n /** Create an isolated worktree on a fresh branch off `baseRef`. */\n create(opts: { baseRef: string; label: string }): Promise<Worktree>\n /** Commit any pending changes in the worktree, then return a CodeSurface\n * pointing at it. The agent has already written its change into\n * `worktree.path` by the time this is called. */\n finalize(worktree: Worktree, summary: string): Promise<CodeSurface>\n /** Remove the worktree (and its branch) — called for losing candidates. */\n discard(worktree: Worktree): Promise<void>\n}\n\nexport class WorktreeAdapterError extends Error {\n constructor(\n message: string,\n readonly cause?: unknown,\n ) {\n super(message)\n this.name = 'WorktreeAdapterError'\n }\n}\n\nexport interface GitWorktreeAdapterOptions {\n /** Repo root the worktrees fork from. */\n repoRoot: string\n /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */\n worktreeDir?: string\n /** Branch-name prefix. Default: `improve`. */\n branchPrefix?: string\n /** Test seam — defaults to a real `git` runner. */\n git?: (args: string[], cwd: string) => string\n}\n\nfunction defaultGit(args: string[], cwd: string): string {\n try {\n return execFileSync('git', args, { cwd, encoding: 'utf8' }).trim()\n } catch (err) {\n const stderr =\n err && typeof err === 'object' && 'stderr' in err\n ? String((err as { stderr: unknown }).stderr)\n : ''\n throw new WorktreeAdapterError(`git ${args.join(' ')} failed: ${stderr || String(err)}`, err)\n }\n}\n\n/** Slugify a label into a branch-safe segment. */\nfunction slug(label: string): string {\n return (\n label\n .toLowerCase()\n .replace(/[^a-z0-9]+/g, '-')\n .replace(/^-+|-+$/g, '')\n .slice(0, 48) || 'candidate'\n )\n}\n\nexport function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter {\n const git = opts.git ?? defaultGit\n const worktreeDir = opts.worktreeDir ?? join(opts.repoRoot, '.worktrees')\n const branchPrefix = opts.branchPrefix ?? 'improve'\n\n return {\n async create({ baseRef, label }) {\n const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`\n const branch = `${branchPrefix}/${id}`\n const path = join(worktreeDir, id)\n git(['worktree', 'add', '-b', branch, path, baseRef], opts.repoRoot)\n return { path, branch, baseRef }\n },\n\n async finalize(worktree, summary) {\n // Stage + commit any pending changes the agent left in the worktree.\n // A no-op commit is refused by git, so only commit when the tree is dirty.\n const status = git(['status', '--porcelain'], worktree.path)\n if (status.length > 0) {\n git(['add', '-A'], worktree.path)\n git(['commit', '-m', summary], worktree.path)\n }\n return {\n kind: 'code',\n worktreeRef: worktree.path,\n baseRef: worktree.baseRef,\n summary,\n }\n },\n\n async discard(worktree) {\n // Remove the worktree, then delete its branch. Force-remove because the\n // worktree may hold uncommitted experiment state we're discarding.\n git(['worktree', 'remove', '--force', worktree.path], opts.repoRoot)\n git(['branch', '-D', worktree.branch], opts.repoRoot)\n },\n }\n}\n\n/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can\n * run the worker in. A path ref is returned as-is; anything else is treated\n * as a ref under the adapter's worktree dir. */\nexport function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string {\n if (isAbsolute(surface.worktreeRef) && existsSync(surface.worktreeRef)) return surface.worktreeRef\n if (worktreeDir) return join(worktreeDir, basename(surface.worktreeRef))\n return surface.worktreeRef\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA6BA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAqBd,IAAM,4BAAN,cAAwC,MAAM;AAAA,EACnD,YACkB,MAChB,SACA;AACA,UAAM,OAAO;AAHG;AAIhB,SAAK,OAAO;AAAA,EACd;AAAA,EALkB;AAMpB;AAQO,IAAM,yBAAN,MAA6D;AAAA,EAIlE,YAA6B,SAAwC;AAAxC;AAC3B,QAAI,CAAC,WAAW,QAAQ,IAAI,EAAG,WAAU,QAAQ,MAAM,EAAE,WAAW,KAAK,CAAC;AAC1E,SAAK,MAAM,QAAQ,OAAO,KAAK;AAAA,EACjC;AAAA,EAH6B;AAAA,EAHZ;AAAA,EACA,aAAa,oBAAI,IAA4B;AAAA,EAO9D,MAAM,QAAQ,OAA4C;AACxD,SAAK,iBAAiB,KAAK;AAC3B,SAAK,gBAAgB,KAAK;AAC1B,UAAM,SAAS,KAAK,SAAS,KAAK;AAClC,UAAM,OAAO,KAAK,cAAc,MAAM,MAAM;AAC5C,UAAM,OAAO,GAAG,KAAK,UAAU,MAAM,CAAC;AAAA;AAItC,eAAW,MAAM,IAAI;AAAA,EACvB;AAAA,EAEA,MAAM,OAAO,MAAmE;AAC9E,QAAI,CAAC,KAAK,OAAO;AACf,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,KAAK,gBAAgB;AACxB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAEA,UAAM,MAA+B,CAAC;AACtC,eAAW,UAAU,aAAa;AAGhC,UAAI,KAAK,UAAU,WAAW,WAAW,oBAAoB;AAC3D,cAAM,WAAW,qBAAqB,KAAK,QAAQ,QAAQ,kBAAkB;AAC7E,YAAI,CAAC,SAAU;AAAA,MACjB;AACA,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,EAAG;AACvB,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,iBAAW,QAAQ,OAAO;AACxB,YAAI;AACJ,YAAI;AACF,mBAAS,KAAK,MAAM,IAAI;AAAA,QAC1B,QAAQ;AACN;AAAA,QACF;AACA,YAAI,CAAC,cAAc,QAAQ,MAAM,MAAM,EAAG;AAC1C,YAAI,KAAK,MAAM;AAAA,MACjB;AAAA,IACF;AAGA,QAAI,KAAK,CAAC,GAAG,MAAM;AACjB,UAAI,EAAE,eAAe,EAAE,WAAY,QAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AACjF,aAAO,EAAE,WAAW,cAAc,EAAE,UAAU;AAAA,IAChD,CAAC;AAED,WAAO,IAAI,MAAM,GAAG,KAAK,KAAK;AAAA,EAChC;AAAA,EAEA,MAAM,OAKH;AACD,UAAM,WAAmC,CAAC;AAC1C,UAAM,UAAsC;AAAA,MAC1C,YAAY;AAAA,MACZ,mBAAmB;AAAA,MACnB,eAAe;AAAA,IACjB;AACA,QAAI,QAAQ;AACZ,eAAW,UAAU,aAAa;AAChC,YAAM,OAAO,KAAK,cAAc,MAAM;AACtC,UAAI,CAAC,WAAW,IAAI,GAAG;AACrB,iBAAS,MAAM,IAAI;AACnB;AAAA,MACF;AACA,YAAM,QAAQ,aAAa,MAAM,MAAM,EAAE,MAAM,IAAI,EAAE,OAAO,OAAO;AACnE,eAAS,MAAM,IAAI,MAAM;AACzB,eAAS,MAAM;AACf,iBAAW,QAAQ,OAAO;AACxB,YAAI,QAAoB;AACxB,YAAI;AACF,kBAAS,KAAK,MAAM,IAAI,EAA4B,cAAc;AAAA,QACpE,QAAQ;AAAA,QAER;AACA,gBAAQ,KAAK,KAAK;AAAA,MACpB;AAAA,IACF;AAIA,WAAO,EAAE,OAAO,OAAO,MAAM,OAAO,UAAU,QAAQ;AAAA,EACxD;AAAA,EAEQ,iBAAiB,OAAmC;AAC1D,QAAI,CAAC,MAAM,QAAQ;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,qBAAqB,MAAM,kBAAkB,WAAW,GAAG;AACpE,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,YAAY;AACrB,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,MAAM,iBAAiB;AAC1B,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,YAAY,SAAS,MAAM,MAAM,GAAG;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA,+CAA+C,YAAY,KAAK,IAAI,CAAC;AAAA,MACvE;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,gBAAgB,OAAmC;AACzD,UAAM,MAAM,KAAK,QAAQ;AACzB,QAAI,CAAC,OAAO,CAAC,MAAM,gBAAiB;AACpC,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,WAAW;AACjB,QAAI,QAAQ,KAAK,WAAW,IAAI,MAAM,eAAe;AACrD,QAAI,CAAC,SAAS,MAAM,MAAM,iBAAiB,UAAU;AACnD,cAAQ,EAAE,QAAQ,MAAM,iBAAiB,eAAe,KAAK,OAAO,EAAE;AACtE,WAAK,WAAW,IAAI,MAAM,iBAAiB,KAAK;AAAA,IAClD;AACA,QAAI,MAAM,SAAS,KAAK;AACtB,YAAM,IAAI;AAAA,QACR;AAAA,QACA,gCAAgC,MAAM,eAAe,aAAa,GAAG;AAAA,MACvE;AAAA,IACF;AACA,UAAM,SAAS;AAAA,EACjB;AAAA,EAEQ,SAAS,OAAoD;AACnE,UAAM,aAAa;AAAA,MACjB,KAAK,UAAU;AAAA,QACb,IAAI,MAAM,SAAS;AAAA,QACnB,KAAK,MAAM;AAAA,QACX,IAAI,MAAM;AAAA,QACV,KAAK,MAAM;AAAA,MACb,CAAC;AAAA,IACH;AAGA,WAAO;AAAA,MACL,GAAG;AAAA,MACH;AAAA,MACA,OAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEQ,cAAc,QAAwB;AAC5C,WAAO,KAAK,KAAK,QAAQ,MAAM,GAAG,MAAM,QAAQ;AAAA,EAClD;AACF;AAEA,IAAM,cAAgD;AAAA,EACpD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,qBACP,QACA,QACS;AACT,MAAI,CAAC,OAAQ,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,EAAG,QAAO,OAAO,SAAS,MAAM;AACxD,SAAO,WAAW;AACpB;AAEA,SAAS,cACP,QACA,MACA,QACS;AAET,MAAI,KAAK,UAAU,WAAW,OAAO,cAAc,KAAK,eAAgB,QAAO;AAC/E,MAAI,KAAK,UAAU,UAAU,OAAO,aAAa,KAAK,eAAgB,QAAO;AAE7E,QAAM,IAAI,KAAK;AACf,MAAI,CAAC,EAAG,QAAO;AACf,MAAI,EAAE,QAAQ,OAAO,SAAS,SAAS,EAAE,KAAM,QAAO;AACtD,MAAI,EAAE,QAAQ;AACZ,UAAM,UAAU,MAAM,QAAQ,EAAE,MAAM,IAAI,EAAE,SAAS,CAAC,EAAE,MAAM;AAC9D,QAAI,CAAC,QAAQ,SAAS,MAAe,EAAG,QAAO;AAAA,EACjD;AACA,MAAI,EAAE,iBAAiB,UAAa,EAAE,iBAAiB,QAAW;AAChE,UAAM,aAAa,OAAO,OAAO,OAAO,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC3E,UAAM,MAAM,WAAW,WAAW,IAAI,IAAI,KAAK,IAAI,GAAG,UAAU;AAChE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AACjE,QAAI,EAAE,iBAAiB,UAAa,MAAM,EAAE,aAAc,QAAO;AAAA,EACnE;AACA,MAAI,EAAE,aAAa,UAAa,eAAe,OAAO,UAAU,IAAI,eAAe,EAAE,QAAQ,GAAG;AAC9F,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEA,SAAS,OAAO,OAAuB;AACrC,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACrE;AAEA,SAAS,WAAW,MAAc,MAAoB;AACpD,MAAI,WAAW,IAAI,GAAG;AACpB,UAAM,WAAW,aAAa,MAAM,MAAM;AAC1C,kBAAc,MAAM,WAAW,IAAI;AAAA,EACrC,OAAO;AACL,kBAAc,MAAM,IAAI;AAAA,EAC1B;AACF;;;AC5QA,SAAS,cAAAA,mBAAkB;AAC3B,SAAS,QAAAC,aAAY;AA6Bd,IAAM,qBAAN,cAAiC,eAAe;AAAA,EACrD,YAAY,SAAiB;AAC3B,UAAM,kBAAkB,OAAO;AAAA,EACjC;AACF;AAyGA,SAAS,SAAS,IAAoB;AACpC,SAAO,GAAG,QAAQ,mBAAmB,GAAG;AAC1C;AAEA,SAAS,IAAI,OAAwB;AACnC,SAAOC,YAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,KAAK,CAAC,EAAE,OAAO,KAAK;AACxE;AAEA,SAAS,KAAK,IAAsB;AAClC,SAAO,GAAG,WAAW,IAAI,IAAI,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAClE;AAEA,SAAS,cAAc,MAA2C;AAChE,QAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,SAAO,WAAW,WAAW,IAAI,IAAI,KAAK,UAAU;AACtD;AAaA,SAAS,eAA0B,MAA6C;AAC9E,QAAM,EAAE,MAAM,SAAS,aAAa,YAAY,cAAc,UAAU,WAAW,SAAS,IAC1F;AACF,QAAM,YAAY,cAAc,IAAI;AAGpC,QAAM,MAA8B,EAAE,UAAU;AAChD,QAAM,WAAmD,CAAC;AAC1D,QAAM,WAAqC,CAAC;AAC5C,QAAM,QAAkB,CAAC;AACzB,aAAW,CAAC,WAAW,EAAE,KAAK,OAAO,QAAQ,KAAK,WAAW,GAAG;AAC9D,aAAS,SAAS,IAAI,EAAE,GAAG,GAAG,WAAW;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,UAAU,GAAG;AACxD,UAAI,GAAG,SAAS,IAAI,GAAG,EAAE,IAAI;AAC5B,OAAC,SAAS,GAAG,MAAM,CAAC,GAAG,KAAK,KAAK;AAAA,IACpC;AACA,QAAI,GAAG,MAAO,OAAM,KAAK,GAAG,SAAS,KAAK,GAAG,KAAK,EAAE;AAAA,EACtD;AACA,QAAM,aAAqC,CAAC;AAC5C,aAAW,CAAC,KAAK,MAAM,KAAK,OAAO,QAAQ,QAAQ,EAAG,YAAW,GAAG,IAAI,KAAK,MAAM;AAEnF,QAAM,UACJ,aAAa,YAAY,EAAE,cAAc,WAAW,IAAI,IAAI,EAAE,aAAa,WAAW,IAAI;AAC5F,MAAI,OAAO,KAAK,QAAQ,EAAE,SAAS,GAAG;AACpC,YAAQ,cAAc;AAAA,MACpB;AAAA,MACA;AAAA,MACA;AAAA,MACA,GAAI,MAAM,SAAS,IAAI,EAAE,OAAO,MAAM,KAAK,KAAK,EAAE,IAAI,CAAC;AAAA,IACzD;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE,IAAI,KAAK,MAAM;AAAA,IAC/C;AAAA,IACA,aAAa,QAAQ;AAAA,IACrB,MAAM,KAAK;AAAA,IACX,OAAO,QAAQ;AAAA,IACf,YAAY;AAAA,IACZ;AAAA,IACA;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,SAAS,KAAK;AAAA,IACd,YAAY,KAAK;AAAA,IACjB;AAAA,IACA;AAAA,IACA,YAAY,KAAK;AAAA,IACjB,GAAI,KAAK,QAAQ,EAAE,aAAa,KAAK,MAAM,IAAI,CAAC;AAAA,EAClD;AACF;AAEA,eAAsB,iBACpB,MACuD;AACvD,MAAI,KAAK,SAAS,WAAW,EAAG,OAAM,IAAI,mBAAmB,4BAA4B;AACzF,MAAI,KAAK,UAAU,WAAW,EAAG,OAAM,IAAI,mBAAmB,6BAA6B;AAE3F,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,gBAAgB,KAAK,aAAa;AACxC,QAAM,aAAa,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE;AAChD,QAAM,eACJ,KAAK,gBACL,MAAM,IAAI,EAAE,YAAY,WAAW,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AACpF,QAAM,WAAW,OAAO,IAAI,EAAE,cAAc,YAAY,MAAM,SAAS,CAAC,EAAE,MAAM,GAAG,EAAE,CAAC;AAMtF,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,QAAI;AACF,wBAAkB;AAAA,QAChB,OAAO,GAAG,QAAQ,IAAI,QAAQ,EAAE;AAAA,QAChC;AAAA,QACA,aAAa,QAAQ;AAAA,QACrB;AAAA,QACA,OAAO,QAAQ;AAAA,QACf,YAAY;AAAA,QACZ,YAAY;AAAA,QACZ,WAAW,KAAK;AAAA,QAChB,QAAQ;AAAA,QACR,SAAS;AAAA,QACT,YAAY,EAAE,OAAO,GAAG,QAAQ,EAAE;AAAA,QAClC,SACE,aAAa,YAAY,EAAE,cAAc,GAAG,KAAK,CAAC,EAAE,IAAI,EAAE,aAAa,GAAG,KAAK,CAAC,EAAE;AAAA,QACpF;AAAA,MACF,CAAC;AAAA,IACH,SAAS,KAAK;AACZ,YAAM,IAAI;AAAA,QACR,YAAY,QAAQ,EAAE,wBAAwB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,MAChG;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAuB,CAAC;AAC9B,QAAM,YAAkE,CAAC;AACzE,QAAM,YAA4C,CAAC;AAEnD,aAAW,WAAW,KAAK,UAAU;AACnC,UAAM,cAAc,iBAAiB,OAAO;AAC5C,UAAM,aAAa,IAAI;AAAA,MACrB,SAAS;AAAA,MACT,SAAS,KAAK,UAAU,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,IAAI;AAAA,MAC7C;AAAA,MACA;AAAA,IACF,CAAC;AAID,UAAM,WAAW,CAAC,UAAqB,QACrC,KAAK,SAAS,SAAS,UAAU,GAAG;AACtC,WAAO,eAAe,UAAU,QAAQ,EAAE,OAAO,WAAW,SAAS,QAAQ,EAAE,CAAC,GAAG,CAAC;AAEpF,UAAM,WAAW,MAAM,YAAkC;AAAA,MACvD,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,QAAQ,KAAK;AAAA,MACb;AAAA,MACA,MAAM,KAAK;AAAA,MACX,gBAAgB,KAAK;AAAA,MACrB,aAAa,KAAK;AAAA,MAClB,cAAc,KAAK;AAAA,MACnB,eAAe,KAAK;AAAA,MACpB,SAAS,KAAK;AAAA,MACd,KAAK,KAAK;AAAA,MACV,QAAQC,MAAK,KAAK,QAAQ,SAAS,QAAQ,EAAE,CAAC;AAAA,IAChD,CAAC;AACD,cAAU,QAAQ,EAAE,IAAI;AAExB,UAAM,iBAA8B,CAAC;AACrC,eAAW,QAAQ,SAAS,OAAO;AACjC,YAAM,SAAS,eAAe;AAAA,QAC5B;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,WAAW,KAAK;AAAA,QAChB;AAAA,MACF,CAAC;AACD,UAAI,SAAU,mBAAkB,MAAM;AACtC,qBAAe,KAAK,MAAM;AAC1B,cAAQ,KAAK,MAAM;AAAA,IACrB;AAEA,cAAU,QAAQ,EAAE,IAAI;AAAA,MACtB,WAAW,QAAQ;AAAA,MACnB;AAAA,MACA,OAAO,QAAQ;AAAA,MACf,SAAS,eAAe;AAAA,MACxB,eAAe,KAAK,eAAe,IAAI,WAAW,CAAC;AAAA,MACnD,cAAc,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,MAC9D,WAAW,0BAA0B,cAAc;AAAA,IACrD;AAAA,EACF;AAGA,QAAM,YAAY,0BAA0B,OAAO;AACnD,MAAI,kBAAkB,UAAU;AAC9B,sBAAkB,SAAS,EAAE,YAAY,KAAK,cAAc,KAAK,CAAC;AAAA,EACpE,WAAW,kBAAkB,UAAU,UAAU,YAAY,QAAQ;AAEnE,YAAQ;AAAA,MACN,yCAAyC,UAAU,OAAO,WAAM,UAAU,SAAS;AAAA,IACrF;AAAA,EACF;AAGA,QAAM,aAAa,OAAO,SAAS,CAAC,MAAM,EAAE,UAAU;AACtD,QAAM,YAAY,KAAK,YACnB,gBAAgB,SAAS,KAAK,WAAW,KAAK,SAAS,IACvD;AAEJ,SAAO,EAAE,UAAU,cAAc,SAAS,WAAW,YAAY,WAAW,WAAW,UAAU;AACnG;AAGA,SAAS,YAAY,GAAsB;AACzC,SAAO,EAAE,QAAQ,gBAAgB,EAAE,QAAQ,eAAe;AAC5D;AAEA,SAAS,OACP,SACA,OACgC;AAChC,QAAM,SAAS,oBAAI,IAAsB;AACzC,aAAW,KAAK,SAAS;AACvB,UAAM,MAAM,MAAM,CAAC;AACnB,QAAI,QAAQ,OAAW;AACvB,UAAM,MAAM,OAAO,IAAI,GAAG,KAAK,CAAC;AAChC,QAAI,KAAK,YAAY,CAAC,CAAC;AACvB,WAAO,IAAI,KAAK,GAAG;AAAA,EACrB;AACA,QAAM,MAAsC,CAAC;AAC7C,aAAW,CAAC,KAAK,EAAE,KAAK,OAAQ,KAAI,GAAG,IAAI,EAAE,eAAe,KAAK,EAAE,GAAG,GAAG,GAAG,OAAO;AACnF,SAAO;AACT;AAEA,SAAS,gBACP,SACA,WACA,WACgC;AAChC,QAAM,sBAAsB,oBAAI,IAAoB;AACpD,aAAW,KAAK,UAAW,qBAAoB,IAAI,EAAE,IAAI,UAAU,CAAC,CAAC;AACrE,SAAO,OAAO,SAAS,CAAC,MAAO,EAAE,aAAa,oBAAoB,IAAI,EAAE,UAAU,IAAI,MAAU;AAClG;;;AC/YA,SAAS,oBAAoB;AAC7B,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,UAAU,YAAY,QAAAC,aAAY;AAuBpC,IAAM,uBAAN,cAAmC,MAAM;AAAA,EAC9C,YACE,SACS,OACT;AACA,UAAM,OAAO;AAFJ;AAGT,SAAK,OAAO;AAAA,EACd;AAAA,EAJW;AAKb;AAaA,SAAS,WAAW,MAAgB,KAAqB;AACvD,MAAI;AACF,WAAO,aAAa,OAAO,MAAM,EAAE,KAAK,UAAU,OAAO,CAAC,EAAE,KAAK;AAAA,EACnE,SAAS,KAAK;AACZ,UAAM,SACJ,OAAO,OAAO,QAAQ,YAAY,YAAY,MAC1C,OAAQ,IAA4B,MAAM,IAC1C;AACN,UAAM,IAAI,qBAAqB,OAAO,KAAK,KAAK,GAAG,CAAC,YAAY,UAAU,OAAO,GAAG,CAAC,IAAI,GAAG;AAAA,EAC9F;AACF;AAGA,SAAS,KAAK,OAAuB;AACnC,SACE,MACG,YAAY,EACZ,QAAQ,eAAe,GAAG,EAC1B,QAAQ,YAAY,EAAE,EACtB,MAAM,GAAG,EAAE,KAAK;AAEvB;AAEO,SAAS,mBAAmB,MAAkD;AACnF,QAAM,MAAM,KAAK,OAAO;AACxB,QAAM,cAAc,KAAK,eAAeA,MAAK,KAAK,UAAU,YAAY;AACxE,QAAM,eAAe,KAAK,gBAAgB;AAE1C,SAAO;AAAA,IACL,MAAM,OAAO,EAAE,SAAS,MAAM,GAAG;AAC/B,YAAM,KAAK,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,CAAC,CAAC;AAC9F,YAAM,SAAS,GAAG,YAAY,IAAI,EAAE;AACpC,YAAM,OAAOA,MAAK,aAAa,EAAE;AACjC,UAAI,CAAC,YAAY,OAAO,MAAM,QAAQ,MAAM,OAAO,GAAG,KAAK,QAAQ;AACnE,aAAO,EAAE,MAAM,QAAQ,QAAQ;AAAA,IACjC;AAAA,IAEA,MAAM,SAAS,UAAU,SAAS;AAGhC,YAAM,SAAS,IAAI,CAAC,UAAU,aAAa,GAAG,SAAS,IAAI;AAC3D,UAAI,OAAO,SAAS,GAAG;AACrB,YAAI,CAAC,OAAO,IAAI,GAAG,SAAS,IAAI;AAChC,YAAI,CAAC,UAAU,MAAM,OAAO,GAAG,SAAS,IAAI;AAAA,MAC9C;AACA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,aAAa,SAAS;AAAA,QACtB,SAAS,SAAS;AAAA,QAClB;AAAA,MACF;AAAA,IACF;AAAA,IAEA,MAAM,QAAQ,UAAU;AAGtB,UAAI,CAAC,YAAY,UAAU,WAAW,SAAS,IAAI,GAAG,KAAK,QAAQ;AACnE,UAAI,CAAC,UAAU,MAAM,SAAS,MAAM,GAAG,KAAK,QAAQ;AAAA,IACtD;AAAA,EACF;AACF;AAKO,SAAS,oBAAoB,SAAsB,aAA8B;AACtF,MAAI,WAAW,QAAQ,WAAW,KAAKD,YAAW,QAAQ,WAAW,EAAG,QAAO,QAAQ;AACvF,MAAI,YAAa,QAAOC,MAAK,aAAa,SAAS,QAAQ,WAAW,CAAC;AACvE,SAAO,QAAQ;AACjB;","names":["createHash","join","createHash","join","existsSync","join"]}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
cohensD
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-ITBRCT73.js";
|
|
4
4
|
import {
|
|
5
5
|
argHash,
|
|
6
6
|
groupBy,
|
|
@@ -551,4 +551,4 @@ export {
|
|
|
551
551
|
iqr,
|
|
552
552
|
welchsTTest
|
|
553
553
|
};
|
|
554
|
-
//# sourceMappingURL=chunk-
|
|
554
|
+
//# sourceMappingURL=chunk-3B7Y5AUR.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/errors.ts"],"sourcesContent":["/**\n * Error taxonomy for `@tangle-network/agent-eval`.\n *\n * Every error this package throws as part of its *public contract* extends\n * `AgentEvalError`. Consumers can pattern-match by `instanceof <Subclass>` or\n * by the stable string `code` carried on the base class.\n *\n * The codes are stable across minor versions; new codes can be added, but\n * existing codes never change meaning. New subclasses are non-breaking.\n *\n * Internal invariant guards (`throw new Error('this should never happen')`)\n * remain plain `Error`s on purpose — they're programmer-mistake assertions,\n * not consumer-catchable contract failures.\n */\n\nexport type AgentEvalErrorCode =\n | 'validation'\n | 'not_found'\n | 'config'\n | 'capture_integrity'\n | 'judge'\n | 'verification'\n | 'replay'\n | 'backend_integrity'\n | 'profile_matrix'\n\nexport class AgentEvalError extends Error {\n /** Stable string code. Survives minification; safe to switch on. */\n readonly code: AgentEvalErrorCode\n\n constructor(code: AgentEvalErrorCode, message: string, options?: { cause?: unknown }) {\n super(message, options)\n this.name = this.constructor.name\n this.code = code\n }\n}\n\n/** Caller passed invalid arguments (out of range, mutually-exclusive options, bad shape). */\nexport class ValidationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('validation', message, options)\n }\n}\n\n/** A named resource (run, span, rubric, scenario, dataset row, route) does not exist. */\nexport class NotFoundError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('not_found', message, options)\n }\n}\n\n/** Configuration missing or malformed (`HOME` unset, required image not supplied, env var absent). */\nexport class ConfigError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('config', message, options)\n }\n}\n\n/**\n * A run is missing the artifacts a launch-grade check requires:\n * raw HTTP capture absent, no LLM spans, route assertion failed, run-end\n * assertion tripped. Block ship on this; do not catch and move on.\n */\nexport class CaptureIntegrityError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('capture_integrity', message, options)\n }\n}\n\n/** A judge call failed in a way that's not retryable: schema parse failure, bad rubric, conflicting dimensions. */\nexport class JudgeError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('judge', message, options)\n }\n}\n\n/** A verifier signalled a hard failure (compile, test, schema) — distinct from a low judge score. */\nexport class VerificationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('verification', message, options)\n }\n}\n\n/** Replay cache cannot satisfy a request: miss with no fallback, sink lacks list(), unsupported URL. */\nexport class ReplayError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('replay', message, options)\n }\n}\n"],"mappings":";AA0BO,IAAM,iBAAN,cAA6B,MAAM;AAAA;AAAA,EAE/B;AAAA,EAET,YAAY,MAA0B,SAAiB,SAA+B;AACpF,UAAM,SAAS,OAAO;AACtB,SAAK,OAAO,KAAK,YAAY;AAC7B,SAAK,OAAO;AAAA,EACd;AACF;AAGO,IAAM,kBAAN,cAA8B,eAAe;AAAA,EAClD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,cAAc,SAAS,OAAO;AAAA,EACtC;AACF;AAGO,IAAM,gBAAN,cAA4B,eAAe;AAAA,EAChD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,aAAa,SAAS,OAAO;AAAA,EACrC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;AAOO,IAAM,wBAAN,cAAoC,eAAe;AAAA,EACxD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,qBAAqB,SAAS,OAAO;AAAA,EAC7C;AACF;AAGO,IAAM,aAAN,cAAyB,eAAe;AAAA,EAC7C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,SAAS,SAAS,OAAO;AAAA,EACjC;AACF;AAGO,IAAM,oBAAN,cAAgC,eAAe;AAAA,EACpD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,gBAAgB,SAAS,OAAO;AAAA,EACxC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;","names":[]}
|
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
} from "./chunk-NCRFYPS3.js";
|
|
5
5
|
import {
|
|
6
6
|
validateRunRecord
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-F3SRAAZO.js";
|
|
8
8
|
import {
|
|
9
9
|
TraceEmitter
|
|
10
10
|
} from "./chunk-TVVP3ZZQ.js";
|
|
@@ -610,4 +610,4 @@ export {
|
|
|
610
610
|
runProposeReviewAsControlLoop,
|
|
611
611
|
controlFailureClassFromVerification
|
|
612
612
|
};
|
|
613
|
-
//# sourceMappingURL=chunk-
|
|
613
|
+
//# sourceMappingURL=chunk-6EKXFFGQ.js.map
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
callLlmJson
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-IHDHUN2X.js";
|
|
4
4
|
|
|
5
5
|
// src/wire/schemas.ts
|
|
6
6
|
import { extendZodWithOpenApi } from "@asteasolutions/zod-to-openapi";
|
|
@@ -1002,4 +1002,4 @@ export {
|
|
|
1002
1002
|
startServer,
|
|
1003
1003
|
startServerAsync
|
|
1004
1004
|
};
|
|
1005
|
-
//# sourceMappingURL=chunk-
|
|
1005
|
+
//# sourceMappingURL=chunk-6REHLN5J.js.map
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
import {
|
|
2
|
+
BackendIntegrityError
|
|
3
|
+
} from "./chunk-E22YUOAL.js";
|
|
1
4
|
import {
|
|
2
5
|
confidenceInterval
|
|
3
|
-
} from "./chunk-
|
|
6
|
+
} from "./chunk-ITBRCT73.js";
|
|
4
7
|
|
|
5
8
|
// src/campaign/run-campaign.ts
|
|
6
9
|
import { createHash } from "crypto";
|
|
@@ -111,6 +114,7 @@ async function runCampaign(opts) {
|
|
|
111
114
|
signal: abortController.signal
|
|
112
115
|
});
|
|
113
116
|
cellsRef.push(result.cell);
|
|
117
|
+
enforceCellUsage(result.cell, opts.expectUsage ?? "warn");
|
|
114
118
|
totalCostUsd += result.cell.costUsd;
|
|
115
119
|
Object.assign(artifactsByPath, result.artifactsByPath);
|
|
116
120
|
if (opts.costCeiling !== void 0 && totalCostUsd >= opts.costCeiling) {
|
|
@@ -187,13 +191,22 @@ async function executeCell(args) {
|
|
|
187
191
|
}
|
|
188
192
|
};
|
|
189
193
|
let costSoFar = 0;
|
|
194
|
+
const tokensSoFar = { input: 0, output: 0 };
|
|
190
195
|
const cost = {
|
|
191
196
|
observe(amount, source) {
|
|
192
197
|
costSoFar += amount;
|
|
193
198
|
trace.span(`cost.${source}`, { amountUsd: amount }).end();
|
|
194
199
|
},
|
|
200
|
+
observeTokens(usage) {
|
|
201
|
+
tokensSoFar.input += usage.input;
|
|
202
|
+
tokensSoFar.output += usage.output;
|
|
203
|
+
if (usage.cached) tokensSoFar.cached = (tokensSoFar.cached ?? 0) + usage.cached;
|
|
204
|
+
},
|
|
195
205
|
current() {
|
|
196
206
|
return costSoFar;
|
|
207
|
+
},
|
|
208
|
+
tokens() {
|
|
209
|
+
return { ...tokensSoFar };
|
|
197
210
|
}
|
|
198
211
|
};
|
|
199
212
|
const placement = args.opts.cellPlacement?.({
|
|
@@ -241,6 +254,7 @@ async function executeCell(args) {
|
|
|
241
254
|
artifact: artifact ?? null,
|
|
242
255
|
judgeScores,
|
|
243
256
|
costUsd: costSoFar,
|
|
257
|
+
tokenUsage: { ...tokensSoFar },
|
|
244
258
|
durationMs: Date.now() - startMs,
|
|
245
259
|
seed: args.slot.cellSeed,
|
|
246
260
|
cached: false,
|
|
@@ -251,6 +265,28 @@ async function executeCell(args) {
|
|
|
251
265
|
}
|
|
252
266
|
return { cell, artifactsByPath };
|
|
253
267
|
}
|
|
268
|
+
function enforceCellUsage(cell, mode) {
|
|
269
|
+
if (mode === "off" || cell.error) return;
|
|
270
|
+
if (cell.artifact === null || cell.artifact === void 0) return;
|
|
271
|
+
const zeroTokens = cell.tokenUsage.input === 0 && cell.tokenUsage.output === 0;
|
|
272
|
+
if (cell.costUsd !== 0 || !zeroTokens) return;
|
|
273
|
+
const msg = `cell '${cell.cellId}' produced an artifact but reported zero cost and zero tokens \u2014 the dispatch never reported LLM usage via ctx.cost.observe/observeTokens (a stub cell)`;
|
|
274
|
+
if (mode === "assert") {
|
|
275
|
+
const report = {
|
|
276
|
+
totalRecords: 1,
|
|
277
|
+
stubRecords: 1,
|
|
278
|
+
realRecords: 0,
|
|
279
|
+
uncostedRecords: 0,
|
|
280
|
+
totalInputTokens: 0,
|
|
281
|
+
totalOutputTokens: 0,
|
|
282
|
+
totalCostUsd: 0,
|
|
283
|
+
verdict: "stub",
|
|
284
|
+
diagnosis: msg
|
|
285
|
+
};
|
|
286
|
+
throw new BackendIntegrityError(`expectUsage: ${msg}`, report);
|
|
287
|
+
}
|
|
288
|
+
console.warn(`[runCampaign] expectUsage: ${msg}`);
|
|
289
|
+
}
|
|
254
290
|
async function runJudgeCell(judge, input) {
|
|
255
291
|
return judge.score(input);
|
|
256
292
|
}
|
|
@@ -287,6 +323,7 @@ function skippedCell(slot, reason) {
|
|
|
287
323
|
artifact: null,
|
|
288
324
|
judgeScores: {},
|
|
289
325
|
costUsd: 0,
|
|
326
|
+
tokenUsage: { input: 0, output: 0 },
|
|
290
327
|
durationMs: 0,
|
|
291
328
|
seed: slot.cellSeed,
|
|
292
329
|
cached: false,
|
|
@@ -363,4 +400,4 @@ export {
|
|
|
363
400
|
inMemoryCampaignStorage,
|
|
364
401
|
runCampaign
|
|
365
402
|
};
|
|
366
|
-
//# sourceMappingURL=chunk-
|
|
403
|
+
//# sourceMappingURL=chunk-7TPYV2ER.js.map
|