@tangle-network/agent-eval 0.38.0 → 0.40.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/campaign/index.d.ts +775 -0
  2. package/dist/campaign/index.js +807 -0
  3. package/dist/campaign/index.js.map +1 -0
  4. package/dist/chunk-5U2DOJU4.js +565 -0
  5. package/dist/chunk-5U2DOJU4.js.map +1 -0
  6. package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
  7. package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
  8. package/dist/chunk-BWZEGTES.js.map +1 -0
  9. package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
  10. package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
  11. package/dist/chunk-GGE4NNQT.js +65 -0
  12. package/dist/chunk-GGE4NNQT.js.map +1 -0
  13. package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
  14. package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
  15. package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
  16. package/dist/chunk-MAOZCN36.js.map +1 -0
  17. package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
  18. package/dist/chunk-TMXPFWC7.js +305 -0
  19. package/dist/chunk-TMXPFWC7.js.map +1 -0
  20. package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
  21. package/dist/chunk-WP7SY7AI.js.map +1 -0
  22. package/dist/chunk-YV7J7X5N.js +313 -0
  23. package/dist/chunk-YV7J7X5N.js.map +1 -0
  24. package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
  25. package/dist/control.d.ts +3 -3
  26. package/dist/control.js +2 -2
  27. package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
  28. package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
  29. package/dist/governance/index.d.ts +133 -5
  30. package/dist/index.d.ts +35 -34
  31. package/dist/index.js +97 -630
  32. package/dist/index.js.map +1 -1
  33. package/dist/multishot/index.d.ts +21 -21
  34. package/dist/multishot/index.js +64 -15
  35. package/dist/multishot/index.js.map +1 -1
  36. package/dist/openapi.json +1 -1
  37. package/dist/optimization.d.ts +2 -2
  38. package/dist/optimization.js +5 -5
  39. package/dist/pipelines/index.js +2 -2
  40. package/dist/red-team-30II1T4o.d.ts +63 -0
  41. package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
  42. package/dist/reporting.d.ts +2 -2
  43. package/dist/reporting.js +3 -3
  44. package/dist/rl.js +15 -315
  45. package/dist/rl.js.map +1 -1
  46. package/dist/run-campaign-JYJXYHHL.js +10 -0
  47. package/dist/run-campaign-JYJXYHHL.js.map +1 -0
  48. package/dist/traces.js +7 -5
  49. package/dist/wire/index.d.ts +2 -2
  50. package/docs/design/loop-taxonomy.md +233 -0
  51. package/docs/design/self-improvement-engine.md +130 -0
  52. package/package.json +33 -24
  53. package/dist/chunk-KHZRNY3F.js.map +0 -1
  54. package/dist/chunk-L5UNCDAJ.js.map +0 -1
  55. package/dist/chunk-TSPOEDM3.js.map +0 -1
  56. package/dist/index-CN2agEaO.d.ts +0 -191
  57. /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
  58. /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
  59. /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
  60. /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
  61. /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
  62. /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
@@ -0,0 +1,807 @@
1
+ import {
2
+ runCampaign
3
+ } from "../chunk-TMXPFWC7.js";
4
+ import {
5
+ runCanaries,
6
+ scoreRedTeamOutput
7
+ } from "../chunk-5U2DOJU4.js";
8
+ import {
9
+ detectRewardHacking
10
+ } from "../chunk-YV7J7X5N.js";
11
+ import "../chunk-WP7SY7AI.js";
12
+ import "../chunk-GGE4NNQT.js";
13
+ import "../chunk-QYJT52YW.js";
14
+ import "../chunk-PZ5AY32C.js";
15
+
16
+ // src/campaign/auto-pr.ts
17
+ import { execSync } from "child_process";
18
+ import { writeFileSync } from "fs";
19
+ import { tmpdir } from "os";
20
+ import { join } from "path";
21
+ function openAutoPr(options) {
22
+ if (options.gate.decision !== "ship") {
23
+ return {
24
+ opened: false,
25
+ dryRun: false,
26
+ reason: `gate verdict was "${options.gate.decision}" \u2014 refusing to open PR`
27
+ };
28
+ }
29
+ const dryRun = options.dryRun ?? !process.env.GH_AUTO_PR_TOKEN;
30
+ const branch = options.branch ?? `auto/${options.result.manifestHash.slice(0, 12)}`;
31
+ const title = options.title ?? `auto: campaign ${options.result.manifestHash.slice(0, 8)} promoted by gate`;
32
+ const body = renderPrBody(options.result, options.gate, options.promotedDiff);
33
+ const bodyPath = join(tmpdir(), `auto-pr-body-${Date.now()}.md`);
34
+ writeFileSync(bodyPath, body);
35
+ if (dryRun) {
36
+ return {
37
+ opened: false,
38
+ dryRun: true,
39
+ reason: `dry-run (GH_AUTO_PR_TOKEN not set). Would create PR on ${options.ghOwner}/${options.ghRepo} branch ${branch}. Body at ${bodyPath}.`
40
+ };
41
+ }
42
+ const ghExec = options.ghExec ?? defaultGhExec;
43
+ const result = ghExec([
44
+ "pr",
45
+ "create",
46
+ "--repo",
47
+ `${options.ghOwner}/${options.ghRepo}`,
48
+ "--head",
49
+ branch,
50
+ "--title",
51
+ title,
52
+ "--body-file",
53
+ bodyPath
54
+ ]);
55
+ if (result.status !== 0) {
56
+ return {
57
+ opened: false,
58
+ dryRun: false,
59
+ reason: `gh pr create failed (exit ${result.status}): ${result.stderr.slice(0, 400)}`
60
+ };
61
+ }
62
+ const prUrl = result.stdout.trim();
63
+ return { opened: true, prUrl, dryRun: false, reason: "PR opened" };
64
+ }
65
+ function renderPrBody(result, gate, diff) {
66
+ const lines = [];
67
+ lines.push(`## Automated promotion by \`runImprovementLoop\``);
68
+ lines.push("");
69
+ lines.push(`**Manifest**: \`${result.manifestHash}\``);
70
+ lines.push(`**Seed**: ${result.seed}`);
71
+ lines.push(`**Duration**: ${Math.round(result.durationMs / 1e3)}s`);
72
+ lines.push(
73
+ `**Cells**: executed ${result.aggregates.cellsExecuted}, cached ${result.aggregates.cellsCached}, skipped ${result.aggregates.cellsSkipped}, failed ${result.aggregates.cellsFailed}`
74
+ );
75
+ lines.push(`**Total spend**: $${result.aggregates.totalCostUsd.toFixed(2)}`);
76
+ lines.push("");
77
+ lines.push(`### Gate verdict: \`${gate.decision}\``);
78
+ lines.push("");
79
+ for (const reason of gate.reasons) lines.push(`- ${reason}`);
80
+ if (gate.delta !== void 0) lines.push(`- delta: ${gate.delta.toFixed(3)}`);
81
+ lines.push("");
82
+ lines.push("### Contributing gates");
83
+ lines.push("");
84
+ lines.push("| gate | passed | detail |");
85
+ lines.push("|---|---|---|");
86
+ for (const c of gate.contributingGates) {
87
+ const detail = typeof c.detail === "object" ? JSON.stringify(c.detail).slice(0, 80) : String(c.detail).slice(0, 80);
88
+ lines.push(`| ${c.name} | ${c.passed ? "\u2713" : "\u2717"} | ${detail} |`);
89
+ }
90
+ lines.push("");
91
+ lines.push("### Promoted surface");
92
+ lines.push("");
93
+ lines.push("```diff");
94
+ lines.push(diff.slice(0, 8e3));
95
+ lines.push("```");
96
+ lines.push("");
97
+ lines.push("### By-judge aggregates");
98
+ lines.push("");
99
+ lines.push("| judge | mean | ci95 | n |");
100
+ lines.push("|---|---|---|---|");
101
+ for (const [name, agg] of Object.entries(result.aggregates.byJudge)) {
102
+ lines.push(
103
+ `| ${name} | ${agg.mean.toFixed(3)} | [${agg.ci95[0].toFixed(3)}, ${agg.ci95[1].toFixed(3)}] | ${agg.n} |`
104
+ );
105
+ }
106
+ return lines.join("\n");
107
+ }
108
+ function defaultGhExec(args) {
109
+ try {
110
+ const stdout = execSync(`gh ${args.map(quoteArg).join(" ")}`, {
111
+ env: { ...process.env, GH_TOKEN: process.env.GH_AUTO_PR_TOKEN ?? process.env.GH_TOKEN ?? "" },
112
+ stdio: ["ignore", "pipe", "pipe"]
113
+ }).toString("utf8");
114
+ return { stdout, stderr: "", status: 0 };
115
+ } catch (err) {
116
+ const e = err;
117
+ return {
118
+ stdout: e.stdout?.toString("utf8") ?? "",
119
+ stderr: e.stderr?.toString("utf8") ?? "",
120
+ status: e.status ?? 1
121
+ };
122
+ }
123
+ }
124
+ function quoteArg(arg) {
125
+ if (/^[a-zA-Z0-9_/\-:.@]+$/.test(arg)) return arg;
126
+ return `"${arg.replace(/"/g, '\\"')}"`;
127
+ }
128
+
129
+ // src/campaign/drivers/evolutionary.ts
130
+ function evolutionaryDriver(opts) {
131
+ return {
132
+ kind: `evolutionary:${opts.mutator.kind}`,
133
+ async propose({ currentSurface, findings, populationSize, signal }) {
134
+ return opts.mutator.mutate({
135
+ findings: findings.length > 0 ? findings : opts.findings ?? [],
136
+ currentSurface,
137
+ populationSize,
138
+ signal
139
+ });
140
+ }
141
+ };
142
+ }
143
+
144
+ // src/campaign/gates/compose.ts
145
+ function composeGate(...gates) {
146
+ if (gates.length === 0) {
147
+ throw new Error("composeGate requires at least one gate");
148
+ }
149
+ return {
150
+ name: `composed(${gates.map((g) => g.name).join(",")})`,
151
+ async decide(ctx) {
152
+ const results = [];
153
+ for (const gate of gates) {
154
+ const res = await gate.decide(ctx);
155
+ results.push({ gate, res });
156
+ }
157
+ const decisions = results.map((r) => r.res.decision);
158
+ const overall = decisions.every((d) => d === "ship") ? "ship" : decisions.includes("arch_ceiling") ? "arch_ceiling" : decisions.includes("model_ceiling") ? "model_ceiling" : decisions.includes("hold") ? "hold" : "need_more_work";
159
+ const contributing = results.flatMap(
160
+ (r) => r.res.contributingGates.length > 0 ? r.res.contributingGates : [{ name: r.gate.name, passed: r.res.decision === "ship", detail: r.res }]
161
+ );
162
+ const reasons = results.flatMap(
163
+ (r) => r.res.reasons.map((reason) => `[${r.gate.name}] ${reason}`)
164
+ );
165
+ return {
166
+ decision: overall,
167
+ reasons,
168
+ contributingGates: contributing,
169
+ delta: results[0]?.res.delta
170
+ };
171
+ }
172
+ };
173
+ }
174
+
175
+ // src/campaign/gates/default-production-gate.ts
176
+ function defaultProductionGate(options) {
177
+ const deltaThreshold = options.deltaThreshold ?? 0.5;
178
+ const blockOnGaming = options.blockOnRewardHackingGaming ?? true;
179
+ return {
180
+ name: "defaultProductionGate",
181
+ async decide(ctx) {
182
+ const reasons = [];
183
+ const contributing = [];
184
+ const baselineComposite = meanComposite(
185
+ ctx.baselineArtifacts,
186
+ ctx.judgeScores,
187
+ options.holdoutScenarios
188
+ );
189
+ const candidateComposite = meanComposite(
190
+ ctx.candidateArtifacts,
191
+ ctx.judgeScores,
192
+ options.holdoutScenarios
193
+ );
194
+ const delta = candidateComposite - baselineComposite;
195
+ const heldoutPass = delta >= deltaThreshold;
196
+ contributing.push({
197
+ name: "heldout-delta",
198
+ passed: heldoutPass,
199
+ detail: { baselineComposite, candidateComposite, delta, deltaThreshold }
200
+ });
201
+ if (!heldoutPass) {
202
+ reasons.push(`heldout delta ${delta.toFixed(3)} < threshold ${deltaThreshold}`);
203
+ }
204
+ const budgetPass = options.budgetUsd === void 0 || ctx.cost.candidate + ctx.cost.baseline <= options.budgetUsd;
205
+ contributing.push({
206
+ name: "budget",
207
+ passed: budgetPass,
208
+ detail: {
209
+ candidateUsd: ctx.cost.candidate,
210
+ baselineUsd: ctx.cost.baseline,
211
+ budgetUsd: options.budgetUsd
212
+ }
213
+ });
214
+ if (!budgetPass) {
215
+ reasons.push(
216
+ `spend ${(ctx.cost.candidate + ctx.cost.baseline).toFixed(2)} > budget ${options.budgetUsd}`
217
+ );
218
+ }
219
+ const redTeamFindings = options.redTeamBattery ? probeRedTeam(ctx.candidateArtifacts, options.redTeamBattery) : { passed: true, findings: [] };
220
+ contributing.push({
221
+ name: "red-team",
222
+ passed: redTeamFindings.passed,
223
+ detail: {
224
+ failures: redTeamFindings.findings.length,
225
+ sample: redTeamFindings.findings.slice(0, 3)
226
+ }
227
+ });
228
+ if (!redTeamFindings.passed) {
229
+ reasons.push(`red-team probe failed (${redTeamFindings.findings.length} findings)`);
230
+ }
231
+ let rewardHackingReport = null;
232
+ if (options.recentRuns && options.recentRuns.length >= 10) {
233
+ rewardHackingReport = detectRewardHacking({ runs: options.recentRuns });
234
+ }
235
+ const gamingThreshold = 0.6;
236
+ const gamingFindings = (rewardHackingReport?.findings ?? []).filter(
237
+ (f) => f.severity >= gamingThreshold
238
+ );
239
+ const rewardHackingPass = !rewardHackingReport || !blockOnGaming || gamingFindings.length === 0 && rewardHackingReport.verdict !== "gaming";
240
+ contributing.push({
241
+ name: "reward-hacking",
242
+ passed: rewardHackingPass,
243
+ detail: { report: rewardHackingReport, gamingFindingCount: gamingFindings.length }
244
+ });
245
+ if (!rewardHackingPass) {
246
+ reasons.push(
247
+ `reward-hacking detector flagged ${gamingFindings.length} gaming-severity findings (verdict=${rewardHackingReport.verdict})`
248
+ );
249
+ }
250
+ let canaryReport = null;
251
+ if (options.recentRuns && options.recentRuns.length >= 10) {
252
+ canaryReport = runCanaries(options.recentRuns, {});
253
+ }
254
+ const errorAlerts = (canaryReport?.alerts ?? []).filter((a) => a.severity === "error");
255
+ const canaryPass = errorAlerts.length === 0;
256
+ contributing.push({
257
+ name: "canary",
258
+ passed: canaryPass,
259
+ detail: { totalAlerts: canaryReport?.alerts.length ?? 0, errorAlerts: errorAlerts.length }
260
+ });
261
+ if (!canaryPass) {
262
+ reasons.push(`canary error alerts: ${errorAlerts.length}`);
263
+ }
264
+ const allPassed = contributing.every((c) => c.passed);
265
+ const decision = allPassed ? "ship" : "hold";
266
+ return {
267
+ decision,
268
+ reasons: reasons.length > 0 ? reasons : ["all gates passed"],
269
+ contributingGates: contributing,
270
+ delta
271
+ };
272
+ }
273
+ };
274
+ }
275
+ function meanComposite(artifacts, judgeScoresByCell, scenarios) {
276
+ if (!artifacts || artifacts.size === 0) return 0;
277
+ const scenarioIds = new Set(scenarios.map((s) => s.id));
278
+ const composites = [];
279
+ for (const [cellId, scores] of judgeScoresByCell) {
280
+ const scenarioId = cellId.split(":")[0] ?? "";
281
+ if (!scenarioIds.has(scenarioId)) continue;
282
+ const cellComposites = Object.values(scores).map((s) => s.composite);
283
+ if (cellComposites.length === 0) continue;
284
+ composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
285
+ }
286
+ if (composites.length === 0) return 0;
287
+ return composites.reduce((a, b) => a + b, 0) / composites.length;
288
+ }
289
+ function probeRedTeam(artifacts, battery) {
290
+ const findings = [];
291
+ for (const [_cellId, artifact] of artifacts) {
292
+ const text = extractText(artifact);
293
+ if (text === void 0) continue;
294
+ for (const rtCase of battery) {
295
+ const finding = scoreRedTeamOutput(text, [], rtCase);
296
+ if (!finding.passed) {
297
+ findings.push({ scenarioId: rtCase.id, reason: finding.reason ?? "red-team probe failed" });
298
+ }
299
+ }
300
+ }
301
+ return { passed: findings.length === 0, findings };
302
+ }
303
+ function extractText(artifact) {
304
+ if (typeof artifact === "string") return artifact;
305
+ if (artifact && typeof artifact === "object") {
306
+ const rec = artifact;
307
+ if (typeof rec.text === "string") return rec.text;
308
+ if (typeof rec.output === "string") return rec.output;
309
+ if (typeof rec.content === "string") return rec.content;
310
+ }
311
+ return void 0;
312
+ }
313
+
314
+ // src/campaign/gates/heldout-gate.ts
315
+ function heldOutGate(options) {
316
+ const deltaThreshold = options.deltaThreshold ?? 0.5;
317
+ return {
318
+ name: "heldOutGate",
319
+ async decide(ctx) {
320
+ const scenarioIds = new Set(options.scenarios.map((s) => s.id));
321
+ const baseline = meanForScenarios(ctx.baselineArtifacts, ctx.judgeScores, scenarioIds);
322
+ const candidate = meanForScenarios(ctx.candidateArtifacts, ctx.judgeScores, scenarioIds);
323
+ const delta = candidate - baseline;
324
+ const passed = delta >= deltaThreshold;
325
+ return {
326
+ decision: passed ? "ship" : "hold",
327
+ reasons: passed ? [`held-out delta ${delta.toFixed(3)} \u2265 ${deltaThreshold}`] : [`held-out delta ${delta.toFixed(3)} < ${deltaThreshold}`],
328
+ contributingGates: [
329
+ { name: "heldOutGate", passed, detail: { baseline, candidate, delta, deltaThreshold } }
330
+ ],
331
+ delta
332
+ };
333
+ }
334
+ };
335
+ }
336
+ function meanForScenarios(artifacts, judgeScoresByCell, scenarioIds) {
337
+ if (!artifacts || artifacts.size === 0) return 0;
338
+ const composites = [];
339
+ for (const [cellId, scores] of judgeScoresByCell) {
340
+ const scenarioId = cellId.split(":")[0] ?? "";
341
+ if (!scenarioIds.has(scenarioId)) continue;
342
+ const vals = Object.values(scores).map((s) => s.composite);
343
+ if (vals.length > 0) composites.push(vals.reduce((a, b) => a + b, 0) / vals.length);
344
+ }
345
+ return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
346
+ }
347
+
348
+ // src/campaign/labeled-store/fs-adapter.ts
349
+ import { createHash } from "crypto";
350
+ import { existsSync, mkdirSync, readFileSync, writeFileSync as writeFileSync2 } from "fs";
351
+ import { join as join2 } from "path";
352
+ var LabeledScenarioStoreError = class extends Error {
353
+ constructor(code, message) {
354
+ super(message);
355
+ this.code = code;
356
+ this.name = "LabeledScenarioStoreError";
357
+ }
358
+ code;
359
+ };
360
+ var FsLabeledScenarioStore = class {
361
+ constructor(options) {
362
+ this.options = options;
363
+ if (!existsSync(options.root)) mkdirSync(options.root, { recursive: true });
364
+ this.now = options.now ?? Date.now;
365
+ }
366
+ options;
367
+ now;
368
+ rateLimits = /* @__PURE__ */ new Map();
369
+ async observe(write) {
370
+ this.assertProvenance(write);
371
+ this.assertRateLimit(write);
372
+ const record = this.toRecord(write);
373
+ const path = this.pathForSource(write.source);
374
+ const line = `${JSON.stringify(record)}
375
+ `;
376
+ appendLine(path, line);
377
+ }
378
+ async sample(args) {
379
+ if (!args.split) {
380
+ throw new LabeledScenarioStoreError(
381
+ "split_required",
382
+ "sample() requires an explicit `split` (train | test) \u2014 substrate refuses ambiguous reads"
383
+ );
384
+ }
385
+ if (!args.capturedBefore) {
386
+ throw new LabeledScenarioStoreError(
387
+ "capturedBefore_required",
388
+ "sample() requires an explicit `capturedBefore` timestamp for temporal-split discipline"
389
+ );
390
+ }
391
+ const all = [];
392
+ for (const source of ALL_SOURCES) {
393
+ if (args.split === "train" && source === "production-trace") {
394
+ const explicit = sourceFilterContains(args.filter?.source, "production-trace");
395
+ if (!explicit) continue;
396
+ }
397
+ const path = this.pathForSource(source);
398
+ if (!existsSync(path)) continue;
399
+ const lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
400
+ for (const line of lines) {
401
+ let record;
402
+ try {
403
+ record = JSON.parse(line);
404
+ } catch {
405
+ continue;
406
+ }
407
+ if (!matchesFilter(record, args, source)) continue;
408
+ all.push(record);
409
+ }
410
+ }
411
+ all.sort((a, b) => {
412
+ if (a.capturedAt !== b.capturedAt) return a.capturedAt.localeCompare(b.capturedAt);
413
+ return a.recordHash.localeCompare(b.recordHash);
414
+ });
415
+ return all.slice(0, args.count);
416
+ }
417
+ async size() {
418
+ const bySource = {};
419
+ let total = 0;
420
+ for (const source of ALL_SOURCES) {
421
+ const path = this.pathForSource(source);
422
+ if (!existsSync(path)) {
423
+ bySource[source] = 0;
424
+ continue;
425
+ }
426
+ const count = readFileSync(path, "utf8").split("\n").filter(Boolean).length;
427
+ bySource[source] = count;
428
+ total += count;
429
+ }
430
+ return { train: total, test: total, bySource };
431
+ }
432
+ assertProvenance(write) {
433
+ if (!write.source) {
434
+ throw new LabeledScenarioStoreError(
435
+ "missing_source",
436
+ "LabeledScenarioWrite requires `source`"
437
+ );
438
+ }
439
+ if (!write.sourceVersionHash || write.sourceVersionHash.length === 0) {
440
+ throw new LabeledScenarioStoreError(
441
+ "missing_source_version",
442
+ "LabeledScenarioWrite requires `sourceVersionHash` (git sha or substrate version)"
443
+ );
444
+ }
445
+ if (!write.capturedAt) {
446
+ throw new LabeledScenarioStoreError(
447
+ "missing_captured_at",
448
+ "LabeledScenarioWrite requires `capturedAt` ISO timestamp"
449
+ );
450
+ }
451
+ if (!write.redactionStatus) {
452
+ throw new LabeledScenarioStoreError(
453
+ "missing_redaction_status",
454
+ "LabeledScenarioWrite requires explicit `redactionStatus` \u2014 raw / redacted-pii / redacted-secrets / fully-redacted"
455
+ );
456
+ }
457
+ if (!ALL_SOURCES.includes(write.source)) {
458
+ throw new LabeledScenarioStoreError(
459
+ "unknown_source",
460
+ `LabeledScenarioWrite.source must be one of: ${ALL_SOURCES.join(", ")}`
461
+ );
462
+ }
463
+ }
464
+ assertRateLimit(write) {
465
+ const cap = this.options.maxWritesPerMinutePerBucket;
466
+ if (!cap || !write.rateLimitBucket) return;
467
+ const now = this.now();
468
+ const windowMs = 6e4;
469
+ let state = this.rateLimits.get(write.rateLimitBucket);
470
+ if (!state || now - state.windowStartMs >= windowMs) {
471
+ state = { bucket: write.rateLimitBucket, windowStartMs: now, count: 0 };
472
+ this.rateLimits.set(write.rateLimitBucket, state);
473
+ }
474
+ if (state.count >= cap) {
475
+ throw new LabeledScenarioStoreError(
476
+ "rate_limit_exceeded",
477
+ `LabeledScenarioStore: bucket ${write.rateLimitBucket} exceeded ${cap} writes/min`
478
+ );
479
+ }
480
+ state.count += 1;
481
+ }
482
+ toRecord(write) {
483
+ const recordHash = sha256(
484
+ JSON.stringify({
485
+ id: write.scenario.id,
486
+ src: write.source,
487
+ at: write.capturedAt,
488
+ ver: write.sourceVersionHash
489
+ })
490
+ );
491
+ return {
492
+ ...write,
493
+ recordHash,
494
+ split: "train"
495
+ };
496
+ }
497
+ pathForSource(source) {
498
+ return join2(this.options.root, `${source}.jsonl`);
499
+ }
500
+ };
501
+ var ALL_SOURCES = [
502
+ "production-trace",
503
+ "eval-run",
504
+ "manual",
505
+ "red-team",
506
+ "synthetic"
507
+ ];
508
+ function sourceFilterContains(filter, needle) {
509
+ if (!filter) return false;
510
+ if (Array.isArray(filter)) return filter.includes(needle);
511
+ return filter === needle;
512
+ }
513
+ function matchesFilter(record, args, source) {
514
+ if (args.split === "train" && record.capturedAt >= args.capturedBefore) return false;
515
+ if (args.split === "test" && record.capturedAt < args.capturedBefore) return false;
516
+ const f = args.filter;
517
+ if (!f) return true;
518
+ if (f.kind && record.scenario.kind !== f.kind) return false;
519
+ if (f.source) {
520
+ const sources = Array.isArray(f.source) ? f.source : [f.source];
521
+ if (!sources.includes(source)) return false;
522
+ }
523
+ if (f.minComposite !== void 0 || f.maxComposite !== void 0) {
524
+ const composites = Object.values(record.judgeScores).map((s) => s.composite);
525
+ const max = composites.length === 0 ? 0 : Math.max(...composites);
526
+ if (f.minComposite !== void 0 && max < f.minComposite) return false;
527
+ if (f.maxComposite !== void 0 && max > f.maxComposite) return false;
528
+ }
529
+ return true;
530
+ }
531
+ function sha256(input) {
532
+ return createHash("sha256").update(input).digest("hex").slice(0, 16);
533
+ }
534
+ function appendLine(path, line) {
535
+ if (existsSync(path)) {
536
+ const existing = readFileSync(path, "utf8");
537
+ writeFileSync2(path, existing + line);
538
+ } else {
539
+ writeFileSync2(path, line);
540
+ }
541
+ }
542
+
543
+ // src/campaign/presets/run-eval.ts
544
+ async function runEval(opts) {
545
+ return runCampaign(opts);
546
+ }
547
+
548
+ // src/campaign/presets/run-optimization.ts
549
+ import { createHash as createHash2 } from "crypto";
550
+ async function runOptimization(opts) {
551
+ const promoteTopK = opts.promoteTopK ?? 2;
552
+ const baselineCampaign = await runCampaign({
553
+ ...opts,
554
+ dispatch: (scenario, ctx) => opts.dispatchWithSurface(opts.baselineSurface, scenario, ctx),
555
+ runDir: `${opts.runDir}/baseline`
556
+ });
557
+ const generations = [];
558
+ const history = [];
559
+ let currentSurfaces = [opts.baselineSurface];
560
+ let winnerSurface = opts.baselineSurface;
561
+ let winnerSurfaceHash = surfaceHash(opts.baselineSurface);
562
+ let winnerComposite = meanComposite2(baselineCampaign);
563
+ for (let gen = 0; gen < opts.maxGenerations; gen++) {
564
+ if (opts.driver.decide?.({ history }).stop) break;
565
+ const candidates = await opts.driver.propose({
566
+ currentSurface: currentSurfaces[0] ?? opts.baselineSurface,
567
+ history,
568
+ findings: [],
569
+ populationSize: opts.populationSize,
570
+ generation: gen,
571
+ signal: new AbortController().signal,
572
+ report: opts.report,
573
+ dataset: opts.labeledStore && opts.labeledStore !== "off" ? opts.labeledStore : void 0,
574
+ maxImprovementShots: opts.maxImprovementShots
575
+ });
576
+ const surfaceResults = [];
577
+ for (let i = 0; i < candidates.length; i++) {
578
+ const surface = candidates[i];
579
+ const hash = surfaceHash(surface);
580
+ const campaign = await runCampaign({
581
+ ...opts,
582
+ dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
583
+ runDir: `${opts.runDir}/gen-${gen}/candidate-${i}`
584
+ });
585
+ const composite = meanComposite2(campaign);
586
+ surfaceResults.push({ surfaceHash: hash, surface, campaign, composite });
587
+ }
588
+ surfaceResults.sort((a, b) => b.composite - a.composite);
589
+ const promoted = surfaceResults.slice(0, promoteTopK);
590
+ currentSurfaces = promoted.map((p) => p.surface);
591
+ const top = surfaceResults[0];
592
+ if (top && top.composite > winnerComposite) {
593
+ winnerSurface = top.surface;
594
+ winnerSurfaceHash = top.surfaceHash;
595
+ winnerComposite = top.composite;
596
+ }
597
+ const record = {
598
+ generationIndex: gen,
599
+ candidates: surfaceResults.map((s) => ({
600
+ surfaceHash: s.surfaceHash,
601
+ composite: s.composite,
602
+ ci95: [s.composite, s.composite]
603
+ })),
604
+ promoted: promoted.map((p) => p.surfaceHash)
605
+ };
606
+ history.push(record);
607
+ generations.push({
608
+ record,
609
+ surfaces: surfaceResults.map((s) => ({
610
+ surfaceHash: s.surfaceHash,
611
+ surface: s.surface,
612
+ campaign: s.campaign
613
+ }))
614
+ });
615
+ }
616
+ return {
617
+ generations,
618
+ winnerSurface,
619
+ winnerSurfaceHash,
620
+ baselineCampaign
621
+ };
622
+ }
623
+ function surfaceHash(surface) {
624
+ const material = typeof surface === "string" ? surface : JSON.stringify({
625
+ kind: surface.kind,
626
+ worktreeRef: surface.worktreeRef,
627
+ baseRef: surface.baseRef ?? null
628
+ });
629
+ return createHash2("sha256").update(material).digest("hex").slice(0, 16);
630
+ }
631
+ function meanComposite2(campaign) {
632
+ const composites = [];
633
+ for (const cell of campaign.cells) {
634
+ const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
635
+ if (cellComposites.length > 0) {
636
+ composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
637
+ }
638
+ }
639
+ return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
640
+ }
641
+
642
+ // src/campaign/presets/run-improvement-loop.ts
643
+ async function runImprovementLoop(opts) {
644
+ if (opts.autoOnPromote === "config") {
645
+ throw new Error(
646
+ "runImprovementLoop: autoOnPromote='config' is deferred to Pass B (requires shadow deploy + rollback + ensemble judges). Use 'pr' or 'none' in v0.40."
647
+ );
648
+ }
649
+ if (opts.tracing === "off" && opts.driver) {
650
+ throw new Error(
651
+ "runImprovementLoop: tracing='off' is forbidden when a driver is wired. The improvement loop without traces is unattributable; candidate surfaces cannot be cited back to spans and the optimization dataset goes unfed."
652
+ );
653
+ }
654
+ if (opts.autoOnPromote === "pr" && (!opts.ghOwner || !opts.ghRepo)) {
655
+ throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
656
+ }
657
+ const optimization = await runOptimization(opts);
658
+ const { runCampaign: runCampaign2 } = await import("../run-campaign-JYJXYHHL.js");
659
+ const baselineOnHoldout = await runCampaign2({
660
+ ...opts,
661
+ scenarios: opts.holdoutScenarios,
662
+ dispatch: (scenario, ctx) => opts.dispatchWithSurface(opts.baselineSurface, scenario, ctx),
663
+ runDir: `${opts.runDir}/holdout-baseline`
664
+ });
665
+ const winnerOnHoldout = await runCampaign2({
666
+ ...opts,
667
+ scenarios: opts.holdoutScenarios,
668
+ dispatch: (scenario, ctx) => opts.dispatchWithSurface(optimization.winnerSurface, scenario, ctx),
669
+ runDir: `${opts.runDir}/holdout-winner`
670
+ });
671
+ const candidateArtifacts = /* @__PURE__ */ new Map();
672
+ const baselineArtifacts = /* @__PURE__ */ new Map();
673
+ const judgeScores = /* @__PURE__ */ new Map();
674
+ for (const cell of winnerOnHoldout.cells) {
675
+ candidateArtifacts.set(cell.cellId, cell.artifact);
676
+ judgeScores.set(cell.cellId, cell.judgeScores);
677
+ }
678
+ for (const cell of baselineOnHoldout.cells) {
679
+ baselineArtifacts.set(cell.cellId, cell.artifact);
680
+ const prior = judgeScores.get(cell.cellId) ?? {};
681
+ judgeScores.set(cell.cellId, { ...prior, ...cell.judgeScores });
682
+ }
683
+ const gateResult = await opts.gate.decide({
684
+ candidateArtifacts,
685
+ baselineArtifacts,
686
+ judgeScores,
687
+ scenarios: opts.holdoutScenarios,
688
+ cost: {
689
+ candidate: winnerOnHoldout.aggregates.totalCostUsd,
690
+ baseline: baselineOnHoldout.aggregates.totalCostUsd
691
+ },
692
+ signal: new AbortController().signal
693
+ });
694
+ let prResult;
695
+ if (opts.autoOnPromote === "pr" && gateResult.decision === "ship") {
696
+ const render = opts.renderPromotedDiff ?? defaultRenderDiff;
697
+ const promotedDiff = render(optimization.winnerSurface, opts.baselineSurface);
698
+ prResult = openAutoPr({
699
+ result: winnerOnHoldout,
700
+ gate: gateResult,
701
+ promotedDiff,
702
+ ghOwner: opts.ghOwner,
703
+ ghRepo: opts.ghRepo
704
+ });
705
+ }
706
+ return {
707
+ ...optimization,
708
+ baselineOnHoldout,
709
+ winnerOnHoldout,
710
+ gateResult,
711
+ prResult
712
+ };
713
+ }
714
+ function defaultRenderDiff(winnerSurface, baselineSurface) {
715
+ if (typeof winnerSurface !== "string" || typeof baselineSurface !== "string") {
716
+ const fmt = (s) => typeof s === "string" ? "(prompt surface)" : `worktree=${s.worktreeRef}${s.baseRef ? ` base=${s.baseRef}` : ""}${s.summary ? `
717
+ ${s.summary}` : ""}`;
718
+ return `--- baseline
719
+ ${fmt(baselineSurface)}
720
+ +++ winner
721
+ ${fmt(winnerSurface)}`;
722
+ }
723
+ const lines = [];
724
+ lines.push("--- baseline");
725
+ lines.push("+++ winner");
726
+ for (const l of baselineSurface.split("\n")) lines.push(`- ${l}`);
727
+ for (const l of winnerSurface.split("\n")) lines.push(`+ ${l}`);
728
+ return lines.join("\n");
729
+ }
730
+
731
+ // src/campaign/worktree/index.ts
732
+ import { execFileSync } from "child_process";
733
+ import { existsSync as existsSync2 } from "fs";
734
+ import { basename, isAbsolute, join as join3 } from "path";
735
+ var WorktreeAdapterError = class extends Error {
736
+ constructor(message, cause) {
737
+ super(message);
738
+ this.cause = cause;
739
+ this.name = "WorktreeAdapterError";
740
+ }
741
+ cause;
742
+ };
743
+ function defaultGit(args, cwd) {
744
+ try {
745
+ return execFileSync("git", args, { cwd, encoding: "utf8" }).trim();
746
+ } catch (err) {
747
+ const stderr = err && typeof err === "object" && "stderr" in err ? String(err.stderr) : "";
748
+ throw new WorktreeAdapterError(`git ${args.join(" ")} failed: ${stderr || String(err)}`, err);
749
+ }
750
+ }
751
+ function slug(label) {
752
+ return label.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 48) || "candidate";
753
+ }
754
+ function gitWorktreeAdapter(opts) {
755
+ const git = opts.git ?? defaultGit;
756
+ const worktreeDir = opts.worktreeDir ?? join3(opts.repoRoot, ".worktrees");
757
+ const branchPrefix = opts.branchPrefix ?? "improve";
758
+ return {
759
+ async create({ baseRef, label }) {
760
+ const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
761
+ const branch = `${branchPrefix}/${id}`;
762
+ const path = join3(worktreeDir, id);
763
+ git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
764
+ return { path, branch, baseRef };
765
+ },
766
+ async finalize(worktree, summary) {
767
+ const status = git(["status", "--porcelain"], worktree.path);
768
+ if (status.length > 0) {
769
+ git(["add", "-A"], worktree.path);
770
+ git(["commit", "-m", summary], worktree.path);
771
+ }
772
+ return {
773
+ kind: "code",
774
+ worktreeRef: worktree.path,
775
+ baseRef: worktree.baseRef,
776
+ summary
777
+ };
778
+ },
779
+ async discard(worktree) {
780
+ git(["worktree", "remove", "--force", worktree.path], opts.repoRoot);
781
+ git(["branch", "-D", worktree.branch], opts.repoRoot);
782
+ }
783
+ };
784
+ }
785
+ function resolveWorktreePath(surface, worktreeDir) {
786
+ if (isAbsolute(surface.worktreeRef) && existsSync2(surface.worktreeRef)) return surface.worktreeRef;
787
+ if (worktreeDir) return join3(worktreeDir, basename(surface.worktreeRef));
788
+ return surface.worktreeRef;
789
+ }
790
+ export {
791
+ FsLabeledScenarioStore,
792
+ LabeledScenarioStoreError,
793
+ WorktreeAdapterError,
794
+ composeGate,
795
+ defaultProductionGate,
796
+ evolutionaryDriver,
797
+ gitWorktreeAdapter,
798
+ heldOutGate,
799
+ openAutoPr,
800
+ resolveWorktreePath,
801
+ runCampaign,
802
+ runEval,
803
+ runImprovementLoop,
804
+ runOptimization,
805
+ surfaceHash
806
+ };
807
+ //# sourceMappingURL=index.js.map