@tangle-network/agent-eval 0.62.0 → 0.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,7 @@
1
1
  import {
2
2
  buildLoopProvenanceRecord,
3
+ campaignBreakdown,
4
+ campaignMeanComposite,
3
5
  composeGate,
4
6
  countSentenceEdits,
5
7
  defaultProductionGate,
@@ -20,7 +22,7 @@ import {
20
22
  runOptimization,
21
23
  surfaceContentHash,
22
24
  surfaceHash
23
- } from "../chunk-CV2BS2OV.js";
25
+ } from "../chunk-Z7ZU7IYZ.js";
24
26
  import {
25
27
  fsCampaignStorage,
26
28
  inMemoryCampaignStorage,
@@ -29,7 +31,7 @@ import {
29
31
  import {
30
32
  agentProfileHash
31
33
  } from "../chunk-PQV2TKC3.js";
32
- import "../chunk-SS2SOBBT.js";
34
+ import "../chunk-4ODZXQV2.js";
33
35
  import {
34
36
  assertRealBackend,
35
37
  summarizeBackendIntegrity
@@ -38,16 +40,253 @@ import "../chunk-YV7J7X5N.js";
38
40
  import {
39
41
  validateRunRecord
40
42
  } from "../chunk-F3SRAAZO.js";
41
- import "../chunk-ITBRCT73.js";
43
+ import {
44
+ pairedBootstrap
45
+ } from "../chunk-ITBRCT73.js";
42
46
  import "../chunk-GGE4NNQT.js";
43
47
  import "../chunk-VSMTAMNK.js";
44
- import "../chunk-IHDHUN2X.js";
48
+ import {
49
+ callLlm
50
+ } from "../chunk-IHDHUN2X.js";
45
51
  import "../chunk-PC4UYEBM.js";
46
52
  import {
47
53
  AgentEvalError
48
54
  } from "../chunk-3BFEG2F6.js";
49
55
  import "../chunk-PZ5AY32C.js";
50
56
 
57
+ // src/campaign/skill-patch.ts
58
+ function applySkillPatch(surface, patch) {
59
+ let lines = surface.split("\n");
60
+ let applied = 0;
61
+ const rejected = [];
62
+ const findLine = (anchor) => lines.findIndex((l) => l.includes(anchor));
63
+ for (const op of patch.ops) {
64
+ if (op.op === "add") {
65
+ if (typeof op.text !== "string" || op.text.trim() === "") {
66
+ rejected.push({ op, reason: "empty add text" });
67
+ continue;
68
+ }
69
+ const insert = op.text.split("\n");
70
+ if (op.after === void 0 || op.after === "") {
71
+ lines = [...lines, ...insert];
72
+ applied++;
73
+ continue;
74
+ }
75
+ const idx = findLine(op.after);
76
+ if (idx === -1) {
77
+ rejected.push({ op, reason: `add anchor not found: ${truncate(op.after)}` });
78
+ continue;
79
+ }
80
+ lines = [...lines.slice(0, idx + 1), ...insert, ...lines.slice(idx + 1)];
81
+ applied++;
82
+ } else if (op.op === "delete") {
83
+ const idx = findLine(op.anchor);
84
+ if (idx === -1) {
85
+ rejected.push({ op, reason: `delete anchor not found: ${truncate(op.anchor)}` });
86
+ continue;
87
+ }
88
+ lines = [...lines.slice(0, idx), ...lines.slice(idx + 1)];
89
+ applied++;
90
+ } else {
91
+ const idx = findLine(op.anchor);
92
+ if (idx === -1) {
93
+ rejected.push({ op, reason: `replace anchor not found: ${truncate(op.anchor)}` });
94
+ continue;
95
+ }
96
+ if (typeof op.text !== "string") {
97
+ rejected.push({ op, reason: "replace text missing" });
98
+ continue;
99
+ }
100
+ lines = [...lines.slice(0, idx), ...op.text.split("\n"), ...lines.slice(idx + 1)];
101
+ applied++;
102
+ }
103
+ }
104
+ return { surface: lines.join("\n"), applied, rejected };
105
+ }
106
+ function patchEditCount(patch) {
107
+ return patch.ops.length;
108
+ }
109
+ function truncate(s, max = 48) {
110
+ return s.length <= max ? s : `${s.slice(0, max)}\u2026`;
111
+ }
112
+
113
+ // src/campaign/drivers/skill-opt.ts
114
+ var SKILLOPT_SYSTEM = 'You are a SkillOpt optimizer. You improve ONE skill document by proposing BOUNDED, anchored edits \u2014 never a full rewrite. Output ONLY a JSON object of shape {"patches":[{"label":string,"rationale":string,"ops":[op,...]}]} where each op is one of: {"op":"add","after":<exact substring of an existing line, or omit to append>,"text":<new line(s)>}, {"op":"delete","anchor":<exact substring of the line to remove>}, {"op":"replace","anchor":<exact substring of the line to replace>,"text":<replacement line(s)>}. Anchors MUST be verbatim substrings of lines that exist in the document. No prose outside JSON.';
115
+ function skillOptDriver(opts) {
116
+ const evidenceK = opts.evidenceK ?? 3;
117
+ const defaultBudget = opts.editBudget ?? 3;
118
+ async function proposePatches(args) {
119
+ const userPrompt = buildPatchPrompt({
120
+ target: opts.target,
121
+ surface: args.surface,
122
+ evidence: args.evidence,
123
+ editBudget: args.editBudget,
124
+ rejectedBuffer: args.rejectedBuffer,
125
+ metaNote: args.metaNote,
126
+ count: args.count
127
+ });
128
+ const result = await callLlm(
129
+ {
130
+ model: opts.model,
131
+ messages: [
132
+ { role: "system", content: SKILLOPT_SYSTEM },
133
+ { role: "user", content: userPrompt }
134
+ ],
135
+ jsonMode: true,
136
+ temperature: opts.temperature ?? 0.6,
137
+ maxTokens: opts.maxTokens ?? 4e3
138
+ },
139
+ opts.llm
140
+ );
141
+ return parseSkillPatchResponse(result.content, args.count, args.editBudget);
142
+ }
143
+ return {
144
+ kind: "skill-opt",
145
+ proposePatches,
146
+ async propose(ctx) {
147
+ if (typeof ctx.currentSurface !== "string") {
148
+ throw new Error(
149
+ "skillOptDriver: surface must be a string skill document (got a CodeSurface). SkillOpt patches text."
150
+ );
151
+ }
152
+ const surface = ctx.currentSurface;
153
+ const patches = await proposePatches({
154
+ surface,
155
+ evidence: evidenceFromHistory(ctx, evidenceK),
156
+ editBudget: defaultBudget,
157
+ rejectedBuffer: [],
158
+ count: ctx.populationSize,
159
+ signal: ctx.signal
160
+ });
161
+ const out = [];
162
+ const seen = /* @__PURE__ */ new Set();
163
+ for (const patch of patches) {
164
+ const { surface: candidate, applied } = applySkillPatch(surface, patch);
165
+ if (applied === 0 || candidate === surface || seen.has(candidate)) continue;
166
+ seen.add(candidate);
167
+ out.push({ surface: candidate, label: patch.label, rationale: patch.rationale });
168
+ if (out.length >= ctx.populationSize) break;
169
+ }
170
+ return out;
171
+ }
172
+ };
173
+ }
174
+ function evidenceFromHistory(ctx, k) {
175
+ const last = ctx.history.at(-1);
176
+ if (!last || last.candidates.length === 0) return { weakScenarios: [], weakDimensions: [] };
177
+ const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0];
178
+ if (!best) return { weakScenarios: [], weakDimensions: [] };
179
+ const weakScenarios = [...best.scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
180
+ const weakDimensions = Object.entries(best.dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
181
+ return { weakScenarios, weakDimensions };
182
+ }
183
+ function buildPatchPrompt(args) {
184
+ const lines = [
185
+ `Skill document governs: ${args.target}.`,
186
+ "",
187
+ "Current skill document:",
188
+ "```",
189
+ args.surface,
190
+ "```",
191
+ "",
192
+ `Propose ${args.count} candidate patch(es). Each patch is a SMALL bundle of`,
193
+ `at most ${args.editBudget} op(s). Anchors must be verbatim substrings of`,
194
+ "existing lines. Prefer adding a specific missing rule or sharpening a vague",
195
+ "one over deleting; never rewrite the whole document."
196
+ ];
197
+ if (args.evidence.weakScenarios.length > 0) {
198
+ lines.push(
199
+ "",
200
+ "Weakest scenarios (patch to fix these):",
201
+ ...args.evidence.weakScenarios.map((s) => `- ${s.scenarioId} (${s.composite.toFixed(2)})`)
202
+ );
203
+ }
204
+ if (args.evidence.weakDimensions.length > 0) {
205
+ lines.push(
206
+ "",
207
+ "Weakest dimensions (what to improve):",
208
+ ...args.evidence.weakDimensions.map((d) => `- ${d.dimension} (${d.score.toFixed(2)})`)
209
+ );
210
+ }
211
+ if (args.rejectedBuffer.length > 0) {
212
+ lines.push(
213
+ "",
214
+ "Already tried and REJECTED (do not repeat or restate these edits):",
215
+ ...args.rejectedBuffer.map((e) => `- ${e.label}: ${e.rationale} \u2014 ${e.reason}`)
216
+ );
217
+ }
218
+ if (args.metaNote) {
219
+ lines.push("", `Strategy note from prior epochs: ${args.metaNote}`);
220
+ }
221
+ return lines.join("\n");
222
+ }
223
+ var SkillPatchParseError = class extends Error {
224
+ constructor(message) {
225
+ super(message);
226
+ this.name = "SkillPatchParseError";
227
+ }
228
+ };
229
+ function parseSkillPatchResponse(raw, maxPatches, editBudget) {
230
+ let text = raw.trim();
231
+ if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
232
+ const start = text.indexOf("{");
233
+ const end = text.lastIndexOf("}");
234
+ if (start < 0 || end <= start) {
235
+ throw new SkillPatchParseError(
236
+ `parseSkillPatchResponse: response was not valid JSON (no object found): ${snippet(raw)}`
237
+ );
238
+ }
239
+ let parsed;
240
+ try {
241
+ parsed = JSON.parse(text.slice(start, end + 1));
242
+ } catch (err) {
243
+ throw new SkillPatchParseError(
244
+ `parseSkillPatchResponse: response was not valid JSON (${err instanceof Error ? err.message : String(err)}): ${snippet(raw)}`
245
+ );
246
+ }
247
+ const rawPatches = Array.isArray(parsed.patches) ? parsed.patches : [];
248
+ const out = [];
249
+ for (const rp of rawPatches) {
250
+ if (typeof rp !== "object" || rp === null) continue;
251
+ const obj = rp;
252
+ const ops = Array.isArray(obj.ops) ? obj.ops.map(normalizeOp).filter(isOp) : [];
253
+ if (ops.length === 0) continue;
254
+ out.push({
255
+ label: typeof obj.label === "string" ? obj.label : "patch",
256
+ rationale: typeof obj.rationale === "string" ? obj.rationale : "",
257
+ ops: ops.slice(0, editBudget)
258
+ });
259
+ if (out.length >= maxPatches) break;
260
+ }
261
+ return out;
262
+ }
263
+ function normalizeOp(raw) {
264
+ if (typeof raw !== "object" || raw === null) return null;
265
+ const o = raw;
266
+ if (o.op === "add") {
267
+ if (typeof o.text !== "string") return null;
268
+ const op = { op: "add", text: o.text };
269
+ if (typeof o.after === "string") op.after = o.after;
270
+ return op;
271
+ }
272
+ if (o.op === "delete") {
273
+ if (typeof o.anchor !== "string") return null;
274
+ return { op: "delete", anchor: o.anchor };
275
+ }
276
+ if (o.op === "replace") {
277
+ if (typeof o.anchor !== "string" || typeof o.text !== "string") return null;
278
+ return { op: "replace", anchor: o.anchor, text: o.text };
279
+ }
280
+ return null;
281
+ }
282
+ function isOp(op) {
283
+ return op !== null;
284
+ }
285
+ function snippet(s, max = 120) {
286
+ const t = s.trim().replace(/\s+/g, " ");
287
+ return t.length <= max ? t : `${t.slice(0, max)}\u2026`;
288
+ }
289
+
51
290
  // src/campaign/labeled-store/fs-adapter.ts
52
291
  import { createHash } from "crypto";
53
292
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
@@ -259,6 +498,339 @@ function appendLine(path, line) {
259
498
  }
260
499
  }
261
500
 
501
+ // src/campaign/presets/run-skill-opt.ts
502
+ async function runSkillOpt(opts) {
503
+ if (opts.trainScenarios.length === 0) throw new Error("runSkillOpt: trainScenarios is empty");
504
+ if (opts.holdoutScenarios.length === 0) throw new Error("runSkillOpt: holdoutScenarios is empty");
505
+ if (!opts.judges || opts.judges.length === 0) {
506
+ throw new Error(
507
+ "runSkillOpt: at least one judge is required \u2014 scoring (and therefore acceptance) is meaningless without one, and would report a silent zero lift."
508
+ );
509
+ }
510
+ const holdoutIds = new Set(opts.holdoutScenarios.map((s) => s.id));
511
+ const overlap = opts.trainScenarios.filter((s) => holdoutIds.has(s.id)).map((s) => s.id);
512
+ if (overlap.length > 0) {
513
+ throw new Error(
514
+ `runSkillOpt: trainScenarios and holdoutScenarios must be disjoint (overlap: [${overlap.join(
515
+ ", "
516
+ )}]) \u2014 a shared scenario leaks the held-out acceptance axis into the proposal evidence.`
517
+ );
518
+ }
519
+ const patchesPerEpoch = opts.patchesPerEpoch ?? 2;
520
+ const initialBudget = opts.editBudget ?? 3;
521
+ const minImprovement = opts.minImprovement ?? 0;
522
+ if (minImprovement < 0) {
523
+ throw new Error(
524
+ "runSkillOpt: minImprovement must be >= 0 \u2014 a negative threshold would accept held-out regressions, breaking the monotonic-lift contract."
525
+ );
526
+ }
527
+ const patience = opts.patience ?? opts.maxEpochs;
528
+ const budgetAnneal = opts.budgetAnneal ?? true;
529
+ const rejectedBufferSize = opts.rejectedBufferSize ?? 12;
530
+ const slowMetaEvery = opts.slowMetaEvery ?? 2;
531
+ let totalCostUsd = 0;
532
+ const scoreHoldout = async (surface, tag) => {
533
+ const campaign = await runScoringCampaign(opts, opts.holdoutScenarios, surface, tag);
534
+ totalCostUsd += campaign.aggregates.totalCostUsd;
535
+ return campaignMeanComposite(campaign);
536
+ };
537
+ const evidenceK = opts.evidenceK ?? 3;
538
+ const trainEvidence = async (surface, tag) => {
539
+ const campaign = await runScoringCampaign(opts, opts.trainScenarios, surface, tag);
540
+ totalCostUsd += campaign.aggregates.totalCostUsd;
541
+ return toEvidence(campaign, evidenceK);
542
+ };
543
+ let current = opts.baselineSurface;
544
+ let currentEvidence = await trainEvidence(current, "baseline-train");
545
+ const baselineHoldout = await scoreHoldout(current, "baseline-holdout");
546
+ let currentHoldout = baselineHoldout;
547
+ const buffer = [];
548
+ const acceptedEdits = [];
549
+ const rejectedAll = [];
550
+ const history = [];
551
+ let budget = initialBudget;
552
+ let sinceAccept = 0;
553
+ let metaNote;
554
+ let epochsRun = 0;
555
+ for (let epoch = 0; epoch < opts.maxEpochs; epoch++) {
556
+ epochsRun++;
557
+ const patches = await opts.driver.proposePatches({
558
+ surface: current,
559
+ evidence: currentEvidence,
560
+ editBudget: budget,
561
+ rejectedBuffer: buffer,
562
+ metaNote,
563
+ count: patchesPerEpoch,
564
+ signal: opts.signal ?? new AbortController().signal
565
+ });
566
+ let accepted = null;
567
+ const rejectedThisEpoch = [];
568
+ for (let i = 0; i < patches.length; i++) {
569
+ const patch = patches[i];
570
+ const { surface: candidate, applied } = applySkillPatch(current, patch);
571
+ if (applied === 0 || candidate === current) {
572
+ rejectedThisEpoch.push({
573
+ label: patch.label,
574
+ rationale: patch.rationale,
575
+ reason: "no-op (unanchored or zero-change)"
576
+ });
577
+ continue;
578
+ }
579
+ const candidateHoldout = await scoreHoldout(candidate, `epoch-${epoch}-cand-${i}-holdout`);
580
+ if (candidateHoldout > currentHoldout + minImprovement) {
581
+ accepted = {
582
+ epoch,
583
+ label: patch.label,
584
+ rationale: patch.rationale,
585
+ holdoutDelta: candidateHoldout - currentHoldout
586
+ };
587
+ current = candidate;
588
+ currentHoldout = candidateHoldout;
589
+ currentEvidence = await trainEvidence(current, `epoch-${epoch}-train`);
590
+ break;
591
+ }
592
+ rejectedThisEpoch.push({
593
+ label: patch.label,
594
+ rationale: patch.rationale,
595
+ reason: `held-out ${candidateHoldout.toFixed(3)} \u2264 current ${currentHoldout.toFixed(3)}`
596
+ });
597
+ }
598
+ if (accepted) {
599
+ acceptedEdits.push(accepted);
600
+ sinceAccept = 0;
601
+ } else {
602
+ sinceAccept++;
603
+ if (budgetAnneal && sinceAccept >= 2 && budget > 1) budget--;
604
+ }
605
+ for (const r of rejectedThisEpoch) {
606
+ buffer.push(r);
607
+ rejectedAll.push(r);
608
+ }
609
+ while (buffer.length > rejectedBufferSize) buffer.shift();
610
+ if (slowMetaEvery > 0 && (epoch + 1) % slowMetaEvery === 0) {
611
+ metaNote = buildMetaNote(acceptedEdits, buffer);
612
+ }
613
+ history.push({
614
+ epoch,
615
+ editBudget: budget,
616
+ proposed: patches.length,
617
+ accepted,
618
+ rejected: rejectedThisEpoch,
619
+ holdoutComposite: currentHoldout
620
+ });
621
+ if (sinceAccept >= patience) break;
622
+ }
623
+ return {
624
+ winnerSurface: current,
625
+ baselineHoldoutComposite: baselineHoldout,
626
+ winnerHoldoutComposite: currentHoldout,
627
+ lift: currentHoldout - baselineHoldout,
628
+ acceptedEdits,
629
+ rejectedEdits: rejectedAll,
630
+ epochsRun,
631
+ history,
632
+ totalCostUsd
633
+ };
634
+ }
635
+ function runScoringCampaign(opts, scenarios, surface, tag) {
636
+ return runCampaign({
637
+ ...opts,
638
+ scenarios,
639
+ dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
640
+ runDir: `${opts.runDir}/${tag}`
641
+ });
642
+ }
643
+ function toEvidence(campaign, k) {
644
+ const { dimensions, scenarios } = campaignBreakdown(campaign);
645
+ const weakScenarios = [...scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
646
+ const weakDimensions = Object.entries(dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
647
+ return { weakScenarios, weakDimensions };
648
+ }
649
+ function buildMetaNote(accepted, rejected) {
650
+ const parts = [];
651
+ if (accepted.length > 0) {
652
+ parts.push(
653
+ `Edits that improved held-out so far: ${accepted.map((a) => `"${a.label}" (+${a.holdoutDelta.toFixed(3)})`).join("; ")}. Build on these.`
654
+ );
655
+ }
656
+ if (rejected.length > 0) {
657
+ const labels = [...new Set(rejected.map((r) => r.label))].slice(0, 5);
658
+ parts.push(`Dead ends to avoid: ${labels.join(", ")}. Try a different anchor or rule.`);
659
+ }
660
+ parts.push("Keep edits small and anchored to existing lines.");
661
+ return parts.join(" ");
662
+ }
663
+
664
+ // src/campaign/presets/compare-drivers.ts
665
+ async function compareDrivers(opts) {
666
+ if (opts.drivers.length === 0) throw new Error("compareDrivers: no drivers to compare");
667
+ const seed = opts.seed ?? 42;
668
+ const resamples = opts.resamples ?? 2e3;
669
+ const confidence = opts.confidence ?? 0.95;
670
+ const scoreOnHoldout = async (surface, tag) => {
671
+ const campaign = await runCampaign({
672
+ ...opts,
673
+ scenarios: opts.holdoutScenarios,
674
+ dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
675
+ runDir: `${opts.runDir}/${tag}`
676
+ });
677
+ const byScenario = {};
678
+ for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
679
+ byScenario[scenarioId] = composite;
680
+ }
681
+ return byScenario;
682
+ };
683
+ const scenarioIds = [...new Set(opts.holdoutScenarios.map((s) => s.id))].sort();
684
+ if (scenarioIds.length === 0) throw new Error("compareDrivers: holdoutScenarios is empty");
685
+ const align = (byScenario, label) => {
686
+ const missing = scenarioIds.filter((id) => !(id in byScenario));
687
+ if (missing.length > 0) {
688
+ throw new Error(
689
+ `compareDrivers: ${label} produced no held-out score for scenario(s) [${missing.join(
690
+ ", "
691
+ )}] \u2014 a cell errored or its judges returned nothing. Refusing to fabricate a 0 (it would corrupt the lift comparison). Fix the dispatch/judge or drop the scenario.`
692
+ );
693
+ }
694
+ return scenarioIds.map((id) => byScenario[id]);
695
+ };
696
+ const baselineArr = align(
697
+ await scoreOnHoldout(opts.baselineSurface, "compare-baseline"),
698
+ "baseline"
699
+ );
700
+ const winners = [];
701
+ for (const d of opts.drivers) {
702
+ const out = await d.optimize();
703
+ const byScenario = await scoreOnHoldout(out.winnerSurface, `compare-${slug(d.name)}`);
704
+ winners.push({
705
+ name: d.name,
706
+ winnerSurface: out.winnerSurface,
707
+ costUsd: out.costUsd,
708
+ durationMs: out.durationMs,
709
+ arr: align(byScenario, `driver "${d.name}"`)
710
+ });
711
+ }
712
+ const scores = winners.map((w) => {
713
+ const boot = pairedBootstrap(baselineArr, w.arr, {
714
+ seed,
715
+ resamples,
716
+ confidence,
717
+ statistic: "mean"
718
+ });
719
+ const score = {
720
+ name: w.name,
721
+ baselineComposite: mean(baselineArr),
722
+ winnerComposite: mean(w.arr),
723
+ lift: boot.mean,
724
+ liftCi: { low: boot.low, high: boot.high },
725
+ costUsd: w.costUsd,
726
+ winnerSurface: w.winnerSurface,
727
+ rank: 0
728
+ };
729
+ if (w.durationMs !== void 0) score.durationMs = w.durationMs;
730
+ return score;
731
+ });
732
+ scores.sort((a, b) => b.lift - a.lift || a.costUsd - b.costUsd);
733
+ scores.forEach((s, i) => {
734
+ s.rank = i + 1;
735
+ });
736
+ const best = scores[0];
737
+ const byName = new Map(winners.map((w) => [w.name, w]));
738
+ const bestArr = byName.get(best.name).arr;
739
+ const pairwise = scores.slice(1).map((other) => {
740
+ const otherArr = byName.get(other.name).arr;
741
+ const boot = pairedBootstrap(otherArr, bestArr, {
742
+ seed,
743
+ resamples,
744
+ confidence,
745
+ statistic: "mean"
746
+ });
747
+ const favored = boot.low > 0 ? best.name : boot.high < 0 ? other.name : "tie";
748
+ return {
749
+ a: best.name,
750
+ b: other.name,
751
+ deltaMean: boot.mean,
752
+ low: boot.low,
753
+ high: boot.high,
754
+ favored
755
+ };
756
+ });
757
+ return { scores, best, pairwise, holdoutScenarioIds: scenarioIds };
758
+ }
759
+ function mean(xs) {
760
+ return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
761
+ }
762
+ function slug(name) {
763
+ return name.replace(/[^a-z0-9]+/gi, "-").toLowerCase();
764
+ }
765
+ function gepaReflectionEntry(config, name = "gepa-reflection") {
766
+ return gepaEntry(config, false, name);
767
+ }
768
+ function gepaParetoEntry(config, name = "gepa-pareto") {
769
+ return gepaEntry(config, true, name);
770
+ }
771
+ function gepaEntry(config, combineParents, name) {
772
+ return {
773
+ name,
774
+ async optimize() {
775
+ const started = Date.now();
776
+ const driver = gepaDriver({
777
+ llm: config.llm,
778
+ model: config.model,
779
+ target: config.target,
780
+ combineParents,
781
+ ...config.mutationPrimitives ? { mutationPrimitives: config.mutationPrimitives } : {}
782
+ });
783
+ const result = await runImprovementLoop({
784
+ scenarios: config.trainScenarios,
785
+ holdoutScenarios: config.holdoutScenarios,
786
+ baselineSurface: config.baselineSurface,
787
+ dispatchWithSurface: config.dispatchWithSurface,
788
+ judges: config.judges,
789
+ driver,
790
+ populationSize: config.populationSize ?? 2,
791
+ maxGenerations: config.maxGenerations ?? 3,
792
+ gate: defaultProductionGate({
793
+ holdoutScenarios: config.holdoutScenarios,
794
+ deltaThreshold: 0
795
+ }),
796
+ autoOnPromote: "none",
797
+ runDir: `${config.runDir}/${slug(name)}-loop`,
798
+ ...config.seed !== void 0 ? { seed: config.seed } : {}
799
+ });
800
+ const costUsd = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
801
+ (sum, g) => sum + g.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
802
+ 0
803
+ );
804
+ return { winnerSurface: result.winnerSurface, costUsd, durationMs: Date.now() - started };
805
+ }
806
+ };
807
+ }
808
+ function skillOptEntry(config, name = "skill-opt") {
809
+ return {
810
+ name,
811
+ async optimize() {
812
+ const started = Date.now();
813
+ const driver = skillOptDriver({ llm: config.llm, model: config.model, target: config.target });
814
+ const result = await runSkillOpt({
815
+ baselineSurface: config.baselineSurface,
816
+ dispatchWithSurface: config.dispatchWithSurface,
817
+ judges: config.judges,
818
+ driver,
819
+ trainScenarios: config.trainScenarios,
820
+ holdoutScenarios: config.holdoutScenarios,
821
+ maxEpochs: config.maxEpochs ?? 6,
822
+ runDir: `${config.runDir}/${slug(name)}-loop`,
823
+ ...config.seed !== void 0 ? { seed: config.seed } : {}
824
+ });
825
+ return {
826
+ winnerSurface: result.winnerSurface,
827
+ costUsd: result.totalCostUsd,
828
+ durationMs: Date.now() - started
829
+ };
830
+ }
831
+ };
832
+ }
833
+
262
834
  // src/campaign/presets/run-profile-matrix.ts
263
835
  import { createHash as createHash2 } from "crypto";
264
836
  import { join as join2 } from "path";
@@ -273,12 +845,12 @@ function sanitize(id) {
273
845
  function sha(input) {
274
846
  return createHash2("sha256").update(JSON.stringify(input)).digest("hex");
275
847
  }
276
- function mean(xs) {
848
+ function mean2(xs) {
277
849
  return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
278
850
  }
279
851
  function cellComposite(cell) {
280
852
  const composites = Object.values(cell.judgeScores).map((s) => s.composite);
281
- return composites.length === 0 ? 0 : mean(composites);
853
+ return composites.length === 0 ? 0 : mean2(composites);
282
854
  }
283
855
  function buildRunRecord(args) {
284
856
  const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } = args;
@@ -296,7 +868,7 @@ function buildRunRecord(args) {
296
868
  if (js.notes) notes.push(`${judgeName}: ${js.notes}`);
297
869
  }
298
870
  const perDimMean = {};
299
- for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values);
871
+ for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean2(values);
300
872
  const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
301
873
  if (Object.keys(perJudge).length > 0) {
302
874
  outcome.judgeScores = {
@@ -407,7 +979,7 @@ async function runProfileMatrix(opts) {
407
979
  profileHash,
408
980
  model: profile.model,
409
981
  records: profileRecords.length,
410
- meanComposite: mean(profileRecords.map(compositeOf)),
982
+ meanComposite: mean2(profileRecords.map(compositeOf)),
411
983
  totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),
412
984
  integrity: summarizeBackendIntegrity(profileRecords)
413
985
  };
@@ -437,7 +1009,7 @@ function rollup(records, keyOf) {
437
1009
  groups.set(key, arr);
438
1010
  }
439
1011
  const out = {};
440
- for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length };
1012
+ for (const [key, xs] of groups) out[key] = { meanComposite: mean2(xs), n: xs.length };
441
1013
  return out;
442
1014
  }
443
1015
  function rollupByPersona(records, scenarios, personaOf) {
@@ -466,7 +1038,7 @@ function defaultGit(args, cwd) {
466
1038
  throw new WorktreeAdapterError(`git ${args.join(" ")} failed: ${stderr || String(err)}`, err);
467
1039
  }
468
1040
  }
469
- function slug(label) {
1041
+ function slug2(label) {
470
1042
  return label.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 48) || "candidate";
471
1043
  }
472
1044
  function gitWorktreeAdapter(opts) {
@@ -475,7 +1047,7 @@ function gitWorktreeAdapter(opts) {
475
1047
  const branchPrefix = opts.branchPrefix ?? "improve";
476
1048
  return {
477
1049
  async create({ baseRef, label }) {
478
- const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
1050
+ const id = `${slug2(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
479
1051
  const branch = `${branchPrefix}/${id}`;
480
1052
  const path = join3(worktreeDir, id);
481
1053
  git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
@@ -509,8 +1081,13 @@ export {
509
1081
  FsLabeledScenarioStore,
510
1082
  LabeledScenarioStoreError,
511
1083
  ProfileMatrixError,
1084
+ SkillPatchParseError,
512
1085
  WorktreeAdapterError,
1086
+ applySkillPatch,
513
1087
  buildLoopProvenanceRecord,
1088
+ campaignBreakdown,
1089
+ campaignMeanComposite,
1090
+ compareDrivers,
514
1091
  composeGate,
515
1092
  countSentenceEdits,
516
1093
  defaultProductionGate,
@@ -520,6 +1097,8 @@ export {
520
1097
  extractH2Sections,
521
1098
  fsCampaignStorage,
522
1099
  gepaDriver,
1100
+ gepaParetoEntry,
1101
+ gepaReflectionEntry,
523
1102
  gitWorktreeAdapter,
524
1103
  heldOutGate,
525
1104
  inMemoryCampaignStorage,
@@ -527,6 +1106,8 @@ export {
527
1106
  labelTrustRank,
528
1107
  loopProvenanceSpans,
529
1108
  openAutoPr,
1109
+ parseSkillPatchResponse,
1110
+ patchEditCount,
530
1111
  provenanceRecordPath,
531
1112
  provenanceSpansPath,
532
1113
  resolveWorktreePath,
@@ -535,6 +1116,9 @@ export {
535
1116
  runImprovementLoop,
536
1117
  runOptimization,
537
1118
  runProfileMatrix,
1119
+ runSkillOpt,
1120
+ skillOptDriver,
1121
+ skillOptEntry,
538
1122
  surfaceContentHash,
539
1123
  surfaceHash
540
1124
  };