@tangle-network/agent-eval 0.61.0 → 0.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/CHANGELOG.md +48 -8
  2. package/dist/adapters/http.d.ts +4 -1
  3. package/dist/adapters/langchain.d.ts +4 -1
  4. package/dist/adapters/otel.d.ts +4 -4
  5. package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
  6. package/dist/benchmarks/index.d.ts +2 -2
  7. package/dist/campaign/index.d.ts +388 -11
  8. package/dist/campaign/index.js +597 -12
  9. package/dist/campaign/index.js.map +1 -1
  10. package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
  11. package/dist/chunk-4ODZXQV2.js.map +1 -0
  12. package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
  13. package/dist/chunk-7TPYV2ER.js.map +1 -0
  14. package/dist/chunk-E22YUOAL.js +111 -0
  15. package/dist/chunk-E22YUOAL.js.map +1 -0
  16. package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
  17. package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
  18. package/dist/contract/index.d.ts +9 -9
  19. package/dist/contract/index.js +4 -3
  20. package/dist/contract/index.js.map +1 -1
  21. package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
  22. package/dist/control.d.ts +2 -2
  23. package/dist/hosted/index.d.ts +4 -4
  24. package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
  25. package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
  26. package/dist/index.d.ts +98 -14
  27. package/dist/index.js +331 -128
  28. package/dist/index.js.map +1 -1
  29. package/dist/meta-eval/index.d.ts +2 -2
  30. package/dist/multishot/index.js.map +1 -1
  31. package/dist/openapi.json +1 -1
  32. package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
  33. package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
  34. package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
  35. package/dist/reporting.d.ts +4 -4
  36. package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
  37. package/dist/rl.d.ts +6 -6
  38. package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
  39. package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
  40. package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
  41. package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
  42. package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
  43. package/package.json +1 -1
  44. package/dist/chunk-GMXHLSLL.js.map +0 -1
  45. package/dist/chunk-OLULBECP.js.map +0 -1
  46. package/dist/chunk-SUGME4OT.js.map +0 -1
  47. /package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
@@ -1,5 +1,7 @@
1
1
  import {
2
2
  buildLoopProvenanceRecord,
3
+ campaignBreakdown,
4
+ campaignMeanComposite,
3
5
  composeGate,
4
6
  countSentenceEdits,
5
7
  defaultProductionGate,
@@ -20,33 +22,271 @@ import {
20
22
  runOptimization,
21
23
  surfaceContentHash,
22
24
  surfaceHash
23
- } from "../chunk-SUGME4OT.js";
25
+ } from "../chunk-Z7ZU7IYZ.js";
24
26
  import {
25
27
  fsCampaignStorage,
26
28
  inMemoryCampaignStorage,
27
29
  runCampaign
28
- } from "../chunk-OLULBECP.js";
30
+ } from "../chunk-7TPYV2ER.js";
29
31
  import {
30
32
  agentProfileHash
31
33
  } from "../chunk-PQV2TKC3.js";
34
+ import "../chunk-4ODZXQV2.js";
32
35
  import {
33
36
  assertRealBackend,
34
37
  summarizeBackendIntegrity
35
- } from "../chunk-GMXHLSLL.js";
38
+ } from "../chunk-E22YUOAL.js";
36
39
  import "../chunk-YV7J7X5N.js";
37
40
  import {
38
41
  validateRunRecord
39
42
  } from "../chunk-F3SRAAZO.js";
40
- import "../chunk-ITBRCT73.js";
43
+ import {
44
+ pairedBootstrap
45
+ } from "../chunk-ITBRCT73.js";
41
46
  import "../chunk-GGE4NNQT.js";
42
47
  import "../chunk-VSMTAMNK.js";
43
- import "../chunk-IHDHUN2X.js";
48
+ import {
49
+ callLlm
50
+ } from "../chunk-IHDHUN2X.js";
44
51
  import "../chunk-PC4UYEBM.js";
45
52
  import {
46
53
  AgentEvalError
47
54
  } from "../chunk-3BFEG2F6.js";
48
55
  import "../chunk-PZ5AY32C.js";
49
56
 
57
+ // src/campaign/skill-patch.ts
58
+ function applySkillPatch(surface, patch) {
59
+ let lines = surface.split("\n");
60
+ let applied = 0;
61
+ const rejected = [];
62
+ const findLine = (anchor) => lines.findIndex((l) => l.includes(anchor));
63
+ for (const op of patch.ops) {
64
+ if (op.op === "add") {
65
+ if (typeof op.text !== "string" || op.text.trim() === "") {
66
+ rejected.push({ op, reason: "empty add text" });
67
+ continue;
68
+ }
69
+ const insert = op.text.split("\n");
70
+ if (op.after === void 0 || op.after === "") {
71
+ lines = [...lines, ...insert];
72
+ applied++;
73
+ continue;
74
+ }
75
+ const idx = findLine(op.after);
76
+ if (idx === -1) {
77
+ rejected.push({ op, reason: `add anchor not found: ${truncate(op.after)}` });
78
+ continue;
79
+ }
80
+ lines = [...lines.slice(0, idx + 1), ...insert, ...lines.slice(idx + 1)];
81
+ applied++;
82
+ } else if (op.op === "delete") {
83
+ const idx = findLine(op.anchor);
84
+ if (idx === -1) {
85
+ rejected.push({ op, reason: `delete anchor not found: ${truncate(op.anchor)}` });
86
+ continue;
87
+ }
88
+ lines = [...lines.slice(0, idx), ...lines.slice(idx + 1)];
89
+ applied++;
90
+ } else {
91
+ const idx = findLine(op.anchor);
92
+ if (idx === -1) {
93
+ rejected.push({ op, reason: `replace anchor not found: ${truncate(op.anchor)}` });
94
+ continue;
95
+ }
96
+ if (typeof op.text !== "string") {
97
+ rejected.push({ op, reason: "replace text missing" });
98
+ continue;
99
+ }
100
+ lines = [...lines.slice(0, idx), ...op.text.split("\n"), ...lines.slice(idx + 1)];
101
+ applied++;
102
+ }
103
+ }
104
+ return { surface: lines.join("\n"), applied, rejected };
105
+ }
106
+ function patchEditCount(patch) {
107
+ return patch.ops.length;
108
+ }
109
+ function truncate(s, max = 48) {
110
+ return s.length <= max ? s : `${s.slice(0, max)}\u2026`;
111
+ }
112
+
113
+ // src/campaign/drivers/skill-opt.ts
114
+ var SKILLOPT_SYSTEM = 'You are a SkillOpt optimizer. You improve ONE skill document by proposing BOUNDED, anchored edits \u2014 never a full rewrite. Output ONLY a JSON object of shape {"patches":[{"label":string,"rationale":string,"ops":[op,...]}]} where each op is one of: {"op":"add","after":<exact substring of an existing line, or omit to append>,"text":<new line(s)>}, {"op":"delete","anchor":<exact substring of the line to remove>}, {"op":"replace","anchor":<exact substring of the line to replace>,"text":<replacement line(s)>}. Anchors MUST be verbatim substrings of lines that exist in the document. No prose outside JSON.';
115
+ function skillOptDriver(opts) {
116
+ const evidenceK = opts.evidenceK ?? 3;
117
+ const defaultBudget = opts.editBudget ?? 3;
118
+ async function proposePatches(args) {
119
+ const userPrompt = buildPatchPrompt({
120
+ target: opts.target,
121
+ surface: args.surface,
122
+ evidence: args.evidence,
123
+ editBudget: args.editBudget,
124
+ rejectedBuffer: args.rejectedBuffer,
125
+ metaNote: args.metaNote,
126
+ count: args.count
127
+ });
128
+ const result = await callLlm(
129
+ {
130
+ model: opts.model,
131
+ messages: [
132
+ { role: "system", content: SKILLOPT_SYSTEM },
133
+ { role: "user", content: userPrompt }
134
+ ],
135
+ jsonMode: true,
136
+ temperature: opts.temperature ?? 0.6,
137
+ maxTokens: opts.maxTokens ?? 4e3
138
+ },
139
+ opts.llm
140
+ );
141
+ return parseSkillPatchResponse(result.content, args.count, args.editBudget);
142
+ }
143
+ return {
144
+ kind: "skill-opt",
145
+ proposePatches,
146
+ async propose(ctx) {
147
+ if (typeof ctx.currentSurface !== "string") {
148
+ throw new Error(
149
+ "skillOptDriver: surface must be a string skill document (got a CodeSurface). SkillOpt patches text."
150
+ );
151
+ }
152
+ const surface = ctx.currentSurface;
153
+ const patches = await proposePatches({
154
+ surface,
155
+ evidence: evidenceFromHistory(ctx, evidenceK),
156
+ editBudget: defaultBudget,
157
+ rejectedBuffer: [],
158
+ count: ctx.populationSize,
159
+ signal: ctx.signal
160
+ });
161
+ const out = [];
162
+ const seen = /* @__PURE__ */ new Set();
163
+ for (const patch of patches) {
164
+ const { surface: candidate, applied } = applySkillPatch(surface, patch);
165
+ if (applied === 0 || candidate === surface || seen.has(candidate)) continue;
166
+ seen.add(candidate);
167
+ out.push({ surface: candidate, label: patch.label, rationale: patch.rationale });
168
+ if (out.length >= ctx.populationSize) break;
169
+ }
170
+ return out;
171
+ }
172
+ };
173
+ }
174
+ function evidenceFromHistory(ctx, k) {
175
+ const last = ctx.history.at(-1);
176
+ if (!last || last.candidates.length === 0) return { weakScenarios: [], weakDimensions: [] };
177
+ const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0];
178
+ if (!best) return { weakScenarios: [], weakDimensions: [] };
179
+ const weakScenarios = [...best.scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
180
+ const weakDimensions = Object.entries(best.dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
181
+ return { weakScenarios, weakDimensions };
182
+ }
183
+ function buildPatchPrompt(args) {
184
+ const lines = [
185
+ `Skill document governs: ${args.target}.`,
186
+ "",
187
+ "Current skill document:",
188
+ "```",
189
+ args.surface,
190
+ "```",
191
+ "",
192
+ `Propose ${args.count} candidate patch(es). Each patch is a SMALL bundle of`,
193
+ `at most ${args.editBudget} op(s). Anchors must be verbatim substrings of`,
194
+ "existing lines. Prefer adding a specific missing rule or sharpening a vague",
195
+ "one over deleting; never rewrite the whole document."
196
+ ];
197
+ if (args.evidence.weakScenarios.length > 0) {
198
+ lines.push(
199
+ "",
200
+ "Weakest scenarios (patch to fix these):",
201
+ ...args.evidence.weakScenarios.map((s) => `- ${s.scenarioId} (${s.composite.toFixed(2)})`)
202
+ );
203
+ }
204
+ if (args.evidence.weakDimensions.length > 0) {
205
+ lines.push(
206
+ "",
207
+ "Weakest dimensions (what to improve):",
208
+ ...args.evidence.weakDimensions.map((d) => `- ${d.dimension} (${d.score.toFixed(2)})`)
209
+ );
210
+ }
211
+ if (args.rejectedBuffer.length > 0) {
212
+ lines.push(
213
+ "",
214
+ "Already tried and REJECTED (do not repeat or restate these edits):",
215
+ ...args.rejectedBuffer.map((e) => `- ${e.label}: ${e.rationale} \u2014 ${e.reason}`)
216
+ );
217
+ }
218
+ if (args.metaNote) {
219
+ lines.push("", `Strategy note from prior epochs: ${args.metaNote}`);
220
+ }
221
+ return lines.join("\n");
222
+ }
223
+ var SkillPatchParseError = class extends Error {
224
+ constructor(message) {
225
+ super(message);
226
+ this.name = "SkillPatchParseError";
227
+ }
228
+ };
229
+ function parseSkillPatchResponse(raw, maxPatches, editBudget) {
230
+ let text = raw.trim();
231
+ if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
232
+ const start = text.indexOf("{");
233
+ const end = text.lastIndexOf("}");
234
+ if (start < 0 || end <= start) {
235
+ throw new SkillPatchParseError(
236
+ `parseSkillPatchResponse: response was not valid JSON (no object found): ${snippet(raw)}`
237
+ );
238
+ }
239
+ let parsed;
240
+ try {
241
+ parsed = JSON.parse(text.slice(start, end + 1));
242
+ } catch (err) {
243
+ throw new SkillPatchParseError(
244
+ `parseSkillPatchResponse: response was not valid JSON (${err instanceof Error ? err.message : String(err)}): ${snippet(raw)}`
245
+ );
246
+ }
247
+ const rawPatches = Array.isArray(parsed.patches) ? parsed.patches : [];
248
+ const out = [];
249
+ for (const rp of rawPatches) {
250
+ if (typeof rp !== "object" || rp === null) continue;
251
+ const obj = rp;
252
+ const ops = Array.isArray(obj.ops) ? obj.ops.map(normalizeOp).filter(isOp) : [];
253
+ if (ops.length === 0) continue;
254
+ out.push({
255
+ label: typeof obj.label === "string" ? obj.label : "patch",
256
+ rationale: typeof obj.rationale === "string" ? obj.rationale : "",
257
+ ops: ops.slice(0, editBudget)
258
+ });
259
+ if (out.length >= maxPatches) break;
260
+ }
261
+ return out;
262
+ }
263
+ function normalizeOp(raw) {
264
+ if (typeof raw !== "object" || raw === null) return null;
265
+ const o = raw;
266
+ if (o.op === "add") {
267
+ if (typeof o.text !== "string") return null;
268
+ const op = { op: "add", text: o.text };
269
+ if (typeof o.after === "string") op.after = o.after;
270
+ return op;
271
+ }
272
+ if (o.op === "delete") {
273
+ if (typeof o.anchor !== "string") return null;
274
+ return { op: "delete", anchor: o.anchor };
275
+ }
276
+ if (o.op === "replace") {
277
+ if (typeof o.anchor !== "string" || typeof o.text !== "string") return null;
278
+ return { op: "replace", anchor: o.anchor, text: o.text };
279
+ }
280
+ return null;
281
+ }
282
+ function isOp(op) {
283
+ return op !== null;
284
+ }
285
+ function snippet(s, max = 120) {
286
+ const t = s.trim().replace(/\s+/g, " ");
287
+ return t.length <= max ? t : `${t.slice(0, max)}\u2026`;
288
+ }
289
+
50
290
  // src/campaign/labeled-store/fs-adapter.ts
51
291
  import { createHash } from "crypto";
52
292
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
@@ -258,6 +498,339 @@ function appendLine(path, line) {
258
498
  }
259
499
  }
260
500
 
501
+ // src/campaign/presets/run-skill-opt.ts
502
+ async function runSkillOpt(opts) {
503
+ if (opts.trainScenarios.length === 0) throw new Error("runSkillOpt: trainScenarios is empty");
504
+ if (opts.holdoutScenarios.length === 0) throw new Error("runSkillOpt: holdoutScenarios is empty");
505
+ if (!opts.judges || opts.judges.length === 0) {
506
+ throw new Error(
507
+ "runSkillOpt: at least one judge is required \u2014 scoring (and therefore acceptance) is meaningless without one, and would report a silent zero lift."
508
+ );
509
+ }
510
+ const holdoutIds = new Set(opts.holdoutScenarios.map((s) => s.id));
511
+ const overlap = opts.trainScenarios.filter((s) => holdoutIds.has(s.id)).map((s) => s.id);
512
+ if (overlap.length > 0) {
513
+ throw new Error(
514
+ `runSkillOpt: trainScenarios and holdoutScenarios must be disjoint (overlap: [${overlap.join(
515
+ ", "
516
+ )}]) \u2014 a shared scenario leaks the held-out acceptance axis into the proposal evidence.`
517
+ );
518
+ }
519
+ const patchesPerEpoch = opts.patchesPerEpoch ?? 2;
520
+ const initialBudget = opts.editBudget ?? 3;
521
+ const minImprovement = opts.minImprovement ?? 0;
522
+ if (minImprovement < 0) {
523
+ throw new Error(
524
+ "runSkillOpt: minImprovement must be >= 0 \u2014 a negative threshold would accept held-out regressions, breaking the monotonic-lift contract."
525
+ );
526
+ }
527
+ const patience = opts.patience ?? opts.maxEpochs;
528
+ const budgetAnneal = opts.budgetAnneal ?? true;
529
+ const rejectedBufferSize = opts.rejectedBufferSize ?? 12;
530
+ const slowMetaEvery = opts.slowMetaEvery ?? 2;
531
+ let totalCostUsd = 0;
532
+ const scoreHoldout = async (surface, tag) => {
533
+ const campaign = await runScoringCampaign(opts, opts.holdoutScenarios, surface, tag);
534
+ totalCostUsd += campaign.aggregates.totalCostUsd;
535
+ return campaignMeanComposite(campaign);
536
+ };
537
+ const evidenceK = opts.evidenceK ?? 3;
538
+ const trainEvidence = async (surface, tag) => {
539
+ const campaign = await runScoringCampaign(opts, opts.trainScenarios, surface, tag);
540
+ totalCostUsd += campaign.aggregates.totalCostUsd;
541
+ return toEvidence(campaign, evidenceK);
542
+ };
543
+ let current = opts.baselineSurface;
544
+ let currentEvidence = await trainEvidence(current, "baseline-train");
545
+ const baselineHoldout = await scoreHoldout(current, "baseline-holdout");
546
+ let currentHoldout = baselineHoldout;
547
+ const buffer = [];
548
+ const acceptedEdits = [];
549
+ const rejectedAll = [];
550
+ const history = [];
551
+ let budget = initialBudget;
552
+ let sinceAccept = 0;
553
+ let metaNote;
554
+ let epochsRun = 0;
555
+ for (let epoch = 0; epoch < opts.maxEpochs; epoch++) {
556
+ epochsRun++;
557
+ const patches = await opts.driver.proposePatches({
558
+ surface: current,
559
+ evidence: currentEvidence,
560
+ editBudget: budget,
561
+ rejectedBuffer: buffer,
562
+ metaNote,
563
+ count: patchesPerEpoch,
564
+ signal: opts.signal ?? new AbortController().signal
565
+ });
566
+ let accepted = null;
567
+ const rejectedThisEpoch = [];
568
+ for (let i = 0; i < patches.length; i++) {
569
+ const patch = patches[i];
570
+ const { surface: candidate, applied } = applySkillPatch(current, patch);
571
+ if (applied === 0 || candidate === current) {
572
+ rejectedThisEpoch.push({
573
+ label: patch.label,
574
+ rationale: patch.rationale,
575
+ reason: "no-op (unanchored or zero-change)"
576
+ });
577
+ continue;
578
+ }
579
+ const candidateHoldout = await scoreHoldout(candidate, `epoch-${epoch}-cand-${i}-holdout`);
580
+ if (candidateHoldout > currentHoldout + minImprovement) {
581
+ accepted = {
582
+ epoch,
583
+ label: patch.label,
584
+ rationale: patch.rationale,
585
+ holdoutDelta: candidateHoldout - currentHoldout
586
+ };
587
+ current = candidate;
588
+ currentHoldout = candidateHoldout;
589
+ currentEvidence = await trainEvidence(current, `epoch-${epoch}-train`);
590
+ break;
591
+ }
592
+ rejectedThisEpoch.push({
593
+ label: patch.label,
594
+ rationale: patch.rationale,
595
+ reason: `held-out ${candidateHoldout.toFixed(3)} \u2264 current ${currentHoldout.toFixed(3)}`
596
+ });
597
+ }
598
+ if (accepted) {
599
+ acceptedEdits.push(accepted);
600
+ sinceAccept = 0;
601
+ } else {
602
+ sinceAccept++;
603
+ if (budgetAnneal && sinceAccept >= 2 && budget > 1) budget--;
604
+ }
605
+ for (const r of rejectedThisEpoch) {
606
+ buffer.push(r);
607
+ rejectedAll.push(r);
608
+ }
609
+ while (buffer.length > rejectedBufferSize) buffer.shift();
610
+ if (slowMetaEvery > 0 && (epoch + 1) % slowMetaEvery === 0) {
611
+ metaNote = buildMetaNote(acceptedEdits, buffer);
612
+ }
613
+ history.push({
614
+ epoch,
615
+ editBudget: budget,
616
+ proposed: patches.length,
617
+ accepted,
618
+ rejected: rejectedThisEpoch,
619
+ holdoutComposite: currentHoldout
620
+ });
621
+ if (sinceAccept >= patience) break;
622
+ }
623
+ return {
624
+ winnerSurface: current,
625
+ baselineHoldoutComposite: baselineHoldout,
626
+ winnerHoldoutComposite: currentHoldout,
627
+ lift: currentHoldout - baselineHoldout,
628
+ acceptedEdits,
629
+ rejectedEdits: rejectedAll,
630
+ epochsRun,
631
+ history,
632
+ totalCostUsd
633
+ };
634
+ }
635
+ function runScoringCampaign(opts, scenarios, surface, tag) {
636
+ return runCampaign({
637
+ ...opts,
638
+ scenarios,
639
+ dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
640
+ runDir: `${opts.runDir}/${tag}`
641
+ });
642
+ }
643
+ function toEvidence(campaign, k) {
644
+ const { dimensions, scenarios } = campaignBreakdown(campaign);
645
+ const weakScenarios = [...scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
646
+ const weakDimensions = Object.entries(dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
647
+ return { weakScenarios, weakDimensions };
648
+ }
649
+ function buildMetaNote(accepted, rejected) {
650
+ const parts = [];
651
+ if (accepted.length > 0) {
652
+ parts.push(
653
+ `Edits that improved held-out so far: ${accepted.map((a) => `"${a.label}" (+${a.holdoutDelta.toFixed(3)})`).join("; ")}. Build on these.`
654
+ );
655
+ }
656
+ if (rejected.length > 0) {
657
+ const labels = [...new Set(rejected.map((r) => r.label))].slice(0, 5);
658
+ parts.push(`Dead ends to avoid: ${labels.join(", ")}. Try a different anchor or rule.`);
659
+ }
660
+ parts.push("Keep edits small and anchored to existing lines.");
661
+ return parts.join(" ");
662
+ }
663
+
664
+ // src/campaign/presets/compare-drivers.ts
665
+ async function compareDrivers(opts) {
666
+ if (opts.drivers.length === 0) throw new Error("compareDrivers: no drivers to compare");
667
+ const seed = opts.seed ?? 42;
668
+ const resamples = opts.resamples ?? 2e3;
669
+ const confidence = opts.confidence ?? 0.95;
670
+ const scoreOnHoldout = async (surface, tag) => {
671
+ const campaign = await runCampaign({
672
+ ...opts,
673
+ scenarios: opts.holdoutScenarios,
674
+ dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
675
+ runDir: `${opts.runDir}/${tag}`
676
+ });
677
+ const byScenario = {};
678
+ for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
679
+ byScenario[scenarioId] = composite;
680
+ }
681
+ return byScenario;
682
+ };
683
+ const scenarioIds = [...new Set(opts.holdoutScenarios.map((s) => s.id))].sort();
684
+ if (scenarioIds.length === 0) throw new Error("compareDrivers: holdoutScenarios is empty");
685
+ const align = (byScenario, label) => {
686
+ const missing = scenarioIds.filter((id) => !(id in byScenario));
687
+ if (missing.length > 0) {
688
+ throw new Error(
689
+ `compareDrivers: ${label} produced no held-out score for scenario(s) [${missing.join(
690
+ ", "
691
+ )}] \u2014 a cell errored or its judges returned nothing. Refusing to fabricate a 0 (it would corrupt the lift comparison). Fix the dispatch/judge or drop the scenario.`
692
+ );
693
+ }
694
+ return scenarioIds.map((id) => byScenario[id]);
695
+ };
696
+ const baselineArr = align(
697
+ await scoreOnHoldout(opts.baselineSurface, "compare-baseline"),
698
+ "baseline"
699
+ );
700
+ const winners = [];
701
+ for (const d of opts.drivers) {
702
+ const out = await d.optimize();
703
+ const byScenario = await scoreOnHoldout(out.winnerSurface, `compare-${slug(d.name)}`);
704
+ winners.push({
705
+ name: d.name,
706
+ winnerSurface: out.winnerSurface,
707
+ costUsd: out.costUsd,
708
+ durationMs: out.durationMs,
709
+ arr: align(byScenario, `driver "${d.name}"`)
710
+ });
711
+ }
712
+ const scores = winners.map((w) => {
713
+ const boot = pairedBootstrap(baselineArr, w.arr, {
714
+ seed,
715
+ resamples,
716
+ confidence,
717
+ statistic: "mean"
718
+ });
719
+ const score = {
720
+ name: w.name,
721
+ baselineComposite: mean(baselineArr),
722
+ winnerComposite: mean(w.arr),
723
+ lift: boot.mean,
724
+ liftCi: { low: boot.low, high: boot.high },
725
+ costUsd: w.costUsd,
726
+ winnerSurface: w.winnerSurface,
727
+ rank: 0
728
+ };
729
+ if (w.durationMs !== void 0) score.durationMs = w.durationMs;
730
+ return score;
731
+ });
732
+ scores.sort((a, b) => b.lift - a.lift || a.costUsd - b.costUsd);
733
+ scores.forEach((s, i) => {
734
+ s.rank = i + 1;
735
+ });
736
+ const best = scores[0];
737
+ const byName = new Map(winners.map((w) => [w.name, w]));
738
+ const bestArr = byName.get(best.name).arr;
739
+ const pairwise = scores.slice(1).map((other) => {
740
+ const otherArr = byName.get(other.name).arr;
741
+ const boot = pairedBootstrap(otherArr, bestArr, {
742
+ seed,
743
+ resamples,
744
+ confidence,
745
+ statistic: "mean"
746
+ });
747
+ const favored = boot.low > 0 ? best.name : boot.high < 0 ? other.name : "tie";
748
+ return {
749
+ a: best.name,
750
+ b: other.name,
751
+ deltaMean: boot.mean,
752
+ low: boot.low,
753
+ high: boot.high,
754
+ favored
755
+ };
756
+ });
757
+ return { scores, best, pairwise, holdoutScenarioIds: scenarioIds };
758
+ }
759
+ function mean(xs) {
760
+ return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
761
+ }
762
+ function slug(name) {
763
+ return name.replace(/[^a-z0-9]+/gi, "-").toLowerCase();
764
+ }
765
+ function gepaReflectionEntry(config, name = "gepa-reflection") {
766
+ return gepaEntry(config, false, name);
767
+ }
768
+ function gepaParetoEntry(config, name = "gepa-pareto") {
769
+ return gepaEntry(config, true, name);
770
+ }
771
+ function gepaEntry(config, combineParents, name) {
772
+ return {
773
+ name,
774
+ async optimize() {
775
+ const started = Date.now();
776
+ const driver = gepaDriver({
777
+ llm: config.llm,
778
+ model: config.model,
779
+ target: config.target,
780
+ combineParents,
781
+ ...config.mutationPrimitives ? { mutationPrimitives: config.mutationPrimitives } : {}
782
+ });
783
+ const result = await runImprovementLoop({
784
+ scenarios: config.trainScenarios,
785
+ holdoutScenarios: config.holdoutScenarios,
786
+ baselineSurface: config.baselineSurface,
787
+ dispatchWithSurface: config.dispatchWithSurface,
788
+ judges: config.judges,
789
+ driver,
790
+ populationSize: config.populationSize ?? 2,
791
+ maxGenerations: config.maxGenerations ?? 3,
792
+ gate: defaultProductionGate({
793
+ holdoutScenarios: config.holdoutScenarios,
794
+ deltaThreshold: 0
795
+ }),
796
+ autoOnPromote: "none",
797
+ runDir: `${config.runDir}/${slug(name)}-loop`,
798
+ ...config.seed !== void 0 ? { seed: config.seed } : {}
799
+ });
800
+ const costUsd = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
801
+ (sum, g) => sum + g.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
802
+ 0
803
+ );
804
+ return { winnerSurface: result.winnerSurface, costUsd, durationMs: Date.now() - started };
805
+ }
806
+ };
807
+ }
808
+ function skillOptEntry(config, name = "skill-opt") {
809
+ return {
810
+ name,
811
+ async optimize() {
812
+ const started = Date.now();
813
+ const driver = skillOptDriver({ llm: config.llm, model: config.model, target: config.target });
814
+ const result = await runSkillOpt({
815
+ baselineSurface: config.baselineSurface,
816
+ dispatchWithSurface: config.dispatchWithSurface,
817
+ judges: config.judges,
818
+ driver,
819
+ trainScenarios: config.trainScenarios,
820
+ holdoutScenarios: config.holdoutScenarios,
821
+ maxEpochs: config.maxEpochs ?? 6,
822
+ runDir: `${config.runDir}/${slug(name)}-loop`,
823
+ ...config.seed !== void 0 ? { seed: config.seed } : {}
824
+ });
825
+ return {
826
+ winnerSurface: result.winnerSurface,
827
+ costUsd: result.totalCostUsd,
828
+ durationMs: Date.now() - started
829
+ };
830
+ }
831
+ };
832
+ }
833
+
261
834
  // src/campaign/presets/run-profile-matrix.ts
262
835
  import { createHash as createHash2 } from "crypto";
263
836
  import { join as join2 } from "path";
@@ -272,12 +845,12 @@ function sanitize(id) {
272
845
  function sha(input) {
273
846
  return createHash2("sha256").update(JSON.stringify(input)).digest("hex");
274
847
  }
275
- function mean(xs) {
848
+ function mean2(xs) {
276
849
  return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
277
850
  }
278
851
  function cellComposite(cell) {
279
852
  const composites = Object.values(cell.judgeScores).map((s) => s.composite);
280
- return composites.length === 0 ? 0 : mean(composites);
853
+ return composites.length === 0 ? 0 : mean2(composites);
281
854
  }
282
855
  function buildRunRecord(args) {
283
856
  const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } = args;
@@ -295,7 +868,7 @@ function buildRunRecord(args) {
295
868
  if (js.notes) notes.push(`${judgeName}: ${js.notes}`);
296
869
  }
297
870
  const perDimMean = {};
298
- for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values);
871
+ for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean2(values);
299
872
  const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
300
873
  if (Object.keys(perJudge).length > 0) {
301
874
  outcome.judgeScores = {
@@ -406,7 +979,7 @@ async function runProfileMatrix(opts) {
406
979
  profileHash,
407
980
  model: profile.model,
408
981
  records: profileRecords.length,
409
- meanComposite: mean(profileRecords.map(compositeOf)),
982
+ meanComposite: mean2(profileRecords.map(compositeOf)),
410
983
  totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),
411
984
  integrity: summarizeBackendIntegrity(profileRecords)
412
985
  };
@@ -436,7 +1009,7 @@ function rollup(records, keyOf) {
436
1009
  groups.set(key, arr);
437
1010
  }
438
1011
  const out = {};
439
- for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length };
1012
+ for (const [key, xs] of groups) out[key] = { meanComposite: mean2(xs), n: xs.length };
440
1013
  return out;
441
1014
  }
442
1015
  function rollupByPersona(records, scenarios, personaOf) {
@@ -465,7 +1038,7 @@ function defaultGit(args, cwd) {
465
1038
  throw new WorktreeAdapterError(`git ${args.join(" ")} failed: ${stderr || String(err)}`, err);
466
1039
  }
467
1040
  }
468
- function slug(label) {
1041
+ function slug2(label) {
469
1042
  return label.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 48) || "candidate";
470
1043
  }
471
1044
  function gitWorktreeAdapter(opts) {
@@ -474,7 +1047,7 @@ function gitWorktreeAdapter(opts) {
474
1047
  const branchPrefix = opts.branchPrefix ?? "improve";
475
1048
  return {
476
1049
  async create({ baseRef, label }) {
477
- const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
1050
+ const id = `${slug2(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
478
1051
  const branch = `${branchPrefix}/${id}`;
479
1052
  const path = join3(worktreeDir, id);
480
1053
  git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
@@ -508,8 +1081,13 @@ export {
508
1081
  FsLabeledScenarioStore,
509
1082
  LabeledScenarioStoreError,
510
1083
  ProfileMatrixError,
1084
+ SkillPatchParseError,
511
1085
  WorktreeAdapterError,
1086
+ applySkillPatch,
512
1087
  buildLoopProvenanceRecord,
1088
+ campaignBreakdown,
1089
+ campaignMeanComposite,
1090
+ compareDrivers,
513
1091
  composeGate,
514
1092
  countSentenceEdits,
515
1093
  defaultProductionGate,
@@ -519,6 +1097,8 @@ export {
519
1097
  extractH2Sections,
520
1098
  fsCampaignStorage,
521
1099
  gepaDriver,
1100
+ gepaParetoEntry,
1101
+ gepaReflectionEntry,
522
1102
  gitWorktreeAdapter,
523
1103
  heldOutGate,
524
1104
  inMemoryCampaignStorage,
@@ -526,6 +1106,8 @@ export {
526
1106
  labelTrustRank,
527
1107
  loopProvenanceSpans,
528
1108
  openAutoPr,
1109
+ parseSkillPatchResponse,
1110
+ patchEditCount,
529
1111
  provenanceRecordPath,
530
1112
  provenanceSpansPath,
531
1113
  resolveWorktreePath,
@@ -534,6 +1116,9 @@ export {
534
1116
  runImprovementLoop,
535
1117
  runOptimization,
536
1118
  runProfileMatrix,
1119
+ runSkillOpt,
1120
+ skillOptDriver,
1121
+ skillOptEntry,
537
1122
  surfaceContentHash,
538
1123
  surfaceHash
539
1124
  };