@tangle-network/agent-eval 0.62.0 → 0.63.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +2 -2
- package/dist/campaign/index.d.ts +385 -8
- package/dist/campaign/index.js +595 -11
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-SS2SOBBT.js → chunk-4ODZXQV2.js} +89 -1
- package/dist/chunk-4ODZXQV2.js.map +1 -0
- package/dist/{chunk-CV2BS2OV.js → chunk-Z7ZU7IYZ.js} +204 -82
- package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
- package/dist/contract/index.d.ts +6 -6
- package/dist/contract/index.js +2 -2
- package/dist/hosted/index.d.ts +2 -2
- package/dist/{index-DxfmYUjC.d.ts → index-GISRh500.d.ts} +1 -1
- package/dist/index.js +10 -86
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-CYBV9Ox6.d.ts → provenance-cUnovpWV.d.ts} +30 -10
- package/dist/rl.d.ts +1 -1
- package/dist/{types-DH22o8hM.d.ts → types-c2R2kfmv.d.ts} +30 -1
- package/package.json +1 -1
- package/dist/chunk-CV2BS2OV.js.map +0 -1
- package/dist/chunk-SS2SOBBT.js.map +0 -1
package/dist/campaign/index.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
buildLoopProvenanceRecord,
|
|
3
|
+
campaignBreakdown,
|
|
4
|
+
campaignMeanComposite,
|
|
3
5
|
composeGate,
|
|
4
6
|
countSentenceEdits,
|
|
5
7
|
defaultProductionGate,
|
|
@@ -20,7 +22,7 @@ import {
|
|
|
20
22
|
runOptimization,
|
|
21
23
|
surfaceContentHash,
|
|
22
24
|
surfaceHash
|
|
23
|
-
} from "../chunk-
|
|
25
|
+
} from "../chunk-Z7ZU7IYZ.js";
|
|
24
26
|
import {
|
|
25
27
|
fsCampaignStorage,
|
|
26
28
|
inMemoryCampaignStorage,
|
|
@@ -29,7 +31,7 @@ import {
|
|
|
29
31
|
import {
|
|
30
32
|
agentProfileHash
|
|
31
33
|
} from "../chunk-PQV2TKC3.js";
|
|
32
|
-
import "../chunk-
|
|
34
|
+
import "../chunk-4ODZXQV2.js";
|
|
33
35
|
import {
|
|
34
36
|
assertRealBackend,
|
|
35
37
|
summarizeBackendIntegrity
|
|
@@ -38,16 +40,253 @@ import "../chunk-YV7J7X5N.js";
|
|
|
38
40
|
import {
|
|
39
41
|
validateRunRecord
|
|
40
42
|
} from "../chunk-F3SRAAZO.js";
|
|
41
|
-
import
|
|
43
|
+
import {
|
|
44
|
+
pairedBootstrap
|
|
45
|
+
} from "../chunk-ITBRCT73.js";
|
|
42
46
|
import "../chunk-GGE4NNQT.js";
|
|
43
47
|
import "../chunk-VSMTAMNK.js";
|
|
44
|
-
import
|
|
48
|
+
import {
|
|
49
|
+
callLlm
|
|
50
|
+
} from "../chunk-IHDHUN2X.js";
|
|
45
51
|
import "../chunk-PC4UYEBM.js";
|
|
46
52
|
import {
|
|
47
53
|
AgentEvalError
|
|
48
54
|
} from "../chunk-3BFEG2F6.js";
|
|
49
55
|
import "../chunk-PZ5AY32C.js";
|
|
50
56
|
|
|
57
|
+
// src/campaign/skill-patch.ts
|
|
58
|
+
function applySkillPatch(surface, patch) {
|
|
59
|
+
let lines = surface.split("\n");
|
|
60
|
+
let applied = 0;
|
|
61
|
+
const rejected = [];
|
|
62
|
+
const findLine = (anchor) => lines.findIndex((l) => l.includes(anchor));
|
|
63
|
+
for (const op of patch.ops) {
|
|
64
|
+
if (op.op === "add") {
|
|
65
|
+
if (typeof op.text !== "string" || op.text.trim() === "") {
|
|
66
|
+
rejected.push({ op, reason: "empty add text" });
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
const insert = op.text.split("\n");
|
|
70
|
+
if (op.after === void 0 || op.after === "") {
|
|
71
|
+
lines = [...lines, ...insert];
|
|
72
|
+
applied++;
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
const idx = findLine(op.after);
|
|
76
|
+
if (idx === -1) {
|
|
77
|
+
rejected.push({ op, reason: `add anchor not found: ${truncate(op.after)}` });
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
lines = [...lines.slice(0, idx + 1), ...insert, ...lines.slice(idx + 1)];
|
|
81
|
+
applied++;
|
|
82
|
+
} else if (op.op === "delete") {
|
|
83
|
+
const idx = findLine(op.anchor);
|
|
84
|
+
if (idx === -1) {
|
|
85
|
+
rejected.push({ op, reason: `delete anchor not found: ${truncate(op.anchor)}` });
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
lines = [...lines.slice(0, idx), ...lines.slice(idx + 1)];
|
|
89
|
+
applied++;
|
|
90
|
+
} else {
|
|
91
|
+
const idx = findLine(op.anchor);
|
|
92
|
+
if (idx === -1) {
|
|
93
|
+
rejected.push({ op, reason: `replace anchor not found: ${truncate(op.anchor)}` });
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
if (typeof op.text !== "string") {
|
|
97
|
+
rejected.push({ op, reason: "replace text missing" });
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
lines = [...lines.slice(0, idx), ...op.text.split("\n"), ...lines.slice(idx + 1)];
|
|
101
|
+
applied++;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return { surface: lines.join("\n"), applied, rejected };
|
|
105
|
+
}
|
|
106
|
+
function patchEditCount(patch) {
|
|
107
|
+
return patch.ops.length;
|
|
108
|
+
}
|
|
109
|
+
function truncate(s, max = 48) {
|
|
110
|
+
return s.length <= max ? s : `${s.slice(0, max)}\u2026`;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// src/campaign/drivers/skill-opt.ts
|
|
114
|
+
var SKILLOPT_SYSTEM = 'You are a SkillOpt optimizer. You improve ONE skill document by proposing BOUNDED, anchored edits \u2014 never a full rewrite. Output ONLY a JSON object of shape {"patches":[{"label":string,"rationale":string,"ops":[op,...]}]} where each op is one of: {"op":"add","after":<exact substring of an existing line, or omit to append>,"text":<new line(s)>}, {"op":"delete","anchor":<exact substring of the line to remove>}, {"op":"replace","anchor":<exact substring of the line to replace>,"text":<replacement line(s)>}. Anchors MUST be verbatim substrings of lines that exist in the document. No prose outside JSON.';
|
|
115
|
+
function skillOptDriver(opts) {
|
|
116
|
+
const evidenceK = opts.evidenceK ?? 3;
|
|
117
|
+
const defaultBudget = opts.editBudget ?? 3;
|
|
118
|
+
async function proposePatches(args) {
|
|
119
|
+
const userPrompt = buildPatchPrompt({
|
|
120
|
+
target: opts.target,
|
|
121
|
+
surface: args.surface,
|
|
122
|
+
evidence: args.evidence,
|
|
123
|
+
editBudget: args.editBudget,
|
|
124
|
+
rejectedBuffer: args.rejectedBuffer,
|
|
125
|
+
metaNote: args.metaNote,
|
|
126
|
+
count: args.count
|
|
127
|
+
});
|
|
128
|
+
const result = await callLlm(
|
|
129
|
+
{
|
|
130
|
+
model: opts.model,
|
|
131
|
+
messages: [
|
|
132
|
+
{ role: "system", content: SKILLOPT_SYSTEM },
|
|
133
|
+
{ role: "user", content: userPrompt }
|
|
134
|
+
],
|
|
135
|
+
jsonMode: true,
|
|
136
|
+
temperature: opts.temperature ?? 0.6,
|
|
137
|
+
maxTokens: opts.maxTokens ?? 4e3
|
|
138
|
+
},
|
|
139
|
+
opts.llm
|
|
140
|
+
);
|
|
141
|
+
return parseSkillPatchResponse(result.content, args.count, args.editBudget);
|
|
142
|
+
}
|
|
143
|
+
return {
|
|
144
|
+
kind: "skill-opt",
|
|
145
|
+
proposePatches,
|
|
146
|
+
async propose(ctx) {
|
|
147
|
+
if (typeof ctx.currentSurface !== "string") {
|
|
148
|
+
throw new Error(
|
|
149
|
+
"skillOptDriver: surface must be a string skill document (got a CodeSurface). SkillOpt patches text."
|
|
150
|
+
);
|
|
151
|
+
}
|
|
152
|
+
const surface = ctx.currentSurface;
|
|
153
|
+
const patches = await proposePatches({
|
|
154
|
+
surface,
|
|
155
|
+
evidence: evidenceFromHistory(ctx, evidenceK),
|
|
156
|
+
editBudget: defaultBudget,
|
|
157
|
+
rejectedBuffer: [],
|
|
158
|
+
count: ctx.populationSize,
|
|
159
|
+
signal: ctx.signal
|
|
160
|
+
});
|
|
161
|
+
const out = [];
|
|
162
|
+
const seen = /* @__PURE__ */ new Set();
|
|
163
|
+
for (const patch of patches) {
|
|
164
|
+
const { surface: candidate, applied } = applySkillPatch(surface, patch);
|
|
165
|
+
if (applied === 0 || candidate === surface || seen.has(candidate)) continue;
|
|
166
|
+
seen.add(candidate);
|
|
167
|
+
out.push({ surface: candidate, label: patch.label, rationale: patch.rationale });
|
|
168
|
+
if (out.length >= ctx.populationSize) break;
|
|
169
|
+
}
|
|
170
|
+
return out;
|
|
171
|
+
}
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
function evidenceFromHistory(ctx, k) {
|
|
175
|
+
const last = ctx.history.at(-1);
|
|
176
|
+
if (!last || last.candidates.length === 0) return { weakScenarios: [], weakDimensions: [] };
|
|
177
|
+
const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0];
|
|
178
|
+
if (!best) return { weakScenarios: [], weakDimensions: [] };
|
|
179
|
+
const weakScenarios = [...best.scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
|
|
180
|
+
const weakDimensions = Object.entries(best.dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
|
|
181
|
+
return { weakScenarios, weakDimensions };
|
|
182
|
+
}
|
|
183
|
+
function buildPatchPrompt(args) {
|
|
184
|
+
const lines = [
|
|
185
|
+
`Skill document governs: ${args.target}.`,
|
|
186
|
+
"",
|
|
187
|
+
"Current skill document:",
|
|
188
|
+
"```",
|
|
189
|
+
args.surface,
|
|
190
|
+
"```",
|
|
191
|
+
"",
|
|
192
|
+
`Propose ${args.count} candidate patch(es). Each patch is a SMALL bundle of`,
|
|
193
|
+
`at most ${args.editBudget} op(s). Anchors must be verbatim substrings of`,
|
|
194
|
+
"existing lines. Prefer adding a specific missing rule or sharpening a vague",
|
|
195
|
+
"one over deleting; never rewrite the whole document."
|
|
196
|
+
];
|
|
197
|
+
if (args.evidence.weakScenarios.length > 0) {
|
|
198
|
+
lines.push(
|
|
199
|
+
"",
|
|
200
|
+
"Weakest scenarios (patch to fix these):",
|
|
201
|
+
...args.evidence.weakScenarios.map((s) => `- ${s.scenarioId} (${s.composite.toFixed(2)})`)
|
|
202
|
+
);
|
|
203
|
+
}
|
|
204
|
+
if (args.evidence.weakDimensions.length > 0) {
|
|
205
|
+
lines.push(
|
|
206
|
+
"",
|
|
207
|
+
"Weakest dimensions (what to improve):",
|
|
208
|
+
...args.evidence.weakDimensions.map((d) => `- ${d.dimension} (${d.score.toFixed(2)})`)
|
|
209
|
+
);
|
|
210
|
+
}
|
|
211
|
+
if (args.rejectedBuffer.length > 0) {
|
|
212
|
+
lines.push(
|
|
213
|
+
"",
|
|
214
|
+
"Already tried and REJECTED (do not repeat or restate these edits):",
|
|
215
|
+
...args.rejectedBuffer.map((e) => `- ${e.label}: ${e.rationale} \u2014 ${e.reason}`)
|
|
216
|
+
);
|
|
217
|
+
}
|
|
218
|
+
if (args.metaNote) {
|
|
219
|
+
lines.push("", `Strategy note from prior epochs: ${args.metaNote}`);
|
|
220
|
+
}
|
|
221
|
+
return lines.join("\n");
|
|
222
|
+
}
|
|
223
|
+
var SkillPatchParseError = class extends Error {
|
|
224
|
+
constructor(message) {
|
|
225
|
+
super(message);
|
|
226
|
+
this.name = "SkillPatchParseError";
|
|
227
|
+
}
|
|
228
|
+
};
|
|
229
|
+
function parseSkillPatchResponse(raw, maxPatches, editBudget) {
|
|
230
|
+
let text = raw.trim();
|
|
231
|
+
if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
|
|
232
|
+
const start = text.indexOf("{");
|
|
233
|
+
const end = text.lastIndexOf("}");
|
|
234
|
+
if (start < 0 || end <= start) {
|
|
235
|
+
throw new SkillPatchParseError(
|
|
236
|
+
`parseSkillPatchResponse: response was not valid JSON (no object found): ${snippet(raw)}`
|
|
237
|
+
);
|
|
238
|
+
}
|
|
239
|
+
let parsed;
|
|
240
|
+
try {
|
|
241
|
+
parsed = JSON.parse(text.slice(start, end + 1));
|
|
242
|
+
} catch (err) {
|
|
243
|
+
throw new SkillPatchParseError(
|
|
244
|
+
`parseSkillPatchResponse: response was not valid JSON (${err instanceof Error ? err.message : String(err)}): ${snippet(raw)}`
|
|
245
|
+
);
|
|
246
|
+
}
|
|
247
|
+
const rawPatches = Array.isArray(parsed.patches) ? parsed.patches : [];
|
|
248
|
+
const out = [];
|
|
249
|
+
for (const rp of rawPatches) {
|
|
250
|
+
if (typeof rp !== "object" || rp === null) continue;
|
|
251
|
+
const obj = rp;
|
|
252
|
+
const ops = Array.isArray(obj.ops) ? obj.ops.map(normalizeOp).filter(isOp) : [];
|
|
253
|
+
if (ops.length === 0) continue;
|
|
254
|
+
out.push({
|
|
255
|
+
label: typeof obj.label === "string" ? obj.label : "patch",
|
|
256
|
+
rationale: typeof obj.rationale === "string" ? obj.rationale : "",
|
|
257
|
+
ops: ops.slice(0, editBudget)
|
|
258
|
+
});
|
|
259
|
+
if (out.length >= maxPatches) break;
|
|
260
|
+
}
|
|
261
|
+
return out;
|
|
262
|
+
}
|
|
263
|
+
function normalizeOp(raw) {
|
|
264
|
+
if (typeof raw !== "object" || raw === null) return null;
|
|
265
|
+
const o = raw;
|
|
266
|
+
if (o.op === "add") {
|
|
267
|
+
if (typeof o.text !== "string") return null;
|
|
268
|
+
const op = { op: "add", text: o.text };
|
|
269
|
+
if (typeof o.after === "string") op.after = o.after;
|
|
270
|
+
return op;
|
|
271
|
+
}
|
|
272
|
+
if (o.op === "delete") {
|
|
273
|
+
if (typeof o.anchor !== "string") return null;
|
|
274
|
+
return { op: "delete", anchor: o.anchor };
|
|
275
|
+
}
|
|
276
|
+
if (o.op === "replace") {
|
|
277
|
+
if (typeof o.anchor !== "string" || typeof o.text !== "string") return null;
|
|
278
|
+
return { op: "replace", anchor: o.anchor, text: o.text };
|
|
279
|
+
}
|
|
280
|
+
return null;
|
|
281
|
+
}
|
|
282
|
+
function isOp(op) {
|
|
283
|
+
return op !== null;
|
|
284
|
+
}
|
|
285
|
+
function snippet(s, max = 120) {
|
|
286
|
+
const t = s.trim().replace(/\s+/g, " ");
|
|
287
|
+
return t.length <= max ? t : `${t.slice(0, max)}\u2026`;
|
|
288
|
+
}
|
|
289
|
+
|
|
51
290
|
// src/campaign/labeled-store/fs-adapter.ts
|
|
52
291
|
import { createHash } from "crypto";
|
|
53
292
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
@@ -259,6 +498,339 @@ function appendLine(path, line) {
|
|
|
259
498
|
}
|
|
260
499
|
}
|
|
261
500
|
|
|
501
|
+
// src/campaign/presets/run-skill-opt.ts
|
|
502
|
+
async function runSkillOpt(opts) {
|
|
503
|
+
if (opts.trainScenarios.length === 0) throw new Error("runSkillOpt: trainScenarios is empty");
|
|
504
|
+
if (opts.holdoutScenarios.length === 0) throw new Error("runSkillOpt: holdoutScenarios is empty");
|
|
505
|
+
if (!opts.judges || opts.judges.length === 0) {
|
|
506
|
+
throw new Error(
|
|
507
|
+
"runSkillOpt: at least one judge is required \u2014 scoring (and therefore acceptance) is meaningless without one, and would report a silent zero lift."
|
|
508
|
+
);
|
|
509
|
+
}
|
|
510
|
+
const holdoutIds = new Set(opts.holdoutScenarios.map((s) => s.id));
|
|
511
|
+
const overlap = opts.trainScenarios.filter((s) => holdoutIds.has(s.id)).map((s) => s.id);
|
|
512
|
+
if (overlap.length > 0) {
|
|
513
|
+
throw new Error(
|
|
514
|
+
`runSkillOpt: trainScenarios and holdoutScenarios must be disjoint (overlap: [${overlap.join(
|
|
515
|
+
", "
|
|
516
|
+
)}]) \u2014 a shared scenario leaks the held-out acceptance axis into the proposal evidence.`
|
|
517
|
+
);
|
|
518
|
+
}
|
|
519
|
+
const patchesPerEpoch = opts.patchesPerEpoch ?? 2;
|
|
520
|
+
const initialBudget = opts.editBudget ?? 3;
|
|
521
|
+
const minImprovement = opts.minImprovement ?? 0;
|
|
522
|
+
if (minImprovement < 0) {
|
|
523
|
+
throw new Error(
|
|
524
|
+
"runSkillOpt: minImprovement must be >= 0 \u2014 a negative threshold would accept held-out regressions, breaking the monotonic-lift contract."
|
|
525
|
+
);
|
|
526
|
+
}
|
|
527
|
+
const patience = opts.patience ?? opts.maxEpochs;
|
|
528
|
+
const budgetAnneal = opts.budgetAnneal ?? true;
|
|
529
|
+
const rejectedBufferSize = opts.rejectedBufferSize ?? 12;
|
|
530
|
+
const slowMetaEvery = opts.slowMetaEvery ?? 2;
|
|
531
|
+
let totalCostUsd = 0;
|
|
532
|
+
const scoreHoldout = async (surface, tag) => {
|
|
533
|
+
const campaign = await runScoringCampaign(opts, opts.holdoutScenarios, surface, tag);
|
|
534
|
+
totalCostUsd += campaign.aggregates.totalCostUsd;
|
|
535
|
+
return campaignMeanComposite(campaign);
|
|
536
|
+
};
|
|
537
|
+
const evidenceK = opts.evidenceK ?? 3;
|
|
538
|
+
const trainEvidence = async (surface, tag) => {
|
|
539
|
+
const campaign = await runScoringCampaign(opts, opts.trainScenarios, surface, tag);
|
|
540
|
+
totalCostUsd += campaign.aggregates.totalCostUsd;
|
|
541
|
+
return toEvidence(campaign, evidenceK);
|
|
542
|
+
};
|
|
543
|
+
let current = opts.baselineSurface;
|
|
544
|
+
let currentEvidence = await trainEvidence(current, "baseline-train");
|
|
545
|
+
const baselineHoldout = await scoreHoldout(current, "baseline-holdout");
|
|
546
|
+
let currentHoldout = baselineHoldout;
|
|
547
|
+
const buffer = [];
|
|
548
|
+
const acceptedEdits = [];
|
|
549
|
+
const rejectedAll = [];
|
|
550
|
+
const history = [];
|
|
551
|
+
let budget = initialBudget;
|
|
552
|
+
let sinceAccept = 0;
|
|
553
|
+
let metaNote;
|
|
554
|
+
let epochsRun = 0;
|
|
555
|
+
for (let epoch = 0; epoch < opts.maxEpochs; epoch++) {
|
|
556
|
+
epochsRun++;
|
|
557
|
+
const patches = await opts.driver.proposePatches({
|
|
558
|
+
surface: current,
|
|
559
|
+
evidence: currentEvidence,
|
|
560
|
+
editBudget: budget,
|
|
561
|
+
rejectedBuffer: buffer,
|
|
562
|
+
metaNote,
|
|
563
|
+
count: patchesPerEpoch,
|
|
564
|
+
signal: opts.signal ?? new AbortController().signal
|
|
565
|
+
});
|
|
566
|
+
let accepted = null;
|
|
567
|
+
const rejectedThisEpoch = [];
|
|
568
|
+
for (let i = 0; i < patches.length; i++) {
|
|
569
|
+
const patch = patches[i];
|
|
570
|
+
const { surface: candidate, applied } = applySkillPatch(current, patch);
|
|
571
|
+
if (applied === 0 || candidate === current) {
|
|
572
|
+
rejectedThisEpoch.push({
|
|
573
|
+
label: patch.label,
|
|
574
|
+
rationale: patch.rationale,
|
|
575
|
+
reason: "no-op (unanchored or zero-change)"
|
|
576
|
+
});
|
|
577
|
+
continue;
|
|
578
|
+
}
|
|
579
|
+
const candidateHoldout = await scoreHoldout(candidate, `epoch-${epoch}-cand-${i}-holdout`);
|
|
580
|
+
if (candidateHoldout > currentHoldout + minImprovement) {
|
|
581
|
+
accepted = {
|
|
582
|
+
epoch,
|
|
583
|
+
label: patch.label,
|
|
584
|
+
rationale: patch.rationale,
|
|
585
|
+
holdoutDelta: candidateHoldout - currentHoldout
|
|
586
|
+
};
|
|
587
|
+
current = candidate;
|
|
588
|
+
currentHoldout = candidateHoldout;
|
|
589
|
+
currentEvidence = await trainEvidence(current, `epoch-${epoch}-train`);
|
|
590
|
+
break;
|
|
591
|
+
}
|
|
592
|
+
rejectedThisEpoch.push({
|
|
593
|
+
label: patch.label,
|
|
594
|
+
rationale: patch.rationale,
|
|
595
|
+
reason: `held-out ${candidateHoldout.toFixed(3)} \u2264 current ${currentHoldout.toFixed(3)}`
|
|
596
|
+
});
|
|
597
|
+
}
|
|
598
|
+
if (accepted) {
|
|
599
|
+
acceptedEdits.push(accepted);
|
|
600
|
+
sinceAccept = 0;
|
|
601
|
+
} else {
|
|
602
|
+
sinceAccept++;
|
|
603
|
+
if (budgetAnneal && sinceAccept >= 2 && budget > 1) budget--;
|
|
604
|
+
}
|
|
605
|
+
for (const r of rejectedThisEpoch) {
|
|
606
|
+
buffer.push(r);
|
|
607
|
+
rejectedAll.push(r);
|
|
608
|
+
}
|
|
609
|
+
while (buffer.length > rejectedBufferSize) buffer.shift();
|
|
610
|
+
if (slowMetaEvery > 0 && (epoch + 1) % slowMetaEvery === 0) {
|
|
611
|
+
metaNote = buildMetaNote(acceptedEdits, buffer);
|
|
612
|
+
}
|
|
613
|
+
history.push({
|
|
614
|
+
epoch,
|
|
615
|
+
editBudget: budget,
|
|
616
|
+
proposed: patches.length,
|
|
617
|
+
accepted,
|
|
618
|
+
rejected: rejectedThisEpoch,
|
|
619
|
+
holdoutComposite: currentHoldout
|
|
620
|
+
});
|
|
621
|
+
if (sinceAccept >= patience) break;
|
|
622
|
+
}
|
|
623
|
+
return {
|
|
624
|
+
winnerSurface: current,
|
|
625
|
+
baselineHoldoutComposite: baselineHoldout,
|
|
626
|
+
winnerHoldoutComposite: currentHoldout,
|
|
627
|
+
lift: currentHoldout - baselineHoldout,
|
|
628
|
+
acceptedEdits,
|
|
629
|
+
rejectedEdits: rejectedAll,
|
|
630
|
+
epochsRun,
|
|
631
|
+
history,
|
|
632
|
+
totalCostUsd
|
|
633
|
+
};
|
|
634
|
+
}
|
|
635
|
+
function runScoringCampaign(opts, scenarios, surface, tag) {
|
|
636
|
+
return runCampaign({
|
|
637
|
+
...opts,
|
|
638
|
+
scenarios,
|
|
639
|
+
dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
|
|
640
|
+
runDir: `${opts.runDir}/${tag}`
|
|
641
|
+
});
|
|
642
|
+
}
|
|
643
|
+
function toEvidence(campaign, k) {
|
|
644
|
+
const { dimensions, scenarios } = campaignBreakdown(campaign);
|
|
645
|
+
const weakScenarios = [...scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
|
|
646
|
+
const weakDimensions = Object.entries(dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
|
|
647
|
+
return { weakScenarios, weakDimensions };
|
|
648
|
+
}
|
|
649
|
+
function buildMetaNote(accepted, rejected) {
|
|
650
|
+
const parts = [];
|
|
651
|
+
if (accepted.length > 0) {
|
|
652
|
+
parts.push(
|
|
653
|
+
`Edits that improved held-out so far: ${accepted.map((a) => `"${a.label}" (+${a.holdoutDelta.toFixed(3)})`).join("; ")}. Build on these.`
|
|
654
|
+
);
|
|
655
|
+
}
|
|
656
|
+
if (rejected.length > 0) {
|
|
657
|
+
const labels = [...new Set(rejected.map((r) => r.label))].slice(0, 5);
|
|
658
|
+
parts.push(`Dead ends to avoid: ${labels.join(", ")}. Try a different anchor or rule.`);
|
|
659
|
+
}
|
|
660
|
+
parts.push("Keep edits small and anchored to existing lines.");
|
|
661
|
+
return parts.join(" ");
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
// src/campaign/presets/compare-drivers.ts
|
|
665
|
+
async function compareDrivers(opts) {
|
|
666
|
+
if (opts.drivers.length === 0) throw new Error("compareDrivers: no drivers to compare");
|
|
667
|
+
const seed = opts.seed ?? 42;
|
|
668
|
+
const resamples = opts.resamples ?? 2e3;
|
|
669
|
+
const confidence = opts.confidence ?? 0.95;
|
|
670
|
+
const scoreOnHoldout = async (surface, tag) => {
|
|
671
|
+
const campaign = await runCampaign({
|
|
672
|
+
...opts,
|
|
673
|
+
scenarios: opts.holdoutScenarios,
|
|
674
|
+
dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
|
|
675
|
+
runDir: `${opts.runDir}/${tag}`
|
|
676
|
+
});
|
|
677
|
+
const byScenario = {};
|
|
678
|
+
for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
|
|
679
|
+
byScenario[scenarioId] = composite;
|
|
680
|
+
}
|
|
681
|
+
return byScenario;
|
|
682
|
+
};
|
|
683
|
+
const scenarioIds = [...new Set(opts.holdoutScenarios.map((s) => s.id))].sort();
|
|
684
|
+
if (scenarioIds.length === 0) throw new Error("compareDrivers: holdoutScenarios is empty");
|
|
685
|
+
const align = (byScenario, label) => {
|
|
686
|
+
const missing = scenarioIds.filter((id) => !(id in byScenario));
|
|
687
|
+
if (missing.length > 0) {
|
|
688
|
+
throw new Error(
|
|
689
|
+
`compareDrivers: ${label} produced no held-out score for scenario(s) [${missing.join(
|
|
690
|
+
", "
|
|
691
|
+
)}] \u2014 a cell errored or its judges returned nothing. Refusing to fabricate a 0 (it would corrupt the lift comparison). Fix the dispatch/judge or drop the scenario.`
|
|
692
|
+
);
|
|
693
|
+
}
|
|
694
|
+
return scenarioIds.map((id) => byScenario[id]);
|
|
695
|
+
};
|
|
696
|
+
const baselineArr = align(
|
|
697
|
+
await scoreOnHoldout(opts.baselineSurface, "compare-baseline"),
|
|
698
|
+
"baseline"
|
|
699
|
+
);
|
|
700
|
+
const winners = [];
|
|
701
|
+
for (const d of opts.drivers) {
|
|
702
|
+
const out = await d.optimize();
|
|
703
|
+
const byScenario = await scoreOnHoldout(out.winnerSurface, `compare-${slug(d.name)}`);
|
|
704
|
+
winners.push({
|
|
705
|
+
name: d.name,
|
|
706
|
+
winnerSurface: out.winnerSurface,
|
|
707
|
+
costUsd: out.costUsd,
|
|
708
|
+
durationMs: out.durationMs,
|
|
709
|
+
arr: align(byScenario, `driver "${d.name}"`)
|
|
710
|
+
});
|
|
711
|
+
}
|
|
712
|
+
const scores = winners.map((w) => {
|
|
713
|
+
const boot = pairedBootstrap(baselineArr, w.arr, {
|
|
714
|
+
seed,
|
|
715
|
+
resamples,
|
|
716
|
+
confidence,
|
|
717
|
+
statistic: "mean"
|
|
718
|
+
});
|
|
719
|
+
const score = {
|
|
720
|
+
name: w.name,
|
|
721
|
+
baselineComposite: mean(baselineArr),
|
|
722
|
+
winnerComposite: mean(w.arr),
|
|
723
|
+
lift: boot.mean,
|
|
724
|
+
liftCi: { low: boot.low, high: boot.high },
|
|
725
|
+
costUsd: w.costUsd,
|
|
726
|
+
winnerSurface: w.winnerSurface,
|
|
727
|
+
rank: 0
|
|
728
|
+
};
|
|
729
|
+
if (w.durationMs !== void 0) score.durationMs = w.durationMs;
|
|
730
|
+
return score;
|
|
731
|
+
});
|
|
732
|
+
scores.sort((a, b) => b.lift - a.lift || a.costUsd - b.costUsd);
|
|
733
|
+
scores.forEach((s, i) => {
|
|
734
|
+
s.rank = i + 1;
|
|
735
|
+
});
|
|
736
|
+
const best = scores[0];
|
|
737
|
+
const byName = new Map(winners.map((w) => [w.name, w]));
|
|
738
|
+
const bestArr = byName.get(best.name).arr;
|
|
739
|
+
const pairwise = scores.slice(1).map((other) => {
|
|
740
|
+
const otherArr = byName.get(other.name).arr;
|
|
741
|
+
const boot = pairedBootstrap(otherArr, bestArr, {
|
|
742
|
+
seed,
|
|
743
|
+
resamples,
|
|
744
|
+
confidence,
|
|
745
|
+
statistic: "mean"
|
|
746
|
+
});
|
|
747
|
+
const favored = boot.low > 0 ? best.name : boot.high < 0 ? other.name : "tie";
|
|
748
|
+
return {
|
|
749
|
+
a: best.name,
|
|
750
|
+
b: other.name,
|
|
751
|
+
deltaMean: boot.mean,
|
|
752
|
+
low: boot.low,
|
|
753
|
+
high: boot.high,
|
|
754
|
+
favored
|
|
755
|
+
};
|
|
756
|
+
});
|
|
757
|
+
return { scores, best, pairwise, holdoutScenarioIds: scenarioIds };
|
|
758
|
+
}
|
|
759
|
+
function mean(xs) {
|
|
760
|
+
return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
761
|
+
}
|
|
762
|
+
function slug(name) {
|
|
763
|
+
return name.replace(/[^a-z0-9]+/gi, "-").toLowerCase();
|
|
764
|
+
}
|
|
765
|
+
function gepaReflectionEntry(config, name = "gepa-reflection") {
|
|
766
|
+
return gepaEntry(config, false, name);
|
|
767
|
+
}
|
|
768
|
+
function gepaParetoEntry(config, name = "gepa-pareto") {
|
|
769
|
+
return gepaEntry(config, true, name);
|
|
770
|
+
}
|
|
771
|
+
function gepaEntry(config, combineParents, name) {
|
|
772
|
+
return {
|
|
773
|
+
name,
|
|
774
|
+
async optimize() {
|
|
775
|
+
const started = Date.now();
|
|
776
|
+
const driver = gepaDriver({
|
|
777
|
+
llm: config.llm,
|
|
778
|
+
model: config.model,
|
|
779
|
+
target: config.target,
|
|
780
|
+
combineParents,
|
|
781
|
+
...config.mutationPrimitives ? { mutationPrimitives: config.mutationPrimitives } : {}
|
|
782
|
+
});
|
|
783
|
+
const result = await runImprovementLoop({
|
|
784
|
+
scenarios: config.trainScenarios,
|
|
785
|
+
holdoutScenarios: config.holdoutScenarios,
|
|
786
|
+
baselineSurface: config.baselineSurface,
|
|
787
|
+
dispatchWithSurface: config.dispatchWithSurface,
|
|
788
|
+
judges: config.judges,
|
|
789
|
+
driver,
|
|
790
|
+
populationSize: config.populationSize ?? 2,
|
|
791
|
+
maxGenerations: config.maxGenerations ?? 3,
|
|
792
|
+
gate: defaultProductionGate({
|
|
793
|
+
holdoutScenarios: config.holdoutScenarios,
|
|
794
|
+
deltaThreshold: 0
|
|
795
|
+
}),
|
|
796
|
+
autoOnPromote: "none",
|
|
797
|
+
runDir: `${config.runDir}/${slug(name)}-loop`,
|
|
798
|
+
...config.seed !== void 0 ? { seed: config.seed } : {}
|
|
799
|
+
});
|
|
800
|
+
const costUsd = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
|
|
801
|
+
(sum, g) => sum + g.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
|
|
802
|
+
0
|
|
803
|
+
);
|
|
804
|
+
return { winnerSurface: result.winnerSurface, costUsd, durationMs: Date.now() - started };
|
|
805
|
+
}
|
|
806
|
+
};
|
|
807
|
+
}
|
|
808
|
+
function skillOptEntry(config, name = "skill-opt") {
|
|
809
|
+
return {
|
|
810
|
+
name,
|
|
811
|
+
async optimize() {
|
|
812
|
+
const started = Date.now();
|
|
813
|
+
const driver = skillOptDriver({ llm: config.llm, model: config.model, target: config.target });
|
|
814
|
+
const result = await runSkillOpt({
|
|
815
|
+
baselineSurface: config.baselineSurface,
|
|
816
|
+
dispatchWithSurface: config.dispatchWithSurface,
|
|
817
|
+
judges: config.judges,
|
|
818
|
+
driver,
|
|
819
|
+
trainScenarios: config.trainScenarios,
|
|
820
|
+
holdoutScenarios: config.holdoutScenarios,
|
|
821
|
+
maxEpochs: config.maxEpochs ?? 6,
|
|
822
|
+
runDir: `${config.runDir}/${slug(name)}-loop`,
|
|
823
|
+
...config.seed !== void 0 ? { seed: config.seed } : {}
|
|
824
|
+
});
|
|
825
|
+
return {
|
|
826
|
+
winnerSurface: result.winnerSurface,
|
|
827
|
+
costUsd: result.totalCostUsd,
|
|
828
|
+
durationMs: Date.now() - started
|
|
829
|
+
};
|
|
830
|
+
}
|
|
831
|
+
};
|
|
832
|
+
}
|
|
833
|
+
|
|
262
834
|
// src/campaign/presets/run-profile-matrix.ts
|
|
263
835
|
import { createHash as createHash2 } from "crypto";
|
|
264
836
|
import { join as join2 } from "path";
|
|
@@ -273,12 +845,12 @@ function sanitize(id) {
|
|
|
273
845
|
function sha(input) {
|
|
274
846
|
return createHash2("sha256").update(JSON.stringify(input)).digest("hex");
|
|
275
847
|
}
|
|
276
|
-
function
|
|
848
|
+
function mean2(xs) {
|
|
277
849
|
return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
278
850
|
}
|
|
279
851
|
function cellComposite(cell) {
|
|
280
852
|
const composites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
281
|
-
return composites.length === 0 ? 0 :
|
|
853
|
+
return composites.length === 0 ? 0 : mean2(composites);
|
|
282
854
|
}
|
|
283
855
|
function buildRunRecord(args) {
|
|
284
856
|
const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } = args;
|
|
@@ -296,7 +868,7 @@ function buildRunRecord(args) {
|
|
|
296
868
|
if (js.notes) notes.push(`${judgeName}: ${js.notes}`);
|
|
297
869
|
}
|
|
298
870
|
const perDimMean = {};
|
|
299
|
-
for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] =
|
|
871
|
+
for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean2(values);
|
|
300
872
|
const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
|
|
301
873
|
if (Object.keys(perJudge).length > 0) {
|
|
302
874
|
outcome.judgeScores = {
|
|
@@ -407,7 +979,7 @@ async function runProfileMatrix(opts) {
|
|
|
407
979
|
profileHash,
|
|
408
980
|
model: profile.model,
|
|
409
981
|
records: profileRecords.length,
|
|
410
|
-
meanComposite:
|
|
982
|
+
meanComposite: mean2(profileRecords.map(compositeOf)),
|
|
411
983
|
totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),
|
|
412
984
|
integrity: summarizeBackendIntegrity(profileRecords)
|
|
413
985
|
};
|
|
@@ -437,7 +1009,7 @@ function rollup(records, keyOf) {
|
|
|
437
1009
|
groups.set(key, arr);
|
|
438
1010
|
}
|
|
439
1011
|
const out = {};
|
|
440
|
-
for (const [key, xs] of groups) out[key] = { meanComposite:
|
|
1012
|
+
for (const [key, xs] of groups) out[key] = { meanComposite: mean2(xs), n: xs.length };
|
|
441
1013
|
return out;
|
|
442
1014
|
}
|
|
443
1015
|
function rollupByPersona(records, scenarios, personaOf) {
|
|
@@ -466,7 +1038,7 @@ function defaultGit(args, cwd) {
|
|
|
466
1038
|
throw new WorktreeAdapterError(`git ${args.join(" ")} failed: ${stderr || String(err)}`, err);
|
|
467
1039
|
}
|
|
468
1040
|
}
|
|
469
|
-
function
|
|
1041
|
+
function slug2(label) {
|
|
470
1042
|
return label.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 48) || "candidate";
|
|
471
1043
|
}
|
|
472
1044
|
function gitWorktreeAdapter(opts) {
|
|
@@ -475,7 +1047,7 @@ function gitWorktreeAdapter(opts) {
|
|
|
475
1047
|
const branchPrefix = opts.branchPrefix ?? "improve";
|
|
476
1048
|
return {
|
|
477
1049
|
async create({ baseRef, label }) {
|
|
478
|
-
const id = `${
|
|
1050
|
+
const id = `${slug2(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
|
|
479
1051
|
const branch = `${branchPrefix}/${id}`;
|
|
480
1052
|
const path = join3(worktreeDir, id);
|
|
481
1053
|
git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
|
|
@@ -509,8 +1081,13 @@ export {
|
|
|
509
1081
|
FsLabeledScenarioStore,
|
|
510
1082
|
LabeledScenarioStoreError,
|
|
511
1083
|
ProfileMatrixError,
|
|
1084
|
+
SkillPatchParseError,
|
|
512
1085
|
WorktreeAdapterError,
|
|
1086
|
+
applySkillPatch,
|
|
513
1087
|
buildLoopProvenanceRecord,
|
|
1088
|
+
campaignBreakdown,
|
|
1089
|
+
campaignMeanComposite,
|
|
1090
|
+
compareDrivers,
|
|
514
1091
|
composeGate,
|
|
515
1092
|
countSentenceEdits,
|
|
516
1093
|
defaultProductionGate,
|
|
@@ -520,6 +1097,8 @@ export {
|
|
|
520
1097
|
extractH2Sections,
|
|
521
1098
|
fsCampaignStorage,
|
|
522
1099
|
gepaDriver,
|
|
1100
|
+
gepaParetoEntry,
|
|
1101
|
+
gepaReflectionEntry,
|
|
523
1102
|
gitWorktreeAdapter,
|
|
524
1103
|
heldOutGate,
|
|
525
1104
|
inMemoryCampaignStorage,
|
|
@@ -527,6 +1106,8 @@ export {
|
|
|
527
1106
|
labelTrustRank,
|
|
528
1107
|
loopProvenanceSpans,
|
|
529
1108
|
openAutoPr,
|
|
1109
|
+
parseSkillPatchResponse,
|
|
1110
|
+
patchEditCount,
|
|
530
1111
|
provenanceRecordPath,
|
|
531
1112
|
provenanceSpansPath,
|
|
532
1113
|
resolveWorktreePath,
|
|
@@ -535,6 +1116,9 @@ export {
|
|
|
535
1116
|
runImprovementLoop,
|
|
536
1117
|
runOptimization,
|
|
537
1118
|
runProfileMatrix,
|
|
1119
|
+
runSkillOpt,
|
|
1120
|
+
skillOptDriver,
|
|
1121
|
+
skillOptEntry,
|
|
538
1122
|
surfaceContentHash,
|
|
539
1123
|
surfaceHash
|
|
540
1124
|
};
|