@tangle-network/agent-eval 0.61.0 → 0.63.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -8
- package/dist/adapters/http.d.ts +4 -1
- package/dist/adapters/langchain.d.ts +4 -1
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/campaign/index.d.ts +388 -11
- package/dist/campaign/index.js +597 -12
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
- package/dist/chunk-4ODZXQV2.js.map +1 -0
- package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
- package/dist/chunk-7TPYV2ER.js.map +1 -0
- package/dist/chunk-E22YUOAL.js +111 -0
- package/dist/chunk-E22YUOAL.js.map +1 -0
- package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
- package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
- package/dist/contract/index.d.ts +9 -9
- package/dist/contract/index.js +4 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
- package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
- package/dist/index.d.ts +98 -14
- package/dist/index.js +331 -128
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
- package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
- package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
- package/dist/rl.d.ts +6 -6
- package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
- package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
- package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
- package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
- package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
- package/package.json +1 -1
- package/dist/chunk-GMXHLSLL.js.map +0 -1
- package/dist/chunk-OLULBECP.js.map +0 -1
- package/dist/chunk-SUGME4OT.js.map +0 -1
- /package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
package/dist/campaign/index.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
buildLoopProvenanceRecord,
|
|
3
|
+
campaignBreakdown,
|
|
4
|
+
campaignMeanComposite,
|
|
3
5
|
composeGate,
|
|
4
6
|
countSentenceEdits,
|
|
5
7
|
defaultProductionGate,
|
|
@@ -20,33 +22,271 @@ import {
|
|
|
20
22
|
runOptimization,
|
|
21
23
|
surfaceContentHash,
|
|
22
24
|
surfaceHash
|
|
23
|
-
} from "../chunk-
|
|
25
|
+
} from "../chunk-Z7ZU7IYZ.js";
|
|
24
26
|
import {
|
|
25
27
|
fsCampaignStorage,
|
|
26
28
|
inMemoryCampaignStorage,
|
|
27
29
|
runCampaign
|
|
28
|
-
} from "../chunk-
|
|
30
|
+
} from "../chunk-7TPYV2ER.js";
|
|
29
31
|
import {
|
|
30
32
|
agentProfileHash
|
|
31
33
|
} from "../chunk-PQV2TKC3.js";
|
|
34
|
+
import "../chunk-4ODZXQV2.js";
|
|
32
35
|
import {
|
|
33
36
|
assertRealBackend,
|
|
34
37
|
summarizeBackendIntegrity
|
|
35
|
-
} from "../chunk-
|
|
38
|
+
} from "../chunk-E22YUOAL.js";
|
|
36
39
|
import "../chunk-YV7J7X5N.js";
|
|
37
40
|
import {
|
|
38
41
|
validateRunRecord
|
|
39
42
|
} from "../chunk-F3SRAAZO.js";
|
|
40
|
-
import
|
|
43
|
+
import {
|
|
44
|
+
pairedBootstrap
|
|
45
|
+
} from "../chunk-ITBRCT73.js";
|
|
41
46
|
import "../chunk-GGE4NNQT.js";
|
|
42
47
|
import "../chunk-VSMTAMNK.js";
|
|
43
|
-
import
|
|
48
|
+
import {
|
|
49
|
+
callLlm
|
|
50
|
+
} from "../chunk-IHDHUN2X.js";
|
|
44
51
|
import "../chunk-PC4UYEBM.js";
|
|
45
52
|
import {
|
|
46
53
|
AgentEvalError
|
|
47
54
|
} from "../chunk-3BFEG2F6.js";
|
|
48
55
|
import "../chunk-PZ5AY32C.js";
|
|
49
56
|
|
|
57
|
+
// src/campaign/skill-patch.ts
|
|
58
|
+
function applySkillPatch(surface, patch) {
|
|
59
|
+
let lines = surface.split("\n");
|
|
60
|
+
let applied = 0;
|
|
61
|
+
const rejected = [];
|
|
62
|
+
const findLine = (anchor) => lines.findIndex((l) => l.includes(anchor));
|
|
63
|
+
for (const op of patch.ops) {
|
|
64
|
+
if (op.op === "add") {
|
|
65
|
+
if (typeof op.text !== "string" || op.text.trim() === "") {
|
|
66
|
+
rejected.push({ op, reason: "empty add text" });
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
const insert = op.text.split("\n");
|
|
70
|
+
if (op.after === void 0 || op.after === "") {
|
|
71
|
+
lines = [...lines, ...insert];
|
|
72
|
+
applied++;
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
const idx = findLine(op.after);
|
|
76
|
+
if (idx === -1) {
|
|
77
|
+
rejected.push({ op, reason: `add anchor not found: ${truncate(op.after)}` });
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
lines = [...lines.slice(0, idx + 1), ...insert, ...lines.slice(idx + 1)];
|
|
81
|
+
applied++;
|
|
82
|
+
} else if (op.op === "delete") {
|
|
83
|
+
const idx = findLine(op.anchor);
|
|
84
|
+
if (idx === -1) {
|
|
85
|
+
rejected.push({ op, reason: `delete anchor not found: ${truncate(op.anchor)}` });
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
lines = [...lines.slice(0, idx), ...lines.slice(idx + 1)];
|
|
89
|
+
applied++;
|
|
90
|
+
} else {
|
|
91
|
+
const idx = findLine(op.anchor);
|
|
92
|
+
if (idx === -1) {
|
|
93
|
+
rejected.push({ op, reason: `replace anchor not found: ${truncate(op.anchor)}` });
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
if (typeof op.text !== "string") {
|
|
97
|
+
rejected.push({ op, reason: "replace text missing" });
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
lines = [...lines.slice(0, idx), ...op.text.split("\n"), ...lines.slice(idx + 1)];
|
|
101
|
+
applied++;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return { surface: lines.join("\n"), applied, rejected };
|
|
105
|
+
}
|
|
106
|
+
function patchEditCount(patch) {
|
|
107
|
+
return patch.ops.length;
|
|
108
|
+
}
|
|
109
|
+
function truncate(s, max = 48) {
|
|
110
|
+
return s.length <= max ? s : `${s.slice(0, max)}\u2026`;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// src/campaign/drivers/skill-opt.ts
|
|
114
|
+
var SKILLOPT_SYSTEM = 'You are a SkillOpt optimizer. You improve ONE skill document by proposing BOUNDED, anchored edits \u2014 never a full rewrite. Output ONLY a JSON object of shape {"patches":[{"label":string,"rationale":string,"ops":[op,...]}]} where each op is one of: {"op":"add","after":<exact substring of an existing line, or omit to append>,"text":<new line(s)>}, {"op":"delete","anchor":<exact substring of the line to remove>}, {"op":"replace","anchor":<exact substring of the line to replace>,"text":<replacement line(s)>}. Anchors MUST be verbatim substrings of lines that exist in the document. No prose outside JSON.';
|
|
115
|
+
function skillOptDriver(opts) {
|
|
116
|
+
const evidenceK = opts.evidenceK ?? 3;
|
|
117
|
+
const defaultBudget = opts.editBudget ?? 3;
|
|
118
|
+
async function proposePatches(args) {
|
|
119
|
+
const userPrompt = buildPatchPrompt({
|
|
120
|
+
target: opts.target,
|
|
121
|
+
surface: args.surface,
|
|
122
|
+
evidence: args.evidence,
|
|
123
|
+
editBudget: args.editBudget,
|
|
124
|
+
rejectedBuffer: args.rejectedBuffer,
|
|
125
|
+
metaNote: args.metaNote,
|
|
126
|
+
count: args.count
|
|
127
|
+
});
|
|
128
|
+
const result = await callLlm(
|
|
129
|
+
{
|
|
130
|
+
model: opts.model,
|
|
131
|
+
messages: [
|
|
132
|
+
{ role: "system", content: SKILLOPT_SYSTEM },
|
|
133
|
+
{ role: "user", content: userPrompt }
|
|
134
|
+
],
|
|
135
|
+
jsonMode: true,
|
|
136
|
+
temperature: opts.temperature ?? 0.6,
|
|
137
|
+
maxTokens: opts.maxTokens ?? 4e3
|
|
138
|
+
},
|
|
139
|
+
opts.llm
|
|
140
|
+
);
|
|
141
|
+
return parseSkillPatchResponse(result.content, args.count, args.editBudget);
|
|
142
|
+
}
|
|
143
|
+
return {
|
|
144
|
+
kind: "skill-opt",
|
|
145
|
+
proposePatches,
|
|
146
|
+
async propose(ctx) {
|
|
147
|
+
if (typeof ctx.currentSurface !== "string") {
|
|
148
|
+
throw new Error(
|
|
149
|
+
"skillOptDriver: surface must be a string skill document (got a CodeSurface). SkillOpt patches text."
|
|
150
|
+
);
|
|
151
|
+
}
|
|
152
|
+
const surface = ctx.currentSurface;
|
|
153
|
+
const patches = await proposePatches({
|
|
154
|
+
surface,
|
|
155
|
+
evidence: evidenceFromHistory(ctx, evidenceK),
|
|
156
|
+
editBudget: defaultBudget,
|
|
157
|
+
rejectedBuffer: [],
|
|
158
|
+
count: ctx.populationSize,
|
|
159
|
+
signal: ctx.signal
|
|
160
|
+
});
|
|
161
|
+
const out = [];
|
|
162
|
+
const seen = /* @__PURE__ */ new Set();
|
|
163
|
+
for (const patch of patches) {
|
|
164
|
+
const { surface: candidate, applied } = applySkillPatch(surface, patch);
|
|
165
|
+
if (applied === 0 || candidate === surface || seen.has(candidate)) continue;
|
|
166
|
+
seen.add(candidate);
|
|
167
|
+
out.push({ surface: candidate, label: patch.label, rationale: patch.rationale });
|
|
168
|
+
if (out.length >= ctx.populationSize) break;
|
|
169
|
+
}
|
|
170
|
+
return out;
|
|
171
|
+
}
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
function evidenceFromHistory(ctx, k) {
|
|
175
|
+
const last = ctx.history.at(-1);
|
|
176
|
+
if (!last || last.candidates.length === 0) return { weakScenarios: [], weakDimensions: [] };
|
|
177
|
+
const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0];
|
|
178
|
+
if (!best) return { weakScenarios: [], weakDimensions: [] };
|
|
179
|
+
const weakScenarios = [...best.scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
|
|
180
|
+
const weakDimensions = Object.entries(best.dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
|
|
181
|
+
return { weakScenarios, weakDimensions };
|
|
182
|
+
}
|
|
183
|
+
function buildPatchPrompt(args) {
|
|
184
|
+
const lines = [
|
|
185
|
+
`Skill document governs: ${args.target}.`,
|
|
186
|
+
"",
|
|
187
|
+
"Current skill document:",
|
|
188
|
+
"```",
|
|
189
|
+
args.surface,
|
|
190
|
+
"```",
|
|
191
|
+
"",
|
|
192
|
+
`Propose ${args.count} candidate patch(es). Each patch is a SMALL bundle of`,
|
|
193
|
+
`at most ${args.editBudget} op(s). Anchors must be verbatim substrings of`,
|
|
194
|
+
"existing lines. Prefer adding a specific missing rule or sharpening a vague",
|
|
195
|
+
"one over deleting; never rewrite the whole document."
|
|
196
|
+
];
|
|
197
|
+
if (args.evidence.weakScenarios.length > 0) {
|
|
198
|
+
lines.push(
|
|
199
|
+
"",
|
|
200
|
+
"Weakest scenarios (patch to fix these):",
|
|
201
|
+
...args.evidence.weakScenarios.map((s) => `- ${s.scenarioId} (${s.composite.toFixed(2)})`)
|
|
202
|
+
);
|
|
203
|
+
}
|
|
204
|
+
if (args.evidence.weakDimensions.length > 0) {
|
|
205
|
+
lines.push(
|
|
206
|
+
"",
|
|
207
|
+
"Weakest dimensions (what to improve):",
|
|
208
|
+
...args.evidence.weakDimensions.map((d) => `- ${d.dimension} (${d.score.toFixed(2)})`)
|
|
209
|
+
);
|
|
210
|
+
}
|
|
211
|
+
if (args.rejectedBuffer.length > 0) {
|
|
212
|
+
lines.push(
|
|
213
|
+
"",
|
|
214
|
+
"Already tried and REJECTED (do not repeat or restate these edits):",
|
|
215
|
+
...args.rejectedBuffer.map((e) => `- ${e.label}: ${e.rationale} \u2014 ${e.reason}`)
|
|
216
|
+
);
|
|
217
|
+
}
|
|
218
|
+
if (args.metaNote) {
|
|
219
|
+
lines.push("", `Strategy note from prior epochs: ${args.metaNote}`);
|
|
220
|
+
}
|
|
221
|
+
return lines.join("\n");
|
|
222
|
+
}
|
|
223
|
+
var SkillPatchParseError = class extends Error {
|
|
224
|
+
constructor(message) {
|
|
225
|
+
super(message);
|
|
226
|
+
this.name = "SkillPatchParseError";
|
|
227
|
+
}
|
|
228
|
+
};
|
|
229
|
+
function parseSkillPatchResponse(raw, maxPatches, editBudget) {
|
|
230
|
+
let text = raw.trim();
|
|
231
|
+
if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
|
|
232
|
+
const start = text.indexOf("{");
|
|
233
|
+
const end = text.lastIndexOf("}");
|
|
234
|
+
if (start < 0 || end <= start) {
|
|
235
|
+
throw new SkillPatchParseError(
|
|
236
|
+
`parseSkillPatchResponse: response was not valid JSON (no object found): ${snippet(raw)}`
|
|
237
|
+
);
|
|
238
|
+
}
|
|
239
|
+
let parsed;
|
|
240
|
+
try {
|
|
241
|
+
parsed = JSON.parse(text.slice(start, end + 1));
|
|
242
|
+
} catch (err) {
|
|
243
|
+
throw new SkillPatchParseError(
|
|
244
|
+
`parseSkillPatchResponse: response was not valid JSON (${err instanceof Error ? err.message : String(err)}): ${snippet(raw)}`
|
|
245
|
+
);
|
|
246
|
+
}
|
|
247
|
+
const rawPatches = Array.isArray(parsed.patches) ? parsed.patches : [];
|
|
248
|
+
const out = [];
|
|
249
|
+
for (const rp of rawPatches) {
|
|
250
|
+
if (typeof rp !== "object" || rp === null) continue;
|
|
251
|
+
const obj = rp;
|
|
252
|
+
const ops = Array.isArray(obj.ops) ? obj.ops.map(normalizeOp).filter(isOp) : [];
|
|
253
|
+
if (ops.length === 0) continue;
|
|
254
|
+
out.push({
|
|
255
|
+
label: typeof obj.label === "string" ? obj.label : "patch",
|
|
256
|
+
rationale: typeof obj.rationale === "string" ? obj.rationale : "",
|
|
257
|
+
ops: ops.slice(0, editBudget)
|
|
258
|
+
});
|
|
259
|
+
if (out.length >= maxPatches) break;
|
|
260
|
+
}
|
|
261
|
+
return out;
|
|
262
|
+
}
|
|
263
|
+
function normalizeOp(raw) {
|
|
264
|
+
if (typeof raw !== "object" || raw === null) return null;
|
|
265
|
+
const o = raw;
|
|
266
|
+
if (o.op === "add") {
|
|
267
|
+
if (typeof o.text !== "string") return null;
|
|
268
|
+
const op = { op: "add", text: o.text };
|
|
269
|
+
if (typeof o.after === "string") op.after = o.after;
|
|
270
|
+
return op;
|
|
271
|
+
}
|
|
272
|
+
if (o.op === "delete") {
|
|
273
|
+
if (typeof o.anchor !== "string") return null;
|
|
274
|
+
return { op: "delete", anchor: o.anchor };
|
|
275
|
+
}
|
|
276
|
+
if (o.op === "replace") {
|
|
277
|
+
if (typeof o.anchor !== "string" || typeof o.text !== "string") return null;
|
|
278
|
+
return { op: "replace", anchor: o.anchor, text: o.text };
|
|
279
|
+
}
|
|
280
|
+
return null;
|
|
281
|
+
}
|
|
282
|
+
function isOp(op) {
|
|
283
|
+
return op !== null;
|
|
284
|
+
}
|
|
285
|
+
function snippet(s, max = 120) {
|
|
286
|
+
const t = s.trim().replace(/\s+/g, " ");
|
|
287
|
+
return t.length <= max ? t : `${t.slice(0, max)}\u2026`;
|
|
288
|
+
}
|
|
289
|
+
|
|
50
290
|
// src/campaign/labeled-store/fs-adapter.ts
|
|
51
291
|
import { createHash } from "crypto";
|
|
52
292
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
@@ -258,6 +498,339 @@ function appendLine(path, line) {
|
|
|
258
498
|
}
|
|
259
499
|
}
|
|
260
500
|
|
|
501
|
+
// src/campaign/presets/run-skill-opt.ts
|
|
502
|
+
async function runSkillOpt(opts) {
|
|
503
|
+
if (opts.trainScenarios.length === 0) throw new Error("runSkillOpt: trainScenarios is empty");
|
|
504
|
+
if (opts.holdoutScenarios.length === 0) throw new Error("runSkillOpt: holdoutScenarios is empty");
|
|
505
|
+
if (!opts.judges || opts.judges.length === 0) {
|
|
506
|
+
throw new Error(
|
|
507
|
+
"runSkillOpt: at least one judge is required \u2014 scoring (and therefore acceptance) is meaningless without one, and would report a silent zero lift."
|
|
508
|
+
);
|
|
509
|
+
}
|
|
510
|
+
const holdoutIds = new Set(opts.holdoutScenarios.map((s) => s.id));
|
|
511
|
+
const overlap = opts.trainScenarios.filter((s) => holdoutIds.has(s.id)).map((s) => s.id);
|
|
512
|
+
if (overlap.length > 0) {
|
|
513
|
+
throw new Error(
|
|
514
|
+
`runSkillOpt: trainScenarios and holdoutScenarios must be disjoint (overlap: [${overlap.join(
|
|
515
|
+
", "
|
|
516
|
+
)}]) \u2014 a shared scenario leaks the held-out acceptance axis into the proposal evidence.`
|
|
517
|
+
);
|
|
518
|
+
}
|
|
519
|
+
const patchesPerEpoch = opts.patchesPerEpoch ?? 2;
|
|
520
|
+
const initialBudget = opts.editBudget ?? 3;
|
|
521
|
+
const minImprovement = opts.minImprovement ?? 0;
|
|
522
|
+
if (minImprovement < 0) {
|
|
523
|
+
throw new Error(
|
|
524
|
+
"runSkillOpt: minImprovement must be >= 0 \u2014 a negative threshold would accept held-out regressions, breaking the monotonic-lift contract."
|
|
525
|
+
);
|
|
526
|
+
}
|
|
527
|
+
const patience = opts.patience ?? opts.maxEpochs;
|
|
528
|
+
const budgetAnneal = opts.budgetAnneal ?? true;
|
|
529
|
+
const rejectedBufferSize = opts.rejectedBufferSize ?? 12;
|
|
530
|
+
const slowMetaEvery = opts.slowMetaEvery ?? 2;
|
|
531
|
+
let totalCostUsd = 0;
|
|
532
|
+
const scoreHoldout = async (surface, tag) => {
|
|
533
|
+
const campaign = await runScoringCampaign(opts, opts.holdoutScenarios, surface, tag);
|
|
534
|
+
totalCostUsd += campaign.aggregates.totalCostUsd;
|
|
535
|
+
return campaignMeanComposite(campaign);
|
|
536
|
+
};
|
|
537
|
+
const evidenceK = opts.evidenceK ?? 3;
|
|
538
|
+
const trainEvidence = async (surface, tag) => {
|
|
539
|
+
const campaign = await runScoringCampaign(opts, opts.trainScenarios, surface, tag);
|
|
540
|
+
totalCostUsd += campaign.aggregates.totalCostUsd;
|
|
541
|
+
return toEvidence(campaign, evidenceK);
|
|
542
|
+
};
|
|
543
|
+
let current = opts.baselineSurface;
|
|
544
|
+
let currentEvidence = await trainEvidence(current, "baseline-train");
|
|
545
|
+
const baselineHoldout = await scoreHoldout(current, "baseline-holdout");
|
|
546
|
+
let currentHoldout = baselineHoldout;
|
|
547
|
+
const buffer = [];
|
|
548
|
+
const acceptedEdits = [];
|
|
549
|
+
const rejectedAll = [];
|
|
550
|
+
const history = [];
|
|
551
|
+
let budget = initialBudget;
|
|
552
|
+
let sinceAccept = 0;
|
|
553
|
+
let metaNote;
|
|
554
|
+
let epochsRun = 0;
|
|
555
|
+
for (let epoch = 0; epoch < opts.maxEpochs; epoch++) {
|
|
556
|
+
epochsRun++;
|
|
557
|
+
const patches = await opts.driver.proposePatches({
|
|
558
|
+
surface: current,
|
|
559
|
+
evidence: currentEvidence,
|
|
560
|
+
editBudget: budget,
|
|
561
|
+
rejectedBuffer: buffer,
|
|
562
|
+
metaNote,
|
|
563
|
+
count: patchesPerEpoch,
|
|
564
|
+
signal: opts.signal ?? new AbortController().signal
|
|
565
|
+
});
|
|
566
|
+
let accepted = null;
|
|
567
|
+
const rejectedThisEpoch = [];
|
|
568
|
+
for (let i = 0; i < patches.length; i++) {
|
|
569
|
+
const patch = patches[i];
|
|
570
|
+
const { surface: candidate, applied } = applySkillPatch(current, patch);
|
|
571
|
+
if (applied === 0 || candidate === current) {
|
|
572
|
+
rejectedThisEpoch.push({
|
|
573
|
+
label: patch.label,
|
|
574
|
+
rationale: patch.rationale,
|
|
575
|
+
reason: "no-op (unanchored or zero-change)"
|
|
576
|
+
});
|
|
577
|
+
continue;
|
|
578
|
+
}
|
|
579
|
+
const candidateHoldout = await scoreHoldout(candidate, `epoch-${epoch}-cand-${i}-holdout`);
|
|
580
|
+
if (candidateHoldout > currentHoldout + minImprovement) {
|
|
581
|
+
accepted = {
|
|
582
|
+
epoch,
|
|
583
|
+
label: patch.label,
|
|
584
|
+
rationale: patch.rationale,
|
|
585
|
+
holdoutDelta: candidateHoldout - currentHoldout
|
|
586
|
+
};
|
|
587
|
+
current = candidate;
|
|
588
|
+
currentHoldout = candidateHoldout;
|
|
589
|
+
currentEvidence = await trainEvidence(current, `epoch-${epoch}-train`);
|
|
590
|
+
break;
|
|
591
|
+
}
|
|
592
|
+
rejectedThisEpoch.push({
|
|
593
|
+
label: patch.label,
|
|
594
|
+
rationale: patch.rationale,
|
|
595
|
+
reason: `held-out ${candidateHoldout.toFixed(3)} \u2264 current ${currentHoldout.toFixed(3)}`
|
|
596
|
+
});
|
|
597
|
+
}
|
|
598
|
+
if (accepted) {
|
|
599
|
+
acceptedEdits.push(accepted);
|
|
600
|
+
sinceAccept = 0;
|
|
601
|
+
} else {
|
|
602
|
+
sinceAccept++;
|
|
603
|
+
if (budgetAnneal && sinceAccept >= 2 && budget > 1) budget--;
|
|
604
|
+
}
|
|
605
|
+
for (const r of rejectedThisEpoch) {
|
|
606
|
+
buffer.push(r);
|
|
607
|
+
rejectedAll.push(r);
|
|
608
|
+
}
|
|
609
|
+
while (buffer.length > rejectedBufferSize) buffer.shift();
|
|
610
|
+
if (slowMetaEvery > 0 && (epoch + 1) % slowMetaEvery === 0) {
|
|
611
|
+
metaNote = buildMetaNote(acceptedEdits, buffer);
|
|
612
|
+
}
|
|
613
|
+
history.push({
|
|
614
|
+
epoch,
|
|
615
|
+
editBudget: budget,
|
|
616
|
+
proposed: patches.length,
|
|
617
|
+
accepted,
|
|
618
|
+
rejected: rejectedThisEpoch,
|
|
619
|
+
holdoutComposite: currentHoldout
|
|
620
|
+
});
|
|
621
|
+
if (sinceAccept >= patience) break;
|
|
622
|
+
}
|
|
623
|
+
return {
|
|
624
|
+
winnerSurface: current,
|
|
625
|
+
baselineHoldoutComposite: baselineHoldout,
|
|
626
|
+
winnerHoldoutComposite: currentHoldout,
|
|
627
|
+
lift: currentHoldout - baselineHoldout,
|
|
628
|
+
acceptedEdits,
|
|
629
|
+
rejectedEdits: rejectedAll,
|
|
630
|
+
epochsRun,
|
|
631
|
+
history,
|
|
632
|
+
totalCostUsd
|
|
633
|
+
};
|
|
634
|
+
}
|
|
635
|
+
function runScoringCampaign(opts, scenarios, surface, tag) {
|
|
636
|
+
return runCampaign({
|
|
637
|
+
...opts,
|
|
638
|
+
scenarios,
|
|
639
|
+
dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
|
|
640
|
+
runDir: `${opts.runDir}/${tag}`
|
|
641
|
+
});
|
|
642
|
+
}
|
|
643
|
+
function toEvidence(campaign, k) {
|
|
644
|
+
const { dimensions, scenarios } = campaignBreakdown(campaign);
|
|
645
|
+
const weakScenarios = [...scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
|
|
646
|
+
const weakDimensions = Object.entries(dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
|
|
647
|
+
return { weakScenarios, weakDimensions };
|
|
648
|
+
}
|
|
649
|
+
function buildMetaNote(accepted, rejected) {
|
|
650
|
+
const parts = [];
|
|
651
|
+
if (accepted.length > 0) {
|
|
652
|
+
parts.push(
|
|
653
|
+
`Edits that improved held-out so far: ${accepted.map((a) => `"${a.label}" (+${a.holdoutDelta.toFixed(3)})`).join("; ")}. Build on these.`
|
|
654
|
+
);
|
|
655
|
+
}
|
|
656
|
+
if (rejected.length > 0) {
|
|
657
|
+
const labels = [...new Set(rejected.map((r) => r.label))].slice(0, 5);
|
|
658
|
+
parts.push(`Dead ends to avoid: ${labels.join(", ")}. Try a different anchor or rule.`);
|
|
659
|
+
}
|
|
660
|
+
parts.push("Keep edits small and anchored to existing lines.");
|
|
661
|
+
return parts.join(" ");
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
// src/campaign/presets/compare-drivers.ts
|
|
665
|
+
async function compareDrivers(opts) {
|
|
666
|
+
if (opts.drivers.length === 0) throw new Error("compareDrivers: no drivers to compare");
|
|
667
|
+
const seed = opts.seed ?? 42;
|
|
668
|
+
const resamples = opts.resamples ?? 2e3;
|
|
669
|
+
const confidence = opts.confidence ?? 0.95;
|
|
670
|
+
const scoreOnHoldout = async (surface, tag) => {
|
|
671
|
+
const campaign = await runCampaign({
|
|
672
|
+
...opts,
|
|
673
|
+
scenarios: opts.holdoutScenarios,
|
|
674
|
+
dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
|
|
675
|
+
runDir: `${opts.runDir}/${tag}`
|
|
676
|
+
});
|
|
677
|
+
const byScenario = {};
|
|
678
|
+
for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
|
|
679
|
+
byScenario[scenarioId] = composite;
|
|
680
|
+
}
|
|
681
|
+
return byScenario;
|
|
682
|
+
};
|
|
683
|
+
const scenarioIds = [...new Set(opts.holdoutScenarios.map((s) => s.id))].sort();
|
|
684
|
+
if (scenarioIds.length === 0) throw new Error("compareDrivers: holdoutScenarios is empty");
|
|
685
|
+
const align = (byScenario, label) => {
|
|
686
|
+
const missing = scenarioIds.filter((id) => !(id in byScenario));
|
|
687
|
+
if (missing.length > 0) {
|
|
688
|
+
throw new Error(
|
|
689
|
+
`compareDrivers: ${label} produced no held-out score for scenario(s) [${missing.join(
|
|
690
|
+
", "
|
|
691
|
+
)}] \u2014 a cell errored or its judges returned nothing. Refusing to fabricate a 0 (it would corrupt the lift comparison). Fix the dispatch/judge or drop the scenario.`
|
|
692
|
+
);
|
|
693
|
+
}
|
|
694
|
+
return scenarioIds.map((id) => byScenario[id]);
|
|
695
|
+
};
|
|
696
|
+
const baselineArr = align(
|
|
697
|
+
await scoreOnHoldout(opts.baselineSurface, "compare-baseline"),
|
|
698
|
+
"baseline"
|
|
699
|
+
);
|
|
700
|
+
const winners = [];
|
|
701
|
+
for (const d of opts.drivers) {
|
|
702
|
+
const out = await d.optimize();
|
|
703
|
+
const byScenario = await scoreOnHoldout(out.winnerSurface, `compare-${slug(d.name)}`);
|
|
704
|
+
winners.push({
|
|
705
|
+
name: d.name,
|
|
706
|
+
winnerSurface: out.winnerSurface,
|
|
707
|
+
costUsd: out.costUsd,
|
|
708
|
+
durationMs: out.durationMs,
|
|
709
|
+
arr: align(byScenario, `driver "${d.name}"`)
|
|
710
|
+
});
|
|
711
|
+
}
|
|
712
|
+
const scores = winners.map((w) => {
|
|
713
|
+
const boot = pairedBootstrap(baselineArr, w.arr, {
|
|
714
|
+
seed,
|
|
715
|
+
resamples,
|
|
716
|
+
confidence,
|
|
717
|
+
statistic: "mean"
|
|
718
|
+
});
|
|
719
|
+
const score = {
|
|
720
|
+
name: w.name,
|
|
721
|
+
baselineComposite: mean(baselineArr),
|
|
722
|
+
winnerComposite: mean(w.arr),
|
|
723
|
+
lift: boot.mean,
|
|
724
|
+
liftCi: { low: boot.low, high: boot.high },
|
|
725
|
+
costUsd: w.costUsd,
|
|
726
|
+
winnerSurface: w.winnerSurface,
|
|
727
|
+
rank: 0
|
|
728
|
+
};
|
|
729
|
+
if (w.durationMs !== void 0) score.durationMs = w.durationMs;
|
|
730
|
+
return score;
|
|
731
|
+
});
|
|
732
|
+
scores.sort((a, b) => b.lift - a.lift || a.costUsd - b.costUsd);
|
|
733
|
+
scores.forEach((s, i) => {
|
|
734
|
+
s.rank = i + 1;
|
|
735
|
+
});
|
|
736
|
+
const best = scores[0];
|
|
737
|
+
const byName = new Map(winners.map((w) => [w.name, w]));
|
|
738
|
+
const bestArr = byName.get(best.name).arr;
|
|
739
|
+
const pairwise = scores.slice(1).map((other) => {
|
|
740
|
+
const otherArr = byName.get(other.name).arr;
|
|
741
|
+
const boot = pairedBootstrap(otherArr, bestArr, {
|
|
742
|
+
seed,
|
|
743
|
+
resamples,
|
|
744
|
+
confidence,
|
|
745
|
+
statistic: "mean"
|
|
746
|
+
});
|
|
747
|
+
const favored = boot.low > 0 ? best.name : boot.high < 0 ? other.name : "tie";
|
|
748
|
+
return {
|
|
749
|
+
a: best.name,
|
|
750
|
+
b: other.name,
|
|
751
|
+
deltaMean: boot.mean,
|
|
752
|
+
low: boot.low,
|
|
753
|
+
high: boot.high,
|
|
754
|
+
favored
|
|
755
|
+
};
|
|
756
|
+
});
|
|
757
|
+
return { scores, best, pairwise, holdoutScenarioIds: scenarioIds };
|
|
758
|
+
}
|
|
759
|
+
function mean(xs) {
|
|
760
|
+
return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
761
|
+
}
|
|
762
|
+
function slug(name) {
|
|
763
|
+
return name.replace(/[^a-z0-9]+/gi, "-").toLowerCase();
|
|
764
|
+
}
|
|
765
|
+
function gepaReflectionEntry(config, name = "gepa-reflection") {
|
|
766
|
+
return gepaEntry(config, false, name);
|
|
767
|
+
}
|
|
768
|
+
function gepaParetoEntry(config, name = "gepa-pareto") {
|
|
769
|
+
return gepaEntry(config, true, name);
|
|
770
|
+
}
|
|
771
|
+
function gepaEntry(config, combineParents, name) {
|
|
772
|
+
return {
|
|
773
|
+
name,
|
|
774
|
+
async optimize() {
|
|
775
|
+
const started = Date.now();
|
|
776
|
+
const driver = gepaDriver({
|
|
777
|
+
llm: config.llm,
|
|
778
|
+
model: config.model,
|
|
779
|
+
target: config.target,
|
|
780
|
+
combineParents,
|
|
781
|
+
...config.mutationPrimitives ? { mutationPrimitives: config.mutationPrimitives } : {}
|
|
782
|
+
});
|
|
783
|
+
const result = await runImprovementLoop({
|
|
784
|
+
scenarios: config.trainScenarios,
|
|
785
|
+
holdoutScenarios: config.holdoutScenarios,
|
|
786
|
+
baselineSurface: config.baselineSurface,
|
|
787
|
+
dispatchWithSurface: config.dispatchWithSurface,
|
|
788
|
+
judges: config.judges,
|
|
789
|
+
driver,
|
|
790
|
+
populationSize: config.populationSize ?? 2,
|
|
791
|
+
maxGenerations: config.maxGenerations ?? 3,
|
|
792
|
+
gate: defaultProductionGate({
|
|
793
|
+
holdoutScenarios: config.holdoutScenarios,
|
|
794
|
+
deltaThreshold: 0
|
|
795
|
+
}),
|
|
796
|
+
autoOnPromote: "none",
|
|
797
|
+
runDir: `${config.runDir}/${slug(name)}-loop`,
|
|
798
|
+
...config.seed !== void 0 ? { seed: config.seed } : {}
|
|
799
|
+
});
|
|
800
|
+
const costUsd = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
|
|
801
|
+
(sum, g) => sum + g.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
|
|
802
|
+
0
|
|
803
|
+
);
|
|
804
|
+
return { winnerSurface: result.winnerSurface, costUsd, durationMs: Date.now() - started };
|
|
805
|
+
}
|
|
806
|
+
};
|
|
807
|
+
}
|
|
808
|
+
function skillOptEntry(config, name = "skill-opt") {
|
|
809
|
+
return {
|
|
810
|
+
name,
|
|
811
|
+
async optimize() {
|
|
812
|
+
const started = Date.now();
|
|
813
|
+
const driver = skillOptDriver({ llm: config.llm, model: config.model, target: config.target });
|
|
814
|
+
const result = await runSkillOpt({
|
|
815
|
+
baselineSurface: config.baselineSurface,
|
|
816
|
+
dispatchWithSurface: config.dispatchWithSurface,
|
|
817
|
+
judges: config.judges,
|
|
818
|
+
driver,
|
|
819
|
+
trainScenarios: config.trainScenarios,
|
|
820
|
+
holdoutScenarios: config.holdoutScenarios,
|
|
821
|
+
maxEpochs: config.maxEpochs ?? 6,
|
|
822
|
+
runDir: `${config.runDir}/${slug(name)}-loop`,
|
|
823
|
+
...config.seed !== void 0 ? { seed: config.seed } : {}
|
|
824
|
+
});
|
|
825
|
+
return {
|
|
826
|
+
winnerSurface: result.winnerSurface,
|
|
827
|
+
costUsd: result.totalCostUsd,
|
|
828
|
+
durationMs: Date.now() - started
|
|
829
|
+
};
|
|
830
|
+
}
|
|
831
|
+
};
|
|
832
|
+
}
|
|
833
|
+
|
|
261
834
|
// src/campaign/presets/run-profile-matrix.ts
|
|
262
835
|
import { createHash as createHash2 } from "crypto";
|
|
263
836
|
import { join as join2 } from "path";
|
|
@@ -272,12 +845,12 @@ function sanitize(id) {
|
|
|
272
845
|
function sha(input) {
|
|
273
846
|
return createHash2("sha256").update(JSON.stringify(input)).digest("hex");
|
|
274
847
|
}
|
|
275
|
-
function
|
|
848
|
+
function mean2(xs) {
|
|
276
849
|
return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
277
850
|
}
|
|
278
851
|
function cellComposite(cell) {
|
|
279
852
|
const composites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
280
|
-
return composites.length === 0 ? 0 :
|
|
853
|
+
return composites.length === 0 ? 0 : mean2(composites);
|
|
281
854
|
}
|
|
282
855
|
function buildRunRecord(args) {
|
|
283
856
|
const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } = args;
|
|
@@ -295,7 +868,7 @@ function buildRunRecord(args) {
|
|
|
295
868
|
if (js.notes) notes.push(`${judgeName}: ${js.notes}`);
|
|
296
869
|
}
|
|
297
870
|
const perDimMean = {};
|
|
298
|
-
for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] =
|
|
871
|
+
for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean2(values);
|
|
299
872
|
const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
|
|
300
873
|
if (Object.keys(perJudge).length > 0) {
|
|
301
874
|
outcome.judgeScores = {
|
|
@@ -406,7 +979,7 @@ async function runProfileMatrix(opts) {
|
|
|
406
979
|
profileHash,
|
|
407
980
|
model: profile.model,
|
|
408
981
|
records: profileRecords.length,
|
|
409
|
-
meanComposite:
|
|
982
|
+
meanComposite: mean2(profileRecords.map(compositeOf)),
|
|
410
983
|
totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),
|
|
411
984
|
integrity: summarizeBackendIntegrity(profileRecords)
|
|
412
985
|
};
|
|
@@ -436,7 +1009,7 @@ function rollup(records, keyOf) {
|
|
|
436
1009
|
groups.set(key, arr);
|
|
437
1010
|
}
|
|
438
1011
|
const out = {};
|
|
439
|
-
for (const [key, xs] of groups) out[key] = { meanComposite:
|
|
1012
|
+
for (const [key, xs] of groups) out[key] = { meanComposite: mean2(xs), n: xs.length };
|
|
440
1013
|
return out;
|
|
441
1014
|
}
|
|
442
1015
|
function rollupByPersona(records, scenarios, personaOf) {
|
|
@@ -465,7 +1038,7 @@ function defaultGit(args, cwd) {
|
|
|
465
1038
|
throw new WorktreeAdapterError(`git ${args.join(" ")} failed: ${stderr || String(err)}`, err);
|
|
466
1039
|
}
|
|
467
1040
|
}
|
|
468
|
-
function
|
|
1041
|
+
function slug2(label) {
|
|
469
1042
|
return label.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 48) || "candidate";
|
|
470
1043
|
}
|
|
471
1044
|
function gitWorktreeAdapter(opts) {
|
|
@@ -474,7 +1047,7 @@ function gitWorktreeAdapter(opts) {
|
|
|
474
1047
|
const branchPrefix = opts.branchPrefix ?? "improve";
|
|
475
1048
|
return {
|
|
476
1049
|
async create({ baseRef, label }) {
|
|
477
|
-
const id = `${
|
|
1050
|
+
const id = `${slug2(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
|
|
478
1051
|
const branch = `${branchPrefix}/${id}`;
|
|
479
1052
|
const path = join3(worktreeDir, id);
|
|
480
1053
|
git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
|
|
@@ -508,8 +1081,13 @@ export {
|
|
|
508
1081
|
FsLabeledScenarioStore,
|
|
509
1082
|
LabeledScenarioStoreError,
|
|
510
1083
|
ProfileMatrixError,
|
|
1084
|
+
SkillPatchParseError,
|
|
511
1085
|
WorktreeAdapterError,
|
|
1086
|
+
applySkillPatch,
|
|
512
1087
|
buildLoopProvenanceRecord,
|
|
1088
|
+
campaignBreakdown,
|
|
1089
|
+
campaignMeanComposite,
|
|
1090
|
+
compareDrivers,
|
|
513
1091
|
composeGate,
|
|
514
1092
|
countSentenceEdits,
|
|
515
1093
|
defaultProductionGate,
|
|
@@ -519,6 +1097,8 @@ export {
|
|
|
519
1097
|
extractH2Sections,
|
|
520
1098
|
fsCampaignStorage,
|
|
521
1099
|
gepaDriver,
|
|
1100
|
+
gepaParetoEntry,
|
|
1101
|
+
gepaReflectionEntry,
|
|
522
1102
|
gitWorktreeAdapter,
|
|
523
1103
|
heldOutGate,
|
|
524
1104
|
inMemoryCampaignStorage,
|
|
@@ -526,6 +1106,8 @@ export {
|
|
|
526
1106
|
labelTrustRank,
|
|
527
1107
|
loopProvenanceSpans,
|
|
528
1108
|
openAutoPr,
|
|
1109
|
+
parseSkillPatchResponse,
|
|
1110
|
+
patchEditCount,
|
|
529
1111
|
provenanceRecordPath,
|
|
530
1112
|
provenanceSpansPath,
|
|
531
1113
|
resolveWorktreePath,
|
|
@@ -534,6 +1116,9 @@ export {
|
|
|
534
1116
|
runImprovementLoop,
|
|
535
1117
|
runOptimization,
|
|
536
1118
|
runProfileMatrix,
|
|
1119
|
+
runSkillOpt,
|
|
1120
|
+
skillOptDriver,
|
|
1121
|
+
skillOptEntry,
|
|
537
1122
|
surfaceContentHash,
|
|
538
1123
|
surfaceHash
|
|
539
1124
|
};
|