claude-overnight 1.55.1 → 1.57.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/evolve.js +148 -2
- package/dist/core/_version.d.ts +1 -1
- package/dist/core/_version.js +1 -1
- package/dist/prompt-evolution/evaluator-judge.d.ts +20 -0
- package/dist/prompt-evolution/evaluator-judge.js +119 -0
- package/dist/prompt-evolution/evaluator-utils.d.ts +7 -0
- package/dist/prompt-evolution/evaluator-utils.js +17 -0
- package/dist/prompt-evolution/evaluator.d.ts +20 -0
- package/dist/prompt-evolution/evaluator.js +212 -74
- package/dist/prompt-evolution/fixtures/generate.d.ts +38 -0
- package/dist/prompt-evolution/fixtures/generate.js +168 -0
- package/dist/prompt-evolution/index.d.ts +18 -0
- package/dist/prompt-evolution/index.js +66 -9
- package/dist/prompt-evolution/llm-judge.d.ts +2 -0
- package/dist/prompt-evolution/llm-judge.js +2 -2
- package/dist/prompt-evolution/persistence.d.ts +20 -0
- package/dist/prompt-evolution/persistence.js +39 -0
- package/dist/prompt-evolution/report.d.ts +1 -1
- package/dist/prompt-evolution/report.js +134 -7
- package/dist/prompt-evolution/scorer.d.ts +34 -0
- package/dist/prompt-evolution/scorer.js +94 -0
- package/dist/prompt-evolution/transport-batch.d.ts +54 -0
- package/dist/prompt-evolution/transport-batch.js +213 -0
- package/dist/prompt-evolution/types.d.ts +10 -0
- package/package.json +1 -1
- package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1
package/dist/bin/evolve.js
CHANGED
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
import { evolvePrompt } from "../prompt-evolution/index.js";
|
|
19
19
|
import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
|
|
20
20
|
import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
|
|
21
|
+
import { generateCases } from "../prompt-evolution/fixtures/generate.js";
|
|
21
22
|
import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
|
|
22
23
|
function help() {
|
|
23
24
|
process.stdout.write(`Usage: claude-overnight-evolve [options]
|
|
@@ -34,13 +35,28 @@ Options:
|
|
|
34
35
|
--population <n> Max population size (default: 8)
|
|
35
36
|
--plateau <n> Stop early if no improvement for N generations (default: 3)
|
|
36
37
|
--reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
|
|
38
|
+
--concurrency <n> Max in-flight eval calls (default: 8; bump for slow endpoints)
|
|
39
|
+
--batch Use provider batch API (50% cheaper, slower wall-clock)
|
|
40
|
+
--adaptive-cap <n> Adaptive sampling: extend reps up to N when σ > threshold (default: off)
|
|
41
|
+
--adaptive-threshold <x> σ threshold that triggers an extra rep (default: 0.1)
|
|
37
42
|
--judge Use llm-judge for content scoring (costs extra API calls)
|
|
38
43
|
--judge-model <model> Model to use for the judge (default: same as eval-model)
|
|
39
44
|
--judge-top-n <n> Judge only the top-N variants per generation (default: 4)
|
|
40
45
|
--cases <suite> Benchmark suite: plan | mcp-planning | mcp-review |
|
|
41
46
|
mcp-supervision | mcp-stuck (default: plan)
|
|
42
47
|
--harvest Append cases harvested from <cwd>/.claude-overnight/runs/*
|
|
48
|
+
--harvest-only Use ONLY harvested real objectives (fails if none found)
|
|
43
49
|
--harvest-limit <n> Max harvested cases (default: 10)
|
|
50
|
+
--prompts <list> Comma-separated prompt paths to evolve in sequence
|
|
51
|
+
--test-split <f> Hold out fraction f of cases for a selection-bias-free
|
|
52
|
+
final eval (default: 0 = no split). Use 0.3 for rigor.
|
|
53
|
+
--case-pool <n> Target total case count; generates synthetic cases via
|
|
54
|
+
LLM to top up if the current pool is smaller.
|
|
55
|
+
--gen-model <model> Model used by the case generator (default: eval-model)
|
|
56
|
+
|
|
57
|
+
Subcommands:
|
|
58
|
+
claude-overnight-evolve diff <runIdA> <runIdB>
|
|
59
|
+
Print a per-variant diff of two persisted runs
|
|
44
60
|
--base-url <url> API base URL override
|
|
45
61
|
--auth-token <token> Auth token override
|
|
46
62
|
--run-id <id> Preset run id (default: auto-generated)
|
|
@@ -61,11 +77,14 @@ function parseArgs() {
|
|
|
61
77
|
population: 8,
|
|
62
78
|
plateau: 3,
|
|
63
79
|
reps: 1,
|
|
80
|
+
batch: false,
|
|
64
81
|
useJudge: false,
|
|
65
82
|
judgeTopN: 4,
|
|
66
83
|
cases: "",
|
|
67
84
|
harvest: false,
|
|
85
|
+
harvestOnly: false,
|
|
68
86
|
harvestLimit: 10,
|
|
87
|
+
testSplit: 0,
|
|
69
88
|
baseUrl: process.env.ANTHROPIC_BASE_URL,
|
|
70
89
|
authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
|
|
71
90
|
};
|
|
@@ -112,6 +131,21 @@ function parseArgs() {
|
|
|
112
131
|
opts.reps = parseInt(v, 10);
|
|
113
132
|
i++;
|
|
114
133
|
break;
|
|
134
|
+
case "--concurrency":
|
|
135
|
+
opts.concurrency = parseInt(v, 10);
|
|
136
|
+
i++;
|
|
137
|
+
break;
|
|
138
|
+
case "--batch":
|
|
139
|
+
opts.batch = true;
|
|
140
|
+
break;
|
|
141
|
+
case "--adaptive-cap":
|
|
142
|
+
opts.adaptiveCap = parseInt(v, 10);
|
|
143
|
+
i++;
|
|
144
|
+
break;
|
|
145
|
+
case "--adaptive-threshold":
|
|
146
|
+
opts.adaptiveThreshold = parseFloat(v);
|
|
147
|
+
i++;
|
|
148
|
+
break;
|
|
115
149
|
case "--judge":
|
|
116
150
|
opts.useJudge = true;
|
|
117
151
|
break;
|
|
@@ -130,10 +164,30 @@ function parseArgs() {
|
|
|
130
164
|
case "--harvest":
|
|
131
165
|
opts.harvest = true;
|
|
132
166
|
break;
|
|
167
|
+
case "--harvest-only":
|
|
168
|
+
opts.harvest = true;
|
|
169
|
+
opts.harvestOnly = true;
|
|
170
|
+
break;
|
|
133
171
|
case "--harvest-limit":
|
|
134
172
|
opts.harvestLimit = parseInt(v, 10);
|
|
135
173
|
i++;
|
|
136
174
|
break;
|
|
175
|
+
case "--prompts":
|
|
176
|
+
opts.prompts = v.split(",").map((s) => s.trim()).filter(Boolean);
|
|
177
|
+
i++;
|
|
178
|
+
break;
|
|
179
|
+
case "--test-split":
|
|
180
|
+
opts.testSplit = parseFloat(v);
|
|
181
|
+
i++;
|
|
182
|
+
break;
|
|
183
|
+
case "--case-pool":
|
|
184
|
+
opts.casePool = parseInt(v, 10);
|
|
185
|
+
i++;
|
|
186
|
+
break;
|
|
187
|
+
case "--gen-model":
|
|
188
|
+
opts.genModel = v;
|
|
189
|
+
i++;
|
|
190
|
+
break;
|
|
137
191
|
case "--base-url":
|
|
138
192
|
opts.baseUrl = v;
|
|
139
193
|
i++;
|
|
@@ -156,7 +210,31 @@ function parseArgs() {
|
|
|
156
210
|
return opts;
|
|
157
211
|
}
|
|
158
212
|
async function main() {
|
|
213
|
+
// Subcommand: diff two persisted runs.
|
|
214
|
+
if (process.argv[2] === "diff") {
|
|
215
|
+
await runDiff(process.argv[3], process.argv[4]);
|
|
216
|
+
return;
|
|
217
|
+
}
|
|
159
218
|
const opts = parseArgs();
|
|
219
|
+
// Multi-prompt mode: loop evolvePrompt once per prompt in opts.prompts.
|
|
220
|
+
// Each iteration gets its own runId and report. Post a combined summary
|
|
221
|
+
// at the end so the user sees best-of-batch across all prompts.
|
|
222
|
+
if (opts.prompts && opts.prompts.length > 0) {
|
|
223
|
+
const summary = [];
|
|
224
|
+
for (const p of opts.prompts) {
|
|
225
|
+
console.log(`\n========== Evolving ${p} ==========\n`);
|
|
226
|
+
const result = await evolveOne({ ...opts, prompt: p });
|
|
227
|
+
summary.push({ prompt: p, runId: result.runId, gmean: result.bestVariant.gmean, reportPath: result.reportPath });
|
|
228
|
+
}
|
|
229
|
+
console.log("\n========== Multi-prompt summary ==========");
|
|
230
|
+
for (const s of summary) {
|
|
231
|
+
console.log(` ${s.prompt.padEnd(40)} gmean=${(s.gmean * 100).toFixed(1)}% runId=${s.runId}`);
|
|
232
|
+
}
|
|
233
|
+
return;
|
|
234
|
+
}
|
|
235
|
+
await evolveOne(opts);
|
|
236
|
+
}
|
|
237
|
+
async function evolveOne(opts) {
|
|
160
238
|
let cases;
|
|
161
239
|
let promptPath = opts.prompt;
|
|
162
240
|
let seedText;
|
|
@@ -177,7 +255,7 @@ async function main() {
|
|
|
177
255
|
}
|
|
178
256
|
else {
|
|
179
257
|
if (opts.cases === "plan")
|
|
180
|
-
cases = [...PLAN_CASES];
|
|
258
|
+
cases = opts.harvestOnly ? [] : [...PLAN_CASES];
|
|
181
259
|
else
|
|
182
260
|
throw new Error(`Unknown case suite: ${opts.cases}`);
|
|
183
261
|
if (opts.harvest) {
|
|
@@ -187,13 +265,37 @@ async function main() {
|
|
|
187
265
|
limit: opts.harvestLimit,
|
|
188
266
|
});
|
|
189
267
|
if (harvested.length === 0) {
|
|
268
|
+
if (opts.harvestOnly) {
|
|
269
|
+
throw new Error("--harvest-only set but no runs found under <cwd>/.claude-overnight/runs");
|
|
270
|
+
}
|
|
190
271
|
console.log(` (harvest: no runs found under <cwd>/.claude-overnight/runs)`);
|
|
191
272
|
}
|
|
192
273
|
else {
|
|
193
|
-
console.log(` (harvest:
|
|
274
|
+
console.log(` (harvest: ${opts.harvestOnly ? "" : "+"}${harvested.length} real objectives)`);
|
|
194
275
|
cases = cases.concat(harvested);
|
|
195
276
|
}
|
|
196
277
|
}
|
|
278
|
+
// Top up to --case-pool with LLM-generated synthetic cases. The generator
|
|
279
|
+
// caches its output so successive runs share the pool — real cost is
|
|
280
|
+
// paid once, amortised across every subsequent round.
|
|
281
|
+
if (opts.casePool && cases.length < opts.casePool) {
|
|
282
|
+
console.log(` (generating cases to reach pool size ${opts.casePool}…)`);
|
|
283
|
+
try {
|
|
284
|
+
const generated = await generateCases({
|
|
285
|
+
targetCount: opts.casePool - cases.length,
|
|
286
|
+
model: opts.genModel ?? opts.evalModel,
|
|
287
|
+
baseUrl: opts.baseUrl,
|
|
288
|
+
authToken: opts.authToken,
|
|
289
|
+
promptPath,
|
|
290
|
+
existing: cases,
|
|
291
|
+
});
|
|
292
|
+
console.log(` (generated: +${generated.length} synthetic cases)`);
|
|
293
|
+
cases = cases.concat(generated);
|
|
294
|
+
}
|
|
295
|
+
catch (err) {
|
|
296
|
+
console.log(` (case generation failed: ${err.message})`);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
197
299
|
}
|
|
198
300
|
console.log(`Evolution config:`);
|
|
199
301
|
console.log(` target: ${opts.target}`);
|
|
@@ -215,6 +317,12 @@ async function main() {
|
|
|
215
317
|
populationCap: opts.population,
|
|
216
318
|
plateauGenerations: opts.plateau,
|
|
217
319
|
repetitions: opts.reps > 1 ? opts.reps : undefined,
|
|
320
|
+
concurrency: opts.concurrency,
|
|
321
|
+
batch: opts.batch || undefined,
|
|
322
|
+
adaptiveReps: opts.adaptiveCap
|
|
323
|
+
? { cap: opts.adaptiveCap, threshold: opts.adaptiveThreshold }
|
|
324
|
+
: undefined,
|
|
325
|
+
testFraction: opts.testSplit > 0 ? opts.testSplit : undefined,
|
|
218
326
|
judge: opts.useJudge
|
|
219
327
|
? {
|
|
220
328
|
model: opts.judgeModel ?? opts.evalModel,
|
|
@@ -241,6 +349,44 @@ async function main() {
|
|
|
241
349
|
console.log(`speed: ${(result.bestVariant.aggregate.speed * 100).toFixed(1)}%`);
|
|
242
350
|
console.log("\n--- Prompt text ---");
|
|
243
351
|
console.log(result.bestVariant.text);
|
|
352
|
+
return result;
|
|
353
|
+
}
|
|
354
|
+
async function runDiff(runIdA, runIdB) {
|
|
355
|
+
if (!runIdA || !runIdB) {
|
|
356
|
+
console.error("usage: claude-overnight-evolve diff <runIdA> <runIdB>");
|
|
357
|
+
process.exit(2);
|
|
358
|
+
}
|
|
359
|
+
const { loadRun } = await import("../prompt-evolution/persistence.js");
|
|
360
|
+
const a = loadRun(runIdA);
|
|
361
|
+
const b = loadRun(runIdB);
|
|
362
|
+
const collect = (run) => {
|
|
363
|
+
const out = new Map();
|
|
364
|
+
for (const rec of run.matrix) {
|
|
365
|
+
// Keep the latest-generation row per variantId so diff compares final state.
|
|
366
|
+
const existing = out.get(rec.variantId);
|
|
367
|
+
if (!existing || rec.generation > existing.generation) {
|
|
368
|
+
out.set(rec.variantId, { generation: rec.generation, variantId: rec.variantId, gmean: rec.gmean });
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
return out;
|
|
372
|
+
};
|
|
373
|
+
const rowsA = collect(a);
|
|
374
|
+
const rowsB = collect(b);
|
|
375
|
+
const ids = new Set([...rowsA.keys(), ...rowsB.keys()]);
|
|
376
|
+
console.log(`# Diff: ${runIdA} → ${runIdB}`);
|
|
377
|
+
console.log("");
|
|
378
|
+
console.log(`| Variant | A gmean | B gmean | Δ | note |`);
|
|
379
|
+
console.log(`|-----------|-----------|-----------|-------|--------|`);
|
|
380
|
+
const sorted = [...ids].sort();
|
|
381
|
+
for (const id of sorted) {
|
|
382
|
+
const ra = rowsA.get(id);
|
|
383
|
+
const rb = rowsB.get(id);
|
|
384
|
+
const ga = ra ? (ra.gmean * 100).toFixed(1) : "—";
|
|
385
|
+
const gb = rb ? (rb.gmean * 100).toFixed(1) : "—";
|
|
386
|
+
const delta = ra && rb ? ((rb.gmean - ra.gmean) * 100).toFixed(1) : "—";
|
|
387
|
+
const note = !ra ? "new in B" : !rb ? "missing in B" : ra.gmean < rb.gmean ? "↑" : ra.gmean > rb.gmean ? "↓" : "=";
|
|
388
|
+
console.log(`| ${id.padEnd(10)}| ${ga.padStart(9)} | ${gb.padStart(9)} | ${delta.padStart(5)} | ${note} |`);
|
|
389
|
+
}
|
|
244
390
|
}
|
|
245
391
|
main().catch((err) => {
|
|
246
392
|
console.error(err);
|
package/dist/core/_version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "1.
|
|
1
|
+
export declare const VERSION = "1.57.0";
|
package/dist/core/_version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// Auto-generated by build — do not edit manually.
|
|
2
|
-
export const VERSION = "1.
|
|
2
|
+
export const VERSION = "1.57.0";
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-judge pass over a built evaluation matrix.
|
|
3
|
+
*
|
|
4
|
+
* Split out of evaluator.ts to keep each file under the 500-line cap and
|
|
5
|
+
* because the judge has its own concerns (top-N eligibility, batch vs
|
|
6
|
+
* online path, crash-resumable state).
|
|
7
|
+
*
|
|
8
|
+
* The judge REPLACES the heuristic content score with a semantic grade.
|
|
9
|
+
* We only judge top-N variants per generation to cap cost — a judge call
|
|
10
|
+
* per (variant, case, model) on a large population explodes fast.
|
|
11
|
+
*/
|
|
12
|
+
import { type JudgeOpts } from "./llm-judge.js";
|
|
13
|
+
import type { BenchmarkCase, EvaluationResult } from "./types.js";
|
|
14
|
+
import type { EvalOpts } from "./evaluator.js";
|
|
15
|
+
export declare function runJudge(variants: Array<{
|
|
16
|
+
id: string;
|
|
17
|
+
text: string;
|
|
18
|
+
}>, cases: BenchmarkCase[], models: string[], aggregated: Map<string, EvaluationResult>, judge: JudgeOpts & {
|
|
19
|
+
topN?: number;
|
|
20
|
+
}, opts: EvalOpts): Promise<void>;
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-judge pass over a built evaluation matrix.
|
|
3
|
+
*
|
|
4
|
+
* Split out of evaluator.ts to keep each file under the 500-line cap and
|
|
5
|
+
* because the judge has its own concerns (top-N eligibility, batch vs
|
|
6
|
+
* online path, crash-resumable state).
|
|
7
|
+
*
|
|
8
|
+
* The judge REPLACES the heuristic content score with a semantic grade.
|
|
9
|
+
* We only judge top-N variants per generation to cap cost — a judge call
|
|
10
|
+
* per (variant, case, model) on a large population explodes fast.
|
|
11
|
+
*/
|
|
12
|
+
import { judgeOutput, buildJudgePrompt, parseJudgeOutput } from "./llm-judge.js";
|
|
13
|
+
import { batchCallModel } from "./transport-batch.js";
|
|
14
|
+
import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
|
|
15
|
+
import { gmean } from "./scorer.js";
|
|
16
|
+
import { averageDimensions } from "./evaluator-utils.js";
|
|
17
|
+
export async function runJudge(variants, cases, models, aggregated, judge, opts) {
|
|
18
|
+
const topN = judge.topN ?? 4;
|
|
19
|
+
const variantGmeans = variants.map((v) => {
|
|
20
|
+
const scores = [];
|
|
21
|
+
for (const c of cases) {
|
|
22
|
+
for (const model of models) {
|
|
23
|
+
const r = aggregated.get(`${v.id}:${c.hash}:${model}`);
|
|
24
|
+
if (r)
|
|
25
|
+
scores.push(r.scores);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return { id: v.id, g: scores.length > 0 ? gmean(averageDimensions(scores)) : 0 };
|
|
29
|
+
});
|
|
30
|
+
variantGmeans.sort((a, b) => b.g - a.g);
|
|
31
|
+
const eligible = new Set(variantGmeans.slice(0, topN).map((x) => x.id));
|
|
32
|
+
const cells = [];
|
|
33
|
+
for (const v of variants) {
|
|
34
|
+
if (!eligible.has(v.id))
|
|
35
|
+
continue;
|
|
36
|
+
for (const c of cases) {
|
|
37
|
+
for (const model of models) {
|
|
38
|
+
const key = `${v.id}:${c.hash}:${model}`;
|
|
39
|
+
const r = aggregated.get(key);
|
|
40
|
+
if (!r || r.scores.parse < 0.5)
|
|
41
|
+
continue; // unparseable output isn't worth judging
|
|
42
|
+
cells.push({ key, c, r });
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (cells.length === 0)
|
|
47
|
+
return;
|
|
48
|
+
if (opts.batch) {
|
|
49
|
+
await runJudgeBatch(cells, judge, opts);
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
const jobs = cells.map((cell) => async () => {
|
|
53
|
+
try {
|
|
54
|
+
const jr = await judgeOutput(cell.r.rawOutput, cell.c, judge);
|
|
55
|
+
cell.r.scores = { ...cell.r.scores, content: jr.score };
|
|
56
|
+
cell.r.judgeJustification = jr.justification;
|
|
57
|
+
}
|
|
58
|
+
catch {
|
|
59
|
+
// Judge failure is non-fatal — keep heuristic content.
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
const judgeConcurrency = 3;
|
|
63
|
+
let nextJob = 0;
|
|
64
|
+
const judgeWorker = async () => {
|
|
65
|
+
while (true) {
|
|
66
|
+
const i = nextJob++;
|
|
67
|
+
if (i >= jobs.length)
|
|
68
|
+
return;
|
|
69
|
+
await jobs[i]();
|
|
70
|
+
}
|
|
71
|
+
};
|
|
72
|
+
await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
|
|
73
|
+
}
|
|
74
|
+
async function runJudgeBatch(cells, judge, opts) {
|
|
75
|
+
const batchJobs = cells.map((cell, i) => ({
|
|
76
|
+
customId: `j:${i}|k:${cell.key}`,
|
|
77
|
+
userText: buildJudgePrompt(cell.r.rawOutput, cell.c),
|
|
78
|
+
model: judge.model,
|
|
79
|
+
}));
|
|
80
|
+
const existing = opts.runId != null && opts.generation != null
|
|
81
|
+
? loadBatchState(opts.runId, opts.generation, "judge")
|
|
82
|
+
: null;
|
|
83
|
+
const transport = opts.batchCallModel ?? batchCallModel;
|
|
84
|
+
const results = await transport(batchJobs, {
|
|
85
|
+
baseUrl: judge.baseUrl ?? opts.baseUrl,
|
|
86
|
+
authToken: judge.authToken ?? opts.authToken,
|
|
87
|
+
maxTokens: judge.maxTokens ?? 2048,
|
|
88
|
+
resumeBatchId: existing?.batchId,
|
|
89
|
+
onSubmitted: (batchId, p) => {
|
|
90
|
+
if (opts.runId != null && opts.generation != null && !existing) {
|
|
91
|
+
saveBatchState(opts.runId, {
|
|
92
|
+
generation: opts.generation,
|
|
93
|
+
phase: "judge",
|
|
94
|
+
batchId,
|
|
95
|
+
provider: p,
|
|
96
|
+
submittedAt: new Date().toISOString(),
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
opts.onBatchProgress?.(`judge batch submitted: ${batchId} (${p})`);
|
|
100
|
+
},
|
|
101
|
+
onProgress: (p) => opts.onBatchProgress?.(`judge batch ${p.batchId} ${p.phase}${p.succeeded != null ? `: ${p.succeeded}/${p.total ?? batchJobs.length}` : ""}`),
|
|
102
|
+
});
|
|
103
|
+
if (opts.runId != null && existing)
|
|
104
|
+
markBatchFinished(opts.runId, existing.batchId);
|
|
105
|
+
for (const cell of cells) {
|
|
106
|
+
const customId = batchJobs.find((b) => b.customId.includes(`|k:${cell.key}`))?.customId;
|
|
107
|
+
const got = customId ? results.get(customId) : undefined;
|
|
108
|
+
if (!got || !got.raw)
|
|
109
|
+
continue;
|
|
110
|
+
try {
|
|
111
|
+
const jr = parseJudgeOutput(got.raw);
|
|
112
|
+
cell.r.scores = { ...cell.r.scores, content: jr.score };
|
|
113
|
+
cell.r.judgeJustification = jr.justification;
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
// Judge parse failure is non-fatal — keep heuristic content.
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
|
|
3
|
+
* Extracted to break the import cycle that would otherwise form between
|
|
4
|
+
* the two (both call averageDimensions, judge also needs gmean aggregates).
|
|
5
|
+
*/
|
|
6
|
+
import type { ScoreDimensions } from "./types.js";
|
|
7
|
+
export declare function averageDimensions(scores: ScoreDimensions[]): ScoreDimensions;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
|
|
3
|
+
* Extracted to break the import cycle that would otherwise form between
|
|
4
|
+
* the two (both call averageDimensions, judge also needs gmean aggregates).
|
|
5
|
+
*/
|
|
6
|
+
export function averageDimensions(scores) {
|
|
7
|
+
if (scores.length === 0)
|
|
8
|
+
return { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 };
|
|
9
|
+
const n = scores.length;
|
|
10
|
+
return {
|
|
11
|
+
parse: scores.reduce((a, b) => a + b.parse, 0) / n,
|
|
12
|
+
schema: scores.reduce((a, b) => a + b.schema, 0) / n,
|
|
13
|
+
content: scores.reduce((a, b) => a + b.content, 0) / n,
|
|
14
|
+
costEfficiency: scores.reduce((a, b) => a + b.costEfficiency, 0) / n,
|
|
15
|
+
speed: scores.reduce((a, b) => a + b.speed, 0) / n,
|
|
16
|
+
};
|
|
17
|
+
}
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
*/
|
|
18
18
|
import { type JudgeOpts } from "./llm-judge.js";
|
|
19
19
|
import { type CallModel } from "./transport.js";
|
|
20
|
+
import { batchCallModel } from "./transport-batch.js";
|
|
20
21
|
import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
|
|
21
22
|
export interface EvalOpts {
|
|
22
23
|
/** Primary generator model (retained for single-model compat). */
|
|
@@ -35,14 +36,33 @@ export interface EvalOpts {
|
|
|
35
36
|
timeoutMs?: number;
|
|
36
37
|
/** Repetitions per (variant, case, model). Default 1 — opt-in to 3+ for noise floor. */
|
|
37
38
|
repetitions?: number;
|
|
39
|
+
/**
|
|
40
|
+
* Adaptive sampling: after initial `repetitions`, keep adding one rep per cell
|
|
41
|
+
* where any score-dim σ exceeds `threshold`, up to `cap` total reps. Prevents
|
|
42
|
+
* wasted reps on already-stable cells while driving noisy ones down.
|
|
43
|
+
*/
|
|
44
|
+
adaptiveReps?: {
|
|
45
|
+
cap: number;
|
|
46
|
+
threshold?: number;
|
|
47
|
+
};
|
|
38
48
|
/** Inject an llm-judge call per case; content dimension is replaced by judge score. */
|
|
39
49
|
judge?: JudgeOpts & {
|
|
40
50
|
topN?: number;
|
|
41
51
|
};
|
|
42
52
|
/** Transport override for tests. */
|
|
43
53
|
callModel?: CallModel;
|
|
54
|
+
/** Use provider batch API instead of online calls (50% cheaper, slower wall-clock). */
|
|
55
|
+
batch?: boolean;
|
|
56
|
+
/** Run id — required when batch=true so state is crash-resumable. */
|
|
57
|
+
runId?: string;
|
|
58
|
+
/** Current generation number — used to key batch state. */
|
|
59
|
+
generation?: number;
|
|
60
|
+
/** Batch-transport override for tests. Same return shape as transport-batch.batchCallModel. */
|
|
61
|
+
batchCallModel?: typeof batchCallModel;
|
|
44
62
|
/** Optional callback for progress */
|
|
45
63
|
onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
|
|
64
|
+
/** Progress callback specific to batch-phase transitions. */
|
|
65
|
+
onBatchProgress?: (msg: string) => void;
|
|
46
66
|
}
|
|
47
67
|
export declare function buildMatrix(variants: Array<{
|
|
48
68
|
id: string;
|