claude-overnight 1.55.2 → 1.57.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/evolve.js +142 -2
- package/dist/core/_version.d.ts +1 -1
- package/dist/core/_version.js +1 -1
- package/dist/prompt-evolution/evaluator-judge.d.ts +20 -0
- package/dist/prompt-evolution/evaluator-judge.js +119 -0
- package/dist/prompt-evolution/evaluator-utils.d.ts +7 -0
- package/dist/prompt-evolution/evaluator-utils.js +17 -0
- package/dist/prompt-evolution/evaluator.d.ts +20 -0
- package/dist/prompt-evolution/evaluator.js +212 -89
- package/dist/prompt-evolution/fixtures/generate.d.ts +38 -0
- package/dist/prompt-evolution/fixtures/generate.js +168 -0
- package/dist/prompt-evolution/index.d.ts +16 -0
- package/dist/prompt-evolution/index.js +64 -7
- package/dist/prompt-evolution/llm-judge.d.ts +2 -0
- package/dist/prompt-evolution/llm-judge.js +2 -2
- package/dist/prompt-evolution/persistence.d.ts +20 -0
- package/dist/prompt-evolution/persistence.js +39 -0
- package/dist/prompt-evolution/report.d.ts +1 -1
- package/dist/prompt-evolution/report.js +134 -7
- package/dist/prompt-evolution/scorer.d.ts +34 -0
- package/dist/prompt-evolution/scorer.js +94 -0
- package/dist/prompt-evolution/transport-batch.d.ts +54 -0
- package/dist/prompt-evolution/transport-batch.js +213 -0
- package/dist/prompt-evolution/types.d.ts +10 -0
- package/package.json +1 -1
- package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1
package/dist/bin/evolve.js
CHANGED
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
import { evolvePrompt } from "../prompt-evolution/index.js";
|
|
19
19
|
import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
|
|
20
20
|
import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
|
|
21
|
+
import { generateCases } from "../prompt-evolution/fixtures/generate.js";
|
|
21
22
|
import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
|
|
22
23
|
function help() {
|
|
23
24
|
process.stdout.write(`Usage: claude-overnight-evolve [options]
|
|
@@ -35,13 +36,27 @@ Options:
|
|
|
35
36
|
--plateau <n> Stop early if no improvement for N generations (default: 3)
|
|
36
37
|
--reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
|
|
37
38
|
--concurrency <n> Max in-flight eval calls (default: 8; bump for slow endpoints)
|
|
39
|
+
--batch Use provider batch API (50% cheaper, slower wall-clock)
|
|
40
|
+
--adaptive-cap <n> Adaptive sampling: extend reps up to N when σ > threshold (default: off)
|
|
41
|
+
--adaptive-threshold <x> σ threshold that triggers an extra rep (default: 0.1)
|
|
38
42
|
--judge Use llm-judge for content scoring (costs extra API calls)
|
|
39
43
|
--judge-model <model> Model to use for the judge (default: same as eval-model)
|
|
40
44
|
--judge-top-n <n> Judge only the top-N variants per generation (default: 4)
|
|
41
45
|
--cases <suite> Benchmark suite: plan | mcp-planning | mcp-review |
|
|
42
46
|
mcp-supervision | mcp-stuck (default: plan)
|
|
43
47
|
--harvest Append cases harvested from <cwd>/.claude-overnight/runs/*
|
|
48
|
+
--harvest-only Use ONLY harvested real objectives (fails if none found)
|
|
44
49
|
--harvest-limit <n> Max harvested cases (default: 10)
|
|
50
|
+
--prompts <list> Comma-separated prompt paths to evolve in sequence
|
|
51
|
+
--test-split <f> Hold out fraction f of cases for a selection-bias-free
|
|
52
|
+
final eval (default: 0 = no split). Use 0.3 for rigor.
|
|
53
|
+
--case-pool <n> Target total case count; generates synthetic cases via
|
|
54
|
+
LLM to top up if the current pool is smaller.
|
|
55
|
+
--gen-model <model> Model used by the case generator (default: eval-model)
|
|
56
|
+
|
|
57
|
+
Subcommands:
|
|
58
|
+
claude-overnight-evolve diff <runIdA> <runIdB>
|
|
59
|
+
Print a per-variant diff of two persisted runs
|
|
45
60
|
--base-url <url> API base URL override
|
|
46
61
|
--auth-token <token> Auth token override
|
|
47
62
|
--run-id <id> Preset run id (default: auto-generated)
|
|
@@ -62,11 +77,14 @@ function parseArgs() {
|
|
|
62
77
|
population: 8,
|
|
63
78
|
plateau: 3,
|
|
64
79
|
reps: 1,
|
|
80
|
+
batch: false,
|
|
65
81
|
useJudge: false,
|
|
66
82
|
judgeTopN: 4,
|
|
67
83
|
cases: "",
|
|
68
84
|
harvest: false,
|
|
85
|
+
harvestOnly: false,
|
|
69
86
|
harvestLimit: 10,
|
|
87
|
+
testSplit: 0,
|
|
70
88
|
baseUrl: process.env.ANTHROPIC_BASE_URL,
|
|
71
89
|
authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
|
|
72
90
|
};
|
|
@@ -117,6 +135,17 @@ function parseArgs() {
|
|
|
117
135
|
opts.concurrency = parseInt(v, 10);
|
|
118
136
|
i++;
|
|
119
137
|
break;
|
|
138
|
+
case "--batch":
|
|
139
|
+
opts.batch = true;
|
|
140
|
+
break;
|
|
141
|
+
case "--adaptive-cap":
|
|
142
|
+
opts.adaptiveCap = parseInt(v, 10);
|
|
143
|
+
i++;
|
|
144
|
+
break;
|
|
145
|
+
case "--adaptive-threshold":
|
|
146
|
+
opts.adaptiveThreshold = parseFloat(v);
|
|
147
|
+
i++;
|
|
148
|
+
break;
|
|
120
149
|
case "--judge":
|
|
121
150
|
opts.useJudge = true;
|
|
122
151
|
break;
|
|
@@ -135,10 +164,30 @@ function parseArgs() {
|
|
|
135
164
|
case "--harvest":
|
|
136
165
|
opts.harvest = true;
|
|
137
166
|
break;
|
|
167
|
+
case "--harvest-only":
|
|
168
|
+
opts.harvest = true;
|
|
169
|
+
opts.harvestOnly = true;
|
|
170
|
+
break;
|
|
138
171
|
case "--harvest-limit":
|
|
139
172
|
opts.harvestLimit = parseInt(v, 10);
|
|
140
173
|
i++;
|
|
141
174
|
break;
|
|
175
|
+
case "--prompts":
|
|
176
|
+
opts.prompts = v.split(",").map((s) => s.trim()).filter(Boolean);
|
|
177
|
+
i++;
|
|
178
|
+
break;
|
|
179
|
+
case "--test-split":
|
|
180
|
+
opts.testSplit = parseFloat(v);
|
|
181
|
+
i++;
|
|
182
|
+
break;
|
|
183
|
+
case "--case-pool":
|
|
184
|
+
opts.casePool = parseInt(v, 10);
|
|
185
|
+
i++;
|
|
186
|
+
break;
|
|
187
|
+
case "--gen-model":
|
|
188
|
+
opts.genModel = v;
|
|
189
|
+
i++;
|
|
190
|
+
break;
|
|
142
191
|
case "--base-url":
|
|
143
192
|
opts.baseUrl = v;
|
|
144
193
|
i++;
|
|
@@ -161,7 +210,31 @@ function parseArgs() {
|
|
|
161
210
|
return opts;
|
|
162
211
|
}
|
|
163
212
|
async function main() {
|
|
213
|
+
// Subcommand: diff two persisted runs.
|
|
214
|
+
if (process.argv[2] === "diff") {
|
|
215
|
+
await runDiff(process.argv[3], process.argv[4]);
|
|
216
|
+
return;
|
|
217
|
+
}
|
|
164
218
|
const opts = parseArgs();
|
|
219
|
+
// Multi-prompt mode: loop evolvePrompt once per prompt in opts.prompts.
|
|
220
|
+
// Each iteration gets its own runId and report. Post a combined summary
|
|
221
|
+
// at the end so the user sees best-of-batch across all prompts.
|
|
222
|
+
if (opts.prompts && opts.prompts.length > 0) {
|
|
223
|
+
const summary = [];
|
|
224
|
+
for (const p of opts.prompts) {
|
|
225
|
+
console.log(`\n========== Evolving ${p} ==========\n`);
|
|
226
|
+
const result = await evolveOne({ ...opts, prompt: p });
|
|
227
|
+
summary.push({ prompt: p, runId: result.runId, gmean: result.bestVariant.gmean, reportPath: result.reportPath });
|
|
228
|
+
}
|
|
229
|
+
console.log("\n========== Multi-prompt summary ==========");
|
|
230
|
+
for (const s of summary) {
|
|
231
|
+
console.log(` ${s.prompt.padEnd(40)} gmean=${(s.gmean * 100).toFixed(1)}% runId=${s.runId}`);
|
|
232
|
+
}
|
|
233
|
+
return;
|
|
234
|
+
}
|
|
235
|
+
await evolveOne(opts);
|
|
236
|
+
}
|
|
237
|
+
async function evolveOne(opts) {
|
|
165
238
|
let cases;
|
|
166
239
|
let promptPath = opts.prompt;
|
|
167
240
|
let seedText;
|
|
@@ -182,7 +255,7 @@ async function main() {
|
|
|
182
255
|
}
|
|
183
256
|
else {
|
|
184
257
|
if (opts.cases === "plan")
|
|
185
|
-
cases = [...PLAN_CASES];
|
|
258
|
+
cases = opts.harvestOnly ? [] : [...PLAN_CASES];
|
|
186
259
|
else
|
|
187
260
|
throw new Error(`Unknown case suite: ${opts.cases}`);
|
|
188
261
|
if (opts.harvest) {
|
|
@@ -192,13 +265,37 @@ async function main() {
|
|
|
192
265
|
limit: opts.harvestLimit,
|
|
193
266
|
});
|
|
194
267
|
if (harvested.length === 0) {
|
|
268
|
+
if (opts.harvestOnly) {
|
|
269
|
+
throw new Error("--harvest-only set but no runs found under <cwd>/.claude-overnight/runs");
|
|
270
|
+
}
|
|
195
271
|
console.log(` (harvest: no runs found under <cwd>/.claude-overnight/runs)`);
|
|
196
272
|
}
|
|
197
273
|
else {
|
|
198
|
-
console.log(` (harvest:
|
|
274
|
+
console.log(` (harvest: ${opts.harvestOnly ? "" : "+"}${harvested.length} real objectives)`);
|
|
199
275
|
cases = cases.concat(harvested);
|
|
200
276
|
}
|
|
201
277
|
}
|
|
278
|
+
// Top up to --case-pool with LLM-generated synthetic cases. The generator
|
|
279
|
+
// caches its output so successive runs share the pool — real cost is
|
|
280
|
+
// paid once, amortised across every subsequent round.
|
|
281
|
+
if (opts.casePool && cases.length < opts.casePool) {
|
|
282
|
+
console.log(` (generating cases to reach pool size ${opts.casePool}…)`);
|
|
283
|
+
try {
|
|
284
|
+
const generated = await generateCases({
|
|
285
|
+
targetCount: opts.casePool - cases.length,
|
|
286
|
+
model: opts.genModel ?? opts.evalModel,
|
|
287
|
+
baseUrl: opts.baseUrl,
|
|
288
|
+
authToken: opts.authToken,
|
|
289
|
+
promptPath,
|
|
290
|
+
existing: cases,
|
|
291
|
+
});
|
|
292
|
+
console.log(` (generated: +${generated.length} synthetic cases)`);
|
|
293
|
+
cases = cases.concat(generated);
|
|
294
|
+
}
|
|
295
|
+
catch (err) {
|
|
296
|
+
console.log(` (case generation failed: ${err.message})`);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
202
299
|
}
|
|
203
300
|
console.log(`Evolution config:`);
|
|
204
301
|
console.log(` target: ${opts.target}`);
|
|
@@ -221,6 +318,11 @@ async function main() {
|
|
|
221
318
|
plateauGenerations: opts.plateau,
|
|
222
319
|
repetitions: opts.reps > 1 ? opts.reps : undefined,
|
|
223
320
|
concurrency: opts.concurrency,
|
|
321
|
+
batch: opts.batch || undefined,
|
|
322
|
+
adaptiveReps: opts.adaptiveCap
|
|
323
|
+
? { cap: opts.adaptiveCap, threshold: opts.adaptiveThreshold }
|
|
324
|
+
: undefined,
|
|
325
|
+
testFraction: opts.testSplit > 0 ? opts.testSplit : undefined,
|
|
224
326
|
judge: opts.useJudge
|
|
225
327
|
? {
|
|
226
328
|
model: opts.judgeModel ?? opts.evalModel,
|
|
@@ -247,6 +349,44 @@ async function main() {
|
|
|
247
349
|
console.log(`speed: ${(result.bestVariant.aggregate.speed * 100).toFixed(1)}%`);
|
|
248
350
|
console.log("\n--- Prompt text ---");
|
|
249
351
|
console.log(result.bestVariant.text);
|
|
352
|
+
return result;
|
|
353
|
+
}
|
|
354
|
+
async function runDiff(runIdA, runIdB) {
|
|
355
|
+
if (!runIdA || !runIdB) {
|
|
356
|
+
console.error("usage: claude-overnight-evolve diff <runIdA> <runIdB>");
|
|
357
|
+
process.exit(2);
|
|
358
|
+
}
|
|
359
|
+
const { loadRun } = await import("../prompt-evolution/persistence.js");
|
|
360
|
+
const a = loadRun(runIdA);
|
|
361
|
+
const b = loadRun(runIdB);
|
|
362
|
+
const collect = (run) => {
|
|
363
|
+
const out = new Map();
|
|
364
|
+
for (const rec of run.matrix) {
|
|
365
|
+
// Keep the latest-generation row per variantId so diff compares final state.
|
|
366
|
+
const existing = out.get(rec.variantId);
|
|
367
|
+
if (!existing || rec.generation > existing.generation) {
|
|
368
|
+
out.set(rec.variantId, { generation: rec.generation, variantId: rec.variantId, gmean: rec.gmean });
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
return out;
|
|
372
|
+
};
|
|
373
|
+
const rowsA = collect(a);
|
|
374
|
+
const rowsB = collect(b);
|
|
375
|
+
const ids = new Set([...rowsA.keys(), ...rowsB.keys()]);
|
|
376
|
+
console.log(`# Diff: ${runIdA} → ${runIdB}`);
|
|
377
|
+
console.log("");
|
|
378
|
+
console.log(`| Variant | A gmean | B gmean | Δ | note |`);
|
|
379
|
+
console.log(`|-----------|-----------|-----------|-------|--------|`);
|
|
380
|
+
const sorted = [...ids].sort();
|
|
381
|
+
for (const id of sorted) {
|
|
382
|
+
const ra = rowsA.get(id);
|
|
383
|
+
const rb = rowsB.get(id);
|
|
384
|
+
const ga = ra ? (ra.gmean * 100).toFixed(1) : "—";
|
|
385
|
+
const gb = rb ? (rb.gmean * 100).toFixed(1) : "—";
|
|
386
|
+
const delta = ra && rb ? ((rb.gmean - ra.gmean) * 100).toFixed(1) : "—";
|
|
387
|
+
const note = !ra ? "new in B" : !rb ? "missing in B" : ra.gmean < rb.gmean ? "↑" : ra.gmean > rb.gmean ? "↓" : "=";
|
|
388
|
+
console.log(`| ${id.padEnd(10)}| ${ga.padStart(9)} | ${gb.padStart(9)} | ${delta.padStart(5)} | ${note} |`);
|
|
389
|
+
}
|
|
250
390
|
}
|
|
251
391
|
main().catch((err) => {
|
|
252
392
|
console.error(err);
|
package/dist/core/_version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "1.
|
|
1
|
+
export declare const VERSION = "1.57.0";
|
package/dist/core/_version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// Auto-generated by build — do not edit manually.
|
|
2
|
-
export const VERSION = "1.
|
|
2
|
+
export const VERSION = "1.57.0";
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-judge pass over a built evaluation matrix.
|
|
3
|
+
*
|
|
4
|
+
* Split out of evaluator.ts to keep each file under the 500-line cap and
|
|
5
|
+
* because the judge has its own concerns (top-N eligibility, batch vs
|
|
6
|
+
* online path, crash-resumable state).
|
|
7
|
+
*
|
|
8
|
+
* The judge REPLACES the heuristic content score with a semantic grade.
|
|
9
|
+
* We only judge top-N variants per generation to cap cost — a judge call
|
|
10
|
+
* per (variant, case, model) on a large population explodes fast.
|
|
11
|
+
*/
|
|
12
|
+
import { type JudgeOpts } from "./llm-judge.js";
|
|
13
|
+
import type { BenchmarkCase, EvaluationResult } from "./types.js";
|
|
14
|
+
import type { EvalOpts } from "./evaluator.js";
|
|
15
|
+
export declare function runJudge(variants: Array<{
|
|
16
|
+
id: string;
|
|
17
|
+
text: string;
|
|
18
|
+
}>, cases: BenchmarkCase[], models: string[], aggregated: Map<string, EvaluationResult>, judge: JudgeOpts & {
|
|
19
|
+
topN?: number;
|
|
20
|
+
}, opts: EvalOpts): Promise<void>;
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-judge pass over a built evaluation matrix.
|
|
3
|
+
*
|
|
4
|
+
* Split out of evaluator.ts to keep each file under the 500-line cap and
|
|
5
|
+
* because the judge has its own concerns (top-N eligibility, batch vs
|
|
6
|
+
* online path, crash-resumable state).
|
|
7
|
+
*
|
|
8
|
+
* The judge REPLACES the heuristic content score with a semantic grade.
|
|
9
|
+
* We only judge top-N variants per generation to cap cost — a judge call
|
|
10
|
+
* per (variant, case, model) on a large population explodes fast.
|
|
11
|
+
*/
|
|
12
|
+
import { judgeOutput, buildJudgePrompt, parseJudgeOutput } from "./llm-judge.js";
|
|
13
|
+
import { batchCallModel } from "./transport-batch.js";
|
|
14
|
+
import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
|
|
15
|
+
import { gmean } from "./scorer.js";
|
|
16
|
+
import { averageDimensions } from "./evaluator-utils.js";
|
|
17
|
+
export async function runJudge(variants, cases, models, aggregated, judge, opts) {
|
|
18
|
+
const topN = judge.topN ?? 4;
|
|
19
|
+
const variantGmeans = variants.map((v) => {
|
|
20
|
+
const scores = [];
|
|
21
|
+
for (const c of cases) {
|
|
22
|
+
for (const model of models) {
|
|
23
|
+
const r = aggregated.get(`${v.id}:${c.hash}:${model}`);
|
|
24
|
+
if (r)
|
|
25
|
+
scores.push(r.scores);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return { id: v.id, g: scores.length > 0 ? gmean(averageDimensions(scores)) : 0 };
|
|
29
|
+
});
|
|
30
|
+
variantGmeans.sort((a, b) => b.g - a.g);
|
|
31
|
+
const eligible = new Set(variantGmeans.slice(0, topN).map((x) => x.id));
|
|
32
|
+
const cells = [];
|
|
33
|
+
for (const v of variants) {
|
|
34
|
+
if (!eligible.has(v.id))
|
|
35
|
+
continue;
|
|
36
|
+
for (const c of cases) {
|
|
37
|
+
for (const model of models) {
|
|
38
|
+
const key = `${v.id}:${c.hash}:${model}`;
|
|
39
|
+
const r = aggregated.get(key);
|
|
40
|
+
if (!r || r.scores.parse < 0.5)
|
|
41
|
+
continue; // unparseable output isn't worth judging
|
|
42
|
+
cells.push({ key, c, r });
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (cells.length === 0)
|
|
47
|
+
return;
|
|
48
|
+
if (opts.batch) {
|
|
49
|
+
await runJudgeBatch(cells, judge, opts);
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
const jobs = cells.map((cell) => async () => {
|
|
53
|
+
try {
|
|
54
|
+
const jr = await judgeOutput(cell.r.rawOutput, cell.c, judge);
|
|
55
|
+
cell.r.scores = { ...cell.r.scores, content: jr.score };
|
|
56
|
+
cell.r.judgeJustification = jr.justification;
|
|
57
|
+
}
|
|
58
|
+
catch {
|
|
59
|
+
// Judge failure is non-fatal — keep heuristic content.
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
const judgeConcurrency = 3;
|
|
63
|
+
let nextJob = 0;
|
|
64
|
+
const judgeWorker = async () => {
|
|
65
|
+
while (true) {
|
|
66
|
+
const i = nextJob++;
|
|
67
|
+
if (i >= jobs.length)
|
|
68
|
+
return;
|
|
69
|
+
await jobs[i]();
|
|
70
|
+
}
|
|
71
|
+
};
|
|
72
|
+
await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
|
|
73
|
+
}
|
|
74
|
+
async function runJudgeBatch(cells, judge, opts) {
|
|
75
|
+
const batchJobs = cells.map((cell, i) => ({
|
|
76
|
+
customId: `j:${i}|k:${cell.key}`,
|
|
77
|
+
userText: buildJudgePrompt(cell.r.rawOutput, cell.c),
|
|
78
|
+
model: judge.model,
|
|
79
|
+
}));
|
|
80
|
+
const existing = opts.runId != null && opts.generation != null
|
|
81
|
+
? loadBatchState(opts.runId, opts.generation, "judge")
|
|
82
|
+
: null;
|
|
83
|
+
const transport = opts.batchCallModel ?? batchCallModel;
|
|
84
|
+
const results = await transport(batchJobs, {
|
|
85
|
+
baseUrl: judge.baseUrl ?? opts.baseUrl,
|
|
86
|
+
authToken: judge.authToken ?? opts.authToken,
|
|
87
|
+
maxTokens: judge.maxTokens ?? 2048,
|
|
88
|
+
resumeBatchId: existing?.batchId,
|
|
89
|
+
onSubmitted: (batchId, p) => {
|
|
90
|
+
if (opts.runId != null && opts.generation != null && !existing) {
|
|
91
|
+
saveBatchState(opts.runId, {
|
|
92
|
+
generation: opts.generation,
|
|
93
|
+
phase: "judge",
|
|
94
|
+
batchId,
|
|
95
|
+
provider: p,
|
|
96
|
+
submittedAt: new Date().toISOString(),
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
opts.onBatchProgress?.(`judge batch submitted: ${batchId} (${p})`);
|
|
100
|
+
},
|
|
101
|
+
onProgress: (p) => opts.onBatchProgress?.(`judge batch ${p.batchId} ${p.phase}${p.succeeded != null ? `: ${p.succeeded}/${p.total ?? batchJobs.length}` : ""}`),
|
|
102
|
+
});
|
|
103
|
+
if (opts.runId != null && existing)
|
|
104
|
+
markBatchFinished(opts.runId, existing.batchId);
|
|
105
|
+
for (const cell of cells) {
|
|
106
|
+
const customId = batchJobs.find((b) => b.customId.includes(`|k:${cell.key}`))?.customId;
|
|
107
|
+
const got = customId ? results.get(customId) : undefined;
|
|
108
|
+
if (!got || !got.raw)
|
|
109
|
+
continue;
|
|
110
|
+
try {
|
|
111
|
+
const jr = parseJudgeOutput(got.raw);
|
|
112
|
+
cell.r.scores = { ...cell.r.scores, content: jr.score };
|
|
113
|
+
cell.r.judgeJustification = jr.justification;
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
// Judge parse failure is non-fatal — keep heuristic content.
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
|
|
3
|
+
* Extracted to break the import cycle that would otherwise form between
|
|
4
|
+
* the two (both call averageDimensions, judge also needs gmean aggregates).
|
|
5
|
+
*/
|
|
6
|
+
import type { ScoreDimensions } from "./types.js";
|
|
7
|
+
export declare function averageDimensions(scores: ScoreDimensions[]): ScoreDimensions;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
|
|
3
|
+
* Extracted to break the import cycle that would otherwise form between
|
|
4
|
+
* the two (both call averageDimensions, judge also needs gmean aggregates).
|
|
5
|
+
*/
|
|
6
|
+
export function averageDimensions(scores) {
|
|
7
|
+
if (scores.length === 0)
|
|
8
|
+
return { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 };
|
|
9
|
+
const n = scores.length;
|
|
10
|
+
return {
|
|
11
|
+
parse: scores.reduce((a, b) => a + b.parse, 0) / n,
|
|
12
|
+
schema: scores.reduce((a, b) => a + b.schema, 0) / n,
|
|
13
|
+
content: scores.reduce((a, b) => a + b.content, 0) / n,
|
|
14
|
+
costEfficiency: scores.reduce((a, b) => a + b.costEfficiency, 0) / n,
|
|
15
|
+
speed: scores.reduce((a, b) => a + b.speed, 0) / n,
|
|
16
|
+
};
|
|
17
|
+
}
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
*/
|
|
18
18
|
import { type JudgeOpts } from "./llm-judge.js";
|
|
19
19
|
import { type CallModel } from "./transport.js";
|
|
20
|
+
import { batchCallModel } from "./transport-batch.js";
|
|
20
21
|
import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
|
|
21
22
|
export interface EvalOpts {
|
|
22
23
|
/** Primary generator model (retained for single-model compat). */
|
|
@@ -35,14 +36,33 @@ export interface EvalOpts {
|
|
|
35
36
|
timeoutMs?: number;
|
|
36
37
|
/** Repetitions per (variant, case, model). Default 1 — opt-in to 3+ for noise floor. */
|
|
37
38
|
repetitions?: number;
|
|
39
|
+
/**
|
|
40
|
+
* Adaptive sampling: after initial `repetitions`, keep adding one rep per cell
|
|
41
|
+
* where any score-dim σ exceeds `threshold`, up to `cap` total reps. Prevents
|
|
42
|
+
* wasted reps on already-stable cells while driving noisy ones down.
|
|
43
|
+
*/
|
|
44
|
+
adaptiveReps?: {
|
|
45
|
+
cap: number;
|
|
46
|
+
threshold?: number;
|
|
47
|
+
};
|
|
38
48
|
/** Inject an llm-judge call per case; content dimension is replaced by judge score. */
|
|
39
49
|
judge?: JudgeOpts & {
|
|
40
50
|
topN?: number;
|
|
41
51
|
};
|
|
42
52
|
/** Transport override for tests. */
|
|
43
53
|
callModel?: CallModel;
|
|
54
|
+
/** Use provider batch API instead of online calls (50% cheaper, slower wall-clock). */
|
|
55
|
+
batch?: boolean;
|
|
56
|
+
/** Run id — required when batch=true so state is crash-resumable. */
|
|
57
|
+
runId?: string;
|
|
58
|
+
/** Current generation number — used to key batch state. */
|
|
59
|
+
generation?: number;
|
|
60
|
+
/** Batch-transport override for tests. Same return shape as transport-batch.batchCallModel. */
|
|
61
|
+
batchCallModel?: typeof batchCallModel;
|
|
44
62
|
/** Optional callback for progress */
|
|
45
63
|
onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
|
|
64
|
+
/** Progress callback specific to batch-phase transitions. */
|
|
65
|
+
onBatchProgress?: (msg: string) => void;
|
|
46
66
|
}
|
|
47
67
|
export declare function buildMatrix(variants: Array<{
|
|
48
68
|
id: string;
|