claude-overnight 1.55.1 → 1.57.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@
18
18
  import { evolvePrompt } from "../prompt-evolution/index.js";
19
19
  import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
20
20
  import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
21
+ import { generateCases } from "../prompt-evolution/fixtures/generate.js";
21
22
  import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
22
23
  function help() {
23
24
  process.stdout.write(`Usage: claude-overnight-evolve [options]
@@ -34,13 +35,28 @@ Options:
34
35
  --population <n> Max population size (default: 8)
35
36
  --plateau <n> Stop early if no improvement for N generations (default: 3)
36
37
  --reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
38
+ --concurrency <n> Max in-flight eval calls (default: 8; bump for slow endpoints)
39
+ --batch Use provider batch API (50% cheaper, slower wall-clock)
40
+ --adaptive-cap <n> Adaptive sampling: extend reps up to N when σ > threshold (default: off)
41
+ --adaptive-threshold <x> σ threshold that triggers an extra rep (default: 0.1)
37
42
  --judge Use llm-judge for content scoring (costs extra API calls)
38
43
  --judge-model <model> Model to use for the judge (default: same as eval-model)
39
44
  --judge-top-n <n> Judge only the top-N variants per generation (default: 4)
40
45
  --cases <suite> Benchmark suite: plan | mcp-planning | mcp-review |
41
46
  mcp-supervision | mcp-stuck (default: plan)
42
47
  --harvest Append cases harvested from <cwd>/.claude-overnight/runs/*
48
+ --harvest-only Use ONLY harvested real objectives (fails if none found)
43
49
  --harvest-limit <n> Max harvested cases (default: 10)
50
+ --prompts <list> Comma-separated prompt paths to evolve in sequence
51
+ --test-split <f> Hold out fraction f of cases for a selection-bias-free
52
+ final eval (default: 0 = no split). Use 0.3 for rigor.
53
+ --case-pool <n> Target total case count; generates synthetic cases via
54
+ LLM to top up if the current pool is smaller.
55
+ --gen-model <model> Model used by the case generator (default: eval-model)
56
+
57
+ Subcommands:
58
+ claude-overnight-evolve diff <runIdA> <runIdB>
59
+ Print a per-variant diff of two persisted runs
44
60
  --base-url <url> API base URL override
45
61
  --auth-token <token> Auth token override
46
62
  --run-id <id> Preset run id (default: auto-generated)
@@ -61,11 +77,14 @@ function parseArgs() {
61
77
  population: 8,
62
78
  plateau: 3,
63
79
  reps: 1,
80
+ batch: false,
64
81
  useJudge: false,
65
82
  judgeTopN: 4,
66
83
  cases: "",
67
84
  harvest: false,
85
+ harvestOnly: false,
68
86
  harvestLimit: 10,
87
+ testSplit: 0,
69
88
  baseUrl: process.env.ANTHROPIC_BASE_URL,
70
89
  authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
71
90
  };
@@ -112,6 +131,21 @@ function parseArgs() {
112
131
  opts.reps = parseInt(v, 10);
113
132
  i++;
114
133
  break;
134
+ case "--concurrency":
135
+ opts.concurrency = parseInt(v, 10);
136
+ i++;
137
+ break;
138
+ case "--batch":
139
+ opts.batch = true;
140
+ break;
141
+ case "--adaptive-cap":
142
+ opts.adaptiveCap = parseInt(v, 10);
143
+ i++;
144
+ break;
145
+ case "--adaptive-threshold":
146
+ opts.adaptiveThreshold = parseFloat(v);
147
+ i++;
148
+ break;
115
149
  case "--judge":
116
150
  opts.useJudge = true;
117
151
  break;
@@ -130,10 +164,30 @@ function parseArgs() {
130
164
  case "--harvest":
131
165
  opts.harvest = true;
132
166
  break;
167
+ case "--harvest-only":
168
+ opts.harvest = true;
169
+ opts.harvestOnly = true;
170
+ break;
133
171
  case "--harvest-limit":
134
172
  opts.harvestLimit = parseInt(v, 10);
135
173
  i++;
136
174
  break;
175
+ case "--prompts":
176
+ opts.prompts = v.split(",").map((s) => s.trim()).filter(Boolean);
177
+ i++;
178
+ break;
179
+ case "--test-split":
180
+ opts.testSplit = parseFloat(v);
181
+ i++;
182
+ break;
183
+ case "--case-pool":
184
+ opts.casePool = parseInt(v, 10);
185
+ i++;
186
+ break;
187
+ case "--gen-model":
188
+ opts.genModel = v;
189
+ i++;
190
+ break;
137
191
  case "--base-url":
138
192
  opts.baseUrl = v;
139
193
  i++;
@@ -156,7 +210,31 @@ function parseArgs() {
156
210
  return opts;
157
211
  }
158
212
  async function main() {
213
+ // Subcommand: diff two persisted runs.
214
+ if (process.argv[2] === "diff") {
215
+ await runDiff(process.argv[3], process.argv[4]);
216
+ return;
217
+ }
159
218
  const opts = parseArgs();
219
+ // Multi-prompt mode: loop evolvePrompt once per prompt in opts.prompts.
220
+ // Each iteration gets its own runId and report. Post a combined summary
221
+ // at the end so the user sees best-of-batch across all prompts.
222
+ if (opts.prompts && opts.prompts.length > 0) {
223
+ const summary = [];
224
+ for (const p of opts.prompts) {
225
+ console.log(`\n========== Evolving ${p} ==========\n`);
226
+ const result = await evolveOne({ ...opts, prompt: p });
227
+ summary.push({ prompt: p, runId: result.runId, gmean: result.bestVariant.gmean, reportPath: result.reportPath });
228
+ }
229
+ console.log("\n========== Multi-prompt summary ==========");
230
+ for (const s of summary) {
231
+ console.log(` ${s.prompt.padEnd(40)} gmean=${(s.gmean * 100).toFixed(1)}% runId=${s.runId}`);
232
+ }
233
+ return;
234
+ }
235
+ await evolveOne(opts);
236
+ }
237
+ async function evolveOne(opts) {
160
238
  let cases;
161
239
  let promptPath = opts.prompt;
162
240
  let seedText;
@@ -177,7 +255,7 @@ async function main() {
177
255
  }
178
256
  else {
179
257
  if (opts.cases === "plan")
180
- cases = [...PLAN_CASES];
258
+ cases = opts.harvestOnly ? [] : [...PLAN_CASES];
181
259
  else
182
260
  throw new Error(`Unknown case suite: ${opts.cases}`);
183
261
  if (opts.harvest) {
@@ -187,13 +265,37 @@ async function main() {
187
265
  limit: opts.harvestLimit,
188
266
  });
189
267
  if (harvested.length === 0) {
268
+ if (opts.harvestOnly) {
269
+ throw new Error("--harvest-only set but no runs found under <cwd>/.claude-overnight/runs");
270
+ }
190
271
  console.log(` (harvest: no runs found under <cwd>/.claude-overnight/runs)`);
191
272
  }
192
273
  else {
193
- console.log(` (harvest: +${harvested.length} real objectives)`);
274
+ console.log(` (harvest: ${opts.harvestOnly ? "" : "+"}${harvested.length} real objectives)`);
194
275
  cases = cases.concat(harvested);
195
276
  }
196
277
  }
278
+ // Top up to --case-pool with LLM-generated synthetic cases. The generator
279
+ // caches its output so successive runs share the pool — real cost is
280
+ // paid once, amortised across every subsequent round.
281
+ if (opts.casePool && cases.length < opts.casePool) {
282
+ console.log(` (generating cases to reach pool size ${opts.casePool}…)`);
283
+ try {
284
+ const generated = await generateCases({
285
+ targetCount: opts.casePool - cases.length,
286
+ model: opts.genModel ?? opts.evalModel,
287
+ baseUrl: opts.baseUrl,
288
+ authToken: opts.authToken,
289
+ promptPath,
290
+ existing: cases,
291
+ });
292
+ console.log(` (generated: +${generated.length} synthetic cases)`);
293
+ cases = cases.concat(generated);
294
+ }
295
+ catch (err) {
296
+ console.log(` (case generation failed: ${err.message})`);
297
+ }
298
+ }
197
299
  }
198
300
  console.log(`Evolution config:`);
199
301
  console.log(` target: ${opts.target}`);
@@ -215,6 +317,12 @@ async function main() {
215
317
  populationCap: opts.population,
216
318
  plateauGenerations: opts.plateau,
217
319
  repetitions: opts.reps > 1 ? opts.reps : undefined,
320
+ concurrency: opts.concurrency,
321
+ batch: opts.batch || undefined,
322
+ adaptiveReps: opts.adaptiveCap
323
+ ? { cap: opts.adaptiveCap, threshold: opts.adaptiveThreshold }
324
+ : undefined,
325
+ testFraction: opts.testSplit > 0 ? opts.testSplit : undefined,
218
326
  judge: opts.useJudge
219
327
  ? {
220
328
  model: opts.judgeModel ?? opts.evalModel,
@@ -241,6 +349,44 @@ async function main() {
241
349
  console.log(`speed: ${(result.bestVariant.aggregate.speed * 100).toFixed(1)}%`);
242
350
  console.log("\n--- Prompt text ---");
243
351
  console.log(result.bestVariant.text);
352
+ return result;
353
+ }
354
+ async function runDiff(runIdA, runIdB) {
355
+ if (!runIdA || !runIdB) {
356
+ console.error("usage: claude-overnight-evolve diff <runIdA> <runIdB>");
357
+ process.exit(2);
358
+ }
359
+ const { loadRun } = await import("../prompt-evolution/persistence.js");
360
+ const a = loadRun(runIdA);
361
+ const b = loadRun(runIdB);
362
+ const collect = (run) => {
363
+ const out = new Map();
364
+ for (const rec of run.matrix) {
365
+ // Keep the latest-generation row per variantId so diff compares final state.
366
+ const existing = out.get(rec.variantId);
367
+ if (!existing || rec.generation > existing.generation) {
368
+ out.set(rec.variantId, { generation: rec.generation, variantId: rec.variantId, gmean: rec.gmean });
369
+ }
370
+ }
371
+ return out;
372
+ };
373
+ const rowsA = collect(a);
374
+ const rowsB = collect(b);
375
+ const ids = new Set([...rowsA.keys(), ...rowsB.keys()]);
376
+ console.log(`# Diff: ${runIdA} → ${runIdB}`);
377
+ console.log("");
378
+ console.log(`| Variant | A gmean | B gmean | Δ | note |`);
379
+ console.log(`|-----------|-----------|-----------|-------|--------|`);
380
+ const sorted = [...ids].sort();
381
+ for (const id of sorted) {
382
+ const ra = rowsA.get(id);
383
+ const rb = rowsB.get(id);
384
+ const ga = ra ? (ra.gmean * 100).toFixed(1) : "—";
385
+ const gb = rb ? (rb.gmean * 100).toFixed(1) : "—";
386
+ const delta = ra && rb ? ((rb.gmean - ra.gmean) * 100).toFixed(1) : "—";
387
+ const note = !ra ? "new in B" : !rb ? "missing in B" : ra.gmean < rb.gmean ? "↑" : ra.gmean > rb.gmean ? "↓" : "=";
388
+ console.log(`| ${id.padEnd(10)}| ${ga.padStart(9)} | ${gb.padStart(9)} | ${delta.padStart(5)} | ${note} |`);
389
+ }
244
390
  }
245
391
  main().catch((err) => {
246
392
  console.error(err);
@@ -1 +1 @@
1
- export declare const VERSION = "1.55.1";
1
+ export declare const VERSION = "1.57.0";
@@ -1,2 +1,2 @@
1
1
  // Auto-generated by build — do not edit manually.
2
- export const VERSION = "1.55.1";
2
+ export const VERSION = "1.57.0";
@@ -0,0 +1,20 @@
1
+ /**
2
+ * LLM-judge pass over a built evaluation matrix.
3
+ *
4
+ * Split out of evaluator.ts to keep each file under the 500-line cap and
5
+ * because the judge has its own concerns (top-N eligibility, batch vs
6
+ * online path, crash-resumable state).
7
+ *
8
+ * The judge REPLACES the heuristic content score with a semantic grade.
9
+ * We only judge top-N variants per generation to cap cost — a judge call
10
+ * per (variant, case, model) on a large population explodes fast.
11
+ */
12
+ import { type JudgeOpts } from "./llm-judge.js";
13
+ import type { BenchmarkCase, EvaluationResult } from "./types.js";
14
+ import type { EvalOpts } from "./evaluator.js";
15
+ export declare function runJudge(variants: Array<{
16
+ id: string;
17
+ text: string;
18
+ }>, cases: BenchmarkCase[], models: string[], aggregated: Map<string, EvaluationResult>, judge: JudgeOpts & {
19
+ topN?: number;
20
+ }, opts: EvalOpts): Promise<void>;
@@ -0,0 +1,119 @@
1
+ /**
2
+ * LLM-judge pass over a built evaluation matrix.
3
+ *
4
+ * Split out of evaluator.ts to keep each file under the 500-line cap and
5
+ * because the judge has its own concerns (top-N eligibility, batch vs
6
+ * online path, crash-resumable state).
7
+ *
8
+ * The judge REPLACES the heuristic content score with a semantic grade.
9
+ * We only judge top-N variants per generation to cap cost — a judge call
10
+ * per (variant, case, model) on a large population explodes fast.
11
+ */
12
+ import { judgeOutput, buildJudgePrompt, parseJudgeOutput } from "./llm-judge.js";
13
+ import { batchCallModel } from "./transport-batch.js";
14
+ import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
15
+ import { gmean } from "./scorer.js";
16
+ import { averageDimensions } from "./evaluator-utils.js";
17
+ export async function runJudge(variants, cases, models, aggregated, judge, opts) {
18
+ const topN = judge.topN ?? 4;
19
+ const variantGmeans = variants.map((v) => {
20
+ const scores = [];
21
+ for (const c of cases) {
22
+ for (const model of models) {
23
+ const r = aggregated.get(`${v.id}:${c.hash}:${model}`);
24
+ if (r)
25
+ scores.push(r.scores);
26
+ }
27
+ }
28
+ return { id: v.id, g: scores.length > 0 ? gmean(averageDimensions(scores)) : 0 };
29
+ });
30
+ variantGmeans.sort((a, b) => b.g - a.g);
31
+ const eligible = new Set(variantGmeans.slice(0, topN).map((x) => x.id));
32
+ const cells = [];
33
+ for (const v of variants) {
34
+ if (!eligible.has(v.id))
35
+ continue;
36
+ for (const c of cases) {
37
+ for (const model of models) {
38
+ const key = `${v.id}:${c.hash}:${model}`;
39
+ const r = aggregated.get(key);
40
+ if (!r || r.scores.parse < 0.5)
41
+ continue; // unparseable output isn't worth judging
42
+ cells.push({ key, c, r });
43
+ }
44
+ }
45
+ }
46
+ if (cells.length === 0)
47
+ return;
48
+ if (opts.batch) {
49
+ await runJudgeBatch(cells, judge, opts);
50
+ return;
51
+ }
52
+ const jobs = cells.map((cell) => async () => {
53
+ try {
54
+ const jr = await judgeOutput(cell.r.rawOutput, cell.c, judge);
55
+ cell.r.scores = { ...cell.r.scores, content: jr.score };
56
+ cell.r.judgeJustification = jr.justification;
57
+ }
58
+ catch {
59
+ // Judge failure is non-fatal — keep heuristic content.
60
+ }
61
+ });
62
+ const judgeConcurrency = 3;
63
+ let nextJob = 0;
64
+ const judgeWorker = async () => {
65
+ while (true) {
66
+ const i = nextJob++;
67
+ if (i >= jobs.length)
68
+ return;
69
+ await jobs[i]();
70
+ }
71
+ };
72
+ await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
73
+ }
74
+ async function runJudgeBatch(cells, judge, opts) {
75
+ const batchJobs = cells.map((cell, i) => ({
76
+ customId: `j:${i}|k:${cell.key}`,
77
+ userText: buildJudgePrompt(cell.r.rawOutput, cell.c),
78
+ model: judge.model,
79
+ }));
80
+ const existing = opts.runId != null && opts.generation != null
81
+ ? loadBatchState(opts.runId, opts.generation, "judge")
82
+ : null;
83
+ const transport = opts.batchCallModel ?? batchCallModel;
84
+ const results = await transport(batchJobs, {
85
+ baseUrl: judge.baseUrl ?? opts.baseUrl,
86
+ authToken: judge.authToken ?? opts.authToken,
87
+ maxTokens: judge.maxTokens ?? 2048,
88
+ resumeBatchId: existing?.batchId,
89
+ onSubmitted: (batchId, p) => {
90
+ if (opts.runId != null && opts.generation != null && !existing) {
91
+ saveBatchState(opts.runId, {
92
+ generation: opts.generation,
93
+ phase: "judge",
94
+ batchId,
95
+ provider: p,
96
+ submittedAt: new Date().toISOString(),
97
+ });
98
+ }
99
+ opts.onBatchProgress?.(`judge batch submitted: ${batchId} (${p})`);
100
+ },
101
+ onProgress: (p) => opts.onBatchProgress?.(`judge batch ${p.batchId} ${p.phase}${p.succeeded != null ? `: ${p.succeeded}/${p.total ?? batchJobs.length}` : ""}`),
102
+ });
103
+ if (opts.runId != null && existing)
104
+ markBatchFinished(opts.runId, existing.batchId);
105
+ for (const cell of cells) {
106
+ const customId = batchJobs.find((b) => b.customId.includes(`|k:${cell.key}`))?.customId;
107
+ const got = customId ? results.get(customId) : undefined;
108
+ if (!got || !got.raw)
109
+ continue;
110
+ try {
111
+ const jr = parseJudgeOutput(got.raw);
112
+ cell.r.scores = { ...cell.r.scores, content: jr.score };
113
+ cell.r.judgeJustification = jr.justification;
114
+ }
115
+ catch {
116
+ // Judge parse failure is non-fatal — keep heuristic content.
117
+ }
118
+ }
119
+ }
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
3
+ * Extracted to break the import cycle that would otherwise form between
4
+ * the two (both call averageDimensions, judge also needs gmean aggregates).
5
+ */
6
+ import type { ScoreDimensions } from "./types.js";
7
+ export declare function averageDimensions(scores: ScoreDimensions[]): ScoreDimensions;
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
3
+ * Extracted to break the import cycle that would otherwise form between
4
+ * the two (both call averageDimensions, judge also needs gmean aggregates).
5
+ */
6
+ export function averageDimensions(scores) {
7
+ if (scores.length === 0)
8
+ return { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 };
9
+ const n = scores.length;
10
+ return {
11
+ parse: scores.reduce((a, b) => a + b.parse, 0) / n,
12
+ schema: scores.reduce((a, b) => a + b.schema, 0) / n,
13
+ content: scores.reduce((a, b) => a + b.content, 0) / n,
14
+ costEfficiency: scores.reduce((a, b) => a + b.costEfficiency, 0) / n,
15
+ speed: scores.reduce((a, b) => a + b.speed, 0) / n,
16
+ };
17
+ }
@@ -17,6 +17,7 @@
17
17
  */
18
18
  import { type JudgeOpts } from "./llm-judge.js";
19
19
  import { type CallModel } from "./transport.js";
20
+ import { batchCallModel } from "./transport-batch.js";
20
21
  import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
21
22
  export interface EvalOpts {
22
23
  /** Primary generator model (retained for single-model compat). */
@@ -35,14 +36,33 @@ export interface EvalOpts {
35
36
  timeoutMs?: number;
36
37
  /** Repetitions per (variant, case, model). Default 1 — opt-in to 3+ for noise floor. */
37
38
  repetitions?: number;
39
+ /**
40
+ * Adaptive sampling: after initial `repetitions`, keep adding one rep per cell
41
+ * where any score-dim σ exceeds `threshold`, up to `cap` total reps. Prevents
42
+ * wasted reps on already-stable cells while driving noisy ones down.
43
+ */
44
+ adaptiveReps?: {
45
+ cap: number;
46
+ threshold?: number;
47
+ };
38
48
  /** Inject an llm-judge call per case; content dimension is replaced by judge score. */
39
49
  judge?: JudgeOpts & {
40
50
  topN?: number;
41
51
  };
42
52
  /** Transport override for tests. */
43
53
  callModel?: CallModel;
54
+ /** Use provider batch API instead of online calls (50% cheaper, slower wall-clock). */
55
+ batch?: boolean;
56
+ /** Run id — required when batch=true so state is crash-resumable. */
57
+ runId?: string;
58
+ /** Current generation number — used to key batch state. */
59
+ generation?: number;
60
+ /** Batch-transport override for tests. Same return shape as transport-batch.batchCallModel. */
61
+ batchCallModel?: typeof batchCallModel;
44
62
  /** Optional callback for progress */
45
63
  onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
64
+ /** Progress callback specific to batch-phase transitions. */
65
+ onBatchProgress?: (msg: string) => void;
46
66
  }
47
67
  export declare function buildMatrix(variants: Array<{
48
68
  id: string;