claude-overnight 1.55.2 → 1.57.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@
18
18
  import { evolvePrompt } from "../prompt-evolution/index.js";
19
19
  import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
20
20
  import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
21
+ import { generateCases } from "../prompt-evolution/fixtures/generate.js";
21
22
  import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
22
23
  function help() {
23
24
  process.stdout.write(`Usage: claude-overnight-evolve [options]
@@ -35,13 +36,27 @@ Options:
35
36
  --plateau <n> Stop early if no improvement for N generations (default: 3)
36
37
  --reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
37
38
  --concurrency <n> Max in-flight eval calls (default: 8; bump for slow endpoints)
39
+ --batch Use provider batch API (50% cheaper, slower wall-clock)
40
+ --adaptive-cap <n> Adaptive sampling: extend reps up to N when σ > threshold (default: off)
41
+ --adaptive-threshold <x> σ threshold that triggers an extra rep (default: 0.1)
38
42
  --judge Use llm-judge for content scoring (costs extra API calls)
39
43
  --judge-model <model> Model to use for the judge (default: same as eval-model)
40
44
  --judge-top-n <n> Judge only the top-N variants per generation (default: 4)
41
45
  --cases <suite> Benchmark suite: plan | mcp-planning | mcp-review |
42
46
  mcp-supervision | mcp-stuck (default: plan)
43
47
  --harvest Append cases harvested from <cwd>/.claude-overnight/runs/*
48
+ --harvest-only Use ONLY harvested real objectives (fails if none found)
44
49
  --harvest-limit <n> Max harvested cases (default: 10)
50
+ --prompts <list> Comma-separated prompt paths to evolve in sequence
51
+ --test-split <f> Hold out fraction f of cases for a selection-bias-free
52
+ final eval (default: 0 = no split). Use 0.3 for rigor.
53
+ --case-pool <n> Target total case count; generates synthetic cases via
54
+ LLM to top up if the current pool is smaller.
55
+ --gen-model <model> Model used by the case generator (default: eval-model)
56
+
57
+ Subcommands:
58
+ claude-overnight-evolve diff <runIdA> <runIdB>
59
+ Print a per-variant diff of two persisted runs
45
60
  --base-url <url> API base URL override
46
61
  --auth-token <token> Auth token override
47
62
  --run-id <id> Preset run id (default: auto-generated)
@@ -62,11 +77,14 @@ function parseArgs() {
62
77
  population: 8,
63
78
  plateau: 3,
64
79
  reps: 1,
80
+ batch: false,
65
81
  useJudge: false,
66
82
  judgeTopN: 4,
67
83
  cases: "",
68
84
  harvest: false,
85
+ harvestOnly: false,
69
86
  harvestLimit: 10,
87
+ testSplit: 0,
70
88
  baseUrl: process.env.ANTHROPIC_BASE_URL,
71
89
  authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
72
90
  };
@@ -117,6 +135,17 @@ function parseArgs() {
117
135
  opts.concurrency = parseInt(v, 10);
118
136
  i++;
119
137
  break;
138
+ case "--batch":
139
+ opts.batch = true;
140
+ break;
141
+ case "--adaptive-cap":
142
+ opts.adaptiveCap = parseInt(v, 10);
143
+ i++;
144
+ break;
145
+ case "--adaptive-threshold":
146
+ opts.adaptiveThreshold = parseFloat(v);
147
+ i++;
148
+ break;
120
149
  case "--judge":
121
150
  opts.useJudge = true;
122
151
  break;
@@ -135,10 +164,30 @@ function parseArgs() {
135
164
  case "--harvest":
136
165
  opts.harvest = true;
137
166
  break;
167
+ case "--harvest-only":
168
+ opts.harvest = true;
169
+ opts.harvestOnly = true;
170
+ break;
138
171
  case "--harvest-limit":
139
172
  opts.harvestLimit = parseInt(v, 10);
140
173
  i++;
141
174
  break;
175
+ case "--prompts":
176
+ opts.prompts = v.split(",").map((s) => s.trim()).filter(Boolean);
177
+ i++;
178
+ break;
179
+ case "--test-split":
180
+ opts.testSplit = parseFloat(v);
181
+ i++;
182
+ break;
183
+ case "--case-pool":
184
+ opts.casePool = parseInt(v, 10);
185
+ i++;
186
+ break;
187
+ case "--gen-model":
188
+ opts.genModel = v;
189
+ i++;
190
+ break;
142
191
  case "--base-url":
143
192
  opts.baseUrl = v;
144
193
  i++;
@@ -161,7 +210,31 @@ function parseArgs() {
161
210
  return opts;
162
211
  }
163
212
  async function main() {
213
+ // Subcommand: diff two persisted runs.
214
+ if (process.argv[2] === "diff") {
215
+ await runDiff(process.argv[3], process.argv[4]);
216
+ return;
217
+ }
164
218
  const opts = parseArgs();
219
+ // Multi-prompt mode: loop evolvePrompt once per prompt in opts.prompts.
220
+ // Each iteration gets its own runId and report. Post a combined summary
221
+ // at the end so the user sees best-of-batch across all prompts.
222
+ if (opts.prompts && opts.prompts.length > 0) {
223
+ const summary = [];
224
+ for (const p of opts.prompts) {
225
+ console.log(`\n========== Evolving ${p} ==========\n`);
226
+ const result = await evolveOne({ ...opts, prompt: p });
227
+ summary.push({ prompt: p, runId: result.runId, gmean: result.bestVariant.gmean, reportPath: result.reportPath });
228
+ }
229
+ console.log("\n========== Multi-prompt summary ==========");
230
+ for (const s of summary) {
231
+ console.log(` ${s.prompt.padEnd(40)} gmean=${(s.gmean * 100).toFixed(1)}% runId=${s.runId}`);
232
+ }
233
+ return;
234
+ }
235
+ await evolveOne(opts);
236
+ }
237
+ async function evolveOne(opts) {
165
238
  let cases;
166
239
  let promptPath = opts.prompt;
167
240
  let seedText;
@@ -182,7 +255,7 @@ async function main() {
182
255
  }
183
256
  else {
184
257
  if (opts.cases === "plan")
185
- cases = [...PLAN_CASES];
258
+ cases = opts.harvestOnly ? [] : [...PLAN_CASES];
186
259
  else
187
260
  throw new Error(`Unknown case suite: ${opts.cases}`);
188
261
  if (opts.harvest) {
@@ -192,13 +265,39 @@ async function main() {
192
265
  limit: opts.harvestLimit,
193
266
  });
194
267
  if (harvested.length === 0) {
268
+ if (opts.harvestOnly) {
269
+ throw new Error("--harvest-only set but no runs found under <cwd>/.claude-overnight/runs");
270
+ }
195
271
  console.log(` (harvest: no runs found under <cwd>/.claude-overnight/runs)`);
196
272
  }
197
273
  else {
198
- console.log(` (harvest: +${harvested.length} real objectives)`);
274
+ console.log(` (harvest: ${opts.harvestOnly ? "" : "+"}${harvested.length} real objectives)`);
199
275
  cases = cases.concat(harvested);
200
276
  }
201
277
  }
278
+ // Top up to --case-pool with LLM-generated synthetic cases. The generator
279
+ // caches its output so successive runs share the pool — real cost is
280
+ // paid once, amortised across every subsequent round.
281
+ if (opts.casePool && cases.length < opts.casePool) {
282
+ console.log(` (generating cases to reach pool size ${opts.casePool}…)`);
283
+ try {
284
+ const generated = await generateCases({
285
+ targetCount: opts.casePool - cases.length,
286
+ model: opts.genModel ?? opts.evalModel,
287
+ baseUrl: opts.baseUrl,
288
+ authToken: opts.authToken,
289
+ promptPath,
290
+ existing: cases,
291
+ });
292
+ console.log(` (generated: +${generated.length} synthetic cases)`);
293
+ cases = cases.concat(generated);
294
+ }
295
+ catch (err) {
296
+ const msg = err.message ?? String(err);
297
+ console.log(`\n ⚠ case generation failed: ${msg.slice(0, 500)}`);
298
+ console.log(` Falling back to the existing ${cases.length} case(s). Try --gen-model with an Anthropic-compatible JSON-reliable model (e.g. claude-haiku-4-5) if this persists.\n`);
299
+ }
300
+ }
202
301
  }
203
302
  console.log(`Evolution config:`);
204
303
  console.log(` target: ${opts.target}`);
@@ -221,6 +320,11 @@ async function main() {
221
320
  plateauGenerations: opts.plateau,
222
321
  repetitions: opts.reps > 1 ? opts.reps : undefined,
223
322
  concurrency: opts.concurrency,
323
+ batch: opts.batch || undefined,
324
+ adaptiveReps: opts.adaptiveCap
325
+ ? { cap: opts.adaptiveCap, threshold: opts.adaptiveThreshold }
326
+ : undefined,
327
+ testFraction: opts.testSplit > 0 ? opts.testSplit : undefined,
224
328
  judge: opts.useJudge
225
329
  ? {
226
330
  model: opts.judgeModel ?? opts.evalModel,
@@ -247,6 +351,44 @@ async function main() {
247
351
  console.log(`speed: ${(result.bestVariant.aggregate.speed * 100).toFixed(1)}%`);
248
352
  console.log("\n--- Prompt text ---");
249
353
  console.log(result.bestVariant.text);
354
+ return result;
355
+ }
356
+ async function runDiff(runIdA, runIdB) {
357
+ if (!runIdA || !runIdB) {
358
+ console.error("usage: claude-overnight-evolve diff <runIdA> <runIdB>");
359
+ process.exit(2);
360
+ }
361
+ const { loadRun } = await import("../prompt-evolution/persistence.js");
362
+ const a = loadRun(runIdA);
363
+ const b = loadRun(runIdB);
364
+ const collect = (run) => {
365
+ const out = new Map();
366
+ for (const rec of run.matrix) {
367
+ // Keep the latest-generation row per variantId so diff compares final state.
368
+ const existing = out.get(rec.variantId);
369
+ if (!existing || rec.generation > existing.generation) {
370
+ out.set(rec.variantId, { generation: rec.generation, variantId: rec.variantId, gmean: rec.gmean });
371
+ }
372
+ }
373
+ return out;
374
+ };
375
+ const rowsA = collect(a);
376
+ const rowsB = collect(b);
377
+ const ids = new Set([...rowsA.keys(), ...rowsB.keys()]);
378
+ console.log(`# Diff: ${runIdA} → ${runIdB}`);
379
+ console.log("");
380
+ console.log(`| Variant | A gmean | B gmean | Δ | note |`);
381
+ console.log(`|-----------|-----------|-----------|-------|--------|`);
382
+ const sorted = [...ids].sort();
383
+ for (const id of sorted) {
384
+ const ra = rowsA.get(id);
385
+ const rb = rowsB.get(id);
386
+ const ga = ra ? (ra.gmean * 100).toFixed(1) : "—";
387
+ const gb = rb ? (rb.gmean * 100).toFixed(1) : "—";
388
+ const delta = ra && rb ? ((rb.gmean - ra.gmean) * 100).toFixed(1) : "—";
389
+ const note = !ra ? "new in B" : !rb ? "missing in B" : ra.gmean < rb.gmean ? "↑" : ra.gmean > rb.gmean ? "↓" : "=";
390
+ console.log(`| ${id.padEnd(10)}| ${ga.padStart(9)} | ${gb.padStart(9)} | ${delta.padStart(5)} | ${note} |`);
391
+ }
250
392
  }
251
393
  main().catch((err) => {
252
394
  console.error(err);
@@ -1 +1 @@
1
- export declare const VERSION = "1.55.2";
1
+ export declare const VERSION = "1.57.1";
@@ -1,2 +1,2 @@
1
1
  // Auto-generated by build — do not edit manually.
2
- export const VERSION = "1.55.2";
2
+ export const VERSION = "1.57.1";
@@ -0,0 +1,20 @@
1
+ /**
2
+ * LLM-judge pass over a built evaluation matrix.
3
+ *
4
+ * Split out of evaluator.ts to keep each file under the 500-line cap and
5
+ * because the judge has its own concerns (top-N eligibility, batch vs
6
+ * online path, crash-resumable state).
7
+ *
8
+ * The judge REPLACES the heuristic content score with a semantic grade.
9
+ * We only judge top-N variants per generation to cap cost — a judge call
10
+ * per (variant, case, model) on a large population explodes fast.
11
+ */
12
+ import { type JudgeOpts } from "./llm-judge.js";
13
+ import type { BenchmarkCase, EvaluationResult } from "./types.js";
14
+ import type { EvalOpts } from "./evaluator.js";
15
+ export declare function runJudge(variants: Array<{
16
+ id: string;
17
+ text: string;
18
+ }>, cases: BenchmarkCase[], models: string[], aggregated: Map<string, EvaluationResult>, judge: JudgeOpts & {
19
+ topN?: number;
20
+ }, opts: EvalOpts): Promise<void>;
@@ -0,0 +1,119 @@
1
+ /**
2
+ * LLM-judge pass over a built evaluation matrix.
3
+ *
4
+ * Split out of evaluator.ts to keep each file under the 500-line cap and
5
+ * because the judge has its own concerns (top-N eligibility, batch vs
6
+ * online path, crash-resumable state).
7
+ *
8
+ * The judge REPLACES the heuristic content score with a semantic grade.
9
+ * We only judge top-N variants per generation to cap cost — a judge call
10
+ * per (variant, case, model) on a large population explodes fast.
11
+ */
12
+ import { judgeOutput, buildJudgePrompt, parseJudgeOutput } from "./llm-judge.js";
13
+ import { batchCallModel } from "./transport-batch.js";
14
+ import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
15
+ import { gmean } from "./scorer.js";
16
+ import { averageDimensions } from "./evaluator-utils.js";
17
+ export async function runJudge(variants, cases, models, aggregated, judge, opts) {
18
+ const topN = judge.topN ?? 4;
19
+ const variantGmeans = variants.map((v) => {
20
+ const scores = [];
21
+ for (const c of cases) {
22
+ for (const model of models) {
23
+ const r = aggregated.get(`${v.id}:${c.hash}:${model}`);
24
+ if (r)
25
+ scores.push(r.scores);
26
+ }
27
+ }
28
+ return { id: v.id, g: scores.length > 0 ? gmean(averageDimensions(scores)) : 0 };
29
+ });
30
+ variantGmeans.sort((a, b) => b.g - a.g);
31
+ const eligible = new Set(variantGmeans.slice(0, topN).map((x) => x.id));
32
+ const cells = [];
33
+ for (const v of variants) {
34
+ if (!eligible.has(v.id))
35
+ continue;
36
+ for (const c of cases) {
37
+ for (const model of models) {
38
+ const key = `${v.id}:${c.hash}:${model}`;
39
+ const r = aggregated.get(key);
40
+ if (!r || r.scores.parse < 0.5)
41
+ continue; // unparseable output isn't worth judging
42
+ cells.push({ key, c, r });
43
+ }
44
+ }
45
+ }
46
+ if (cells.length === 0)
47
+ return;
48
+ if (opts.batch) {
49
+ await runJudgeBatch(cells, judge, opts);
50
+ return;
51
+ }
52
+ const jobs = cells.map((cell) => async () => {
53
+ try {
54
+ const jr = await judgeOutput(cell.r.rawOutput, cell.c, judge);
55
+ cell.r.scores = { ...cell.r.scores, content: jr.score };
56
+ cell.r.judgeJustification = jr.justification;
57
+ }
58
+ catch {
59
+ // Judge failure is non-fatal — keep heuristic content.
60
+ }
61
+ });
62
+ const judgeConcurrency = 3;
63
+ let nextJob = 0;
64
+ const judgeWorker = async () => {
65
+ while (true) {
66
+ const i = nextJob++;
67
+ if (i >= jobs.length)
68
+ return;
69
+ await jobs[i]();
70
+ }
71
+ };
72
+ await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
73
+ }
74
+ async function runJudgeBatch(cells, judge, opts) {
75
+ const batchJobs = cells.map((cell, i) => ({
76
+ customId: `j:${i}|k:${cell.key}`,
77
+ userText: buildJudgePrompt(cell.r.rawOutput, cell.c),
78
+ model: judge.model,
79
+ }));
80
+ const existing = opts.runId != null && opts.generation != null
81
+ ? loadBatchState(opts.runId, opts.generation, "judge")
82
+ : null;
83
+ const transport = opts.batchCallModel ?? batchCallModel;
84
+ const results = await transport(batchJobs, {
85
+ baseUrl: judge.baseUrl ?? opts.baseUrl,
86
+ authToken: judge.authToken ?? opts.authToken,
87
+ maxTokens: judge.maxTokens ?? 2048,
88
+ resumeBatchId: existing?.batchId,
89
+ onSubmitted: (batchId, p) => {
90
+ if (opts.runId != null && opts.generation != null && !existing) {
91
+ saveBatchState(opts.runId, {
92
+ generation: opts.generation,
93
+ phase: "judge",
94
+ batchId,
95
+ provider: p,
96
+ submittedAt: new Date().toISOString(),
97
+ });
98
+ }
99
+ opts.onBatchProgress?.(`judge batch submitted: ${batchId} (${p})`);
100
+ },
101
+ onProgress: (p) => opts.onBatchProgress?.(`judge batch ${p.batchId} ${p.phase}${p.succeeded != null ? `: ${p.succeeded}/${p.total ?? batchJobs.length}` : ""}`),
102
+ });
103
+ if (opts.runId != null && existing)
104
+ markBatchFinished(opts.runId, existing.batchId);
105
+ for (const cell of cells) {
106
+ const customId = batchJobs.find((b) => b.customId.includes(`|k:${cell.key}`))?.customId;
107
+ const got = customId ? results.get(customId) : undefined;
108
+ if (!got || !got.raw)
109
+ continue;
110
+ try {
111
+ const jr = parseJudgeOutput(got.raw);
112
+ cell.r.scores = { ...cell.r.scores, content: jr.score };
113
+ cell.r.judgeJustification = jr.justification;
114
+ }
115
+ catch {
116
+ // Judge parse failure is non-fatal — keep heuristic content.
117
+ }
118
+ }
119
+ }
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
3
+ * Extracted to break the import cycle that would otherwise form between
4
+ * the two (both call averageDimensions, judge also needs gmean aggregates).
5
+ */
6
+ import type { ScoreDimensions } from "./types.js";
7
+ export declare function averageDimensions(scores: ScoreDimensions[]): ScoreDimensions;
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
3
+ * Extracted to break the import cycle that would otherwise form between
4
+ * the two (both call averageDimensions, judge also needs gmean aggregates).
5
+ */
6
+ export function averageDimensions(scores) {
7
+ if (scores.length === 0)
8
+ return { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 };
9
+ const n = scores.length;
10
+ return {
11
+ parse: scores.reduce((a, b) => a + b.parse, 0) / n,
12
+ schema: scores.reduce((a, b) => a + b.schema, 0) / n,
13
+ content: scores.reduce((a, b) => a + b.content, 0) / n,
14
+ costEfficiency: scores.reduce((a, b) => a + b.costEfficiency, 0) / n,
15
+ speed: scores.reduce((a, b) => a + b.speed, 0) / n,
16
+ };
17
+ }
@@ -17,6 +17,7 @@
17
17
  */
18
18
  import { type JudgeOpts } from "./llm-judge.js";
19
19
  import { type CallModel } from "./transport.js";
20
+ import { batchCallModel } from "./transport-batch.js";
20
21
  import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
21
22
  export interface EvalOpts {
22
23
  /** Primary generator model (retained for single-model compat). */
@@ -35,14 +36,33 @@ export interface EvalOpts {
35
36
  timeoutMs?: number;
36
37
  /** Repetitions per (variant, case, model). Default 1 — opt-in to 3+ for noise floor. */
37
38
  repetitions?: number;
39
+ /**
40
+ * Adaptive sampling: after initial `repetitions`, keep adding one rep per cell
41
+ * where any score-dim σ exceeds `threshold`, up to `cap` total reps. Prevents
42
+ * wasted reps on already-stable cells while driving noisy ones down.
43
+ */
44
+ adaptiveReps?: {
45
+ cap: number;
46
+ threshold?: number;
47
+ };
38
48
  /** Inject an llm-judge call per case; content dimension is replaced by judge score. */
39
49
  judge?: JudgeOpts & {
40
50
  topN?: number;
41
51
  };
42
52
  /** Transport override for tests. */
43
53
  callModel?: CallModel;
54
+ /** Use provider batch API instead of online calls (50% cheaper, slower wall-clock). */
55
+ batch?: boolean;
56
+ /** Run id — required when batch=true so state is crash-resumable. */
57
+ runId?: string;
58
+ /** Current generation number — used to key batch state. */
59
+ generation?: number;
60
+ /** Batch-transport override for tests. Same return shape as transport-batch.batchCallModel. */
61
+ batchCallModel?: typeof batchCallModel;
44
62
  /** Optional callback for progress */
45
63
  onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
64
+ /** Progress callback specific to batch-phase transitions. */
65
+ onBatchProgress?: (msg: string) => void;
46
66
  }
47
67
  export declare function buildMatrix(variants: Array<{
48
68
  id: string;