claude-overnight 1.58.0 → 1.59.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,12 +39,6 @@ Options:
39
39
  --plateau <n> Stop early if no improvement for N generations (default: 3)
40
40
  --reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
41
41
  --concurrency <n> Max in-flight eval calls (default: 8; bump for slow endpoints)
42
- --batch Use provider batch API (50% cheaper, slower wall-clock)
43
- --batch-base-url <url> Override base URL for batch only (e.g. api.moonshot.ai/v1
44
- when online uses api.kimi.com/coding)
45
- --batch-auth-token <t> Override auth token for batch only
46
- --batch-model <model> Override model for batch only (e.g. "kimi-k2.6" for
47
- Moonshot platform when online uses "kimi-for-coding")
48
42
  --adaptive-cap <n> Adaptive sampling: extend reps up to N when σ > threshold (default: off)
49
43
  --adaptive-threshold <x> σ threshold that triggers an extra rep (default: 0.1)
50
44
  --judge Use llm-judge for content scoring (costs extra API calls)
@@ -85,7 +79,6 @@ function parseArgs() {
85
79
  population: 8,
86
80
  plateau: 3,
87
81
  reps: 1,
88
- batch: false,
89
82
  useJudge: false,
90
83
  judgeTopN: 4,
91
84
  cases: "",
@@ -143,21 +136,6 @@ function parseArgs() {
143
136
  opts.concurrency = parseInt(v, 10);
144
137
  i++;
145
138
  break;
146
- case "--batch":
147
- opts.batch = true;
148
- break;
149
- case "--batch-base-url":
150
- opts.batchBaseUrl = v;
151
- i++;
152
- break;
153
- case "--batch-auth-token":
154
- opts.batchAuthToken = v;
155
- i++;
156
- break;
157
- case "--batch-model":
158
- opts.batchModel = v;
159
- i++;
160
- break;
161
139
  case "--adaptive-cap":
162
140
  opts.adaptiveCap = parseInt(v, 10);
163
141
  i++;
@@ -347,10 +325,6 @@ async function evolveOne(opts) {
347
325
  plateauGenerations: opts.plateau,
348
326
  repetitions: opts.reps > 1 ? opts.reps : undefined,
349
327
  concurrency: opts.concurrency,
350
- batch: opts.batch || undefined,
351
- batchBaseUrl: opts.batchBaseUrl,
352
- batchAuthToken: opts.batchAuthToken,
353
- batchModel: opts.batchModel,
354
328
  adaptiveReps: opts.adaptiveCap
355
329
  ? { cap: opts.adaptiveCap, threshold: opts.adaptiveThreshold }
356
330
  : undefined,
@@ -1 +1 @@
1
- export declare const VERSION = "1.58.0";
1
+ export declare const VERSION = "1.59.0";
@@ -1,2 +1,2 @@
1
1
  // Auto-generated by build — do not edit manually.
2
- export const VERSION = "1.58.0";
2
+ export const VERSION = "1.59.0";
@@ -1,20 +1,15 @@
1
1
  /**
2
2
  * LLM-judge pass over a built evaluation matrix.
3
3
  *
4
- * Split out of evaluator.ts to keep each file under the 500-line cap and
5
- * because the judge has its own concerns (top-N eligibility, batch vs
6
- * online path, crash-resumable state).
7
- *
8
4
  * The judge REPLACES the heuristic content score with a semantic grade.
9
5
  * We only judge top-N variants per generation to cap cost — a judge call
10
6
  * per (variant, case, model) on a large population explodes fast.
11
7
  */
12
8
  import { type JudgeOpts } from "./llm-judge.js";
13
9
  import type { BenchmarkCase, EvaluationResult } from "./types.js";
14
- import type { EvalOpts } from "./evaluator.js";
15
10
  export declare function runJudge(variants: Array<{
16
11
  id: string;
17
12
  text: string;
18
13
  }>, cases: BenchmarkCase[], models: string[], aggregated: Map<string, EvaluationResult>, judge: JudgeOpts & {
19
14
  topN?: number;
20
- }, opts: EvalOpts): Promise<void>;
15
+ }): Promise<void>;
@@ -1,20 +1,14 @@
1
1
  /**
2
2
  * LLM-judge pass over a built evaluation matrix.
3
3
  *
4
- * Split out of evaluator.ts to keep each file under the 500-line cap and
5
- * because the judge has its own concerns (top-N eligibility, batch vs
6
- * online path, crash-resumable state).
7
- *
8
4
  * The judge REPLACES the heuristic content score with a semantic grade.
9
5
  * We only judge top-N variants per generation to cap cost — a judge call
10
6
  * per (variant, case, model) on a large population explodes fast.
11
7
  */
12
- import { judgeOutput, buildJudgePrompt, parseJudgeOutput } from "./llm-judge.js";
13
- import { batchCallModel } from "./transport-batch.js";
14
- import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
8
+ import { judgeOutput } from "./llm-judge.js";
15
9
  import { gmean } from "./scorer.js";
16
10
  import { averageDimensions } from "./evaluator-utils.js";
17
- export async function runJudge(variants, cases, models, aggregated, judge, opts) {
11
+ export async function runJudge(variants, cases, models, aggregated, judge) {
18
12
  const topN = judge.topN ?? 4;
19
13
  const variantGmeans = variants.map((v) => {
20
14
  const scores = [];
@@ -45,10 +39,6 @@ export async function runJudge(variants, cases, models, aggregated, judge, opts)
45
39
  }
46
40
  if (cells.length === 0)
47
41
  return;
48
- if (opts.batch) {
49
- await runJudgeBatch(cells, judge, opts);
50
- return;
51
- }
52
42
  const jobs = cells.map((cell) => async () => {
53
43
  try {
54
44
  const jr = await judgeOutput(cell.r.rawOutput, cell.c, judge);
@@ -71,53 +61,3 @@ export async function runJudge(variants, cases, models, aggregated, judge, opts)
71
61
  };
72
62
  await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
73
63
  }
74
- async function runJudgeBatch(cells, judge, opts) {
75
- const batchJobs = cells.map((cell, i) => ({
76
- customId: `j:${i}|k:${cell.key}`,
77
- userText: buildJudgePrompt(cell.r.rawOutput, cell.c),
78
- model: judge.model,
79
- }));
80
- const existing = opts.runId != null && opts.generation != null
81
- ? loadBatchState(opts.runId, opts.generation, "judge")
82
- : null;
83
- const transport = opts.batchCallModel ?? batchCallModel;
84
- const results = await transport(batchJobs, {
85
- // Judge batch follows the same override hierarchy as eval batch: if a
86
- // dedicated batch endpoint is set on EvalOpts, use it; else fall back
87
- // to the judge's own endpoint or the main one.
88
- baseUrl: opts.batchBaseUrl ?? judge.baseUrl ?? opts.baseUrl,
89
- authToken: opts.batchAuthToken ?? judge.authToken ?? opts.authToken,
90
- modelOverride: opts.batchModel,
91
- maxTokens: judge.maxTokens ?? 2048,
92
- resumeBatchId: existing?.batchId,
93
- onSubmitted: (batchId, p) => {
94
- if (opts.runId != null && opts.generation != null && !existing) {
95
- saveBatchState(opts.runId, {
96
- generation: opts.generation,
97
- phase: "judge",
98
- batchId,
99
- provider: p,
100
- submittedAt: new Date().toISOString(),
101
- });
102
- }
103
- opts.onBatchProgress?.(`judge batch submitted: ${batchId} (${p})`);
104
- },
105
- onProgress: (p) => opts.onBatchProgress?.(`judge batch ${p.batchId} ${p.phase}${p.succeeded != null ? `: ${p.succeeded}/${p.total ?? batchJobs.length}` : ""}`),
106
- });
107
- if (opts.runId != null && existing)
108
- markBatchFinished(opts.runId, existing.batchId);
109
- for (const cell of cells) {
110
- const customId = batchJobs.find((b) => b.customId.includes(`|k:${cell.key}`))?.customId;
111
- const got = customId ? results.get(customId) : undefined;
112
- if (!got || !got.raw)
113
- continue;
114
- try {
115
- const jr = parseJudgeOutput(got.raw);
116
- cell.r.scores = { ...cell.r.scores, content: jr.score };
117
- cell.r.judgeJustification = jr.justification;
118
- }
119
- catch {
120
- // Judge parse failure is non-fatal — keep heuristic content.
121
- }
122
- }
123
- }
@@ -17,7 +17,6 @@
17
17
  */
18
18
  import { type JudgeOpts } from "./llm-judge.js";
19
19
  import { type CallModel } from "./transport.js";
20
- import { batchCallModel } from "./transport-batch.js";
21
20
  import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
22
21
  export interface EvalOpts {
23
22
  /** Primary generator model (retained for single-model compat). */
@@ -51,29 +50,8 @@ export interface EvalOpts {
51
50
  };
52
51
  /** Transport override for tests. */
53
52
  callModel?: CallModel;
54
- /** Use provider batch API instead of online calls (50% cheaper, slower wall-clock). */
55
- batch?: boolean;
56
- /**
57
- * Override base URL for batch submissions only — lets batch hit a
58
- * different endpoint than online. Key use-case: Kimi users whose online
59
- * traffic runs through api.kimi.com/coding (which has no batch) but
60
- * whose batch traffic should go to api.moonshot.ai/v1.
61
- */
62
- batchBaseUrl?: string;
63
- /** Override auth token for batch when batchBaseUrl needs a different key. */
64
- batchAuthToken?: string;
65
- /** Override model for batch submissions (e.g., kimi-k2.6 when online uses kimi-for-coding). */
66
- batchModel?: string;
67
- /** Run id — required when batch=true so state is crash-resumable. */
68
- runId?: string;
69
- /** Current generation number — used to key batch state. */
70
- generation?: number;
71
- /** Batch-transport override for tests. Same return shape as transport-batch.batchCallModel. */
72
- batchCallModel?: typeof batchCallModel;
73
53
  /** Optional callback for progress */
74
54
  onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
75
- /** Progress callback specific to batch-phase transitions. */
76
- onBatchProgress?: (msg: string) => void;
77
55
  }
78
56
  export declare function buildMatrix(variants: Array<{
79
57
  id: string;
@@ -18,8 +18,6 @@
18
18
  import { renderPrompt } from "../prompts/load.js";
19
19
  import { scoreOutput, gmean, aggregateReps, bootstrapCI, kendallTau } from "./scorer.js";
20
20
  import { defaultCallModel, attemptJsonParse, } from "./transport.js";
21
- import { batchCallModel, detectBatchProvider, } from "./transport-batch.js";
22
- import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
23
21
  import { averageDimensions } from "./evaluator-utils.js";
24
22
  import { runJudge } from "./evaluator-judge.js";
25
23
  export async function buildMatrix(variants, cases, opts) {
@@ -38,53 +36,30 @@ export async function buildMatrix(variants, cases, opts) {
38
36
  }
39
37
  }
40
38
  }
41
- // Two execution paths:
42
- // batch=true — submit every job to the provider batch API, poll, score
43
- // results as they arrive. 50% cheaper, slower wall-clock.
44
- // batch=false — work-stealing pool: keep `concurrency` jobs in flight so
45
- // a slow call doesn't block the others in its slice.
39
+ // Work-stealing pool: keep `concurrency` jobs in flight so a slow call
40
+ // doesn't block the others in its slice.
46
41
  const rawByKey = new Map();
47
- const runOnlinePool = async () => {
48
- let done = 0;
49
- let next = 0;
50
- const worker = async () => {
51
- while (true) {
52
- const i = next++;
53
- if (i >= jobs.length)
54
- return;
55
- const r = await runSingle(jobs[i], opts, transport);
56
- const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
57
- const arr = rawByKey.get(key) ?? [];
58
- arr.push(r);
59
- rawByKey.set(key, arr);
60
- done++;
61
- opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
62
- }
63
- };
64
- await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
65
- };
66
- if (opts.batch) {
67
- try {
68
- await runBatchPath(jobs, opts, rawByKey);
69
- }
70
- catch (err) {
71
- // Batch submission failed (Kimi's /v1/files doesn't match OpenAI,
72
- // OpenRouter has no batch at all, transient provider error, etc.).
73
- // Fall back to the online pool so the whole run doesn't die — losing
74
- // the 50% batch discount is better than losing the run.
75
- const msg = err instanceof Error ? err.message : String(err);
76
- opts.onBatchProgress?.(`batch path failed, falling back to online: ${msg.slice(0, 200)}`);
77
- rawByKey.clear(); // discard any partial state
78
- await runOnlinePool();
42
+ let done = 0;
43
+ let next = 0;
44
+ const worker = async () => {
45
+ while (true) {
46
+ const i = next++;
47
+ if (i >= jobs.length)
48
+ return;
49
+ const r = await runSingle(jobs[i], opts, transport);
50
+ const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
51
+ const arr = rawByKey.get(key) ?? [];
52
+ arr.push(r);
53
+ rawByKey.set(key, arr);
54
+ done++;
55
+ opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
79
56
  }
80
- }
81
- else {
82
- await runOnlinePool();
83
- }
57
+ };
58
+ await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
84
59
  // Adaptive sampling: for cells where any score-dim σ exceeds threshold,
85
60
  // add one more rep and rerun — up to `cap` total reps. Converges on a
86
61
  // stable estimate without wasting reps on already-stable cells.
87
- if (!opts.batch && opts.adaptiveReps) {
62
+ if (opts.adaptiveReps) {
88
63
  const cap = opts.adaptiveReps.cap;
89
64
  const threshold = opts.adaptiveReps.threshold ?? 0.1;
90
65
  for (let round = 0; round < cap - reps; round++) {
@@ -131,7 +106,7 @@ export async function buildMatrix(variants, cases, opts) {
131
106
  }
132
107
  // Optional llm-judge pass on top-N variants (by current heuristic content).
133
108
  if (opts.judge)
134
- await runJudge(variants, cases, models, aggregated, opts.judge, opts);
109
+ await runJudge(variants, cases, models, aggregated, opts.judge);
135
110
  // Assemble rows: per-variant aggregate across all cases and models.
136
111
  const rows = [];
137
112
  for (const v of variants) {
@@ -244,81 +219,6 @@ function halfSplitMatrix(variants, cases, models, rawByKey, side) {
244
219
  scored.sort((a, b) => b.g - a.g);
245
220
  return scored.map((s) => s.id);
246
221
  }
247
- async function runBatchPath(jobs, opts, rawByKey) {
248
- const provider = detectBatchProvider(opts.baseUrl);
249
- if (provider === "unsupported") {
250
- throw new Error(`Batch API not supported for baseUrl=${opts.baseUrl}; rerun without --batch or point at an Anthropic / OpenAI-compatible endpoint.`);
251
- }
252
- // Build custom_ids that route results back to the right cell. Index is
253
- // included so reps of the same (variant, case, model) don't collide.
254
- const keyed = jobs.map((job, i) => ({
255
- job,
256
- index: i,
257
- customId: `v:${job.variantId}|h:${job.case.hash}|m:${job.model}|r:${job.rep}|i:${i}`,
258
- }));
259
- const batchJobs = keyed.map((k) => ({
260
- customId: k.customId,
261
- userText: k.job.text,
262
- systemText: k.job.systemText,
263
- model: k.job.model,
264
- }));
265
- const started = Date.now();
266
- const existing = opts.runId != null && opts.generation != null
267
- ? loadBatchState(opts.runId, opts.generation, "eval")
268
- : null;
269
- const transport = opts.batchCallModel ?? batchCallModel;
270
- const results = await transport(batchJobs, {
271
- baseUrl: opts.batchBaseUrl ?? opts.baseUrl,
272
- authToken: opts.batchAuthToken ?? opts.authToken,
273
- modelOverride: opts.batchModel,
274
- maxTokens: opts.maxTokens,
275
- resumeBatchId: existing?.batchId,
276
- onSubmitted: (batchId, p) => {
277
- if (opts.runId != null && opts.generation != null && !existing) {
278
- saveBatchState(opts.runId, {
279
- generation: opts.generation,
280
- phase: "eval",
281
- batchId,
282
- provider: p,
283
- submittedAt: new Date().toISOString(),
284
- });
285
- }
286
- opts.onBatchProgress?.(`batch submitted: ${batchId} (${p})`);
287
- },
288
- onProgress: (p) => {
289
- if (p.phase === "polling") {
290
- const ok = p.succeeded ?? 0;
291
- const failed = p.failed ?? 0;
292
- const total = p.total ?? batchJobs.length;
293
- opts.onBatchProgress?.(`batch ${p.batchId} polling: ${ok}/${total} done${failed ? `, ${failed} failed` : ""}`);
294
- }
295
- else {
296
- opts.onBatchProgress?.(`batch ${p.batchId} ${p.phase}`);
297
- }
298
- },
299
- });
300
- // Mark the state entry as finished so a crash after this point doesn't
301
- // cause the next run to try resuming an already-consumed batch.
302
- if (opts.runId != null && existing)
303
- markBatchFinished(opts.runId, existing.batchId);
304
- // Score each result and populate rawByKey the same way runSingle does.
305
- const durationMs = Math.round((Date.now() - started) / Math.max(1, jobs.length));
306
- let done = 0;
307
- for (const k of keyed) {
308
- const r = results.get(k.customId);
309
- const raw = r?.raw ?? "batch returned no result for this custom_id";
310
- const costUsd = r?.costUsd ?? 0;
311
- const parsed = attemptJsonParse(raw);
312
- const scored = scoreOutput(raw, parsed, costUsd, durationMs, k.job.case, { model: k.job.model });
313
- scored.variantId = k.job.variantId;
314
- const mapKey = `${scored.variantId}:${scored.caseHash}:${scored.model ?? ""}`;
315
- const arr = rawByKey.get(mapKey) ?? [];
316
- arr.push(scored);
317
- rawByKey.set(mapKey, arr);
318
- done++;
319
- opts.onProgress?.(done, jobs.length, k.job.case.name, k.job.variantId);
320
- }
321
- }
322
222
  async function runSingle(job, opts, transport) {
323
223
  const started = Date.now();
324
224
  const callOpts = {
@@ -54,14 +54,6 @@ export interface EvolveOpts {
54
54
  repetitions?: number;
55
55
  /** Max in-flight eval calls. Default 8. Raise for slow endpoints, lower for strict rate limits. */
56
56
  concurrency?: number;
57
- /** Use provider batch API instead of online calls. 50% cheaper, slower wall-clock. */
58
- batch?: boolean;
59
- /** Override base URL for batch submissions only. */
60
- batchBaseUrl?: string;
61
- /** Override auth token for batch submissions only. */
62
- batchAuthToken?: string;
63
- /** Override model for batch submissions (e.g. kimi-k2.6 when online uses kimi-for-coding). */
64
- batchModel?: string;
65
57
  /** Adaptive sampling cap (opt-in). Keeps adding reps to noisy cells up to this count. */
66
58
  adaptiveReps?: {
67
59
  cap: number;
@@ -73,17 +73,10 @@ export async function evolvePrompt(opts) {
73
73
  concurrency: opts.concurrency ?? 8,
74
74
  repetitions: opts.repetitions,
75
75
  judge: opts.judge,
76
- batch: opts.batch,
77
- batchBaseUrl: opts.batchBaseUrl,
78
- batchAuthToken: opts.batchAuthToken,
79
- batchModel: opts.batchModel,
80
76
  adaptiveReps: opts.adaptiveReps,
81
- runId,
82
- generation: gen,
83
77
  onProgress: (done, total, caseName, variantId) => {
84
78
  log(` [${done}/${total}] ${variantId.slice(0, 16)} → ${caseName}`);
85
79
  },
86
- onBatchProgress: (msg) => log(` [batch] ${msg}`),
87
80
  };
88
81
  const matrix = await buildMatrix(population, trainCases, evalOpts);
89
82
  generationMatrices.push(matrix);
@@ -198,11 +191,7 @@ export async function evolvePrompt(opts) {
198
191
  concurrency: opts.concurrency ?? 8,
199
192
  repetitions: opts.repetitions,
200
193
  judge: opts.judge,
201
- batch: opts.batch,
202
194
  adaptiveReps: opts.adaptiveReps,
203
- runId,
204
- generation: generations,
205
- onBatchProgress: (msg) => log(` [batch] ${msg}`),
206
195
  });
207
196
  generationMatrices.push(finalMatrix);
208
197
  snapshotPrompts(runId, finalMatrix);
@@ -222,14 +211,7 @@ export async function evolvePrompt(opts) {
222
211
  authToken: opts.authToken,
223
212
  concurrency: opts.concurrency ?? 8,
224
213
  repetitions: opts.repetitions,
225
- batch: opts.batch,
226
- batchBaseUrl: opts.batchBaseUrl,
227
- batchAuthToken: opts.batchAuthToken,
228
- batchModel: opts.batchModel,
229
214
  adaptiveReps: opts.adaptiveReps,
230
- runId,
231
- generation: generations + 1,
232
- onBatchProgress: (msg) => log(` [batch-test] ${msg}`),
233
215
  });
234
216
  log(formatMatrix(testMatrix, testCases.map((c) => c.name)));
235
217
  }
@@ -37,26 +37,6 @@ export declare function appendLearning(runId: string, entries: LearningEntry[]):
37
37
  export declare function snapshotPrompts(runId: string, rows: VariantRow[]): void;
38
38
  /** Finalise the run: write best.md and update meta.json. */
39
39
  export declare function finalizeRun(runId: string, result: EvolutionResult, metaPartial?: Partial<RunMeta>): void;
40
- /**
41
- * Persist batch submission state so a crashed or restarted run can resume
42
- * polling instead of resubmitting (which would duplicate the bill).
43
- *
44
- * Keyed by (generation, phase) so multi-generation runs and eval-vs-judge
45
- * submissions don't collide. Written append-only — the latest entry wins
46
- * on load.
47
- */
48
- export interface BatchStateEntry {
49
- generation: number;
50
- phase: "eval" | "judge";
51
- batchId: string;
52
- provider: "anthropic" | "openai-compatible";
53
- submittedAt: string;
54
- /** If set, we've already collected results for this entry — ignore on resume. */
55
- finishedAt?: string;
56
- }
57
- export declare function saveBatchState(runId: string, entry: BatchStateEntry): void;
58
- export declare function loadBatchState(runId: string, generation: number, phase: "eval" | "judge"): BatchStateEntry | null;
59
- export declare function markBatchFinished(runId: string, batchId: string): void;
60
40
  /** List all runs, newest first. */
61
41
  export declare function listRuns(): Array<{
62
42
  runId: string;
@@ -118,45 +118,6 @@ ${result.learningLog.map((l) => `| ${l.generation} | ${l.mutationSummary} | ${(l
118
118
  `;
119
119
  writeFileSync(join(root, "best.md"), report);
120
120
  }
121
- export function saveBatchState(runId, entry) {
122
- const path = join(runDir(runId), "batch-jobs.jsonl");
123
- writeFileSync(path, JSON.stringify(entry) + "\n", { flag: "a" });
124
- }
125
- export function loadBatchState(runId, generation, phase) {
126
- const path = join(runDir(runId), "batch-jobs.jsonl");
127
- if (!existsSync(path))
128
- return null;
129
- const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
130
- let latest = null;
131
- for (const line of lines) {
132
- try {
133
- const e = JSON.parse(line);
134
- if (e.generation === generation && e.phase === phase)
135
- latest = e;
136
- }
137
- catch { /* skip malformed */ }
138
- }
139
- // Only return if not yet finished — otherwise caller would re-poll a consumed batch.
140
- return latest && !latest.finishedAt ? latest : null;
141
- }
142
- export function markBatchFinished(runId, batchId) {
143
- const path = join(runDir(runId), "batch-jobs.jsonl");
144
- if (!existsSync(path))
145
- return;
146
- const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
147
- const updated = lines.map((line) => {
148
- try {
149
- const e = JSON.parse(line);
150
- if (e.batchId === batchId && !e.finishedAt) {
151
- e.finishedAt = new Date().toISOString();
152
- return JSON.stringify(e);
153
- }
154
- }
155
- catch { /* skip */ }
156
- return line;
157
- });
158
- writeFileSync(path, updated.join("\n") + "\n");
159
- }
160
121
  /** List all runs, newest first. */
161
122
  export function listRuns() {
162
123
  const root = storeRoot();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-overnight",
3
- "version": "1.58.0",
3
+ "version": "1.59.0",
4
4
  "description": "Overnight parallel coding agents in git worktrees, with a self-curating skill memory that improves while the run is going. Mix Claude Opus as planner, Kimi 2.6 or Cursor composer-2 as cheap fast worker, Gemini or Qwen for bulk implementation. Multi-wave autonomous loop that plans, executes, reviews, and steers itself until the objective is met. Crash-safe resume, rate-limit aware, usage cap preserves headroom for your interactive Claude Code.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-overnight",
3
- "version": "1.58.0",
3
+ "version": "1.59.0",
4
4
  "description": "Claude Code skill for understanding, installing, and inspecting claude-overnight runs: overnight parallel coding agents in git worktrees with a self-curating skill memory, multi-wave steering, three-layer review, and crash-safe resume. Mix Opus planner with Kimi 2.6, Cursor composer-2, Gemini, Qwen, or any Anthropic-compatible worker.",
5
5
  "author": {
6
6
  "name": "Francesco Fornace"
@@ -1,62 +0,0 @@
1
- /**
2
- * Batch-API transport for prompt evolution.
3
- *
4
- * 50% cheaper than online calls on every major provider that supports
5
- * batch. Perfect fit for generations=1 benchmark rounds where interactive
6
- * progress isn't needed — we submit 120-1000 requests, poll every 30-300s,
7
- * then pull the results in one shot.
8
- *
9
- * Provider detection from baseUrl:
10
- * - api.anthropic.com → Anthropic Message Batches API (one-shot submit)
11
- * - kimi / moonshot / openai → OpenAI-compatible file-based batch
12
- * - openrouter → NO batch support; throws (caller must fall back to online)
13
- *
14
- * Custom IDs route results back to the right (variant, case, model, rep)
15
- * cell. The evaluator builds ids like `v0:h_abc:kimi-for-coding:r0`.
16
- *
17
- * Poll state is persisted via `persistBatchState` so a crashed or
18
- * restarted run can resume without resubmitting.
19
- */
20
- import type { CallModelResult } from "./transport.js";
21
- export interface BatchJob {
22
- customId: string;
23
- userText: string;
24
- systemText?: string;
25
- model: string;
26
- }
27
- export interface BatchOpts {
28
- baseUrl?: string;
29
- authToken?: string;
30
- /**
31
- * Override model for the batch submission. Moonshot's batch API only
32
- * accepts kimi-k2.5 or kimi-k2.6 — NOT the kimi-for-coding alias that the
33
- * coding endpoint uses. When batch is enabled against a Kimi stack, set
34
- * this so online eval keeps using kimi-for-coding while batch uses the
35
- * concrete version.
36
- */
37
- modelOverride?: string;
38
- maxTokens?: number;
39
- /** Poll interval starts here and doubles to `pollMaxMs`. Defaults 30s → 5min. */
40
- pollStartMs?: number;
41
- pollMaxMs?: number;
42
- /** Overall timeout for the whole batch. Default 24h — matches provider SLAs. */
43
- batchTimeoutMs?: number;
44
- /** Called with progress snapshots during polling. */
45
- onProgress?: (p: BatchProgress) => void;
46
- /** Restore a previously-submitted batch instead of resubmitting. */
47
- resumeBatchId?: string;
48
- /** Called after submit returns an id — use to persist for crash resume. */
49
- onSubmitted?: (batchId: string, provider: BatchProvider) => void;
50
- }
51
- export interface BatchProgress {
52
- provider: BatchProvider;
53
- batchId: string;
54
- phase: "submitted" | "polling" | "downloading" | "done";
55
- processing?: number;
56
- succeeded?: number;
57
- failed?: number;
58
- total?: number;
59
- }
60
- export type BatchProvider = "anthropic" | "openai-compatible" | "unsupported";
61
- export declare function detectBatchProvider(baseUrl: string | undefined): BatchProvider;
62
- export declare function batchCallModel(jobs: BatchJob[], opts: BatchOpts): Promise<Map<string, CallModelResult>>;
@@ -1,235 +0,0 @@
1
- /**
2
- * Batch-API transport for prompt evolution.
3
- *
4
- * 50% cheaper than online calls on every major provider that supports
5
- * batch. Perfect fit for generations=1 benchmark rounds where interactive
6
- * progress isn't needed — we submit 120-1000 requests, poll every 30-300s,
7
- * then pull the results in one shot.
8
- *
9
- * Provider detection from baseUrl:
10
- * - api.anthropic.com → Anthropic Message Batches API (one-shot submit)
11
- * - kimi / moonshot / openai → OpenAI-compatible file-based batch
12
- * - openrouter → NO batch support; throws (caller must fall back to online)
13
- *
14
- * Custom IDs route results back to the right (variant, case, model, rep)
15
- * cell. The evaluator builds ids like `v0:h_abc:kimi-for-coding:r0`.
16
- *
17
- * Poll state is persisted via `persistBatchState` so a crashed or
18
- * restarted run can resume without resubmitting.
19
- */
20
- import { VERSION } from "../core/_version.js";
21
- const USER_AGENT = `claude-overnight-evolve/${VERSION}`;
22
- export function detectBatchProvider(baseUrl) {
23
- const url = (baseUrl ?? "https://api.anthropic.com").toLowerCase();
24
- if (/(^|\/\/)(api\.)?anthropic\.com/.test(url))
25
- return "anthropic";
26
- // Providers with no batch support — caller auto-falls back to online.
27
- // - OpenRouter: no batch API at all.
28
- // - api.kimi.com/coding: Moonshot's coding-specific endpoint; synchronous
29
- // only (30 concurrent, 300-1200 req/5hr) with no /v1/files upload flow.
30
- // Moonshot's generic platform.moonshot.ai might have batch; this one
31
- // doesn't.
32
- if (/openrouter/.test(url))
33
- return "unsupported";
34
- if (/(api\.)?kimi\.com\/coding/.test(url))
35
- return "unsupported";
36
- // Everything else that speaks /v1/chat/completions — OpenAI, DeepSeek,
37
- // DashScope in OpenAI-compat mode — exposes an OpenAI-compatible batch
38
- // endpoint we can ride.
39
- return "openai-compatible";
40
- }
41
- export async function batchCallModel(jobs, opts) {
42
- if (jobs.length === 0)
43
- return new Map();
44
- const provider = detectBatchProvider(opts.baseUrl);
45
- if (provider === "unsupported") {
46
- throw new Error(`Batch API not supported for baseUrl=${opts.baseUrl}. ` +
47
- `Options: (1) omit --batch and use online transport, or (2) point ` +
48
- `the batch call at a provider with batch support (e.g. set --batch-base-url ` +
49
- `https://api.moonshot.ai/v1 --batch-model kimi-k2.6 for Kimi users whose ` +
50
- `online endpoint is api.kimi.com/coding).`);
51
- }
52
- if (provider === "anthropic")
53
- return runAnthropicBatch(jobs, opts);
54
- return runOpenAIBatch(jobs, opts);
55
- }
56
- // ── Anthropic ──────────────────────────────────────────────────────────────
57
- async function runAnthropicBatch(jobs, opts) {
58
- const baseUrl = (opts.baseUrl ?? "https://api.anthropic.com").replace(/\/$/, "");
59
- const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
60
- const headers = {
61
- "Content-Type": "application/json",
62
- "Authorization": `Bearer ${authToken}`,
63
- "User-Agent": USER_AGENT,
64
- "anthropic-version": "2023-06-01",
65
- "anthropic-beta": "message-batches-2024-09-24",
66
- };
67
- let batchId = opts.resumeBatchId;
68
- if (!batchId) {
69
- const body = JSON.stringify({
70
- requests: jobs.map((j) => {
71
- const params = {
72
- model: opts.modelOverride ?? j.model,
73
- max_tokens: opts.maxTokens ?? 4096,
74
- messages: [{ role: "user", content: j.userText }],
75
- };
76
- if (j.systemText)
77
- params.system = j.systemText;
78
- return { custom_id: j.customId, params };
79
- }),
80
- });
81
- const res = await fetch(`${baseUrl}/v1/messages/batches`, { method: "POST", headers, body });
82
- if (!res.ok)
83
- throw new Error(`Anthropic batch submit: HTTP ${res.status} ${await res.text()}`);
84
- const data = await res.json();
85
- batchId = data.id;
86
- opts.onSubmitted?.(batchId, "anthropic");
87
- }
88
- opts.onProgress?.({ provider: "anthropic", batchId, phase: "submitted", total: jobs.length });
89
- const endedAt = await pollUntilDone(async () => {
90
- const res = await fetch(`${baseUrl}/v1/messages/batches/${batchId}`, { headers });
91
- if (!res.ok)
92
- throw new Error(`Anthropic batch poll: HTTP ${res.status}`);
93
- const d = await res.json();
94
- opts.onProgress?.({
95
- provider: "anthropic",
96
- batchId: batchId,
97
- phase: "polling",
98
- processing: d.request_counts?.processing,
99
- succeeded: d.request_counts?.succeeded,
100
- failed: (d.request_counts?.errored ?? 0) + (d.request_counts?.canceled ?? 0) + (d.request_counts?.expired ?? 0),
101
- total: jobs.length,
102
- });
103
- return d.processing_status === "ended" ? d : null;
104
- }, opts);
105
- opts.onProgress?.({ provider: "anthropic", batchId, phase: "downloading" });
106
- const resultsUrl = endedAt.results_url ?? `${baseUrl}/v1/messages/batches/${batchId}/results`;
107
- const res = await fetch(resultsUrl, { headers });
108
- if (!res.ok)
109
- throw new Error(`Anthropic batch results: HTTP ${res.status}`);
110
- const text = await res.text();
111
- const out = new Map();
112
- for (const line of text.split("\n")) {
113
- if (!line.trim())
114
- continue;
115
- const row = JSON.parse(line);
116
- if (row.result.type === "succeeded") {
117
- const raw = row.result.message.content.map((c) => c.text ?? "").join("");
118
- const inp = row.result.message.usage?.input_tokens ?? 0;
119
- const outp = row.result.message.usage?.output_tokens ?? 0;
120
- out.set(row.custom_id, { raw, costUsd: (inp * 0.000003 + outp * 0.000015) * 0.5, inputTokens: inp, outputTokens: outp });
121
- }
122
- else {
123
- const msg = row.result.type === "errored" ? row.result.error.message : row.result.type;
124
- out.set(row.custom_id, { raw: `batch error: ${msg}`, costUsd: 0, inputTokens: 0, outputTokens: 0 });
125
- }
126
- }
127
- opts.onProgress?.({ provider: "anthropic", batchId, phase: "done", succeeded: out.size, total: jobs.length });
128
- return out;
129
- }
130
- // ── OpenAI-compatible (OpenAI, Kimi/Moonshot, DeepSeek) ────────────────────
131
- async function runOpenAIBatch(jobs, opts) {
132
- const baseUrl = (opts.baseUrl ?? "https://api.openai.com").replace(/\/$/, "");
133
- const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
134
- const authHeaders = {
135
- "Authorization": `Bearer ${authToken}`,
136
- "User-Agent": USER_AGENT,
137
- };
138
- let batchId = opts.resumeBatchId;
139
- let outputFileId;
140
- if (!batchId) {
141
- // Build the JSONL payload and upload as a file.
142
- const jsonl = jobs.map((j) => {
143
- const messages = [];
144
- if (j.systemText)
145
- messages.push({ role: "system", content: j.systemText });
146
- messages.push({ role: "user", content: j.userText });
147
- return JSON.stringify({
148
- custom_id: j.customId,
149
- method: "POST",
150
- url: "/v1/chat/completions",
151
- body: { model: opts.modelOverride ?? j.model, max_tokens: opts.maxTokens ?? 4096, max_completion_tokens: opts.maxTokens ?? 4096, messages },
152
- });
153
- }).join("\n");
154
- const form = new FormData();
155
- form.append("purpose", "batch");
156
- form.append("file", new Blob([jsonl], { type: "application/jsonl" }), "batch-input.jsonl");
157
- const fileRes = await fetch(`${baseUrl}/v1/files`, { method: "POST", headers: authHeaders, body: form });
158
- if (!fileRes.ok) {
159
- const body = await fileRes.text().catch(() => "");
160
- throw new Error(`Batch file-upload failed: HTTP ${fileRes.status} at ${baseUrl}/v1/files. ` +
161
- `This provider may not support OpenAI-compatible batch. Response: ${body.slice(0, 300)}`);
162
- }
163
- const fileData = await fileRes.json();
164
- const createRes = await fetch(`${baseUrl}/v1/batches`, {
165
- method: "POST",
166
- headers: { ...authHeaders, "Content-Type": "application/json" },
167
- body: JSON.stringify({ input_file_id: fileData.id, endpoint: "/v1/chat/completions", completion_window: "24h" }),
168
- });
169
- if (!createRes.ok)
170
- throw new Error(`OpenAI-compat batch create: HTTP ${createRes.status} ${await createRes.text()}`);
171
- const createData = await createRes.json();
172
- batchId = createData.id;
173
- opts.onSubmitted?.(batchId, "openai-compatible");
174
- }
175
- opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "submitted", total: jobs.length });
176
- const endedAt = await pollUntilDone(async () => {
177
- const res = await fetch(`${baseUrl}/v1/batches/${batchId}`, { headers: authHeaders });
178
- if (!res.ok)
179
- throw new Error(`OpenAI-compat batch poll: HTTP ${res.status}`);
180
- const d = await res.json();
181
- opts.onProgress?.({
182
- provider: "openai-compatible",
183
- batchId: batchId,
184
- phase: "polling",
185
- succeeded: d.request_counts?.completed,
186
- failed: d.request_counts?.failed,
187
- total: d.request_counts?.total ?? jobs.length,
188
- });
189
- if (d.status === "completed")
190
- return d;
191
- if (d.status === "failed" || d.status === "expired" || d.status === "cancelled") {
192
- throw new Error(`OpenAI-compat batch ${d.status}`);
193
- }
194
- return null;
195
- }, opts);
196
- outputFileId = endedAt.output_file_id;
197
- if (!outputFileId)
198
- throw new Error("OpenAI-compat batch completed with no output_file_id");
199
- opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "downloading" });
200
- const contentRes = await fetch(`${baseUrl}/v1/files/${outputFileId}/content`, { headers: authHeaders });
201
- if (!contentRes.ok)
202
- throw new Error(`OpenAI-compat batch download: HTTP ${contentRes.status}`);
203
- const text = await contentRes.text();
204
- const out = new Map();
205
- for (const line of text.split("\n")) {
206
- if (!line.trim())
207
- continue;
208
- const row = JSON.parse(line);
209
- if (row.error || !row.response) {
210
- out.set(row.custom_id, { raw: `batch error: ${row.error?.message ?? "unknown"}`, costUsd: 0, inputTokens: 0, outputTokens: 0 });
211
- continue;
212
- }
213
- const raw = row.response.body.choices?.[0]?.message?.content ?? "";
214
- const inp = row.response.body.usage?.prompt_tokens ?? 0;
215
- const outp = row.response.body.usage?.completion_tokens ?? 0;
216
- out.set(row.custom_id, { raw, costUsd: (inp * 0.000003 + outp * 0.000015) * 0.5, inputTokens: inp, outputTokens: outp });
217
- }
218
- opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "done", succeeded: out.size, total: jobs.length });
219
- return out;
220
- }
221
- // ── Shared poll loop ───────────────────────────────────────────────────────
222
- async function pollUntilDone(check, opts) {
223
- const start = Date.now();
224
- const deadline = start + (opts.batchTimeoutMs ?? 24 * 60 * 60 * 1000);
225
- let delay = opts.pollStartMs ?? 30_000;
226
- const maxDelay = opts.pollMaxMs ?? 5 * 60_000;
227
- while (Date.now() < deadline) {
228
- const result = await check();
229
- if (result != null)
230
- return result;
231
- await new Promise((r) => setTimeout(r, delay));
232
- delay = Math.min(maxDelay, delay * 2);
233
- }
234
- throw new Error("Batch exceeded batchTimeoutMs without completing");
235
- }