claude-overnight 1.57.4 → 1.59.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/evolve.d.ts +1 -1
- package/dist/bin/evolve.js +5 -8
- package/dist/core/_version.d.ts +1 -1
- package/dist/core/_version.js +1 -1
- package/dist/prompt-evolution/evaluator-judge.d.ts +1 -6
- package/dist/prompt-evolution/evaluator-judge.js +2 -58
- package/dist/prompt-evolution/evaluator.d.ts +0 -11
- package/dist/prompt-evolution/evaluator.js +20 -119
- package/dist/prompt-evolution/index.d.ts +0 -2
- package/dist/prompt-evolution/index.js +0 -12
- package/dist/prompt-evolution/persistence.d.ts +0 -20
- package/dist/prompt-evolution/persistence.js +0 -39
- package/dist/prompt-evolution/transport.js +15 -5
- package/docs/prompt-evolution-research.md +1 -1
- package/package.json +1 -1
- package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1
- package/dist/prompt-evolution/transport-batch.d.ts +0 -54
- package/dist/prompt-evolution/transport-batch.js +0 -216
package/dist/bin/evolve.d.ts
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
*
|
|
9
9
|
* Examples:
|
|
10
10
|
* claude-overnight-evolve --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 3
|
|
11
|
-
* claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-
|
|
11
|
+
* claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-for-coding
|
|
12
12
|
*
|
|
13
13
|
* Requires ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) in env. When `--target
|
|
14
14
|
* mcp-browser` is used the cwd must be the MCP-browser repo root (so
|
package/dist/bin/evolve.js
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
*
|
|
9
9
|
* Examples:
|
|
10
10
|
* claude-overnight-evolve --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 3
|
|
11
|
-
* claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-
|
|
11
|
+
* claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-for-coding
|
|
12
12
|
*
|
|
13
13
|
* Requires ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) in env. When `--target
|
|
14
14
|
* mcp-browser` is used the cwd must be the MCP-browser repo root (so
|
|
@@ -28,7 +28,10 @@ Options:
|
|
|
28
28
|
--prompt <path> Prompt file path (claude-overnight)
|
|
29
29
|
--prompt-kind <kind> MCP-browser prompt kind: planning | review | evolution |
|
|
30
30
|
goal-refinement | plan-supervision | simple-supervision | stuck-analysis
|
|
31
|
-
--eval-model <model> Fast model for evaluation (default: claude-haiku-4-5)
|
|
31
|
+
--eval-model <model> Fast model for evaluation (default: claude-haiku-4-5).
|
|
32
|
+
For Kimi endpoints use "kimi-for-coding" (stable alias,
|
|
33
|
+
auto-upgrades as flagship revs). For Moonshot platform
|
|
34
|
+
API use "kimi-k2.6" (dot, not dash).
|
|
32
35
|
--eval-models <list> Comma-separated list to run cross-model (overrides --eval-model)
|
|
33
36
|
--mutate-model <model> Smarter model for mutation (defaults to eval-model)
|
|
34
37
|
--generations <n> Number of evolution generations (default: 10)
|
|
@@ -36,7 +39,6 @@ Options:
|
|
|
36
39
|
--plateau <n> Stop early if no improvement for N generations (default: 3)
|
|
37
40
|
--reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
|
|
38
41
|
--concurrency <n> Max in-flight eval calls (default: 8; bump for slow endpoints)
|
|
39
|
-
--batch Use provider batch API (50% cheaper, slower wall-clock)
|
|
40
42
|
--adaptive-cap <n> Adaptive sampling: extend reps up to N when σ > threshold (default: off)
|
|
41
43
|
--adaptive-threshold <x> σ threshold that triggers an extra rep (default: 0.1)
|
|
42
44
|
--judge Use llm-judge for content scoring (costs extra API calls)
|
|
@@ -77,7 +79,6 @@ function parseArgs() {
|
|
|
77
79
|
population: 8,
|
|
78
80
|
plateau: 3,
|
|
79
81
|
reps: 1,
|
|
80
|
-
batch: false,
|
|
81
82
|
useJudge: false,
|
|
82
83
|
judgeTopN: 4,
|
|
83
84
|
cases: "",
|
|
@@ -135,9 +136,6 @@ function parseArgs() {
|
|
|
135
136
|
opts.concurrency = parseInt(v, 10);
|
|
136
137
|
i++;
|
|
137
138
|
break;
|
|
138
|
-
case "--batch":
|
|
139
|
-
opts.batch = true;
|
|
140
|
-
break;
|
|
141
139
|
case "--adaptive-cap":
|
|
142
140
|
opts.adaptiveCap = parseInt(v, 10);
|
|
143
141
|
i++;
|
|
@@ -327,7 +325,6 @@ async function evolveOne(opts) {
|
|
|
327
325
|
plateauGenerations: opts.plateau,
|
|
328
326
|
repetitions: opts.reps > 1 ? opts.reps : undefined,
|
|
329
327
|
concurrency: opts.concurrency,
|
|
330
|
-
batch: opts.batch || undefined,
|
|
331
328
|
adaptiveReps: opts.adaptiveCap
|
|
332
329
|
? { cap: opts.adaptiveCap, threshold: opts.adaptiveThreshold }
|
|
333
330
|
: undefined,
|
package/dist/core/_version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "1.
|
|
1
|
+
export declare const VERSION = "1.59.0";
|
package/dist/core/_version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// Auto-generated by build — do not edit manually.
|
|
2
|
-
export const VERSION = "1.
|
|
2
|
+
export const VERSION = "1.59.0";
|
|
@@ -1,20 +1,15 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* LLM-judge pass over a built evaluation matrix.
|
|
3
3
|
*
|
|
4
|
-
* Split out of evaluator.ts to keep each file under the 500-line cap and
|
|
5
|
-
* because the judge has its own concerns (top-N eligibility, batch vs
|
|
6
|
-
* online path, crash-resumable state).
|
|
7
|
-
*
|
|
8
4
|
* The judge REPLACES the heuristic content score with a semantic grade.
|
|
9
5
|
* We only judge top-N variants per generation to cap cost — a judge call
|
|
10
6
|
* per (variant, case, model) on a large population explodes fast.
|
|
11
7
|
*/
|
|
12
8
|
import { type JudgeOpts } from "./llm-judge.js";
|
|
13
9
|
import type { BenchmarkCase, EvaluationResult } from "./types.js";
|
|
14
|
-
import type { EvalOpts } from "./evaluator.js";
|
|
15
10
|
export declare function runJudge(variants: Array<{
|
|
16
11
|
id: string;
|
|
17
12
|
text: string;
|
|
18
13
|
}>, cases: BenchmarkCase[], models: string[], aggregated: Map<string, EvaluationResult>, judge: JudgeOpts & {
|
|
19
14
|
topN?: number;
|
|
20
|
-
}
|
|
15
|
+
}): Promise<void>;
|
|
@@ -1,20 +1,14 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* LLM-judge pass over a built evaluation matrix.
|
|
3
3
|
*
|
|
4
|
-
* Split out of evaluator.ts to keep each file under the 500-line cap and
|
|
5
|
-
* because the judge has its own concerns (top-N eligibility, batch vs
|
|
6
|
-
* online path, crash-resumable state).
|
|
7
|
-
*
|
|
8
4
|
* The judge REPLACES the heuristic content score with a semantic grade.
|
|
9
5
|
* We only judge top-N variants per generation to cap cost — a judge call
|
|
10
6
|
* per (variant, case, model) on a large population explodes fast.
|
|
11
7
|
*/
|
|
12
|
-
import { judgeOutput
|
|
13
|
-
import { batchCallModel } from "./transport-batch.js";
|
|
14
|
-
import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
|
|
8
|
+
import { judgeOutput } from "./llm-judge.js";
|
|
15
9
|
import { gmean } from "./scorer.js";
|
|
16
10
|
import { averageDimensions } from "./evaluator-utils.js";
|
|
17
|
-
export async function runJudge(variants, cases, models, aggregated, judge
|
|
11
|
+
export async function runJudge(variants, cases, models, aggregated, judge) {
|
|
18
12
|
const topN = judge.topN ?? 4;
|
|
19
13
|
const variantGmeans = variants.map((v) => {
|
|
20
14
|
const scores = [];
|
|
@@ -45,10 +39,6 @@ export async function runJudge(variants, cases, models, aggregated, judge, opts)
|
|
|
45
39
|
}
|
|
46
40
|
if (cells.length === 0)
|
|
47
41
|
return;
|
|
48
|
-
if (opts.batch) {
|
|
49
|
-
await runJudgeBatch(cells, judge, opts);
|
|
50
|
-
return;
|
|
51
|
-
}
|
|
52
42
|
const jobs = cells.map((cell) => async () => {
|
|
53
43
|
try {
|
|
54
44
|
const jr = await judgeOutput(cell.r.rawOutput, cell.c, judge);
|
|
@@ -71,49 +61,3 @@ export async function runJudge(variants, cases, models, aggregated, judge, opts)
|
|
|
71
61
|
};
|
|
72
62
|
await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
|
|
73
63
|
}
|
|
74
|
-
async function runJudgeBatch(cells, judge, opts) {
|
|
75
|
-
const batchJobs = cells.map((cell, i) => ({
|
|
76
|
-
customId: `j:${i}|k:${cell.key}`,
|
|
77
|
-
userText: buildJudgePrompt(cell.r.rawOutput, cell.c),
|
|
78
|
-
model: judge.model,
|
|
79
|
-
}));
|
|
80
|
-
const existing = opts.runId != null && opts.generation != null
|
|
81
|
-
? loadBatchState(opts.runId, opts.generation, "judge")
|
|
82
|
-
: null;
|
|
83
|
-
const transport = opts.batchCallModel ?? batchCallModel;
|
|
84
|
-
const results = await transport(batchJobs, {
|
|
85
|
-
baseUrl: judge.baseUrl ?? opts.baseUrl,
|
|
86
|
-
authToken: judge.authToken ?? opts.authToken,
|
|
87
|
-
maxTokens: judge.maxTokens ?? 2048,
|
|
88
|
-
resumeBatchId: existing?.batchId,
|
|
89
|
-
onSubmitted: (batchId, p) => {
|
|
90
|
-
if (opts.runId != null && opts.generation != null && !existing) {
|
|
91
|
-
saveBatchState(opts.runId, {
|
|
92
|
-
generation: opts.generation,
|
|
93
|
-
phase: "judge",
|
|
94
|
-
batchId,
|
|
95
|
-
provider: p,
|
|
96
|
-
submittedAt: new Date().toISOString(),
|
|
97
|
-
});
|
|
98
|
-
}
|
|
99
|
-
opts.onBatchProgress?.(`judge batch submitted: ${batchId} (${p})`);
|
|
100
|
-
},
|
|
101
|
-
onProgress: (p) => opts.onBatchProgress?.(`judge batch ${p.batchId} ${p.phase}${p.succeeded != null ? `: ${p.succeeded}/${p.total ?? batchJobs.length}` : ""}`),
|
|
102
|
-
});
|
|
103
|
-
if (opts.runId != null && existing)
|
|
104
|
-
markBatchFinished(opts.runId, existing.batchId);
|
|
105
|
-
for (const cell of cells) {
|
|
106
|
-
const customId = batchJobs.find((b) => b.customId.includes(`|k:${cell.key}`))?.customId;
|
|
107
|
-
const got = customId ? results.get(customId) : undefined;
|
|
108
|
-
if (!got || !got.raw)
|
|
109
|
-
continue;
|
|
110
|
-
try {
|
|
111
|
-
const jr = parseJudgeOutput(got.raw);
|
|
112
|
-
cell.r.scores = { ...cell.r.scores, content: jr.score };
|
|
113
|
-
cell.r.judgeJustification = jr.justification;
|
|
114
|
-
}
|
|
115
|
-
catch {
|
|
116
|
-
// Judge parse failure is non-fatal — keep heuristic content.
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
}
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
*/
|
|
18
18
|
import { type JudgeOpts } from "./llm-judge.js";
|
|
19
19
|
import { type CallModel } from "./transport.js";
|
|
20
|
-
import { batchCallModel } from "./transport-batch.js";
|
|
21
20
|
import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
|
|
22
21
|
export interface EvalOpts {
|
|
23
22
|
/** Primary generator model (retained for single-model compat). */
|
|
@@ -51,18 +50,8 @@ export interface EvalOpts {
|
|
|
51
50
|
};
|
|
52
51
|
/** Transport override for tests. */
|
|
53
52
|
callModel?: CallModel;
|
|
54
|
-
/** Use provider batch API instead of online calls (50% cheaper, slower wall-clock). */
|
|
55
|
-
batch?: boolean;
|
|
56
|
-
/** Run id — required when batch=true so state is crash-resumable. */
|
|
57
|
-
runId?: string;
|
|
58
|
-
/** Current generation number — used to key batch state. */
|
|
59
|
-
generation?: number;
|
|
60
|
-
/** Batch-transport override for tests. Same return shape as transport-batch.batchCallModel. */
|
|
61
|
-
batchCallModel?: typeof batchCallModel;
|
|
62
53
|
/** Optional callback for progress */
|
|
63
54
|
onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
|
|
64
|
-
/** Progress callback specific to batch-phase transitions. */
|
|
65
|
-
onBatchProgress?: (msg: string) => void;
|
|
66
55
|
}
|
|
67
56
|
export declare function buildMatrix(variants: Array<{
|
|
68
57
|
id: string;
|
|
@@ -18,8 +18,6 @@
|
|
|
18
18
|
import { renderPrompt } from "../prompts/load.js";
|
|
19
19
|
import { scoreOutput, gmean, aggregateReps, bootstrapCI, kendallTau } from "./scorer.js";
|
|
20
20
|
import { defaultCallModel, attemptJsonParse, } from "./transport.js";
|
|
21
|
-
import { batchCallModel, detectBatchProvider, } from "./transport-batch.js";
|
|
22
|
-
import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
|
|
23
21
|
import { averageDimensions } from "./evaluator-utils.js";
|
|
24
22
|
import { runJudge } from "./evaluator-judge.js";
|
|
25
23
|
export async function buildMatrix(variants, cases, opts) {
|
|
@@ -38,53 +36,30 @@ export async function buildMatrix(variants, cases, opts) {
|
|
|
38
36
|
}
|
|
39
37
|
}
|
|
40
38
|
}
|
|
41
|
-
//
|
|
42
|
-
//
|
|
43
|
-
// results as they arrive. 50% cheaper, slower wall-clock.
|
|
44
|
-
// batch=false — work-stealing pool: keep `concurrency` jobs in flight so
|
|
45
|
-
// a slow call doesn't block the others in its slice.
|
|
39
|
+
// Work-stealing pool: keep `concurrency` jobs in flight so a slow call
|
|
40
|
+
// doesn't block the others in its slice.
|
|
46
41
|
const rawByKey = new Map();
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
|
|
62
|
-
}
|
|
63
|
-
};
|
|
64
|
-
await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
|
|
65
|
-
};
|
|
66
|
-
if (opts.batch) {
|
|
67
|
-
try {
|
|
68
|
-
await runBatchPath(jobs, opts, rawByKey);
|
|
69
|
-
}
|
|
70
|
-
catch (err) {
|
|
71
|
-
// Batch submission failed (Kimi's /v1/files doesn't match OpenAI,
|
|
72
|
-
// OpenRouter has no batch at all, transient provider error, etc.).
|
|
73
|
-
// Fall back to the online pool so the whole run doesn't die — losing
|
|
74
|
-
// the 50% batch discount is better than losing the run.
|
|
75
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
76
|
-
opts.onBatchProgress?.(`batch path failed, falling back to online: ${msg.slice(0, 200)}`);
|
|
77
|
-
rawByKey.clear(); // discard any partial state
|
|
78
|
-
await runOnlinePool();
|
|
42
|
+
let done = 0;
|
|
43
|
+
let next = 0;
|
|
44
|
+
const worker = async () => {
|
|
45
|
+
while (true) {
|
|
46
|
+
const i = next++;
|
|
47
|
+
if (i >= jobs.length)
|
|
48
|
+
return;
|
|
49
|
+
const r = await runSingle(jobs[i], opts, transport);
|
|
50
|
+
const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
|
|
51
|
+
const arr = rawByKey.get(key) ?? [];
|
|
52
|
+
arr.push(r);
|
|
53
|
+
rawByKey.set(key, arr);
|
|
54
|
+
done++;
|
|
55
|
+
opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
|
|
79
56
|
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
await runOnlinePool();
|
|
83
|
-
}
|
|
57
|
+
};
|
|
58
|
+
await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
|
|
84
59
|
// Adaptive sampling: for cells where any score-dim σ exceeds threshold,
|
|
85
60
|
// add one more rep and rerun — up to `cap` total reps. Converges on a
|
|
86
61
|
// stable estimate without wasting reps on already-stable cells.
|
|
87
|
-
if (
|
|
62
|
+
if (opts.adaptiveReps) {
|
|
88
63
|
const cap = opts.adaptiveReps.cap;
|
|
89
64
|
const threshold = opts.adaptiveReps.threshold ?? 0.1;
|
|
90
65
|
for (let round = 0; round < cap - reps; round++) {
|
|
@@ -131,7 +106,7 @@ export async function buildMatrix(variants, cases, opts) {
|
|
|
131
106
|
}
|
|
132
107
|
// Optional llm-judge pass on top-N variants (by current heuristic content).
|
|
133
108
|
if (opts.judge)
|
|
134
|
-
await runJudge(variants, cases, models, aggregated, opts.judge
|
|
109
|
+
await runJudge(variants, cases, models, aggregated, opts.judge);
|
|
135
110
|
// Assemble rows: per-variant aggregate across all cases and models.
|
|
136
111
|
const rows = [];
|
|
137
112
|
for (const v of variants) {
|
|
@@ -244,80 +219,6 @@ function halfSplitMatrix(variants, cases, models, rawByKey, side) {
|
|
|
244
219
|
scored.sort((a, b) => b.g - a.g);
|
|
245
220
|
return scored.map((s) => s.id);
|
|
246
221
|
}
|
|
247
|
-
async function runBatchPath(jobs, opts, rawByKey) {
|
|
248
|
-
const provider = detectBatchProvider(opts.baseUrl);
|
|
249
|
-
if (provider === "unsupported") {
|
|
250
|
-
throw new Error(`Batch API not supported for baseUrl=${opts.baseUrl}; rerun without --batch or point at an Anthropic / OpenAI-compatible endpoint.`);
|
|
251
|
-
}
|
|
252
|
-
// Build custom_ids that route results back to the right cell. Index is
|
|
253
|
-
// included so reps of the same (variant, case, model) don't collide.
|
|
254
|
-
const keyed = jobs.map((job, i) => ({
|
|
255
|
-
job,
|
|
256
|
-
index: i,
|
|
257
|
-
customId: `v:${job.variantId}|h:${job.case.hash}|m:${job.model}|r:${job.rep}|i:${i}`,
|
|
258
|
-
}));
|
|
259
|
-
const batchJobs = keyed.map((k) => ({
|
|
260
|
-
customId: k.customId,
|
|
261
|
-
userText: k.job.text,
|
|
262
|
-
systemText: k.job.systemText,
|
|
263
|
-
model: k.job.model,
|
|
264
|
-
}));
|
|
265
|
-
const started = Date.now();
|
|
266
|
-
const existing = opts.runId != null && opts.generation != null
|
|
267
|
-
? loadBatchState(opts.runId, opts.generation, "eval")
|
|
268
|
-
: null;
|
|
269
|
-
const transport = opts.batchCallModel ?? batchCallModel;
|
|
270
|
-
const results = await transport(batchJobs, {
|
|
271
|
-
baseUrl: opts.baseUrl,
|
|
272
|
-
authToken: opts.authToken,
|
|
273
|
-
maxTokens: opts.maxTokens,
|
|
274
|
-
resumeBatchId: existing?.batchId,
|
|
275
|
-
onSubmitted: (batchId, p) => {
|
|
276
|
-
if (opts.runId != null && opts.generation != null && !existing) {
|
|
277
|
-
saveBatchState(opts.runId, {
|
|
278
|
-
generation: opts.generation,
|
|
279
|
-
phase: "eval",
|
|
280
|
-
batchId,
|
|
281
|
-
provider: p,
|
|
282
|
-
submittedAt: new Date().toISOString(),
|
|
283
|
-
});
|
|
284
|
-
}
|
|
285
|
-
opts.onBatchProgress?.(`batch submitted: ${batchId} (${p})`);
|
|
286
|
-
},
|
|
287
|
-
onProgress: (p) => {
|
|
288
|
-
if (p.phase === "polling") {
|
|
289
|
-
const ok = p.succeeded ?? 0;
|
|
290
|
-
const failed = p.failed ?? 0;
|
|
291
|
-
const total = p.total ?? batchJobs.length;
|
|
292
|
-
opts.onBatchProgress?.(`batch ${p.batchId} polling: ${ok}/${total} done${failed ? `, ${failed} failed` : ""}`);
|
|
293
|
-
}
|
|
294
|
-
else {
|
|
295
|
-
opts.onBatchProgress?.(`batch ${p.batchId} ${p.phase}`);
|
|
296
|
-
}
|
|
297
|
-
},
|
|
298
|
-
});
|
|
299
|
-
// Mark the state entry as finished so a crash after this point doesn't
|
|
300
|
-
// cause the next run to try resuming an already-consumed batch.
|
|
301
|
-
if (opts.runId != null && existing)
|
|
302
|
-
markBatchFinished(opts.runId, existing.batchId);
|
|
303
|
-
// Score each result and populate rawByKey the same way runSingle does.
|
|
304
|
-
const durationMs = Math.round((Date.now() - started) / Math.max(1, jobs.length));
|
|
305
|
-
let done = 0;
|
|
306
|
-
for (const k of keyed) {
|
|
307
|
-
const r = results.get(k.customId);
|
|
308
|
-
const raw = r?.raw ?? "batch returned no result for this custom_id";
|
|
309
|
-
const costUsd = r?.costUsd ?? 0;
|
|
310
|
-
const parsed = attemptJsonParse(raw);
|
|
311
|
-
const scored = scoreOutput(raw, parsed, costUsd, durationMs, k.job.case, { model: k.job.model });
|
|
312
|
-
scored.variantId = k.job.variantId;
|
|
313
|
-
const mapKey = `${scored.variantId}:${scored.caseHash}:${scored.model ?? ""}`;
|
|
314
|
-
const arr = rawByKey.get(mapKey) ?? [];
|
|
315
|
-
arr.push(scored);
|
|
316
|
-
rawByKey.set(mapKey, arr);
|
|
317
|
-
done++;
|
|
318
|
-
opts.onProgress?.(done, jobs.length, k.job.case.name, k.job.variantId);
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
222
|
async function runSingle(job, opts, transport) {
|
|
322
223
|
const started = Date.now();
|
|
323
224
|
const callOpts = {
|
|
@@ -54,8 +54,6 @@ export interface EvolveOpts {
|
|
|
54
54
|
repetitions?: number;
|
|
55
55
|
/** Max in-flight eval calls. Default 8. Raise for slow endpoints, lower for strict rate limits. */
|
|
56
56
|
concurrency?: number;
|
|
57
|
-
/** Use provider batch API instead of online calls. 50% cheaper, slower wall-clock. */
|
|
58
|
-
batch?: boolean;
|
|
59
57
|
/** Adaptive sampling cap (opt-in). Keeps adding reps to noisy cells up to this count. */
|
|
60
58
|
adaptiveReps?: {
|
|
61
59
|
cap: number;
|
|
@@ -73,14 +73,10 @@ export async function evolvePrompt(opts) {
|
|
|
73
73
|
concurrency: opts.concurrency ?? 8,
|
|
74
74
|
repetitions: opts.repetitions,
|
|
75
75
|
judge: opts.judge,
|
|
76
|
-
batch: opts.batch,
|
|
77
76
|
adaptiveReps: opts.adaptiveReps,
|
|
78
|
-
runId,
|
|
79
|
-
generation: gen,
|
|
80
77
|
onProgress: (done, total, caseName, variantId) => {
|
|
81
78
|
log(` [${done}/${total}] ${variantId.slice(0, 16)} → ${caseName}`);
|
|
82
79
|
},
|
|
83
|
-
onBatchProgress: (msg) => log(` [batch] ${msg}`),
|
|
84
80
|
};
|
|
85
81
|
const matrix = await buildMatrix(population, trainCases, evalOpts);
|
|
86
82
|
generationMatrices.push(matrix);
|
|
@@ -195,11 +191,7 @@ export async function evolvePrompt(opts) {
|
|
|
195
191
|
concurrency: opts.concurrency ?? 8,
|
|
196
192
|
repetitions: opts.repetitions,
|
|
197
193
|
judge: opts.judge,
|
|
198
|
-
batch: opts.batch,
|
|
199
194
|
adaptiveReps: opts.adaptiveReps,
|
|
200
|
-
runId,
|
|
201
|
-
generation: generations,
|
|
202
|
-
onBatchProgress: (msg) => log(` [batch] ${msg}`),
|
|
203
195
|
});
|
|
204
196
|
generationMatrices.push(finalMatrix);
|
|
205
197
|
snapshotPrompts(runId, finalMatrix);
|
|
@@ -219,11 +211,7 @@ export async function evolvePrompt(opts) {
|
|
|
219
211
|
authToken: opts.authToken,
|
|
220
212
|
concurrency: opts.concurrency ?? 8,
|
|
221
213
|
repetitions: opts.repetitions,
|
|
222
|
-
batch: opts.batch,
|
|
223
214
|
adaptiveReps: opts.adaptiveReps,
|
|
224
|
-
runId,
|
|
225
|
-
generation: generations + 1,
|
|
226
|
-
onBatchProgress: (msg) => log(` [batch-test] ${msg}`),
|
|
227
215
|
});
|
|
228
216
|
log(formatMatrix(testMatrix, testCases.map((c) => c.name)));
|
|
229
217
|
}
|
|
@@ -37,26 +37,6 @@ export declare function appendLearning(runId: string, entries: LearningEntry[]):
|
|
|
37
37
|
export declare function snapshotPrompts(runId: string, rows: VariantRow[]): void;
|
|
38
38
|
/** Finalise the run: write best.md and update meta.json. */
|
|
39
39
|
export declare function finalizeRun(runId: string, result: EvolutionResult, metaPartial?: Partial<RunMeta>): void;
|
|
40
|
-
/**
|
|
41
|
-
* Persist batch submission state so a crashed or restarted run can resume
|
|
42
|
-
* polling instead of resubmitting (which would duplicate the bill).
|
|
43
|
-
*
|
|
44
|
-
* Keyed by (generation, phase) so multi-generation runs and eval-vs-judge
|
|
45
|
-
* submissions don't collide. Written append-only — the latest entry wins
|
|
46
|
-
* on load.
|
|
47
|
-
*/
|
|
48
|
-
export interface BatchStateEntry {
|
|
49
|
-
generation: number;
|
|
50
|
-
phase: "eval" | "judge";
|
|
51
|
-
batchId: string;
|
|
52
|
-
provider: "anthropic" | "openai-compatible";
|
|
53
|
-
submittedAt: string;
|
|
54
|
-
/** If set, we've already collected results for this entry — ignore on resume. */
|
|
55
|
-
finishedAt?: string;
|
|
56
|
-
}
|
|
57
|
-
export declare function saveBatchState(runId: string, entry: BatchStateEntry): void;
|
|
58
|
-
export declare function loadBatchState(runId: string, generation: number, phase: "eval" | "judge"): BatchStateEntry | null;
|
|
59
|
-
export declare function markBatchFinished(runId: string, batchId: string): void;
|
|
60
40
|
/** List all runs, newest first. */
|
|
61
41
|
export declare function listRuns(): Array<{
|
|
62
42
|
runId: string;
|
|
@@ -118,45 +118,6 @@ ${result.learningLog.map((l) => `| ${l.generation} | ${l.mutationSummary} | ${(l
|
|
|
118
118
|
`;
|
|
119
119
|
writeFileSync(join(root, "best.md"), report);
|
|
120
120
|
}
|
|
121
|
-
export function saveBatchState(runId, entry) {
|
|
122
|
-
const path = join(runDir(runId), "batch-jobs.jsonl");
|
|
123
|
-
writeFileSync(path, JSON.stringify(entry) + "\n", { flag: "a" });
|
|
124
|
-
}
|
|
125
|
-
export function loadBatchState(runId, generation, phase) {
|
|
126
|
-
const path = join(runDir(runId), "batch-jobs.jsonl");
|
|
127
|
-
if (!existsSync(path))
|
|
128
|
-
return null;
|
|
129
|
-
const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
|
|
130
|
-
let latest = null;
|
|
131
|
-
for (const line of lines) {
|
|
132
|
-
try {
|
|
133
|
-
const e = JSON.parse(line);
|
|
134
|
-
if (e.generation === generation && e.phase === phase)
|
|
135
|
-
latest = e;
|
|
136
|
-
}
|
|
137
|
-
catch { /* skip malformed */ }
|
|
138
|
-
}
|
|
139
|
-
// Only return if not yet finished — otherwise caller would re-poll a consumed batch.
|
|
140
|
-
return latest && !latest.finishedAt ? latest : null;
|
|
141
|
-
}
|
|
142
|
-
export function markBatchFinished(runId, batchId) {
|
|
143
|
-
const path = join(runDir(runId), "batch-jobs.jsonl");
|
|
144
|
-
if (!existsSync(path))
|
|
145
|
-
return;
|
|
146
|
-
const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
|
|
147
|
-
const updated = lines.map((line) => {
|
|
148
|
-
try {
|
|
149
|
-
const e = JSON.parse(line);
|
|
150
|
-
if (e.batchId === batchId && !e.finishedAt) {
|
|
151
|
-
e.finishedAt = new Date().toISOString();
|
|
152
|
-
return JSON.stringify(e);
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
catch { /* skip */ }
|
|
156
|
-
return line;
|
|
157
|
-
});
|
|
158
|
-
writeFileSync(path, updated.join("\n") + "\n");
|
|
159
|
-
}
|
|
160
121
|
/** List all runs, newest first. */
|
|
161
122
|
export function listRuns() {
|
|
162
123
|
const root = storeRoot();
|
|
@@ -8,17 +8,22 @@
|
|
|
8
8
|
* Supports both Anthropic-native and OpenAI-compatible endpoints so we can
|
|
9
9
|
* run the same eval against Haiku, Kimi, and OpenRouter without a rewrite.
|
|
10
10
|
*/
|
|
11
|
+
import { VERSION } from "../core/_version.js";
|
|
12
|
+
const USER_AGENT = `claude-overnight-evolve/${VERSION}`;
|
|
11
13
|
export async function defaultCallModel(userText, systemText, opts) {
|
|
12
14
|
const baseUrl = (opts.baseUrl ?? process.env.ANTHROPIC_BASE_URL ?? "https://api.anthropic.com").replace(/\/$/, "");
|
|
13
15
|
const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
|
|
14
16
|
const isAnthropic = /^https?:\/\/(api\.)?anthropic\.com/i.test(baseUrl);
|
|
15
|
-
|
|
17
|
+
// Identify ourselves honestly. Kimi's coding-endpoint docs explicitly say
|
|
18
|
+
// "Tampering with the client identifier (User-Agent) is considered a
|
|
19
|
+
// violation." The previous "Kilo-Code/1.0" was impersonating a third-party
|
|
20
|
+
// tool; we now send our real binary name + version.
|
|
16
21
|
const headers = {
|
|
17
22
|
"Content-Type": "application/json",
|
|
18
23
|
"Authorization": `Bearer ${authToken}`,
|
|
24
|
+
"User-Agent": USER_AGENT,
|
|
19
25
|
};
|
|
20
|
-
|
|
21
|
-
headers["User-Agent"] = "Kilo-Code/1.0";
|
|
26
|
+
const maxOut = opts.maxTokens ?? 4096;
|
|
22
27
|
let endpoint;
|
|
23
28
|
let body;
|
|
24
29
|
if (isAnthropic) {
|
|
@@ -26,7 +31,7 @@ export async function defaultCallModel(userText, systemText, opts) {
|
|
|
26
31
|
headers["anthropic-version"] = "2023-06-01";
|
|
27
32
|
const payload = {
|
|
28
33
|
model: opts.model,
|
|
29
|
-
max_tokens:
|
|
34
|
+
max_tokens: maxOut, // Anthropic uses max_tokens, not max_completion_tokens.
|
|
30
35
|
messages: [{ role: "user", content: userText }],
|
|
31
36
|
};
|
|
32
37
|
if (systemText)
|
|
@@ -39,9 +44,14 @@ export async function defaultCallModel(userText, systemText, opts) {
|
|
|
39
44
|
if (systemText)
|
|
40
45
|
messages.push({ role: "system", content: systemText });
|
|
41
46
|
messages.push({ role: "user", content: userText });
|
|
47
|
+
// Platform.moonshot.ai marks max_tokens deprecated in favor of
|
|
48
|
+
// max_completion_tokens. Kimi's coding endpoint still accepts max_tokens.
|
|
49
|
+
// Sending both is safe — OpenAI, Moonshot, DeepSeek, and Kimi all tolerate
|
|
50
|
+
// the extra field, and we're future-proof against the deprecation.
|
|
42
51
|
body = JSON.stringify({
|
|
43
52
|
model: opts.model,
|
|
44
|
-
max_tokens:
|
|
53
|
+
max_tokens: maxOut,
|
|
54
|
+
max_completion_tokens: maxOut,
|
|
45
55
|
messages,
|
|
46
56
|
});
|
|
47
57
|
}
|
|
@@ -183,7 +183,7 @@ Your laptop can be off the whole time.
|
|
|
183
183
|
npm run evolve -- --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 10
|
|
184
184
|
|
|
185
185
|
# Evolve an MCP-browser supervision prompt
|
|
186
|
-
npm run evolve -- --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-
|
|
186
|
+
npm run evolve -- --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-for-coding --generations 10
|
|
187
187
|
```
|
|
188
188
|
|
|
189
189
|
### Via Platform API (runs on server)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-overnight",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.59.0",
|
|
4
4
|
"description": "Overnight parallel coding agents in git worktrees, with a self-curating skill memory that improves while the run is going. Mix Claude Opus as planner, Kimi 2.6 or Cursor composer-2 as cheap fast worker, Gemini or Qwen for bulk implementation. Multi-wave autonomous loop that plans, executes, reviews, and steers itself until the objective is met. Crash-safe resume, rate-limit aware, usage cap preserves headroom for your interactive Claude Code.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-overnight",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.59.0",
|
|
4
4
|
"description": "Claude Code skill for understanding, installing, and inspecting claude-overnight runs: overnight parallel coding agents in git worktrees with a self-curating skill memory, multi-wave steering, three-layer review, and crash-safe resume. Mix Opus planner with Kimi 2.6, Cursor composer-2, Gemini, Qwen, or any Anthropic-compatible worker.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Francesco Fornace"
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Batch-API transport for prompt evolution.
|
|
3
|
-
*
|
|
4
|
-
* 50% cheaper than online calls on every major provider that supports
|
|
5
|
-
* batch. Perfect fit for generations=1 benchmark rounds where interactive
|
|
6
|
-
* progress isn't needed — we submit 120-1000 requests, poll every 30-300s,
|
|
7
|
-
* then pull the results in one shot.
|
|
8
|
-
*
|
|
9
|
-
* Provider detection from baseUrl:
|
|
10
|
-
* - api.anthropic.com → Anthropic Message Batches API (one-shot submit)
|
|
11
|
-
* - kimi / moonshot / openai → OpenAI-compatible file-based batch
|
|
12
|
-
* - openrouter → NO batch support; throws (caller must fall back to online)
|
|
13
|
-
*
|
|
14
|
-
* Custom IDs route results back to the right (variant, case, model, rep)
|
|
15
|
-
* cell. The evaluator builds ids like `v0:h_abc:kimi-k2-6:r0`.
|
|
16
|
-
*
|
|
17
|
-
* Poll state is persisted via `persistBatchState` so a crashed or
|
|
18
|
-
* restarted run can resume without resubmitting.
|
|
19
|
-
*/
|
|
20
|
-
import type { CallModelResult } from "./transport.js";
|
|
21
|
-
export interface BatchJob {
|
|
22
|
-
customId: string;
|
|
23
|
-
userText: string;
|
|
24
|
-
systemText?: string;
|
|
25
|
-
model: string;
|
|
26
|
-
}
|
|
27
|
-
export interface BatchOpts {
|
|
28
|
-
baseUrl?: string;
|
|
29
|
-
authToken?: string;
|
|
30
|
-
maxTokens?: number;
|
|
31
|
-
/** Poll interval starts here and doubles to `pollMaxMs`. Defaults 30s → 5min. */
|
|
32
|
-
pollStartMs?: number;
|
|
33
|
-
pollMaxMs?: number;
|
|
34
|
-
/** Overall timeout for the whole batch. Default 24h — matches provider SLAs. */
|
|
35
|
-
batchTimeoutMs?: number;
|
|
36
|
-
/** Called with progress snapshots during polling. */
|
|
37
|
-
onProgress?: (p: BatchProgress) => void;
|
|
38
|
-
/** Restore a previously-submitted batch instead of resubmitting. */
|
|
39
|
-
resumeBatchId?: string;
|
|
40
|
-
/** Called after submit returns an id — use to persist for crash resume. */
|
|
41
|
-
onSubmitted?: (batchId: string, provider: BatchProvider) => void;
|
|
42
|
-
}
|
|
43
|
-
export interface BatchProgress {
|
|
44
|
-
provider: BatchProvider;
|
|
45
|
-
batchId: string;
|
|
46
|
-
phase: "submitted" | "polling" | "downloading" | "done";
|
|
47
|
-
processing?: number;
|
|
48
|
-
succeeded?: number;
|
|
49
|
-
failed?: number;
|
|
50
|
-
total?: number;
|
|
51
|
-
}
|
|
52
|
-
export type BatchProvider = "anthropic" | "openai-compatible" | "unsupported";
|
|
53
|
-
export declare function detectBatchProvider(baseUrl: string | undefined): BatchProvider;
|
|
54
|
-
export declare function batchCallModel(jobs: BatchJob[], opts: BatchOpts): Promise<Map<string, CallModelResult>>;
|
|
@@ -1,216 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Batch-API transport for prompt evolution.
|
|
3
|
-
*
|
|
4
|
-
* 50% cheaper than online calls on every major provider that supports
|
|
5
|
-
* batch. Perfect fit for generations=1 benchmark rounds where interactive
|
|
6
|
-
* progress isn't needed — we submit 120-1000 requests, poll every 30-300s,
|
|
7
|
-
* then pull the results in one shot.
|
|
8
|
-
*
|
|
9
|
-
* Provider detection from baseUrl:
|
|
10
|
-
* - api.anthropic.com → Anthropic Message Batches API (one-shot submit)
|
|
11
|
-
* - kimi / moonshot / openai → OpenAI-compatible file-based batch
|
|
12
|
-
* - openrouter → NO batch support; throws (caller must fall back to online)
|
|
13
|
-
*
|
|
14
|
-
* Custom IDs route results back to the right (variant, case, model, rep)
|
|
15
|
-
* cell. The evaluator builds ids like `v0:h_abc:kimi-k2-6:r0`.
|
|
16
|
-
*
|
|
17
|
-
* Poll state is persisted via `persistBatchState` so a crashed or
|
|
18
|
-
* restarted run can resume without resubmitting.
|
|
19
|
-
*/
|
|
20
|
-
export function detectBatchProvider(baseUrl) {
|
|
21
|
-
const url = (baseUrl ?? "https://api.anthropic.com").toLowerCase();
|
|
22
|
-
if (/(^|\/\/)(api\.)?anthropic\.com/.test(url))
|
|
23
|
-
return "anthropic";
|
|
24
|
-
if (/openrouter/.test(url))
|
|
25
|
-
return "unsupported";
|
|
26
|
-
// Everything else that speaks /v1/chat/completions — OpenAI, Kimi, Moonshot,
|
|
27
|
-
// DeepSeek — exposes an OpenAI-compatible batch endpoint.
|
|
28
|
-
return "openai-compatible";
|
|
29
|
-
}
|
|
30
|
-
export async function batchCallModel(jobs, opts) {
|
|
31
|
-
if (jobs.length === 0)
|
|
32
|
-
return new Map();
|
|
33
|
-
const provider = detectBatchProvider(opts.baseUrl);
|
|
34
|
-
if (provider === "unsupported") {
|
|
35
|
-
throw new Error(`Batch API not supported for baseUrl=${opts.baseUrl}; use online transport`);
|
|
36
|
-
}
|
|
37
|
-
if (provider === "anthropic")
|
|
38
|
-
return runAnthropicBatch(jobs, opts);
|
|
39
|
-
return runOpenAIBatch(jobs, opts);
|
|
40
|
-
}
|
|
41
|
-
// ── Anthropic ──────────────────────────────────────────────────────────────
|
|
42
|
-
async function runAnthropicBatch(jobs, opts) {
|
|
43
|
-
const baseUrl = (opts.baseUrl ?? "https://api.anthropic.com").replace(/\/$/, "");
|
|
44
|
-
const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
|
|
45
|
-
const headers = {
|
|
46
|
-
"Content-Type": "application/json",
|
|
47
|
-
"Authorization": `Bearer ${authToken}`,
|
|
48
|
-
"anthropic-version": "2023-06-01",
|
|
49
|
-
"anthropic-beta": "message-batches-2024-09-24",
|
|
50
|
-
};
|
|
51
|
-
let batchId = opts.resumeBatchId;
|
|
52
|
-
if (!batchId) {
|
|
53
|
-
const body = JSON.stringify({
|
|
54
|
-
requests: jobs.map((j) => {
|
|
55
|
-
const params = {
|
|
56
|
-
model: j.model,
|
|
57
|
-
max_tokens: opts.maxTokens ?? 4096,
|
|
58
|
-
messages: [{ role: "user", content: j.userText }],
|
|
59
|
-
};
|
|
60
|
-
if (j.systemText)
|
|
61
|
-
params.system = j.systemText;
|
|
62
|
-
return { custom_id: j.customId, params };
|
|
63
|
-
}),
|
|
64
|
-
});
|
|
65
|
-
const res = await fetch(`${baseUrl}/v1/messages/batches`, { method: "POST", headers, body });
|
|
66
|
-
if (!res.ok)
|
|
67
|
-
throw new Error(`Anthropic batch submit: HTTP ${res.status} ${await res.text()}`);
|
|
68
|
-
const data = await res.json();
|
|
69
|
-
batchId = data.id;
|
|
70
|
-
opts.onSubmitted?.(batchId, "anthropic");
|
|
71
|
-
}
|
|
72
|
-
opts.onProgress?.({ provider: "anthropic", batchId, phase: "submitted", total: jobs.length });
|
|
73
|
-
const endedAt = await pollUntilDone(async () => {
|
|
74
|
-
const res = await fetch(`${baseUrl}/v1/messages/batches/${batchId}`, { headers });
|
|
75
|
-
if (!res.ok)
|
|
76
|
-
throw new Error(`Anthropic batch poll: HTTP ${res.status}`);
|
|
77
|
-
const d = await res.json();
|
|
78
|
-
opts.onProgress?.({
|
|
79
|
-
provider: "anthropic",
|
|
80
|
-
batchId: batchId,
|
|
81
|
-
phase: "polling",
|
|
82
|
-
processing: d.request_counts?.processing,
|
|
83
|
-
succeeded: d.request_counts?.succeeded,
|
|
84
|
-
failed: (d.request_counts?.errored ?? 0) + (d.request_counts?.canceled ?? 0) + (d.request_counts?.expired ?? 0),
|
|
85
|
-
total: jobs.length,
|
|
86
|
-
});
|
|
87
|
-
return d.processing_status === "ended" ? d : null;
|
|
88
|
-
}, opts);
|
|
89
|
-
opts.onProgress?.({ provider: "anthropic", batchId, phase: "downloading" });
|
|
90
|
-
const resultsUrl = endedAt.results_url ?? `${baseUrl}/v1/messages/batches/${batchId}/results`;
|
|
91
|
-
const res = await fetch(resultsUrl, { headers });
|
|
92
|
-
if (!res.ok)
|
|
93
|
-
throw new Error(`Anthropic batch results: HTTP ${res.status}`);
|
|
94
|
-
const text = await res.text();
|
|
95
|
-
const out = new Map();
|
|
96
|
-
for (const line of text.split("\n")) {
|
|
97
|
-
if (!line.trim())
|
|
98
|
-
continue;
|
|
99
|
-
const row = JSON.parse(line);
|
|
100
|
-
if (row.result.type === "succeeded") {
|
|
101
|
-
const raw = row.result.message.content.map((c) => c.text ?? "").join("");
|
|
102
|
-
const inp = row.result.message.usage?.input_tokens ?? 0;
|
|
103
|
-
const outp = row.result.message.usage?.output_tokens ?? 0;
|
|
104
|
-
out.set(row.custom_id, { raw, costUsd: (inp * 0.000003 + outp * 0.000015) * 0.5, inputTokens: inp, outputTokens: outp });
|
|
105
|
-
}
|
|
106
|
-
else {
|
|
107
|
-
const msg = row.result.type === "errored" ? row.result.error.message : row.result.type;
|
|
108
|
-
out.set(row.custom_id, { raw: `batch error: ${msg}`, costUsd: 0, inputTokens: 0, outputTokens: 0 });
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
opts.onProgress?.({ provider: "anthropic", batchId, phase: "done", succeeded: out.size, total: jobs.length });
|
|
112
|
-
return out;
|
|
113
|
-
}
|
|
114
|
-
// ── OpenAI-compatible (OpenAI, Kimi/Moonshot, DeepSeek) ────────────────────
|
|
115
|
-
async function runOpenAIBatch(jobs, opts) {
|
|
116
|
-
const baseUrl = (opts.baseUrl ?? "https://api.openai.com").replace(/\/$/, "");
|
|
117
|
-
const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
|
|
118
|
-
const authHeaders = { "Authorization": `Bearer ${authToken}` };
|
|
119
|
-
let batchId = opts.resumeBatchId;
|
|
120
|
-
let outputFileId;
|
|
121
|
-
if (!batchId) {
|
|
122
|
-
// Build the JSONL payload and upload as a file.
|
|
123
|
-
const jsonl = jobs.map((j) => {
|
|
124
|
-
const messages = [];
|
|
125
|
-
if (j.systemText)
|
|
126
|
-
messages.push({ role: "system", content: j.systemText });
|
|
127
|
-
messages.push({ role: "user", content: j.userText });
|
|
128
|
-
return JSON.stringify({
|
|
129
|
-
custom_id: j.customId,
|
|
130
|
-
method: "POST",
|
|
131
|
-
url: "/v1/chat/completions",
|
|
132
|
-
body: { model: j.model, max_tokens: opts.maxTokens ?? 4096, messages },
|
|
133
|
-
});
|
|
134
|
-
}).join("\n");
|
|
135
|
-
const form = new FormData();
|
|
136
|
-
form.append("purpose", "batch");
|
|
137
|
-
form.append("file", new Blob([jsonl], { type: "application/jsonl" }), "batch-input.jsonl");
|
|
138
|
-
const fileRes = await fetch(`${baseUrl}/v1/files`, { method: "POST", headers: authHeaders, body: form });
|
|
139
|
-
if (!fileRes.ok) {
|
|
140
|
-
const body = await fileRes.text().catch(() => "");
|
|
141
|
-
throw new Error(`Batch file-upload failed: HTTP ${fileRes.status} at ${baseUrl}/v1/files. ` +
|
|
142
|
-
`This provider may not support OpenAI-compatible batch. Response: ${body.slice(0, 300)}`);
|
|
143
|
-
}
|
|
144
|
-
const fileData = await fileRes.json();
|
|
145
|
-
const createRes = await fetch(`${baseUrl}/v1/batches`, {
|
|
146
|
-
method: "POST",
|
|
147
|
-
headers: { ...authHeaders, "Content-Type": "application/json" },
|
|
148
|
-
body: JSON.stringify({ input_file_id: fileData.id, endpoint: "/v1/chat/completions", completion_window: "24h" }),
|
|
149
|
-
});
|
|
150
|
-
if (!createRes.ok)
|
|
151
|
-
throw new Error(`OpenAI-compat batch create: HTTP ${createRes.status} ${await createRes.text()}`);
|
|
152
|
-
const createData = await createRes.json();
|
|
153
|
-
batchId = createData.id;
|
|
154
|
-
opts.onSubmitted?.(batchId, "openai-compatible");
|
|
155
|
-
}
|
|
156
|
-
opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "submitted", total: jobs.length });
|
|
157
|
-
const endedAt = await pollUntilDone(async () => {
|
|
158
|
-
const res = await fetch(`${baseUrl}/v1/batches/${batchId}`, { headers: authHeaders });
|
|
159
|
-
if (!res.ok)
|
|
160
|
-
throw new Error(`OpenAI-compat batch poll: HTTP ${res.status}`);
|
|
161
|
-
const d = await res.json();
|
|
162
|
-
opts.onProgress?.({
|
|
163
|
-
provider: "openai-compatible",
|
|
164
|
-
batchId: batchId,
|
|
165
|
-
phase: "polling",
|
|
166
|
-
succeeded: d.request_counts?.completed,
|
|
167
|
-
failed: d.request_counts?.failed,
|
|
168
|
-
total: d.request_counts?.total ?? jobs.length,
|
|
169
|
-
});
|
|
170
|
-
if (d.status === "completed")
|
|
171
|
-
return d;
|
|
172
|
-
if (d.status === "failed" || d.status === "expired" || d.status === "cancelled") {
|
|
173
|
-
throw new Error(`OpenAI-compat batch ${d.status}`);
|
|
174
|
-
}
|
|
175
|
-
return null;
|
|
176
|
-
}, opts);
|
|
177
|
-
outputFileId = endedAt.output_file_id;
|
|
178
|
-
if (!outputFileId)
|
|
179
|
-
throw new Error("OpenAI-compat batch completed with no output_file_id");
|
|
180
|
-
opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "downloading" });
|
|
181
|
-
const contentRes = await fetch(`${baseUrl}/v1/files/${outputFileId}/content`, { headers: authHeaders });
|
|
182
|
-
if (!contentRes.ok)
|
|
183
|
-
throw new Error(`OpenAI-compat batch download: HTTP ${contentRes.status}`);
|
|
184
|
-
const text = await contentRes.text();
|
|
185
|
-
const out = new Map();
|
|
186
|
-
for (const line of text.split("\n")) {
|
|
187
|
-
if (!line.trim())
|
|
188
|
-
continue;
|
|
189
|
-
const row = JSON.parse(line);
|
|
190
|
-
if (row.error || !row.response) {
|
|
191
|
-
out.set(row.custom_id, { raw: `batch error: ${row.error?.message ?? "unknown"}`, costUsd: 0, inputTokens: 0, outputTokens: 0 });
|
|
192
|
-
continue;
|
|
193
|
-
}
|
|
194
|
-
const raw = row.response.body.choices?.[0]?.message?.content ?? "";
|
|
195
|
-
const inp = row.response.body.usage?.prompt_tokens ?? 0;
|
|
196
|
-
const outp = row.response.body.usage?.completion_tokens ?? 0;
|
|
197
|
-
out.set(row.custom_id, { raw, costUsd: (inp * 0.000003 + outp * 0.000015) * 0.5, inputTokens: inp, outputTokens: outp });
|
|
198
|
-
}
|
|
199
|
-
opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "done", succeeded: out.size, total: jobs.length });
|
|
200
|
-
return out;
|
|
201
|
-
}
|
|
202
|
-
// ── Shared poll loop ───────────────────────────────────────────────────────
|
|
203
|
-
async function pollUntilDone(check, opts) {
|
|
204
|
-
const start = Date.now();
|
|
205
|
-
const deadline = start + (opts.batchTimeoutMs ?? 24 * 60 * 60 * 1000);
|
|
206
|
-
let delay = opts.pollStartMs ?? 30_000;
|
|
207
|
-
const maxDelay = opts.pollMaxMs ?? 5 * 60_000;
|
|
208
|
-
while (Date.now() < deadline) {
|
|
209
|
-
const result = await check();
|
|
210
|
-
if (result != null)
|
|
211
|
-
return result;
|
|
212
|
-
await new Promise((r) => setTimeout(r, delay));
|
|
213
|
-
delay = Math.min(maxDelay, delay * 2);
|
|
214
|
-
}
|
|
215
|
-
throw new Error("Batch exceeded batchTimeoutMs without completing");
|
|
216
|
-
}
|