claude-overnight 1.58.0 → 1.60.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/evolve-subcommands.d.ts +3 -0
- package/dist/bin/evolve-subcommands.js +234 -0
- package/dist/bin/evolve.js +23 -63
- package/dist/core/_version.d.ts +1 -1
- package/dist/core/_version.js +1 -1
- package/dist/prompt-evolution/evaluator-judge.d.ts +1 -6
- package/dist/prompt-evolution/evaluator-judge.js +2 -62
- package/dist/prompt-evolution/evaluator.d.ts +0 -22
- package/dist/prompt-evolution/evaluator.js +20 -120
- package/dist/prompt-evolution/index.d.ts +0 -8
- package/dist/prompt-evolution/index.js +0 -18
- package/dist/prompt-evolution/persistence.d.ts +0 -20
- package/dist/prompt-evolution/persistence.js +0 -39
- package/dist/prompts/load.d.ts +1 -0
- package/dist/prompts/load.js +1 -1
- package/package.json +1 -1
- package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1
- package/dist/prompt-evolution/transport-batch.d.ts +0 -62
- package/dist/prompt-evolution/transport-batch.js +0 -235
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
export declare function runDiff(runIdA: string | undefined, runIdB: string | undefined): Promise<void>;
|
|
2
|
+
export declare function runDownload(runIdArg?: string, ...rest: string[]): Promise<void>;
|
|
3
|
+
export declare function runPromote(runIdArg?: string, ...rest: string[]): Promise<void>;
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
export async function runDiff(runIdA, runIdB) {
|
|
2
|
+
if (!runIdA || !runIdB) {
|
|
3
|
+
console.error("usage: claude-overnight-evolve diff <runIdA> <runIdB>");
|
|
4
|
+
process.exit(2);
|
|
5
|
+
}
|
|
6
|
+
const { loadRun } = await import("../prompt-evolution/persistence.js");
|
|
7
|
+
const a = loadRun(runIdA);
|
|
8
|
+
const b = loadRun(runIdB);
|
|
9
|
+
const collect = (run) => {
|
|
10
|
+
const out = new Map();
|
|
11
|
+
for (const rec of run.matrix) {
|
|
12
|
+
// Keep the latest-generation row per variantId so diff compares final state.
|
|
13
|
+
const existing = out.get(rec.variantId);
|
|
14
|
+
if (!existing || rec.generation > existing.generation) {
|
|
15
|
+
out.set(rec.variantId, { generation: rec.generation, variantId: rec.variantId, gmean: rec.gmean });
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return out;
|
|
19
|
+
};
|
|
20
|
+
const rowsA = collect(a);
|
|
21
|
+
const rowsB = collect(b);
|
|
22
|
+
const ids = new Set([...rowsA.keys(), ...rowsB.keys()]);
|
|
23
|
+
console.log(`# Diff: ${runIdA} → ${runIdB}`);
|
|
24
|
+
console.log("");
|
|
25
|
+
console.log(`| Variant | A gmean | B gmean | Δ | note |`);
|
|
26
|
+
console.log(`|-----------|-----------|-----------|-------|--------|`);
|
|
27
|
+
const sorted = [...ids].sort();
|
|
28
|
+
for (const id of sorted) {
|
|
29
|
+
const ra = rowsA.get(id);
|
|
30
|
+
const rb = rowsB.get(id);
|
|
31
|
+
const ga = ra ? (ra.gmean * 100).toFixed(1) : "—";
|
|
32
|
+
const gb = rb ? (rb.gmean * 100).toFixed(1) : "—";
|
|
33
|
+
const delta = ra && rb ? ((rb.gmean - ra.gmean) * 100).toFixed(1) : "—";
|
|
34
|
+
const note = !ra ? "new in B" : !rb ? "missing in B" : ra.gmean < rb.gmean ? "↑" : ra.gmean > rb.gmean ? "↓" : "=";
|
|
35
|
+
console.log(`| ${id.padEnd(10)}| ${ga.padStart(9)} | ${gb.padStart(9)} | ${delta.padStart(5)} | ${note} |`);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
export async function runDownload(runIdArg, ...rest) {
|
|
39
|
+
if (!runIdArg) {
|
|
40
|
+
console.error("usage: claude-overnight-evolve download <runId> --base-url <url> [--token <token>] [--project <id>]");
|
|
41
|
+
process.exit(2);
|
|
42
|
+
}
|
|
43
|
+
const runId = runIdArg;
|
|
44
|
+
let baseUrl;
|
|
45
|
+
let token;
|
|
46
|
+
let projectId;
|
|
47
|
+
for (let i = 0; i < rest.length; i++) {
|
|
48
|
+
if (rest[i] === "--base-url" && rest[i + 1]) {
|
|
49
|
+
baseUrl = rest[i + 1];
|
|
50
|
+
i++;
|
|
51
|
+
}
|
|
52
|
+
else if (rest[i] === "--token" && rest[i + 1]) {
|
|
53
|
+
token = rest[i + 1];
|
|
54
|
+
i++;
|
|
55
|
+
}
|
|
56
|
+
else if (rest[i] === "--project" && rest[i + 1]) {
|
|
57
|
+
projectId = rest[i + 1];
|
|
58
|
+
i++;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
if (!baseUrl) {
|
|
62
|
+
console.error("--base-url is required (e.g. https://fornace.net or http://localhost:8787)");
|
|
63
|
+
process.exit(2);
|
|
64
|
+
}
|
|
65
|
+
const authHeaders = {};
|
|
66
|
+
if (token)
|
|
67
|
+
authHeaders.Authorization = `Bearer ${token}`;
|
|
68
|
+
const prefix = projectId
|
|
69
|
+
? `${baseUrl.replace(/\/$/, "")}/api/projects/${projectId}/prompt-evolution/${runId}`
|
|
70
|
+
: `${baseUrl.replace(/\/$/, "")}/runs/${runId}`;
|
|
71
|
+
const metaRes = await fetch(prefix, { headers: authHeaders });
|
|
72
|
+
if (!metaRes.ok) {
|
|
73
|
+
console.error(`Failed to fetch run metadata: HTTP ${metaRes.status}`);
|
|
74
|
+
process.exit(1);
|
|
75
|
+
}
|
|
76
|
+
const metaBody = (await metaRes.json());
|
|
77
|
+
const remoteMeta = typeof metaBody.meta === "object" && metaBody.meta
|
|
78
|
+
? metaBody.meta
|
|
79
|
+
: metaBody;
|
|
80
|
+
const { runDir } = await import("../prompt-evolution/persistence.js");
|
|
81
|
+
const { mkdirSync, writeFileSync } = await import("node:fs");
|
|
82
|
+
const { dirname, join } = await import("node:path");
|
|
83
|
+
const localDir = runDir(runId);
|
|
84
|
+
mkdirSync(localDir, { recursive: true });
|
|
85
|
+
mkdirSync(join(localDir, "prompts"), { recursive: true });
|
|
86
|
+
const meta = {
|
|
87
|
+
runId,
|
|
88
|
+
promptPath: (remoteMeta.promptPath ?? remoteMeta.prompt ?? ""),
|
|
89
|
+
target: (remoteMeta.target ?? "claude-overnight"),
|
|
90
|
+
evalModel: (remoteMeta.evalModel ?? ""),
|
|
91
|
+
mutateModel: (remoteMeta.mutateModel ?? remoteMeta.evalModel ?? ""),
|
|
92
|
+
generations: (remoteMeta.generations ?? 10),
|
|
93
|
+
populationCap: (remoteMeta.populationCap ?? remoteMeta.population ?? 8),
|
|
94
|
+
startedAt: (remoteMeta.startedAt ?? remoteMeta.queuedAt ?? new Date().toISOString()),
|
|
95
|
+
status: (remoteMeta.status ?? "done"),
|
|
96
|
+
caseNames: [],
|
|
97
|
+
};
|
|
98
|
+
writeFileSync(join(localDir, "meta.json"), JSON.stringify(meta, null, 2) + "\n");
|
|
99
|
+
const inlineReport = typeof metaBody.report === "string" ? metaBody.report : metaBody.report_md;
|
|
100
|
+
if (typeof inlineReport === "string") {
|
|
101
|
+
writeFileSync(join(localDir, "report.md"), inlineReport);
|
|
102
|
+
console.log(" ✓ report.md (inline)");
|
|
103
|
+
}
|
|
104
|
+
const listRes = await fetch(`${prefix}/files`, { headers: authHeaders });
|
|
105
|
+
let files = [];
|
|
106
|
+
if (listRes.ok) {
|
|
107
|
+
const listBody = (await listRes.json());
|
|
108
|
+
files = listBody.files ?? [];
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
console.log(` ⚠ File listing not available (HTTP ${listRes.status}); trying known files...`);
|
|
112
|
+
files = ["report.md", "best.md", "matrix.jsonl", "learning.jsonl"];
|
|
113
|
+
}
|
|
114
|
+
for (const file of files) {
|
|
115
|
+
const fileRes = await fetch(`${prefix}/files/${encodeURIComponent(file)}`, { headers: authHeaders });
|
|
116
|
+
if (!fileRes.ok) {
|
|
117
|
+
console.error(` ⚠ ${file}: HTTP ${fileRes.status}`);
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
const data = Buffer.from(await fileRes.arrayBuffer());
|
|
121
|
+
const localPath = join(localDir, file);
|
|
122
|
+
mkdirSync(dirname(localPath), { recursive: true });
|
|
123
|
+
writeFileSync(localPath, data);
|
|
124
|
+
console.log(` ✓ ${file}`);
|
|
125
|
+
}
|
|
126
|
+
const matrixPath = join(localDir, "matrix.jsonl");
|
|
127
|
+
const { existsSync, readFileSync } = await import("node:fs");
|
|
128
|
+
if (existsSync(matrixPath)) {
|
|
129
|
+
const variantIds = new Set();
|
|
130
|
+
for (const line of readFileSync(matrixPath, "utf-8").trim().split("\n")) {
|
|
131
|
+
if (!line)
|
|
132
|
+
continue;
|
|
133
|
+
try {
|
|
134
|
+
const row = JSON.parse(line);
|
|
135
|
+
if (row.variantId)
|
|
136
|
+
variantIds.add(row.variantId);
|
|
137
|
+
}
|
|
138
|
+
catch { /* ignore */ }
|
|
139
|
+
}
|
|
140
|
+
for (const vid of variantIds) {
|
|
141
|
+
const safeId = vid.replace(/[^a-zA-Z0-9_-]/g, "_");
|
|
142
|
+
const promptFile = `prompts/${safeId}.md`;
|
|
143
|
+
if (existsSync(join(localDir, promptFile)))
|
|
144
|
+
continue;
|
|
145
|
+
const fileRes = await fetch(`${prefix}/files/${encodeURIComponent(promptFile)}`, { headers: authHeaders });
|
|
146
|
+
if (!fileRes.ok)
|
|
147
|
+
continue;
|
|
148
|
+
const data = Buffer.from(await fileRes.arrayBuffer());
|
|
149
|
+
mkdirSync(dirname(join(localDir, promptFile)), { recursive: true });
|
|
150
|
+
writeFileSync(join(localDir, promptFile), data);
|
|
151
|
+
console.log(` ✓ ${promptFile} (from matrix)`);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
console.log(`\nDownloaded to ${localDir}`);
|
|
155
|
+
}
|
|
156
|
+
export async function runPromote(runIdArg, ...rest) {
|
|
157
|
+
if (!runIdArg) {
|
|
158
|
+
console.error("usage: claude-overnight-evolve promote <runId> [--variant <id>] [--into <block>]");
|
|
159
|
+
process.exit(2);
|
|
160
|
+
}
|
|
161
|
+
const runId = runIdArg;
|
|
162
|
+
let variantId;
|
|
163
|
+
let intoBlock;
|
|
164
|
+
for (let i = 0; i < rest.length; i++) {
|
|
165
|
+
if (rest[i] === "--variant" && rest[i + 1]) {
|
|
166
|
+
variantId = rest[i + 1];
|
|
167
|
+
i++;
|
|
168
|
+
}
|
|
169
|
+
else if (rest[i] === "--into" && rest[i + 1]) {
|
|
170
|
+
intoBlock = rest[i + 1];
|
|
171
|
+
i++;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
const { loadRun, runDir } = await import("../prompt-evolution/persistence.js");
|
|
175
|
+
const { PROMPTS_ROOT } = await import("../prompts/load.js");
|
|
176
|
+
const { readFileSync, writeFileSync, existsSync } = await import("node:fs");
|
|
177
|
+
const { join } = await import("node:path");
|
|
178
|
+
const run = loadRun(runId);
|
|
179
|
+
const promptPath = run.meta.promptPath;
|
|
180
|
+
let sourceVariant = variantId;
|
|
181
|
+
if (!sourceVariant) {
|
|
182
|
+
const bestMatch = run.bestMd.match(/variantId\s*\|\s*`([^`]+)`/);
|
|
183
|
+
sourceVariant = bestMatch ? bestMatch[1] : undefined;
|
|
184
|
+
if (!sourceVariant) {
|
|
185
|
+
const rows = run.matrix;
|
|
186
|
+
if (rows.length)
|
|
187
|
+
sourceVariant = [...rows].sort((a, b) => b.gmean - a.gmean)[0].variantId;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
if (!sourceVariant) {
|
|
191
|
+
console.error("Could not determine best variant for run. Use --variant <id>.");
|
|
192
|
+
process.exit(2);
|
|
193
|
+
}
|
|
194
|
+
const safeId = sourceVariant.replace(/[^a-zA-Z0-9_-]/g, "_");
|
|
195
|
+
const variantFile = join(runDir(runId), "prompts", `${safeId}.md`);
|
|
196
|
+
if (!existsSync(variantFile)) {
|
|
197
|
+
console.error(`Variant file not found: ${variantFile}`);
|
|
198
|
+
process.exit(2);
|
|
199
|
+
}
|
|
200
|
+
const variantText = readFileSync(variantFile, "utf-8").replace(/^<!--\s*generation=[\s\S]*?-->\n\n?/, "");
|
|
201
|
+
const namedVariants = ["tight", "standard", "large", "wrap", "amend", "wave", "run", "file", "all", "postfailed", "nofiles"];
|
|
202
|
+
const targetBlock = intoBlock ?? (namedVariants.includes(sourceVariant.toLowerCase()) ? sourceVariant : undefined);
|
|
203
|
+
if (!targetBlock) {
|
|
204
|
+
console.error(`Variant "${sourceVariant}" is not a named seed variant. Use --into <block> to specify which marker block to overwrite.`);
|
|
205
|
+
process.exit(2);
|
|
206
|
+
}
|
|
207
|
+
const promptFile = join(PROMPTS_ROOT, promptPath + ".md");
|
|
208
|
+
if (!existsSync(promptFile)) {
|
|
209
|
+
console.error(`Prompt file not found: ${promptFile}`);
|
|
210
|
+
process.exit(2);
|
|
211
|
+
}
|
|
212
|
+
const newText = replaceVariantBlock(readFileSync(promptFile, "utf-8"), targetBlock, variantText);
|
|
213
|
+
writeFileSync(promptFile, newText);
|
|
214
|
+
console.log(`Promoted ${sourceVariant} → ${promptPath} (<!-- ${targetBlock.toUpperCase()} -->)`);
|
|
215
|
+
console.log(` file: ${promptFile}`);
|
|
216
|
+
}
|
|
217
|
+
function replaceVariantBlock(fileText, blockName, newText) {
|
|
218
|
+
const separator = "\n<!-- @@@ -->\n";
|
|
219
|
+
const sections = fileText.split(separator);
|
|
220
|
+
const markerRegex = new RegExp(`<!--\\s*(?:[─\\-]+\\s*)?${blockName.toUpperCase()}\\s*-->`, "i");
|
|
221
|
+
let found = false;
|
|
222
|
+
const newSections = sections.map((section) => {
|
|
223
|
+
const lines = section.split("\n");
|
|
224
|
+
const markerIndex = lines.findIndex((line) => markerRegex.test(line));
|
|
225
|
+
if (markerIndex === -1)
|
|
226
|
+
return section;
|
|
227
|
+
found = true;
|
|
228
|
+
const before = lines.slice(0, markerIndex + 1);
|
|
229
|
+
return [...before, "", newText.trim(), ""].join("\n").trimEnd() + "\n";
|
|
230
|
+
});
|
|
231
|
+
if (!found)
|
|
232
|
+
throw new Error(`Variant block "${blockName.toUpperCase()}" not found in prompt file`);
|
|
233
|
+
return newSections.join(separator);
|
|
234
|
+
}
|
package/dist/bin/evolve.js
CHANGED
|
@@ -19,6 +19,7 @@ import { evolvePrompt } from "../prompt-evolution/index.js";
|
|
|
19
19
|
import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
|
|
20
20
|
import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
|
|
21
21
|
import { generateCases } from "../prompt-evolution/fixtures/generate.js";
|
|
22
|
+
import { runDiff, runDownload, runPromote } from "./evolve-subcommands.js";
|
|
22
23
|
import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
|
|
23
24
|
function help() {
|
|
24
25
|
process.stdout.write(`Usage: claude-overnight-evolve [options]
|
|
@@ -39,12 +40,6 @@ Options:
|
|
|
39
40
|
--plateau <n> Stop early if no improvement for N generations (default: 3)
|
|
40
41
|
--reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
|
|
41
42
|
--concurrency <n> Max in-flight eval calls (default: 8; bump for slow endpoints)
|
|
42
|
-
--batch Use provider batch API (50% cheaper, slower wall-clock)
|
|
43
|
-
--batch-base-url <url> Override base URL for batch only (e.g. api.moonshot.ai/v1
|
|
44
|
-
when online uses api.kimi.com/coding)
|
|
45
|
-
--batch-auth-token <t> Override auth token for batch only
|
|
46
|
-
--batch-model <model> Override model for batch only (e.g. "kimi-k2.6" for
|
|
47
|
-
Moonshot platform when online uses "kimi-for-coding")
|
|
48
43
|
--adaptive-cap <n> Adaptive sampling: extend reps up to N when σ > threshold (default: off)
|
|
49
44
|
--adaptive-threshold <x> σ threshold that triggers an extra rep (default: 0.1)
|
|
50
45
|
--judge Use llm-judge for content scoring (costs extra API calls)
|
|
@@ -63,6 +58,18 @@ Options:
|
|
|
63
58
|
--gen-model <model> Model used by the case generator (default: eval-model)
|
|
64
59
|
|
|
65
60
|
Subcommands:
|
|
61
|
+
claude-overnight-evolve download <runId> --base-url <url> [--token <token>]
|
|
62
|
+
[--project <id>]
|
|
63
|
+
Pull a remote run (fornace or self-host) into the local
|
|
64
|
+
~/.claude-overnight/prompt-evolution/<runId>/ directory
|
|
65
|
+
so you can audit, diff, or promote it offline. Use
|
|
66
|
+
--project for fornace; omit for self-host.
|
|
67
|
+
claude-overnight-evolve promote <runId> [--variant <id>] [--into <block>]
|
|
68
|
+
Write a run's winning variant back into the source
|
|
69
|
+
prompt file's <!-- BLOCK --> marker. If --variant is
|
|
70
|
+
omitted, uses the run's best variant. If the variant is
|
|
71
|
+
a seed (tight/standard/large) --into defaults to its
|
|
72
|
+
name; evo-* or default variants require --into.
|
|
66
73
|
claude-overnight-evolve diff <runIdA> <runIdB>
|
|
67
74
|
Print a per-variant diff of two persisted runs
|
|
68
75
|
--base-url <url> API base URL override
|
|
@@ -85,7 +92,6 @@ function parseArgs() {
|
|
|
85
92
|
population: 8,
|
|
86
93
|
plateau: 3,
|
|
87
94
|
reps: 1,
|
|
88
|
-
batch: false,
|
|
89
95
|
useJudge: false,
|
|
90
96
|
judgeTopN: 4,
|
|
91
97
|
cases: "",
|
|
@@ -143,21 +149,6 @@ function parseArgs() {
|
|
|
143
149
|
opts.concurrency = parseInt(v, 10);
|
|
144
150
|
i++;
|
|
145
151
|
break;
|
|
146
|
-
case "--batch":
|
|
147
|
-
opts.batch = true;
|
|
148
|
-
break;
|
|
149
|
-
case "--batch-base-url":
|
|
150
|
-
opts.batchBaseUrl = v;
|
|
151
|
-
i++;
|
|
152
|
-
break;
|
|
153
|
-
case "--batch-auth-token":
|
|
154
|
-
opts.batchAuthToken = v;
|
|
155
|
-
i++;
|
|
156
|
-
break;
|
|
157
|
-
case "--batch-model":
|
|
158
|
-
opts.batchModel = v;
|
|
159
|
-
i++;
|
|
160
|
-
break;
|
|
161
152
|
case "--adaptive-cap":
|
|
162
153
|
opts.adaptiveCap = parseInt(v, 10);
|
|
163
154
|
i++;
|
|
@@ -230,6 +221,16 @@ function parseArgs() {
|
|
|
230
221
|
return opts;
|
|
231
222
|
}
|
|
232
223
|
async function main() {
|
|
224
|
+
// Subcommand: download a remote run for local audit/promote.
|
|
225
|
+
if (process.argv[2] === "download") {
|
|
226
|
+
await runDownload(process.argv[3], ...process.argv.slice(4));
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
// Subcommand: promote a run variant back into the source prompt file.
|
|
230
|
+
if (process.argv[2] === "promote") {
|
|
231
|
+
await runPromote(process.argv[3], ...process.argv.slice(4));
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
233
234
|
// Subcommand: diff two persisted runs.
|
|
234
235
|
if (process.argv[2] === "diff") {
|
|
235
236
|
await runDiff(process.argv[3], process.argv[4]);
|
|
@@ -347,10 +348,6 @@ async function evolveOne(opts) {
|
|
|
347
348
|
plateauGenerations: opts.plateau,
|
|
348
349
|
repetitions: opts.reps > 1 ? opts.reps : undefined,
|
|
349
350
|
concurrency: opts.concurrency,
|
|
350
|
-
batch: opts.batch || undefined,
|
|
351
|
-
batchBaseUrl: opts.batchBaseUrl,
|
|
352
|
-
batchAuthToken: opts.batchAuthToken,
|
|
353
|
-
batchModel: opts.batchModel,
|
|
354
351
|
adaptiveReps: opts.adaptiveCap
|
|
355
352
|
? { cap: opts.adaptiveCap, threshold: opts.adaptiveThreshold }
|
|
356
353
|
: undefined,
|
|
@@ -383,43 +380,6 @@ async function evolveOne(opts) {
|
|
|
383
380
|
console.log(result.bestVariant.text);
|
|
384
381
|
return result;
|
|
385
382
|
}
|
|
386
|
-
async function runDiff(runIdA, runIdB) {
|
|
387
|
-
if (!runIdA || !runIdB) {
|
|
388
|
-
console.error("usage: claude-overnight-evolve diff <runIdA> <runIdB>");
|
|
389
|
-
process.exit(2);
|
|
390
|
-
}
|
|
391
|
-
const { loadRun } = await import("../prompt-evolution/persistence.js");
|
|
392
|
-
const a = loadRun(runIdA);
|
|
393
|
-
const b = loadRun(runIdB);
|
|
394
|
-
const collect = (run) => {
|
|
395
|
-
const out = new Map();
|
|
396
|
-
for (const rec of run.matrix) {
|
|
397
|
-
// Keep the latest-generation row per variantId so diff compares final state.
|
|
398
|
-
const existing = out.get(rec.variantId);
|
|
399
|
-
if (!existing || rec.generation > existing.generation) {
|
|
400
|
-
out.set(rec.variantId, { generation: rec.generation, variantId: rec.variantId, gmean: rec.gmean });
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
return out;
|
|
404
|
-
};
|
|
405
|
-
const rowsA = collect(a);
|
|
406
|
-
const rowsB = collect(b);
|
|
407
|
-
const ids = new Set([...rowsA.keys(), ...rowsB.keys()]);
|
|
408
|
-
console.log(`# Diff: ${runIdA} → ${runIdB}`);
|
|
409
|
-
console.log("");
|
|
410
|
-
console.log(`| Variant | A gmean | B gmean | Δ | note |`);
|
|
411
|
-
console.log(`|-----------|-----------|-----------|-------|--------|`);
|
|
412
|
-
const sorted = [...ids].sort();
|
|
413
|
-
for (const id of sorted) {
|
|
414
|
-
const ra = rowsA.get(id);
|
|
415
|
-
const rb = rowsB.get(id);
|
|
416
|
-
const ga = ra ? (ra.gmean * 100).toFixed(1) : "—";
|
|
417
|
-
const gb = rb ? (rb.gmean * 100).toFixed(1) : "—";
|
|
418
|
-
const delta = ra && rb ? ((rb.gmean - ra.gmean) * 100).toFixed(1) : "—";
|
|
419
|
-
const note = !ra ? "new in B" : !rb ? "missing in B" : ra.gmean < rb.gmean ? "↑" : ra.gmean > rb.gmean ? "↓" : "=";
|
|
420
|
-
console.log(`| ${id.padEnd(10)}| ${ga.padStart(9)} | ${gb.padStart(9)} | ${delta.padStart(5)} | ${note} |`);
|
|
421
|
-
}
|
|
422
|
-
}
|
|
423
383
|
main().catch((err) => {
|
|
424
384
|
console.error(err);
|
|
425
385
|
process.exit(1);
|
package/dist/core/_version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "1.
|
|
1
|
+
export declare const VERSION = "1.60.0";
|
package/dist/core/_version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// Auto-generated by build — do not edit manually.
|
|
2
|
-
export const VERSION = "1.
|
|
2
|
+
export const VERSION = "1.60.0";
|
|
@@ -1,20 +1,15 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* LLM-judge pass over a built evaluation matrix.
|
|
3
3
|
*
|
|
4
|
-
* Split out of evaluator.ts to keep each file under the 500-line cap and
|
|
5
|
-
* because the judge has its own concerns (top-N eligibility, batch vs
|
|
6
|
-
* online path, crash-resumable state).
|
|
7
|
-
*
|
|
8
4
|
* The judge REPLACES the heuristic content score with a semantic grade.
|
|
9
5
|
* We only judge top-N variants per generation to cap cost — a judge call
|
|
10
6
|
* per (variant, case, model) on a large population explodes fast.
|
|
11
7
|
*/
|
|
12
8
|
import { type JudgeOpts } from "./llm-judge.js";
|
|
13
9
|
import type { BenchmarkCase, EvaluationResult } from "./types.js";
|
|
14
|
-
import type { EvalOpts } from "./evaluator.js";
|
|
15
10
|
export declare function runJudge(variants: Array<{
|
|
16
11
|
id: string;
|
|
17
12
|
text: string;
|
|
18
13
|
}>, cases: BenchmarkCase[], models: string[], aggregated: Map<string, EvaluationResult>, judge: JudgeOpts & {
|
|
19
14
|
topN?: number;
|
|
20
|
-
}
|
|
15
|
+
}): Promise<void>;
|
|
@@ -1,20 +1,14 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* LLM-judge pass over a built evaluation matrix.
|
|
3
3
|
*
|
|
4
|
-
* Split out of evaluator.ts to keep each file under the 500-line cap and
|
|
5
|
-
* because the judge has its own concerns (top-N eligibility, batch vs
|
|
6
|
-
* online path, crash-resumable state).
|
|
7
|
-
*
|
|
8
4
|
* The judge REPLACES the heuristic content score with a semantic grade.
|
|
9
5
|
* We only judge top-N variants per generation to cap cost — a judge call
|
|
10
6
|
* per (variant, case, model) on a large population explodes fast.
|
|
11
7
|
*/
|
|
12
|
-
import { judgeOutput
|
|
13
|
-
import { batchCallModel } from "./transport-batch.js";
|
|
14
|
-
import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
|
|
8
|
+
import { judgeOutput } from "./llm-judge.js";
|
|
15
9
|
import { gmean } from "./scorer.js";
|
|
16
10
|
import { averageDimensions } from "./evaluator-utils.js";
|
|
17
|
-
export async function runJudge(variants, cases, models, aggregated, judge
|
|
11
|
+
export async function runJudge(variants, cases, models, aggregated, judge) {
|
|
18
12
|
const topN = judge.topN ?? 4;
|
|
19
13
|
const variantGmeans = variants.map((v) => {
|
|
20
14
|
const scores = [];
|
|
@@ -45,10 +39,6 @@ export async function runJudge(variants, cases, models, aggregated, judge, opts)
|
|
|
45
39
|
}
|
|
46
40
|
if (cells.length === 0)
|
|
47
41
|
return;
|
|
48
|
-
if (opts.batch) {
|
|
49
|
-
await runJudgeBatch(cells, judge, opts);
|
|
50
|
-
return;
|
|
51
|
-
}
|
|
52
42
|
const jobs = cells.map((cell) => async () => {
|
|
53
43
|
try {
|
|
54
44
|
const jr = await judgeOutput(cell.r.rawOutput, cell.c, judge);
|
|
@@ -71,53 +61,3 @@ export async function runJudge(variants, cases, models, aggregated, judge, opts)
|
|
|
71
61
|
};
|
|
72
62
|
await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
|
|
73
63
|
}
|
|
74
|
-
async function runJudgeBatch(cells, judge, opts) {
|
|
75
|
-
const batchJobs = cells.map((cell, i) => ({
|
|
76
|
-
customId: `j:${i}|k:${cell.key}`,
|
|
77
|
-
userText: buildJudgePrompt(cell.r.rawOutput, cell.c),
|
|
78
|
-
model: judge.model,
|
|
79
|
-
}));
|
|
80
|
-
const existing = opts.runId != null && opts.generation != null
|
|
81
|
-
? loadBatchState(opts.runId, opts.generation, "judge")
|
|
82
|
-
: null;
|
|
83
|
-
const transport = opts.batchCallModel ?? batchCallModel;
|
|
84
|
-
const results = await transport(batchJobs, {
|
|
85
|
-
// Judge batch follows the same override hierarchy as eval batch: if a
|
|
86
|
-
// dedicated batch endpoint is set on EvalOpts, use it; else fall back
|
|
87
|
-
// to the judge's own endpoint or the main one.
|
|
88
|
-
baseUrl: opts.batchBaseUrl ?? judge.baseUrl ?? opts.baseUrl,
|
|
89
|
-
authToken: opts.batchAuthToken ?? judge.authToken ?? opts.authToken,
|
|
90
|
-
modelOverride: opts.batchModel,
|
|
91
|
-
maxTokens: judge.maxTokens ?? 2048,
|
|
92
|
-
resumeBatchId: existing?.batchId,
|
|
93
|
-
onSubmitted: (batchId, p) => {
|
|
94
|
-
if (opts.runId != null && opts.generation != null && !existing) {
|
|
95
|
-
saveBatchState(opts.runId, {
|
|
96
|
-
generation: opts.generation,
|
|
97
|
-
phase: "judge",
|
|
98
|
-
batchId,
|
|
99
|
-
provider: p,
|
|
100
|
-
submittedAt: new Date().toISOString(),
|
|
101
|
-
});
|
|
102
|
-
}
|
|
103
|
-
opts.onBatchProgress?.(`judge batch submitted: ${batchId} (${p})`);
|
|
104
|
-
},
|
|
105
|
-
onProgress: (p) => opts.onBatchProgress?.(`judge batch ${p.batchId} ${p.phase}${p.succeeded != null ? `: ${p.succeeded}/${p.total ?? batchJobs.length}` : ""}`),
|
|
106
|
-
});
|
|
107
|
-
if (opts.runId != null && existing)
|
|
108
|
-
markBatchFinished(opts.runId, existing.batchId);
|
|
109
|
-
for (const cell of cells) {
|
|
110
|
-
const customId = batchJobs.find((b) => b.customId.includes(`|k:${cell.key}`))?.customId;
|
|
111
|
-
const got = customId ? results.get(customId) : undefined;
|
|
112
|
-
if (!got || !got.raw)
|
|
113
|
-
continue;
|
|
114
|
-
try {
|
|
115
|
-
const jr = parseJudgeOutput(got.raw);
|
|
116
|
-
cell.r.scores = { ...cell.r.scores, content: jr.score };
|
|
117
|
-
cell.r.judgeJustification = jr.justification;
|
|
118
|
-
}
|
|
119
|
-
catch {
|
|
120
|
-
// Judge parse failure is non-fatal — keep heuristic content.
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
}
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
*/
|
|
18
18
|
import { type JudgeOpts } from "./llm-judge.js";
|
|
19
19
|
import { type CallModel } from "./transport.js";
|
|
20
|
-
import { batchCallModel } from "./transport-batch.js";
|
|
21
20
|
import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
|
|
22
21
|
export interface EvalOpts {
|
|
23
22
|
/** Primary generator model (retained for single-model compat). */
|
|
@@ -51,29 +50,8 @@ export interface EvalOpts {
|
|
|
51
50
|
};
|
|
52
51
|
/** Transport override for tests. */
|
|
53
52
|
callModel?: CallModel;
|
|
54
|
-
/** Use provider batch API instead of online calls (50% cheaper, slower wall-clock). */
|
|
55
|
-
batch?: boolean;
|
|
56
|
-
/**
|
|
57
|
-
* Override base URL for batch submissions only — lets batch hit a
|
|
58
|
-
* different endpoint than online. Key use-case: Kimi users whose online
|
|
59
|
-
* traffic runs through api.kimi.com/coding (which has no batch) but
|
|
60
|
-
* whose batch traffic should go to api.moonshot.ai/v1.
|
|
61
|
-
*/
|
|
62
|
-
batchBaseUrl?: string;
|
|
63
|
-
/** Override auth token for batch when batchBaseUrl needs a different key. */
|
|
64
|
-
batchAuthToken?: string;
|
|
65
|
-
/** Override model for batch submissions (e.g., kimi-k2.6 when online uses kimi-for-coding). */
|
|
66
|
-
batchModel?: string;
|
|
67
|
-
/** Run id — required when batch=true so state is crash-resumable. */
|
|
68
|
-
runId?: string;
|
|
69
|
-
/** Current generation number — used to key batch state. */
|
|
70
|
-
generation?: number;
|
|
71
|
-
/** Batch-transport override for tests. Same return shape as transport-batch.batchCallModel. */
|
|
72
|
-
batchCallModel?: typeof batchCallModel;
|
|
73
53
|
/** Optional callback for progress */
|
|
74
54
|
onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
|
|
75
|
-
/** Progress callback specific to batch-phase transitions. */
|
|
76
|
-
onBatchProgress?: (msg: string) => void;
|
|
77
55
|
}
|
|
78
56
|
export declare function buildMatrix(variants: Array<{
|
|
79
57
|
id: string;
|
|
@@ -18,8 +18,6 @@
|
|
|
18
18
|
import { renderPrompt } from "../prompts/load.js";
|
|
19
19
|
import { scoreOutput, gmean, aggregateReps, bootstrapCI, kendallTau } from "./scorer.js";
|
|
20
20
|
import { defaultCallModel, attemptJsonParse, } from "./transport.js";
|
|
21
|
-
import { batchCallModel, detectBatchProvider, } from "./transport-batch.js";
|
|
22
|
-
import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
|
|
23
21
|
import { averageDimensions } from "./evaluator-utils.js";
|
|
24
22
|
import { runJudge } from "./evaluator-judge.js";
|
|
25
23
|
export async function buildMatrix(variants, cases, opts) {
|
|
@@ -38,53 +36,30 @@ export async function buildMatrix(variants, cases, opts) {
|
|
|
38
36
|
}
|
|
39
37
|
}
|
|
40
38
|
}
|
|
41
|
-
//
|
|
42
|
-
//
|
|
43
|
-
// results as they arrive. 50% cheaper, slower wall-clock.
|
|
44
|
-
// batch=false — work-stealing pool: keep `concurrency` jobs in flight so
|
|
45
|
-
// a slow call doesn't block the others in its slice.
|
|
39
|
+
// Work-stealing pool: keep `concurrency` jobs in flight so a slow call
|
|
40
|
+
// doesn't block the others in its slice.
|
|
46
41
|
const rawByKey = new Map();
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
|
|
62
|
-
}
|
|
63
|
-
};
|
|
64
|
-
await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
|
|
65
|
-
};
|
|
66
|
-
if (opts.batch) {
|
|
67
|
-
try {
|
|
68
|
-
await runBatchPath(jobs, opts, rawByKey);
|
|
69
|
-
}
|
|
70
|
-
catch (err) {
|
|
71
|
-
// Batch submission failed (Kimi's /v1/files doesn't match OpenAI,
|
|
72
|
-
// OpenRouter has no batch at all, transient provider error, etc.).
|
|
73
|
-
// Fall back to the online pool so the whole run doesn't die — losing
|
|
74
|
-
// the 50% batch discount is better than losing the run.
|
|
75
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
76
|
-
opts.onBatchProgress?.(`batch path failed, falling back to online: ${msg.slice(0, 200)}`);
|
|
77
|
-
rawByKey.clear(); // discard any partial state
|
|
78
|
-
await runOnlinePool();
|
|
42
|
+
let done = 0;
|
|
43
|
+
let next = 0;
|
|
44
|
+
const worker = async () => {
|
|
45
|
+
while (true) {
|
|
46
|
+
const i = next++;
|
|
47
|
+
if (i >= jobs.length)
|
|
48
|
+
return;
|
|
49
|
+
const r = await runSingle(jobs[i], opts, transport);
|
|
50
|
+
const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
|
|
51
|
+
const arr = rawByKey.get(key) ?? [];
|
|
52
|
+
arr.push(r);
|
|
53
|
+
rawByKey.set(key, arr);
|
|
54
|
+
done++;
|
|
55
|
+
opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
|
|
79
56
|
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
await runOnlinePool();
|
|
83
|
-
}
|
|
57
|
+
};
|
|
58
|
+
await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
|
|
84
59
|
// Adaptive sampling: for cells where any score-dim σ exceeds threshold,
|
|
85
60
|
// add one more rep and rerun — up to `cap` total reps. Converges on a
|
|
86
61
|
// stable estimate without wasting reps on already-stable cells.
|
|
87
|
-
if (
|
|
62
|
+
if (opts.adaptiveReps) {
|
|
88
63
|
const cap = opts.adaptiveReps.cap;
|
|
89
64
|
const threshold = opts.adaptiveReps.threshold ?? 0.1;
|
|
90
65
|
for (let round = 0; round < cap - reps; round++) {
|
|
@@ -131,7 +106,7 @@ export async function buildMatrix(variants, cases, opts) {
|
|
|
131
106
|
}
|
|
132
107
|
// Optional llm-judge pass on top-N variants (by current heuristic content).
|
|
133
108
|
if (opts.judge)
|
|
134
|
-
await runJudge(variants, cases, models, aggregated, opts.judge
|
|
109
|
+
await runJudge(variants, cases, models, aggregated, opts.judge);
|
|
135
110
|
// Assemble rows: per-variant aggregate across all cases and models.
|
|
136
111
|
const rows = [];
|
|
137
112
|
for (const v of variants) {
|
|
@@ -244,81 +219,6 @@ function halfSplitMatrix(variants, cases, models, rawByKey, side) {
|
|
|
244
219
|
scored.sort((a, b) => b.g - a.g);
|
|
245
220
|
return scored.map((s) => s.id);
|
|
246
221
|
}
|
|
247
|
-
async function runBatchPath(jobs, opts, rawByKey) {
|
|
248
|
-
const provider = detectBatchProvider(opts.baseUrl);
|
|
249
|
-
if (provider === "unsupported") {
|
|
250
|
-
throw new Error(`Batch API not supported for baseUrl=${opts.baseUrl}; rerun without --batch or point at an Anthropic / OpenAI-compatible endpoint.`);
|
|
251
|
-
}
|
|
252
|
-
// Build custom_ids that route results back to the right cell. Index is
|
|
253
|
-
// included so reps of the same (variant, case, model) don't collide.
|
|
254
|
-
const keyed = jobs.map((job, i) => ({
|
|
255
|
-
job,
|
|
256
|
-
index: i,
|
|
257
|
-
customId: `v:${job.variantId}|h:${job.case.hash}|m:${job.model}|r:${job.rep}|i:${i}`,
|
|
258
|
-
}));
|
|
259
|
-
const batchJobs = keyed.map((k) => ({
|
|
260
|
-
customId: k.customId,
|
|
261
|
-
userText: k.job.text,
|
|
262
|
-
systemText: k.job.systemText,
|
|
263
|
-
model: k.job.model,
|
|
264
|
-
}));
|
|
265
|
-
const started = Date.now();
|
|
266
|
-
const existing = opts.runId != null && opts.generation != null
|
|
267
|
-
? loadBatchState(opts.runId, opts.generation, "eval")
|
|
268
|
-
: null;
|
|
269
|
-
const transport = opts.batchCallModel ?? batchCallModel;
|
|
270
|
-
const results = await transport(batchJobs, {
|
|
271
|
-
baseUrl: opts.batchBaseUrl ?? opts.baseUrl,
|
|
272
|
-
authToken: opts.batchAuthToken ?? opts.authToken,
|
|
273
|
-
modelOverride: opts.batchModel,
|
|
274
|
-
maxTokens: opts.maxTokens,
|
|
275
|
-
resumeBatchId: existing?.batchId,
|
|
276
|
-
onSubmitted: (batchId, p) => {
|
|
277
|
-
if (opts.runId != null && opts.generation != null && !existing) {
|
|
278
|
-
saveBatchState(opts.runId, {
|
|
279
|
-
generation: opts.generation,
|
|
280
|
-
phase: "eval",
|
|
281
|
-
batchId,
|
|
282
|
-
provider: p,
|
|
283
|
-
submittedAt: new Date().toISOString(),
|
|
284
|
-
});
|
|
285
|
-
}
|
|
286
|
-
opts.onBatchProgress?.(`batch submitted: ${batchId} (${p})`);
|
|
287
|
-
},
|
|
288
|
-
onProgress: (p) => {
|
|
289
|
-
if (p.phase === "polling") {
|
|
290
|
-
const ok = p.succeeded ?? 0;
|
|
291
|
-
const failed = p.failed ?? 0;
|
|
292
|
-
const total = p.total ?? batchJobs.length;
|
|
293
|
-
opts.onBatchProgress?.(`batch ${p.batchId} polling: ${ok}/${total} done${failed ? `, ${failed} failed` : ""}`);
|
|
294
|
-
}
|
|
295
|
-
else {
|
|
296
|
-
opts.onBatchProgress?.(`batch ${p.batchId} ${p.phase}`);
|
|
297
|
-
}
|
|
298
|
-
},
|
|
299
|
-
});
|
|
300
|
-
// Mark the state entry as finished so a crash after this point doesn't
|
|
301
|
-
// cause the next run to try resuming an already-consumed batch.
|
|
302
|
-
if (opts.runId != null && existing)
|
|
303
|
-
markBatchFinished(opts.runId, existing.batchId);
|
|
304
|
-
// Score each result and populate rawByKey the same way runSingle does.
|
|
305
|
-
const durationMs = Math.round((Date.now() - started) / Math.max(1, jobs.length));
|
|
306
|
-
let done = 0;
|
|
307
|
-
for (const k of keyed) {
|
|
308
|
-
const r = results.get(k.customId);
|
|
309
|
-
const raw = r?.raw ?? "batch returned no result for this custom_id";
|
|
310
|
-
const costUsd = r?.costUsd ?? 0;
|
|
311
|
-
const parsed = attemptJsonParse(raw);
|
|
312
|
-
const scored = scoreOutput(raw, parsed, costUsd, durationMs, k.job.case, { model: k.job.model });
|
|
313
|
-
scored.variantId = k.job.variantId;
|
|
314
|
-
const mapKey = `${scored.variantId}:${scored.caseHash}:${scored.model ?? ""}`;
|
|
315
|
-
const arr = rawByKey.get(mapKey) ?? [];
|
|
316
|
-
arr.push(scored);
|
|
317
|
-
rawByKey.set(mapKey, arr);
|
|
318
|
-
done++;
|
|
319
|
-
opts.onProgress?.(done, jobs.length, k.job.case.name, k.job.variantId);
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
222
|
async function runSingle(job, opts, transport) {
|
|
323
223
|
const started = Date.now();
|
|
324
224
|
const callOpts = {
|
|
@@ -54,14 +54,6 @@ export interface EvolveOpts {
|
|
|
54
54
|
repetitions?: number;
|
|
55
55
|
/** Max in-flight eval calls. Default 8. Raise for slow endpoints, lower for strict rate limits. */
|
|
56
56
|
concurrency?: number;
|
|
57
|
-
/** Use provider batch API instead of online calls. 50% cheaper, slower wall-clock. */
|
|
58
|
-
batch?: boolean;
|
|
59
|
-
/** Override base URL for batch submissions only. */
|
|
60
|
-
batchBaseUrl?: string;
|
|
61
|
-
/** Override auth token for batch submissions only. */
|
|
62
|
-
batchAuthToken?: string;
|
|
63
|
-
/** Override model for batch submissions (e.g. kimi-k2.6 when online uses kimi-for-coding). */
|
|
64
|
-
batchModel?: string;
|
|
65
57
|
/** Adaptive sampling cap (opt-in). Keeps adding reps to noisy cells up to this count. */
|
|
66
58
|
adaptiveReps?: {
|
|
67
59
|
cap: number;
|
|
@@ -73,17 +73,10 @@ export async function evolvePrompt(opts) {
|
|
|
73
73
|
concurrency: opts.concurrency ?? 8,
|
|
74
74
|
repetitions: opts.repetitions,
|
|
75
75
|
judge: opts.judge,
|
|
76
|
-
batch: opts.batch,
|
|
77
|
-
batchBaseUrl: opts.batchBaseUrl,
|
|
78
|
-
batchAuthToken: opts.batchAuthToken,
|
|
79
|
-
batchModel: opts.batchModel,
|
|
80
76
|
adaptiveReps: opts.adaptiveReps,
|
|
81
|
-
runId,
|
|
82
|
-
generation: gen,
|
|
83
77
|
onProgress: (done, total, caseName, variantId) => {
|
|
84
78
|
log(` [${done}/${total}] ${variantId.slice(0, 16)} → ${caseName}`);
|
|
85
79
|
},
|
|
86
|
-
onBatchProgress: (msg) => log(` [batch] ${msg}`),
|
|
87
80
|
};
|
|
88
81
|
const matrix = await buildMatrix(population, trainCases, evalOpts);
|
|
89
82
|
generationMatrices.push(matrix);
|
|
@@ -198,11 +191,7 @@ export async function evolvePrompt(opts) {
|
|
|
198
191
|
concurrency: opts.concurrency ?? 8,
|
|
199
192
|
repetitions: opts.repetitions,
|
|
200
193
|
judge: opts.judge,
|
|
201
|
-
batch: opts.batch,
|
|
202
194
|
adaptiveReps: opts.adaptiveReps,
|
|
203
|
-
runId,
|
|
204
|
-
generation: generations,
|
|
205
|
-
onBatchProgress: (msg) => log(` [batch] ${msg}`),
|
|
206
195
|
});
|
|
207
196
|
generationMatrices.push(finalMatrix);
|
|
208
197
|
snapshotPrompts(runId, finalMatrix);
|
|
@@ -222,14 +211,7 @@ export async function evolvePrompt(opts) {
|
|
|
222
211
|
authToken: opts.authToken,
|
|
223
212
|
concurrency: opts.concurrency ?? 8,
|
|
224
213
|
repetitions: opts.repetitions,
|
|
225
|
-
batch: opts.batch,
|
|
226
|
-
batchBaseUrl: opts.batchBaseUrl,
|
|
227
|
-
batchAuthToken: opts.batchAuthToken,
|
|
228
|
-
batchModel: opts.batchModel,
|
|
229
214
|
adaptiveReps: opts.adaptiveReps,
|
|
230
|
-
runId,
|
|
231
|
-
generation: generations + 1,
|
|
232
|
-
onBatchProgress: (msg) => log(` [batch-test] ${msg}`),
|
|
233
215
|
});
|
|
234
216
|
log(formatMatrix(testMatrix, testCases.map((c) => c.name)));
|
|
235
217
|
}
|
|
@@ -37,26 +37,6 @@ export declare function appendLearning(runId: string, entries: LearningEntry[]):
|
|
|
37
37
|
export declare function snapshotPrompts(runId: string, rows: VariantRow[]): void;
|
|
38
38
|
/** Finalise the run: write best.md and update meta.json. */
|
|
39
39
|
export declare function finalizeRun(runId: string, result: EvolutionResult, metaPartial?: Partial<RunMeta>): void;
|
|
40
|
-
/**
|
|
41
|
-
* Persist batch submission state so a crashed or restarted run can resume
|
|
42
|
-
* polling instead of resubmitting (which would duplicate the bill).
|
|
43
|
-
*
|
|
44
|
-
* Keyed by (generation, phase) so multi-generation runs and eval-vs-judge
|
|
45
|
-
* submissions don't collide. Written append-only — the latest entry wins
|
|
46
|
-
* on load.
|
|
47
|
-
*/
|
|
48
|
-
export interface BatchStateEntry {
|
|
49
|
-
generation: number;
|
|
50
|
-
phase: "eval" | "judge";
|
|
51
|
-
batchId: string;
|
|
52
|
-
provider: "anthropic" | "openai-compatible";
|
|
53
|
-
submittedAt: string;
|
|
54
|
-
/** If set, we've already collected results for this entry — ignore on resume. */
|
|
55
|
-
finishedAt?: string;
|
|
56
|
-
}
|
|
57
|
-
export declare function saveBatchState(runId: string, entry: BatchStateEntry): void;
|
|
58
|
-
export declare function loadBatchState(runId: string, generation: number, phase: "eval" | "judge"): BatchStateEntry | null;
|
|
59
|
-
export declare function markBatchFinished(runId: string, batchId: string): void;
|
|
60
40
|
/** List all runs, newest first. */
|
|
61
41
|
export declare function listRuns(): Array<{
|
|
62
42
|
runId: string;
|
|
@@ -118,45 +118,6 @@ ${result.learningLog.map((l) => `| ${l.generation} | ${l.mutationSummary} | ${(l
|
|
|
118
118
|
`;
|
|
119
119
|
writeFileSync(join(root, "best.md"), report);
|
|
120
120
|
}
|
|
121
|
-
export function saveBatchState(runId, entry) {
|
|
122
|
-
const path = join(runDir(runId), "batch-jobs.jsonl");
|
|
123
|
-
writeFileSync(path, JSON.stringify(entry) + "\n", { flag: "a" });
|
|
124
|
-
}
|
|
125
|
-
export function loadBatchState(runId, generation, phase) {
|
|
126
|
-
const path = join(runDir(runId), "batch-jobs.jsonl");
|
|
127
|
-
if (!existsSync(path))
|
|
128
|
-
return null;
|
|
129
|
-
const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
|
|
130
|
-
let latest = null;
|
|
131
|
-
for (const line of lines) {
|
|
132
|
-
try {
|
|
133
|
-
const e = JSON.parse(line);
|
|
134
|
-
if (e.generation === generation && e.phase === phase)
|
|
135
|
-
latest = e;
|
|
136
|
-
}
|
|
137
|
-
catch { /* skip malformed */ }
|
|
138
|
-
}
|
|
139
|
-
// Only return if not yet finished — otherwise caller would re-poll a consumed batch.
|
|
140
|
-
return latest && !latest.finishedAt ? latest : null;
|
|
141
|
-
}
|
|
142
|
-
export function markBatchFinished(runId, batchId) {
|
|
143
|
-
const path = join(runDir(runId), "batch-jobs.jsonl");
|
|
144
|
-
if (!existsSync(path))
|
|
145
|
-
return;
|
|
146
|
-
const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
|
|
147
|
-
const updated = lines.map((line) => {
|
|
148
|
-
try {
|
|
149
|
-
const e = JSON.parse(line);
|
|
150
|
-
if (e.batchId === batchId && !e.finishedAt) {
|
|
151
|
-
e.finishedAt = new Date().toISOString();
|
|
152
|
-
return JSON.stringify(e);
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
catch { /* skip */ }
|
|
156
|
-
return line;
|
|
157
|
-
});
|
|
158
|
-
writeFileSync(path, updated.join("\n") + "\n");
|
|
159
|
-
}
|
|
160
121
|
/** List all runs, newest first. */
|
|
161
122
|
export function listRuns() {
|
|
162
123
|
const root = storeRoot();
|
package/dist/prompts/load.d.ts
CHANGED
package/dist/prompts/load.js
CHANGED
|
@@ -2,7 +2,7 @@ import { readFileSync } from "node:fs";
|
|
|
2
2
|
import { dirname, join } from "node:path";
|
|
3
3
|
import { fileURLToPath } from "node:url";
|
|
4
4
|
// Resolve <pkg>/prompts whether running from dist/ (installed) or src/ (dev).
|
|
5
|
-
const PROMPTS_ROOT = (() => {
|
|
5
|
+
export const PROMPTS_ROOT = (() => {
|
|
6
6
|
const here = dirname(fileURLToPath(import.meta.url));
|
|
7
7
|
for (const depth of [2, 3, 4]) {
|
|
8
8
|
const candidate = join(here, ...Array(depth).fill(".."), "prompts");
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-overnight",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.60.0",
|
|
4
4
|
"description": "Overnight parallel coding agents in git worktrees, with a self-curating skill memory that improves while the run is going. Mix Claude Opus as planner, Kimi 2.6 or Cursor composer-2 as cheap fast worker, Gemini or Qwen for bulk implementation. Multi-wave autonomous loop that plans, executes, reviews, and steers itself until the objective is met. Crash-safe resume, rate-limit aware, usage cap preserves headroom for your interactive Claude Code.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-overnight",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.60.0",
|
|
4
4
|
"description": "Claude Code skill for understanding, installing, and inspecting claude-overnight runs: overnight parallel coding agents in git worktrees with a self-curating skill memory, multi-wave steering, three-layer review, and crash-safe resume. Mix Opus planner with Kimi 2.6, Cursor composer-2, Gemini, Qwen, or any Anthropic-compatible worker.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Francesco Fornace"
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Batch-API transport for prompt evolution.
|
|
3
|
-
*
|
|
4
|
-
* 50% cheaper than online calls on every major provider that supports
|
|
5
|
-
* batch. Perfect fit for generations=1 benchmark rounds where interactive
|
|
6
|
-
* progress isn't needed — we submit 120-1000 requests, poll every 30-300s,
|
|
7
|
-
* then pull the results in one shot.
|
|
8
|
-
*
|
|
9
|
-
* Provider detection from baseUrl:
|
|
10
|
-
* - api.anthropic.com → Anthropic Message Batches API (one-shot submit)
|
|
11
|
-
* - kimi / moonshot / openai → OpenAI-compatible file-based batch
|
|
12
|
-
* - openrouter → NO batch support; throws (caller must fall back to online)
|
|
13
|
-
*
|
|
14
|
-
* Custom IDs route results back to the right (variant, case, model, rep)
|
|
15
|
-
* cell. The evaluator builds ids like `v0:h_abc:kimi-for-coding:r0`.
|
|
16
|
-
*
|
|
17
|
-
* Poll state is persisted via `persistBatchState` so a crashed or
|
|
18
|
-
* restarted run can resume without resubmitting.
|
|
19
|
-
*/
|
|
20
|
-
import type { CallModelResult } from "./transport.js";
|
|
21
|
-
export interface BatchJob {
|
|
22
|
-
customId: string;
|
|
23
|
-
userText: string;
|
|
24
|
-
systemText?: string;
|
|
25
|
-
model: string;
|
|
26
|
-
}
|
|
27
|
-
export interface BatchOpts {
|
|
28
|
-
baseUrl?: string;
|
|
29
|
-
authToken?: string;
|
|
30
|
-
/**
|
|
31
|
-
* Override model for the batch submission. Moonshot's batch API only
|
|
32
|
-
* accepts kimi-k2.5 or kimi-k2.6 — NOT the kimi-for-coding alias that the
|
|
33
|
-
* coding endpoint uses. When batch is enabled against a Kimi stack, set
|
|
34
|
-
* this so online eval keeps using kimi-for-coding while batch uses the
|
|
35
|
-
* concrete version.
|
|
36
|
-
*/
|
|
37
|
-
modelOverride?: string;
|
|
38
|
-
maxTokens?: number;
|
|
39
|
-
/** Poll interval starts here and doubles to `pollMaxMs`. Defaults 30s → 5min. */
|
|
40
|
-
pollStartMs?: number;
|
|
41
|
-
pollMaxMs?: number;
|
|
42
|
-
/** Overall timeout for the whole batch. Default 24h — matches provider SLAs. */
|
|
43
|
-
batchTimeoutMs?: number;
|
|
44
|
-
/** Called with progress snapshots during polling. */
|
|
45
|
-
onProgress?: (p: BatchProgress) => void;
|
|
46
|
-
/** Restore a previously-submitted batch instead of resubmitting. */
|
|
47
|
-
resumeBatchId?: string;
|
|
48
|
-
/** Called after submit returns an id — use to persist for crash resume. */
|
|
49
|
-
onSubmitted?: (batchId: string, provider: BatchProvider) => void;
|
|
50
|
-
}
|
|
51
|
-
export interface BatchProgress {
|
|
52
|
-
provider: BatchProvider;
|
|
53
|
-
batchId: string;
|
|
54
|
-
phase: "submitted" | "polling" | "downloading" | "done";
|
|
55
|
-
processing?: number;
|
|
56
|
-
succeeded?: number;
|
|
57
|
-
failed?: number;
|
|
58
|
-
total?: number;
|
|
59
|
-
}
|
|
60
|
-
export type BatchProvider = "anthropic" | "openai-compatible" | "unsupported";
|
|
61
|
-
export declare function detectBatchProvider(baseUrl: string | undefined): BatchProvider;
|
|
62
|
-
export declare function batchCallModel(jobs: BatchJob[], opts: BatchOpts): Promise<Map<string, CallModelResult>>;
|
|
@@ -1,235 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Batch-API transport for prompt evolution.
|
|
3
|
-
*
|
|
4
|
-
* 50% cheaper than online calls on every major provider that supports
|
|
5
|
-
* batch. Perfect fit for generations=1 benchmark rounds where interactive
|
|
6
|
-
* progress isn't needed — we submit 120-1000 requests, poll every 30-300s,
|
|
7
|
-
* then pull the results in one shot.
|
|
8
|
-
*
|
|
9
|
-
* Provider detection from baseUrl:
|
|
10
|
-
* - api.anthropic.com → Anthropic Message Batches API (one-shot submit)
|
|
11
|
-
* - kimi / moonshot / openai → OpenAI-compatible file-based batch
|
|
12
|
-
* - openrouter → NO batch support; throws (caller must fall back to online)
|
|
13
|
-
*
|
|
14
|
-
* Custom IDs route results back to the right (variant, case, model, rep)
|
|
15
|
-
* cell. The evaluator builds ids like `v0:h_abc:kimi-for-coding:r0`.
|
|
16
|
-
*
|
|
17
|
-
* Poll state is persisted via `persistBatchState` so a crashed or
|
|
18
|
-
* restarted run can resume without resubmitting.
|
|
19
|
-
*/
|
|
20
|
-
import { VERSION } from "../core/_version.js";
|
|
21
|
-
const USER_AGENT = `claude-overnight-evolve/${VERSION}`;
|
|
22
|
-
export function detectBatchProvider(baseUrl) {
|
|
23
|
-
const url = (baseUrl ?? "https://api.anthropic.com").toLowerCase();
|
|
24
|
-
if (/(^|\/\/)(api\.)?anthropic\.com/.test(url))
|
|
25
|
-
return "anthropic";
|
|
26
|
-
// Providers with no batch support — caller auto-falls back to online.
|
|
27
|
-
// - OpenRouter: no batch API at all.
|
|
28
|
-
// - api.kimi.com/coding: Moonshot's coding-specific endpoint; synchronous
|
|
29
|
-
// only (30 concurrent, 300-1200 req/5hr) with no /v1/files upload flow.
|
|
30
|
-
// Moonshot's generic platform.moonshot.ai might have batch; this one
|
|
31
|
-
// doesn't.
|
|
32
|
-
if (/openrouter/.test(url))
|
|
33
|
-
return "unsupported";
|
|
34
|
-
if (/(api\.)?kimi\.com\/coding/.test(url))
|
|
35
|
-
return "unsupported";
|
|
36
|
-
// Everything else that speaks /v1/chat/completions — OpenAI, DeepSeek,
|
|
37
|
-
// DashScope in OpenAI-compat mode — exposes an OpenAI-compatible batch
|
|
38
|
-
// endpoint we can ride.
|
|
39
|
-
return "openai-compatible";
|
|
40
|
-
}
|
|
41
|
-
export async function batchCallModel(jobs, opts) {
|
|
42
|
-
if (jobs.length === 0)
|
|
43
|
-
return new Map();
|
|
44
|
-
const provider = detectBatchProvider(opts.baseUrl);
|
|
45
|
-
if (provider === "unsupported") {
|
|
46
|
-
throw new Error(`Batch API not supported for baseUrl=${opts.baseUrl}. ` +
|
|
47
|
-
`Options: (1) omit --batch and use online transport, or (2) point ` +
|
|
48
|
-
`the batch call at a provider with batch support (e.g. set --batch-base-url ` +
|
|
49
|
-
`https://api.moonshot.ai/v1 --batch-model kimi-k2.6 for Kimi users whose ` +
|
|
50
|
-
`online endpoint is api.kimi.com/coding).`);
|
|
51
|
-
}
|
|
52
|
-
if (provider === "anthropic")
|
|
53
|
-
return runAnthropicBatch(jobs, opts);
|
|
54
|
-
return runOpenAIBatch(jobs, opts);
|
|
55
|
-
}
|
|
56
|
-
// ── Anthropic ──────────────────────────────────────────────────────────────
|
|
57
|
-
async function runAnthropicBatch(jobs, opts) {
|
|
58
|
-
const baseUrl = (opts.baseUrl ?? "https://api.anthropic.com").replace(/\/$/, "");
|
|
59
|
-
const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
|
|
60
|
-
const headers = {
|
|
61
|
-
"Content-Type": "application/json",
|
|
62
|
-
"Authorization": `Bearer ${authToken}`,
|
|
63
|
-
"User-Agent": USER_AGENT,
|
|
64
|
-
"anthropic-version": "2023-06-01",
|
|
65
|
-
"anthropic-beta": "message-batches-2024-09-24",
|
|
66
|
-
};
|
|
67
|
-
let batchId = opts.resumeBatchId;
|
|
68
|
-
if (!batchId) {
|
|
69
|
-
const body = JSON.stringify({
|
|
70
|
-
requests: jobs.map((j) => {
|
|
71
|
-
const params = {
|
|
72
|
-
model: opts.modelOverride ?? j.model,
|
|
73
|
-
max_tokens: opts.maxTokens ?? 4096,
|
|
74
|
-
messages: [{ role: "user", content: j.userText }],
|
|
75
|
-
};
|
|
76
|
-
if (j.systemText)
|
|
77
|
-
params.system = j.systemText;
|
|
78
|
-
return { custom_id: j.customId, params };
|
|
79
|
-
}),
|
|
80
|
-
});
|
|
81
|
-
const res = await fetch(`${baseUrl}/v1/messages/batches`, { method: "POST", headers, body });
|
|
82
|
-
if (!res.ok)
|
|
83
|
-
throw new Error(`Anthropic batch submit: HTTP ${res.status} ${await res.text()}`);
|
|
84
|
-
const data = await res.json();
|
|
85
|
-
batchId = data.id;
|
|
86
|
-
opts.onSubmitted?.(batchId, "anthropic");
|
|
87
|
-
}
|
|
88
|
-
opts.onProgress?.({ provider: "anthropic", batchId, phase: "submitted", total: jobs.length });
|
|
89
|
-
const endedAt = await pollUntilDone(async () => {
|
|
90
|
-
const res = await fetch(`${baseUrl}/v1/messages/batches/${batchId}`, { headers });
|
|
91
|
-
if (!res.ok)
|
|
92
|
-
throw new Error(`Anthropic batch poll: HTTP ${res.status}`);
|
|
93
|
-
const d = await res.json();
|
|
94
|
-
opts.onProgress?.({
|
|
95
|
-
provider: "anthropic",
|
|
96
|
-
batchId: batchId,
|
|
97
|
-
phase: "polling",
|
|
98
|
-
processing: d.request_counts?.processing,
|
|
99
|
-
succeeded: d.request_counts?.succeeded,
|
|
100
|
-
failed: (d.request_counts?.errored ?? 0) + (d.request_counts?.canceled ?? 0) + (d.request_counts?.expired ?? 0),
|
|
101
|
-
total: jobs.length,
|
|
102
|
-
});
|
|
103
|
-
return d.processing_status === "ended" ? d : null;
|
|
104
|
-
}, opts);
|
|
105
|
-
opts.onProgress?.({ provider: "anthropic", batchId, phase: "downloading" });
|
|
106
|
-
const resultsUrl = endedAt.results_url ?? `${baseUrl}/v1/messages/batches/${batchId}/results`;
|
|
107
|
-
const res = await fetch(resultsUrl, { headers });
|
|
108
|
-
if (!res.ok)
|
|
109
|
-
throw new Error(`Anthropic batch results: HTTP ${res.status}`);
|
|
110
|
-
const text = await res.text();
|
|
111
|
-
const out = new Map();
|
|
112
|
-
for (const line of text.split("\n")) {
|
|
113
|
-
if (!line.trim())
|
|
114
|
-
continue;
|
|
115
|
-
const row = JSON.parse(line);
|
|
116
|
-
if (row.result.type === "succeeded") {
|
|
117
|
-
const raw = row.result.message.content.map((c) => c.text ?? "").join("");
|
|
118
|
-
const inp = row.result.message.usage?.input_tokens ?? 0;
|
|
119
|
-
const outp = row.result.message.usage?.output_tokens ?? 0;
|
|
120
|
-
out.set(row.custom_id, { raw, costUsd: (inp * 0.000003 + outp * 0.000015) * 0.5, inputTokens: inp, outputTokens: outp });
|
|
121
|
-
}
|
|
122
|
-
else {
|
|
123
|
-
const msg = row.result.type === "errored" ? row.result.error.message : row.result.type;
|
|
124
|
-
out.set(row.custom_id, { raw: `batch error: ${msg}`, costUsd: 0, inputTokens: 0, outputTokens: 0 });
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
opts.onProgress?.({ provider: "anthropic", batchId, phase: "done", succeeded: out.size, total: jobs.length });
|
|
128
|
-
return out;
|
|
129
|
-
}
|
|
130
|
-
// ── OpenAI-compatible (OpenAI, Kimi/Moonshot, DeepSeek) ────────────────────
|
|
131
|
-
async function runOpenAIBatch(jobs, opts) {
|
|
132
|
-
const baseUrl = (opts.baseUrl ?? "https://api.openai.com").replace(/\/$/, "");
|
|
133
|
-
const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
|
|
134
|
-
const authHeaders = {
|
|
135
|
-
"Authorization": `Bearer ${authToken}`,
|
|
136
|
-
"User-Agent": USER_AGENT,
|
|
137
|
-
};
|
|
138
|
-
let batchId = opts.resumeBatchId;
|
|
139
|
-
let outputFileId;
|
|
140
|
-
if (!batchId) {
|
|
141
|
-
// Build the JSONL payload and upload as a file.
|
|
142
|
-
const jsonl = jobs.map((j) => {
|
|
143
|
-
const messages = [];
|
|
144
|
-
if (j.systemText)
|
|
145
|
-
messages.push({ role: "system", content: j.systemText });
|
|
146
|
-
messages.push({ role: "user", content: j.userText });
|
|
147
|
-
return JSON.stringify({
|
|
148
|
-
custom_id: j.customId,
|
|
149
|
-
method: "POST",
|
|
150
|
-
url: "/v1/chat/completions",
|
|
151
|
-
body: { model: opts.modelOverride ?? j.model, max_tokens: opts.maxTokens ?? 4096, max_completion_tokens: opts.maxTokens ?? 4096, messages },
|
|
152
|
-
});
|
|
153
|
-
}).join("\n");
|
|
154
|
-
const form = new FormData();
|
|
155
|
-
form.append("purpose", "batch");
|
|
156
|
-
form.append("file", new Blob([jsonl], { type: "application/jsonl" }), "batch-input.jsonl");
|
|
157
|
-
const fileRes = await fetch(`${baseUrl}/v1/files`, { method: "POST", headers: authHeaders, body: form });
|
|
158
|
-
if (!fileRes.ok) {
|
|
159
|
-
const body = await fileRes.text().catch(() => "");
|
|
160
|
-
throw new Error(`Batch file-upload failed: HTTP ${fileRes.status} at ${baseUrl}/v1/files. ` +
|
|
161
|
-
`This provider may not support OpenAI-compatible batch. Response: ${body.slice(0, 300)}`);
|
|
162
|
-
}
|
|
163
|
-
const fileData = await fileRes.json();
|
|
164
|
-
const createRes = await fetch(`${baseUrl}/v1/batches`, {
|
|
165
|
-
method: "POST",
|
|
166
|
-
headers: { ...authHeaders, "Content-Type": "application/json" },
|
|
167
|
-
body: JSON.stringify({ input_file_id: fileData.id, endpoint: "/v1/chat/completions", completion_window: "24h" }),
|
|
168
|
-
});
|
|
169
|
-
if (!createRes.ok)
|
|
170
|
-
throw new Error(`OpenAI-compat batch create: HTTP ${createRes.status} ${await createRes.text()}`);
|
|
171
|
-
const createData = await createRes.json();
|
|
172
|
-
batchId = createData.id;
|
|
173
|
-
opts.onSubmitted?.(batchId, "openai-compatible");
|
|
174
|
-
}
|
|
175
|
-
opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "submitted", total: jobs.length });
|
|
176
|
-
const endedAt = await pollUntilDone(async () => {
|
|
177
|
-
const res = await fetch(`${baseUrl}/v1/batches/${batchId}`, { headers: authHeaders });
|
|
178
|
-
if (!res.ok)
|
|
179
|
-
throw new Error(`OpenAI-compat batch poll: HTTP ${res.status}`);
|
|
180
|
-
const d = await res.json();
|
|
181
|
-
opts.onProgress?.({
|
|
182
|
-
provider: "openai-compatible",
|
|
183
|
-
batchId: batchId,
|
|
184
|
-
phase: "polling",
|
|
185
|
-
succeeded: d.request_counts?.completed,
|
|
186
|
-
failed: d.request_counts?.failed,
|
|
187
|
-
total: d.request_counts?.total ?? jobs.length,
|
|
188
|
-
});
|
|
189
|
-
if (d.status === "completed")
|
|
190
|
-
return d;
|
|
191
|
-
if (d.status === "failed" || d.status === "expired" || d.status === "cancelled") {
|
|
192
|
-
throw new Error(`OpenAI-compat batch ${d.status}`);
|
|
193
|
-
}
|
|
194
|
-
return null;
|
|
195
|
-
}, opts);
|
|
196
|
-
outputFileId = endedAt.output_file_id;
|
|
197
|
-
if (!outputFileId)
|
|
198
|
-
throw new Error("OpenAI-compat batch completed with no output_file_id");
|
|
199
|
-
opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "downloading" });
|
|
200
|
-
const contentRes = await fetch(`${baseUrl}/v1/files/${outputFileId}/content`, { headers: authHeaders });
|
|
201
|
-
if (!contentRes.ok)
|
|
202
|
-
throw new Error(`OpenAI-compat batch download: HTTP ${contentRes.status}`);
|
|
203
|
-
const text = await contentRes.text();
|
|
204
|
-
const out = new Map();
|
|
205
|
-
for (const line of text.split("\n")) {
|
|
206
|
-
if (!line.trim())
|
|
207
|
-
continue;
|
|
208
|
-
const row = JSON.parse(line);
|
|
209
|
-
if (row.error || !row.response) {
|
|
210
|
-
out.set(row.custom_id, { raw: `batch error: ${row.error?.message ?? "unknown"}`, costUsd: 0, inputTokens: 0, outputTokens: 0 });
|
|
211
|
-
continue;
|
|
212
|
-
}
|
|
213
|
-
const raw = row.response.body.choices?.[0]?.message?.content ?? "";
|
|
214
|
-
const inp = row.response.body.usage?.prompt_tokens ?? 0;
|
|
215
|
-
const outp = row.response.body.usage?.completion_tokens ?? 0;
|
|
216
|
-
out.set(row.custom_id, { raw, costUsd: (inp * 0.000003 + outp * 0.000015) * 0.5, inputTokens: inp, outputTokens: outp });
|
|
217
|
-
}
|
|
218
|
-
opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "done", succeeded: out.size, total: jobs.length });
|
|
219
|
-
return out;
|
|
220
|
-
}
|
|
221
|
-
// ── Shared poll loop ───────────────────────────────────────────────────────
|
|
222
|
-
async function pollUntilDone(check, opts) {
|
|
223
|
-
const start = Date.now();
|
|
224
|
-
const deadline = start + (opts.batchTimeoutMs ?? 24 * 60 * 60 * 1000);
|
|
225
|
-
let delay = opts.pollStartMs ?? 30_000;
|
|
226
|
-
const maxDelay = opts.pollMaxMs ?? 5 * 60_000;
|
|
227
|
-
while (Date.now() < deadline) {
|
|
228
|
-
const result = await check();
|
|
229
|
-
if (result != null)
|
|
230
|
-
return result;
|
|
231
|
-
await new Promise((r) => setTimeout(r, delay));
|
|
232
|
-
delay = Math.min(maxDelay, delay * 2);
|
|
233
|
-
}
|
|
234
|
-
throw new Error("Batch exceeded batchTimeoutMs without completing");
|
|
235
|
-
}
|