claude-overnight 1.54.0 → 1.55.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/evolve.js +70 -1
- package/dist/core/_version.d.ts +1 -1
- package/dist/core/_version.js +1 -1
- package/dist/prompt-evolution/evaluator.d.ts +27 -7
- package/dist/prompt-evolution/evaluator.js +185 -143
- package/dist/prompt-evolution/fixtures/harvest.d.ts +30 -0
- package/dist/prompt-evolution/fixtures/harvest.js +88 -0
- package/dist/prompt-evolution/fixtures/plan-cases.d.ts +9 -6
- package/dist/prompt-evolution/fixtures/plan-cases.js +72 -23
- package/dist/prompt-evolution/index.d.ts +11 -0
- package/dist/prompt-evolution/index.js +10 -3
- package/dist/prompt-evolution/report.d.ts +8 -6
- package/dist/prompt-evolution/report.js +73 -30
- package/dist/prompt-evolution/scorer.d.ts +23 -5
- package/dist/prompt-evolution/scorer.js +106 -62
- package/dist/prompt-evolution/transport.d.ts +28 -0
- package/dist/prompt-evolution/transport.js +99 -0
- package/dist/prompt-evolution/types.d.ts +15 -5
- package/package.json +1 -1
- package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1
package/dist/bin/evolve.js
CHANGED
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
*/
|
|
18
18
|
import { evolvePrompt } from "../prompt-evolution/index.js";
|
|
19
19
|
import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
|
|
20
|
+
import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
|
|
20
21
|
import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
|
|
21
22
|
function help() {
|
|
22
23
|
process.stdout.write(`Usage: claude-overnight-evolve [options]
|
|
@@ -27,12 +28,20 @@ Options:
|
|
|
27
28
|
--prompt-kind <kind> MCP-browser prompt kind: planning | review | evolution |
|
|
28
29
|
goal-refinement | plan-supervision | simple-supervision | stuck-analysis
|
|
29
30
|
--eval-model <model> Fast model for evaluation (default: claude-haiku-4-5)
|
|
31
|
+
--eval-models <list> Comma-separated list to run cross-model (overrides --eval-model)
|
|
30
32
|
--mutate-model <model> Smarter model for mutation (defaults to eval-model)
|
|
31
33
|
--generations <n> Number of evolution generations (default: 10)
|
|
32
34
|
--population <n> Max population size (default: 8)
|
|
33
35
|
--plateau <n> Stop early if no improvement for N generations (default: 3)
|
|
36
|
+
--reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
|
|
37
|
+
--concurrency <n> Max in-flight eval calls (default: 8; bump for slow endpoints)
|
|
38
|
+
--judge Use llm-judge for content scoring (costs extra API calls)
|
|
39
|
+
--judge-model <model> Model to use for the judge (default: same as eval-model)
|
|
40
|
+
--judge-top-n <n> Judge only the top-N variants per generation (default: 4)
|
|
34
41
|
--cases <suite> Benchmark suite: plan | mcp-planning | mcp-review |
|
|
35
42
|
mcp-supervision | mcp-stuck (default: plan)
|
|
43
|
+
--harvest Append cases harvested from <cwd>/.claude-overnight/runs/*
|
|
44
|
+
--harvest-limit <n> Max harvested cases (default: 10)
|
|
36
45
|
--base-url <url> API base URL override
|
|
37
46
|
--auth-token <token> Auth token override
|
|
38
47
|
--run-id <id> Preset run id (default: auto-generated)
|
|
@@ -52,7 +61,12 @@ function parseArgs() {
|
|
|
52
61
|
generations: 10,
|
|
53
62
|
population: 8,
|
|
54
63
|
plateau: 3,
|
|
64
|
+
reps: 1,
|
|
65
|
+
useJudge: false,
|
|
66
|
+
judgeTopN: 4,
|
|
55
67
|
cases: "",
|
|
68
|
+
harvest: false,
|
|
69
|
+
harvestLimit: 10,
|
|
56
70
|
baseUrl: process.env.ANTHROPIC_BASE_URL,
|
|
57
71
|
authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
|
|
58
72
|
};
|
|
@@ -75,6 +89,10 @@ function parseArgs() {
|
|
|
75
89
|
opts.evalModel = v;
|
|
76
90
|
i++;
|
|
77
91
|
break;
|
|
92
|
+
case "--eval-models":
|
|
93
|
+
opts.evalModels = v.split(",").map((s) => s.trim()).filter(Boolean);
|
|
94
|
+
i++;
|
|
95
|
+
break;
|
|
78
96
|
case "--mutate-model":
|
|
79
97
|
opts.mutateModel = v;
|
|
80
98
|
i++;
|
|
@@ -91,10 +109,36 @@ function parseArgs() {
|
|
|
91
109
|
opts.plateau = parseInt(v, 10);
|
|
92
110
|
i++;
|
|
93
111
|
break;
|
|
112
|
+
case "--reps":
|
|
113
|
+
opts.reps = parseInt(v, 10);
|
|
114
|
+
i++;
|
|
115
|
+
break;
|
|
116
|
+
case "--concurrency":
|
|
117
|
+
opts.concurrency = parseInt(v, 10);
|
|
118
|
+
i++;
|
|
119
|
+
break;
|
|
120
|
+
case "--judge":
|
|
121
|
+
opts.useJudge = true;
|
|
122
|
+
break;
|
|
123
|
+
case "--judge-model":
|
|
124
|
+
opts.judgeModel = v;
|
|
125
|
+
i++;
|
|
126
|
+
break;
|
|
127
|
+
case "--judge-top-n":
|
|
128
|
+
opts.judgeTopN = parseInt(v, 10);
|
|
129
|
+
i++;
|
|
130
|
+
break;
|
|
94
131
|
case "--cases":
|
|
95
132
|
opts.cases = v;
|
|
96
133
|
i++;
|
|
97
134
|
break;
|
|
135
|
+
case "--harvest":
|
|
136
|
+
opts.harvest = true;
|
|
137
|
+
break;
|
|
138
|
+
case "--harvest-limit":
|
|
139
|
+
opts.harvestLimit = parseInt(v, 10);
|
|
140
|
+
i++;
|
|
141
|
+
break;
|
|
98
142
|
case "--base-url":
|
|
99
143
|
opts.baseUrl = v;
|
|
100
144
|
i++;
|
|
@@ -138,9 +182,23 @@ async function main() {
|
|
|
138
182
|
}
|
|
139
183
|
else {
|
|
140
184
|
if (opts.cases === "plan")
|
|
141
|
-
cases = PLAN_CASES;
|
|
185
|
+
cases = [...PLAN_CASES];
|
|
142
186
|
else
|
|
143
187
|
throw new Error(`Unknown case suite: ${opts.cases}`);
|
|
188
|
+
if (opts.harvest) {
|
|
189
|
+
const harvested = harvestRealCases({
|
|
190
|
+
cwd: process.cwd(),
|
|
191
|
+
promptPath,
|
|
192
|
+
limit: opts.harvestLimit,
|
|
193
|
+
});
|
|
194
|
+
if (harvested.length === 0) {
|
|
195
|
+
console.log(` (harvest: no runs found under <cwd>/.claude-overnight/runs)`);
|
|
196
|
+
}
|
|
197
|
+
else {
|
|
198
|
+
console.log(` (harvest: +${harvested.length} real objectives)`);
|
|
199
|
+
cases = cases.concat(harvested);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
144
202
|
}
|
|
145
203
|
console.log(`Evolution config:`);
|
|
146
204
|
console.log(` target: ${opts.target}`);
|
|
@@ -156,10 +214,21 @@ async function main() {
|
|
|
156
214
|
promptPath,
|
|
157
215
|
cases,
|
|
158
216
|
evalModel: opts.evalModel,
|
|
217
|
+
evalModels: opts.evalModels,
|
|
159
218
|
mutateModel: opts.mutateModel,
|
|
160
219
|
generations: opts.generations,
|
|
161
220
|
populationCap: opts.population,
|
|
162
221
|
plateauGenerations: opts.plateau,
|
|
222
|
+
repetitions: opts.reps > 1 ? opts.reps : undefined,
|
|
223
|
+
concurrency: opts.concurrency,
|
|
224
|
+
judge: opts.useJudge
|
|
225
|
+
? {
|
|
226
|
+
model: opts.judgeModel ?? opts.evalModel,
|
|
227
|
+
baseUrl: opts.baseUrl,
|
|
228
|
+
authToken: opts.authToken,
|
|
229
|
+
topN: opts.judgeTopN,
|
|
230
|
+
}
|
|
231
|
+
: undefined,
|
|
163
232
|
baseUrl: opts.baseUrl,
|
|
164
233
|
authToken: opts.authToken,
|
|
165
234
|
seedText,
|
package/dist/core/_version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "1.
|
|
1
|
+
export declare const VERSION = "1.55.2";
|
package/dist/core/_version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// Auto-generated by build — do not edit manually.
|
|
2
|
-
export const VERSION = "1.
|
|
2
|
+
export const VERSION = "1.55.2";
|
|
@@ -1,18 +1,28 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Evaluation matrix runner.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* cells = EvaluationResult with multi-dimensional scores
|
|
4
|
+
* rows = variants
|
|
5
|
+
* columns = cases (optionally × models)
|
|
6
|
+
* cells = EvaluationResult with multi-dimensional scores
|
|
8
7
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
8
|
+
* Repetitions (N) give us a noise floor: the same (variant, case) is run N
|
|
9
|
+
* times and results aggregate to mean + stddev. Without this we can't tell
|
|
10
|
+
* whether 56.7 vs 37.4 is signal or variance.
|
|
11
|
+
*
|
|
12
|
+
* Multi-model runs (models[].length > 1) give us cross-model stddev: a
|
|
13
|
+
* prompt that only works on one generator is fragile.
|
|
14
|
+
*
|
|
15
|
+
* All HTTP calls go through `transport.callModel` so tests can inject a
|
|
16
|
+
* deterministic mock (see prompt-evolution-discrimination.test.ts).
|
|
11
17
|
*/
|
|
18
|
+
import { type JudgeOpts } from "./llm-judge.js";
|
|
19
|
+
import { type CallModel } from "./transport.js";
|
|
12
20
|
import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
|
|
13
21
|
export interface EvalOpts {
|
|
14
|
-
/**
|
|
22
|
+
/** Primary generator model (retained for single-model compat). */
|
|
15
23
|
model: string;
|
|
24
|
+
/** Multiple generator models — enables cross-model scoring. Overrides `model` when ≥2 entries. */
|
|
25
|
+
models?: string[];
|
|
16
26
|
/** Base URL for the API endpoint */
|
|
17
27
|
baseUrl?: string;
|
|
18
28
|
/** Auth token */
|
|
@@ -21,6 +31,16 @@ export interface EvalOpts {
|
|
|
21
31
|
maxTokens?: number;
|
|
22
32
|
/** Concurrency for parallel case evaluation */
|
|
23
33
|
concurrency?: number;
|
|
34
|
+
/** Per-call HTTP timeout. Defaults to 120s — bad endpoints can hang otherwise. */
|
|
35
|
+
timeoutMs?: number;
|
|
36
|
+
/** Repetitions per (variant, case, model). Default 1 — opt-in to 3+ for noise floor. */
|
|
37
|
+
repetitions?: number;
|
|
38
|
+
/** Inject an llm-judge call per case; content dimension is replaced by judge score. */
|
|
39
|
+
judge?: JudgeOpts & {
|
|
40
|
+
topN?: number;
|
|
41
|
+
};
|
|
42
|
+
/** Transport override for tests. */
|
|
43
|
+
callModel?: CallModel;
|
|
24
44
|
/** Optional callback for progress */
|
|
25
45
|
onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
|
|
26
46
|
}
|
|
@@ -1,64 +1,104 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Evaluation matrix runner.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* cells = EvaluationResult with multi-dimensional scores
|
|
4
|
+
* rows = variants
|
|
5
|
+
* columns = cases (optionally × models)
|
|
6
|
+
* cells = EvaluationResult with multi-dimensional scores
|
|
8
7
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
8
|
+
* Repetitions (N) give us a noise floor: the same (variant, case) is run N
|
|
9
|
+
* times and results aggregate to mean + stddev. Without this we can't tell
|
|
10
|
+
* whether 56.7 vs 37.4 is signal or variance.
|
|
11
|
+
*
|
|
12
|
+
* Multi-model runs (models[].length > 1) give us cross-model stddev: a
|
|
13
|
+
* prompt that only works on one generator is fragile.
|
|
14
|
+
*
|
|
15
|
+
* All HTTP calls go through `transport.callModel` so tests can inject a
|
|
16
|
+
* deterministic mock (see prompt-evolution-discrimination.test.ts).
|
|
11
17
|
*/
|
|
12
18
|
import { renderPrompt } from "../prompts/load.js";
|
|
13
|
-
import { scoreOutput, gmean } from "./scorer.js";
|
|
19
|
+
import { scoreOutput, gmean, aggregateReps } from "./scorer.js";
|
|
20
|
+
import { judgeOutput } from "./llm-judge.js";
|
|
21
|
+
import { defaultCallModel, attemptJsonParse, } from "./transport.js";
|
|
14
22
|
export async function buildMatrix(variants, cases, opts) {
|
|
23
|
+
const models = opts.models && opts.models.length > 0 ? opts.models : [opts.model];
|
|
24
|
+
const reps = Math.max(1, opts.repetitions ?? 1);
|
|
25
|
+
const concurrency = opts.concurrency ?? 8;
|
|
26
|
+
const transport = opts.callModel ?? defaultCallModel;
|
|
27
|
+
// Build the full job list: (variant × case × model × rep).
|
|
15
28
|
const jobs = [];
|
|
16
29
|
for (const v of variants) {
|
|
17
30
|
for (const c of cases) {
|
|
18
|
-
|
|
31
|
+
for (const model of models) {
|
|
32
|
+
for (let r = 0; r < reps; r++) {
|
|
33
|
+
jobs.push({ case: c, variantId: v.id, text: v.text, systemText: c.systemPrompt, model, rep: r });
|
|
34
|
+
}
|
|
35
|
+
}
|
|
19
36
|
}
|
|
20
37
|
}
|
|
21
|
-
|
|
22
|
-
|
|
38
|
+
// Work-stealing pool: keep `concurrency` jobs in flight at all times so a
|
|
39
|
+
// slow call (Kimi at 4 min/call is typical) doesn't block the others in its
|
|
40
|
+
// slice. Previous batch-and-wait loop serialized the slowest job in every
|
|
41
|
+
// window of `concurrency`.
|
|
42
|
+
const rawByKey = new Map();
|
|
23
43
|
let done = 0;
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
44
|
+
let next = 0;
|
|
45
|
+
const worker = async () => {
|
|
46
|
+
while (true) {
|
|
47
|
+
const i = next++;
|
|
48
|
+
if (i >= jobs.length)
|
|
49
|
+
return;
|
|
50
|
+
const r = await runSingle(jobs[i], opts, transport);
|
|
51
|
+
const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
|
|
52
|
+
const arr = rawByKey.get(key) ?? [];
|
|
53
|
+
arr.push(r);
|
|
54
|
+
rawByKey.set(key, arr);
|
|
30
55
|
done++;
|
|
31
56
|
opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
|
|
32
57
|
}
|
|
58
|
+
};
|
|
59
|
+
await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
|
|
60
|
+
// Collapse reps: one aggregated EvaluationResult per (variant, case, model).
|
|
61
|
+
const aggregated = new Map();
|
|
62
|
+
for (const [key, runs] of rawByKey) {
|
|
63
|
+
aggregated.set(key, collapseReps(runs));
|
|
33
64
|
}
|
|
34
|
-
//
|
|
65
|
+
// Optional llm-judge pass on top-N variants (by current heuristic content).
|
|
66
|
+
if (opts.judge)
|
|
67
|
+
await runJudge(variants, cases, models, aggregated, opts.judge);
|
|
68
|
+
// Assemble rows: per-variant aggregate across all cases and models.
|
|
35
69
|
const rows = [];
|
|
36
70
|
for (const v of variants) {
|
|
37
71
|
const rowResults = new Map();
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
let
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
72
|
+
const perModel = {};
|
|
73
|
+
const modelGmeans = [];
|
|
74
|
+
let parseFailures = 0;
|
|
75
|
+
for (const model of models) {
|
|
76
|
+
const modelScores = [];
|
|
77
|
+
for (const c of cases) {
|
|
78
|
+
const key = `${v.id}:${c.hash}:${model}`;
|
|
79
|
+
const r = aggregated.get(key);
|
|
80
|
+
if (!r)
|
|
81
|
+
continue;
|
|
82
|
+
rowResults.set(models.length > 1 ? `${c.hash}:${model}` : c.hash, r);
|
|
83
|
+
modelScores.push(r.scores);
|
|
84
|
+
if (r.scores.parse < 0.5)
|
|
85
|
+
parseFailures++;
|
|
86
|
+
}
|
|
87
|
+
if (modelScores.length > 0) {
|
|
88
|
+
const modelAgg = averageDimensions(modelScores);
|
|
89
|
+
perModel[model] = modelAgg;
|
|
90
|
+
modelGmeans.push(gmean(modelAgg));
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
const allScores = [...rowResults.values()].map((r) => r.scores);
|
|
94
|
+
const aggregate = averageDimensions(allScores);
|
|
95
|
+
const g = gmean(aggregate);
|
|
96
|
+
let crossModelStddev;
|
|
97
|
+
if (modelGmeans.length > 1) {
|
|
98
|
+
const m = modelGmeans.reduce((a, b) => a + b, 0) / modelGmeans.length;
|
|
99
|
+
const variance = modelGmeans.reduce((a, b) => a + (b - m) ** 2, 0) / modelGmeans.length;
|
|
100
|
+
crossModelStddev = Math.sqrt(variance);
|
|
53
101
|
}
|
|
54
|
-
const n = cases.length;
|
|
55
|
-
const aggregate = {
|
|
56
|
-
parse: parseSum / n,
|
|
57
|
-
schema: schemaSum / n,
|
|
58
|
-
content: contentSum / n,
|
|
59
|
-
costEfficiency: costSum / n,
|
|
60
|
-
speed: speedSum / n,
|
|
61
|
-
};
|
|
62
102
|
rows.push({
|
|
63
103
|
variantId: v.id,
|
|
64
104
|
promptPath: v.promptPath,
|
|
@@ -66,127 +106,129 @@ export async function buildMatrix(variants, cases, opts) {
|
|
|
66
106
|
text: v.text,
|
|
67
107
|
results: rowResults,
|
|
68
108
|
aggregate,
|
|
69
|
-
gmean:
|
|
109
|
+
gmean: g,
|
|
110
|
+
crossModelStddev,
|
|
111
|
+
perModel: models.length > 1 ? perModel : undefined,
|
|
112
|
+
parseFailures,
|
|
70
113
|
});
|
|
71
114
|
}
|
|
72
115
|
return rows;
|
|
73
116
|
}
|
|
74
|
-
async function runSingle(job, opts) {
|
|
117
|
+
async function runSingle(job, opts, transport) {
|
|
75
118
|
const started = Date.now();
|
|
76
|
-
const
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
let headers = {
|
|
83
|
-
"Content-Type": "application/json",
|
|
84
|
-
"Authorization": `Bearer ${authToken}`,
|
|
119
|
+
const callOpts = {
|
|
120
|
+
model: job.model,
|
|
121
|
+
baseUrl: opts.baseUrl,
|
|
122
|
+
authToken: opts.authToken,
|
|
123
|
+
maxTokens: opts.maxTokens,
|
|
124
|
+
timeoutMs: opts.timeoutMs,
|
|
85
125
|
};
|
|
86
|
-
if (isKimi)
|
|
87
|
-
headers["User-Agent"] = "Kilo-Code/1.0";
|
|
88
|
-
if (isAnthropic) {
|
|
89
|
-
// Anthropic native format
|
|
90
|
-
endpoint = `${baseUrl}/v1/messages`;
|
|
91
|
-
headers["anthropic-version"] = "2023-06-01";
|
|
92
|
-
const messages = [{ role: "user", content: job.text }];
|
|
93
|
-
const payload = {
|
|
94
|
-
model: opts.model,
|
|
95
|
-
max_tokens: opts.maxTokens ?? 4096,
|
|
96
|
-
messages,
|
|
97
|
-
};
|
|
98
|
-
if (job.systemText)
|
|
99
|
-
payload.system = job.systemText;
|
|
100
|
-
body = JSON.stringify(payload);
|
|
101
|
-
}
|
|
102
|
-
else {
|
|
103
|
-
// OpenAI-compatible format (OpenRouter, local proxies, etc.)
|
|
104
|
-
endpoint = `${baseUrl}/v1/chat/completions`;
|
|
105
|
-
const messages = [];
|
|
106
|
-
if (job.systemText) {
|
|
107
|
-
messages.push({ role: "system", content: job.systemText });
|
|
108
|
-
}
|
|
109
|
-
messages.push({ role: "user", content: job.text });
|
|
110
|
-
body = JSON.stringify({
|
|
111
|
-
model: opts.model,
|
|
112
|
-
max_tokens: opts.maxTokens ?? 4096,
|
|
113
|
-
messages,
|
|
114
|
-
});
|
|
115
|
-
}
|
|
116
|
-
let raw = "";
|
|
117
|
-
let costUsd = 0;
|
|
118
126
|
try {
|
|
119
|
-
const
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
const errText = await res.text().catch(() => "");
|
|
126
|
-
return makeErrorResult(job, errText, 0, Date.now() - started);
|
|
127
|
-
}
|
|
128
|
-
let inp = 0;
|
|
129
|
-
let out = 0;
|
|
130
|
-
if (isAnthropic) {
|
|
131
|
-
const data = await res.json();
|
|
132
|
-
raw = data.content?.map((c) => c.text ?? "").join("") ?? "";
|
|
133
|
-
inp = data.usage?.input_tokens ?? 0;
|
|
134
|
-
out = data.usage?.output_tokens ?? 0;
|
|
135
|
-
}
|
|
136
|
-
else {
|
|
137
|
-
const data = await res.json();
|
|
138
|
-
raw = data.choices?.[0]?.message?.content ?? "";
|
|
139
|
-
inp = data.usage?.prompt_tokens ?? 0;
|
|
140
|
-
out = data.usage?.completion_tokens ?? 0;
|
|
141
|
-
}
|
|
142
|
-
// Rough cost estimate: varies by model. Using claude-3-haiku as baseline.
|
|
143
|
-
costUsd = inp * 0.000003 + out * 0.000015;
|
|
127
|
+
const { raw, costUsd } = await transport(job.text, job.systemText, callOpts);
|
|
128
|
+
const durationMs = Date.now() - started;
|
|
129
|
+
const parsed = attemptJsonParse(raw);
|
|
130
|
+
const scored = scoreOutput(raw, parsed, costUsd, durationMs, job.case, { model: job.model });
|
|
131
|
+
scored.variantId = job.variantId;
|
|
132
|
+
return scored;
|
|
144
133
|
}
|
|
145
134
|
catch (err) {
|
|
146
135
|
const msg = err instanceof Error ? err.message : String(err);
|
|
147
|
-
|
|
136
|
+
const durationMs = Date.now() - started;
|
|
137
|
+
return {
|
|
138
|
+
caseHash: job.case.hash,
|
|
139
|
+
caseName: job.case.name,
|
|
140
|
+
variantId: job.variantId,
|
|
141
|
+
promptPath: job.case.promptPath,
|
|
142
|
+
rawOutput: msg,
|
|
143
|
+
parsedOutput: null,
|
|
144
|
+
costUsd: 0,
|
|
145
|
+
durationMs,
|
|
146
|
+
scores: { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 },
|
|
147
|
+
notes: [`HTTP/fetch error: ${msg.slice(0, 200)}`],
|
|
148
|
+
model: job.model,
|
|
149
|
+
};
|
|
148
150
|
}
|
|
149
|
-
const durationMs = Date.now() - started;
|
|
150
|
-
const parsed = attemptJsonParse(raw);
|
|
151
|
-
const scored = scoreOutput(raw, parsed, costUsd, durationMs, job.case);
|
|
152
|
-
scored.variantId = job.variantId;
|
|
153
|
-
return scored;
|
|
154
151
|
}
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
152
|
+
/** Collapse N repetitions into a single EvaluationResult carrying mean + stddev. */
|
|
153
|
+
function collapseReps(runs) {
|
|
154
|
+
if (runs.length === 1)
|
|
155
|
+
return runs[0];
|
|
156
|
+
const { mean, stddev } = aggregateReps(runs);
|
|
157
|
+
// Pick the median-quality run as the "representative" raw output, so the
|
|
158
|
+
// report shows a realistic sample rather than the best or worst rep.
|
|
159
|
+
const sorted = [...runs].sort((a, b) => gmean(a.scores) - gmean(b.scores));
|
|
160
|
+
const mid = sorted[Math.floor(sorted.length / 2)];
|
|
161
|
+
return {
|
|
162
|
+
...mid,
|
|
163
|
+
scores: mean,
|
|
164
|
+
stddev,
|
|
165
|
+
reps: runs.length,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
async function runJudge(variants, cases, models, aggregated, judge) {
|
|
169
|
+
// Judge only the top-N variants to cap cost: a judge call per
|
|
170
|
+
// (variant, case, model) on a large population blows up fast.
|
|
171
|
+
const topN = judge.topN ?? 4;
|
|
172
|
+
const variantGmeans = variants.map((v) => {
|
|
173
|
+
const scores = [];
|
|
174
|
+
for (const c of cases) {
|
|
175
|
+
for (const model of models) {
|
|
176
|
+
const r = aggregated.get(`${v.id}:${c.hash}:${model}`);
|
|
177
|
+
if (r)
|
|
178
|
+
scores.push(r.scores);
|
|
170
179
|
}
|
|
171
|
-
|
|
172
|
-
|
|
180
|
+
}
|
|
181
|
+
return { id: v.id, g: scores.length > 0 ? gmean(averageDimensions(scores)) : 0 };
|
|
182
|
+
});
|
|
183
|
+
variantGmeans.sort((a, b) => b.g - a.g);
|
|
184
|
+
const eligible = new Set(variantGmeans.slice(0, topN).map((x) => x.id));
|
|
185
|
+
const jobs = [];
|
|
186
|
+
for (const v of variants) {
|
|
187
|
+
if (!eligible.has(v.id))
|
|
188
|
+
continue;
|
|
189
|
+
for (const c of cases) {
|
|
190
|
+
for (const model of models) {
|
|
191
|
+
const key = `${v.id}:${c.hash}:${model}`;
|
|
192
|
+
const r = aggregated.get(key);
|
|
193
|
+
if (!r || r.scores.parse < 0.5)
|
|
194
|
+
continue; // no point judging unparseable output
|
|
195
|
+
jobs.push(async () => {
|
|
196
|
+
try {
|
|
197
|
+
const jr = await judgeOutput(r.rawOutput, c, judge);
|
|
198
|
+
r.scores = { ...r.scores, content: jr.score };
|
|
199
|
+
r.judgeJustification = jr.justification;
|
|
200
|
+
}
|
|
201
|
+
catch {
|
|
202
|
+
// Judge failure is non-fatal — keep heuristic content.
|
|
203
|
+
}
|
|
204
|
+
});
|
|
173
205
|
}
|
|
174
206
|
}
|
|
175
|
-
return null;
|
|
176
207
|
}
|
|
208
|
+
// Work-stealing pool for judge calls — modest concurrency to stay under
|
|
209
|
+
// provider rate limits, but no slice-blocking.
|
|
210
|
+
const judgeConcurrency = 3;
|
|
211
|
+
let nextJob = 0;
|
|
212
|
+
const judgeWorker = async () => {
|
|
213
|
+
while (true) {
|
|
214
|
+
const i = nextJob++;
|
|
215
|
+
if (i >= jobs.length)
|
|
216
|
+
return;
|
|
217
|
+
await jobs[i]();
|
|
218
|
+
}
|
|
219
|
+
};
|
|
220
|
+
await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
|
|
177
221
|
}
|
|
178
|
-
function
|
|
222
|
+
function averageDimensions(scores) {
|
|
223
|
+
if (scores.length === 0)
|
|
224
|
+
return { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 };
|
|
225
|
+
const n = scores.length;
|
|
179
226
|
return {
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
parsedOutput: null,
|
|
186
|
-
costUsd,
|
|
187
|
-
durationMs,
|
|
188
|
-
scores: { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 },
|
|
189
|
-
notes: [`HTTP/fetch error: ${error.slice(0, 200)}`],
|
|
227
|
+
parse: scores.reduce((a, b) => a + b.parse, 0) / n,
|
|
228
|
+
schema: scores.reduce((a, b) => a + b.schema, 0) / n,
|
|
229
|
+
content: scores.reduce((a, b) => a + b.content, 0) / n,
|
|
230
|
+
costEfficiency: scores.reduce((a, b) => a + b.costEfficiency, 0) / n,
|
|
231
|
+
speed: scores.reduce((a, b) => a + b.speed, 0) / n,
|
|
190
232
|
};
|
|
191
233
|
}
|
|
192
234
|
/** Render a prompt variant given its source path and optional variant name */
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Harvest real objectives from past claude-overnight runs to build
|
|
3
|
+
* benchmark cases from ground truth instead of synthetic ones.
|
|
4
|
+
*
|
|
5
|
+
* Source: <cwd>/.claude-overnight/runs/<runId>/
|
|
6
|
+
* - goal.md — the original objective the user ran with
|
|
7
|
+
* - state.json — RunState: phase ("done"/"capped"/"stopped"), accCompleted, budget
|
|
8
|
+
*
|
|
9
|
+
* Coarse fitness signal: `state.phase === "done"` and accCompleted/budget
|
|
10
|
+
* close to 1 means the user kept running to completion — the plan was
|
|
11
|
+
* actionable. Cases with "stopped" phase are likely broken plans.
|
|
12
|
+
*
|
|
13
|
+
* We do NOT pretend to have a per-case ground-truth plan. The harvested
|
|
14
|
+
* cases are meant to be scored with the llm-judge: real objective + a
|
|
15
|
+
* heuristic that the run actually finished.
|
|
16
|
+
*/
|
|
17
|
+
import type { BenchmarkCase } from "../types.js";
|
|
18
|
+
export interface HarvestOpts {
|
|
19
|
+
/** Repo root — harvest looks under <cwd>/.claude-overnight/runs/ */
|
|
20
|
+
cwd: string;
|
|
21
|
+
/** Which promptPath to target in the generated cases. */
|
|
22
|
+
promptPath: string;
|
|
23
|
+
/** Variant to attach to every harvested case. Default: STANDARD. */
|
|
24
|
+
variant?: string;
|
|
25
|
+
/** Max cases to return (newest first). */
|
|
26
|
+
limit?: number;
|
|
27
|
+
/** Only include runs whose phase matches — default ["done"] (successful runs). */
|
|
28
|
+
phaseAllowlist?: Array<"done" | "capped" | "stopped" | "planning">;
|
|
29
|
+
}
|
|
30
|
+
export declare function harvestRealCases(opts: HarvestOpts): BenchmarkCase[];
|