claude-overnight 1.54.0 → 1.55.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,7 @@
17
17
  */
18
18
  import { evolvePrompt } from "../prompt-evolution/index.js";
19
19
  import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
20
+ import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
20
21
  import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
21
22
  function help() {
22
23
  process.stdout.write(`Usage: claude-overnight-evolve [options]
@@ -27,12 +28,20 @@ Options:
27
28
  --prompt-kind <kind> MCP-browser prompt kind: planning | review | evolution |
28
29
  goal-refinement | plan-supervision | simple-supervision | stuck-analysis
29
30
  --eval-model <model> Fast model for evaluation (default: claude-haiku-4-5)
31
+ --eval-models <list> Comma-separated list to run cross-model (overrides --eval-model)
30
32
  --mutate-model <model> Smarter model for mutation (defaults to eval-model)
31
33
  --generations <n> Number of evolution generations (default: 10)
32
34
  --population <n> Max population size (default: 8)
33
35
  --plateau <n> Stop early if no improvement for N generations (default: 3)
36
+ --reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
37
+ --concurrency <n> Max in-flight eval calls (default: 8; bump for slow endpoints)
38
+ --judge Use llm-judge for content scoring (costs extra API calls)
39
+ --judge-model <model> Model to use for the judge (default: same as eval-model)
40
+ --judge-top-n <n> Judge only the top-N variants per generation (default: 4)
34
41
  --cases <suite> Benchmark suite: plan | mcp-planning | mcp-review |
35
42
  mcp-supervision | mcp-stuck (default: plan)
43
+ --harvest Append cases harvested from <cwd>/.claude-overnight/runs/*
44
+ --harvest-limit <n> Max harvested cases (default: 10)
36
45
  --base-url <url> API base URL override
37
46
  --auth-token <token> Auth token override
38
47
  --run-id <id> Preset run id (default: auto-generated)
@@ -52,7 +61,12 @@ function parseArgs() {
52
61
  generations: 10,
53
62
  population: 8,
54
63
  plateau: 3,
64
+ reps: 1,
65
+ useJudge: false,
66
+ judgeTopN: 4,
55
67
  cases: "",
68
+ harvest: false,
69
+ harvestLimit: 10,
56
70
  baseUrl: process.env.ANTHROPIC_BASE_URL,
57
71
  authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
58
72
  };
@@ -75,6 +89,10 @@ function parseArgs() {
75
89
  opts.evalModel = v;
76
90
  i++;
77
91
  break;
92
+ case "--eval-models":
93
+ opts.evalModels = v.split(",").map((s) => s.trim()).filter(Boolean);
94
+ i++;
95
+ break;
78
96
  case "--mutate-model":
79
97
  opts.mutateModel = v;
80
98
  i++;
@@ -91,10 +109,36 @@ function parseArgs() {
91
109
  opts.plateau = parseInt(v, 10);
92
110
  i++;
93
111
  break;
112
+ case "--reps":
113
+ opts.reps = parseInt(v, 10);
114
+ i++;
115
+ break;
116
+ case "--concurrency":
117
+ opts.concurrency = parseInt(v, 10);
118
+ i++;
119
+ break;
120
+ case "--judge":
121
+ opts.useJudge = true;
122
+ break;
123
+ case "--judge-model":
124
+ opts.judgeModel = v;
125
+ i++;
126
+ break;
127
+ case "--judge-top-n":
128
+ opts.judgeTopN = parseInt(v, 10);
129
+ i++;
130
+ break;
94
131
  case "--cases":
95
132
  opts.cases = v;
96
133
  i++;
97
134
  break;
135
+ case "--harvest":
136
+ opts.harvest = true;
137
+ break;
138
+ case "--harvest-limit":
139
+ opts.harvestLimit = parseInt(v, 10);
140
+ i++;
141
+ break;
98
142
  case "--base-url":
99
143
  opts.baseUrl = v;
100
144
  i++;
@@ -138,9 +182,23 @@ async function main() {
138
182
  }
139
183
  else {
140
184
  if (opts.cases === "plan")
141
- cases = PLAN_CASES;
185
+ cases = [...PLAN_CASES];
142
186
  else
143
187
  throw new Error(`Unknown case suite: ${opts.cases}`);
188
+ if (opts.harvest) {
189
+ const harvested = harvestRealCases({
190
+ cwd: process.cwd(),
191
+ promptPath,
192
+ limit: opts.harvestLimit,
193
+ });
194
+ if (harvested.length === 0) {
195
+ console.log(` (harvest: no runs found under <cwd>/.claude-overnight/runs)`);
196
+ }
197
+ else {
198
+ console.log(` (harvest: +${harvested.length} real objectives)`);
199
+ cases = cases.concat(harvested);
200
+ }
201
+ }
144
202
  }
145
203
  console.log(`Evolution config:`);
146
204
  console.log(` target: ${opts.target}`);
@@ -156,10 +214,21 @@ async function main() {
156
214
  promptPath,
157
215
  cases,
158
216
  evalModel: opts.evalModel,
217
+ evalModels: opts.evalModels,
159
218
  mutateModel: opts.mutateModel,
160
219
  generations: opts.generations,
161
220
  populationCap: opts.population,
162
221
  plateauGenerations: opts.plateau,
222
+ repetitions: opts.reps > 1 ? opts.reps : undefined,
223
+ concurrency: opts.concurrency,
224
+ judge: opts.useJudge
225
+ ? {
226
+ model: opts.judgeModel ?? opts.evalModel,
227
+ baseUrl: opts.baseUrl,
228
+ authToken: opts.authToken,
229
+ topN: opts.judgeTopN,
230
+ }
231
+ : undefined,
163
232
  baseUrl: opts.baseUrl,
164
233
  authToken: opts.authToken,
165
234
  seedText,
@@ -1 +1 @@
1
- export declare const VERSION = "1.53.0";
1
+ export declare const VERSION = "1.55.2";
@@ -1,2 +1,2 @@
1
1
  // Auto-generated by build — do not edit manually.
2
- export const VERSION = "1.53.0";
2
+ export const VERSION = "1.55.2";
@@ -1,18 +1,28 @@
1
1
  /**
2
2
  * Evaluation matrix runner.
3
3
  *
4
- * Given a set of prompt variants and benchmark cases, produces a matrix:
5
- * rows = variants
6
- * columns = cases
7
- * cells = EvaluationResult with multi-dimensional scores
4
+ * rows = variants
5
+ * columns = cases (optionally × models)
6
+ * cells = EvaluationResult with multi-dimensional scores
8
7
  *
9
- * Uses direct HTTP fetch (not the full Agent SDK) so it's fast and works with
10
- * any Anthropic-compatible endpoint (OpenRouter, local proxies, etc.).
8
+ * Repetitions (N) give us a noise floor: the same (variant, case) is run N
9
+ * times and results aggregate to mean + stddev. Without this we can't tell
10
+ * whether 56.7 vs 37.4 is signal or variance.
11
+ *
12
+ * Multi-model runs (models[].length > 1) give us cross-model stddev: a
13
+ * prompt that only works on one generator is fragile.
14
+ *
15
+ * All HTTP calls go through `transport.callModel` so tests can inject a
16
+ * deterministic mock (see prompt-evolution-discrimination.test.ts).
11
17
  */
18
+ import { type JudgeOpts } from "./llm-judge.js";
19
+ import { type CallModel } from "./transport.js";
12
20
  import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
13
21
  export interface EvalOpts {
14
- /** Model to run evaluations with. Should be fast/cheap (haiku, flash, etc.) */
22
+ /** Primary generator model (retained for single-model compat). */
15
23
  model: string;
24
+ /** Multiple generator models — enables cross-model scoring. Overrides `model` when ≥2 entries. */
25
+ models?: string[];
16
26
  /** Base URL for the API endpoint */
17
27
  baseUrl?: string;
18
28
  /** Auth token */
@@ -21,6 +31,16 @@ export interface EvalOpts {
21
31
  maxTokens?: number;
22
32
  /** Concurrency for parallel case evaluation */
23
33
  concurrency?: number;
34
+ /** Per-call HTTP timeout. Defaults to 120s — bad endpoints can hang otherwise. */
35
+ timeoutMs?: number;
36
+ /** Repetitions per (variant, case, model). Default 1 — opt-in to 3+ for noise floor. */
37
+ repetitions?: number;
38
+ /** Inject an llm-judge call per case; content dimension is replaced by judge score. */
39
+ judge?: JudgeOpts & {
40
+ topN?: number;
41
+ };
42
+ /** Transport override for tests. */
43
+ callModel?: CallModel;
24
44
  /** Optional callback for progress */
25
45
  onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
26
46
  }
@@ -1,64 +1,104 @@
1
1
  /**
2
2
  * Evaluation matrix runner.
3
3
  *
4
- * Given a set of prompt variants and benchmark cases, produces a matrix:
5
- * rows = variants
6
- * columns = cases
7
- * cells = EvaluationResult with multi-dimensional scores
4
+ * rows = variants
5
+ * columns = cases (optionally × models)
6
+ * cells = EvaluationResult with multi-dimensional scores
8
7
  *
9
- * Uses direct HTTP fetch (not the full Agent SDK) so it's fast and works with
10
- * any Anthropic-compatible endpoint (OpenRouter, local proxies, etc.).
8
+ * Repetitions (N) give us a noise floor: the same (variant, case) is run N
9
+ * times and results aggregate to mean + stddev. Without this we can't tell
10
+ * whether 56.7 vs 37.4 is signal or variance.
11
+ *
12
+ * Multi-model runs (models[].length > 1) give us cross-model stddev: a
13
+ * prompt that only works on one generator is fragile.
14
+ *
15
+ * All HTTP calls go through `transport.callModel` so tests can inject a
16
+ * deterministic mock (see prompt-evolution-discrimination.test.ts).
11
17
  */
12
18
  import { renderPrompt } from "../prompts/load.js";
13
- import { scoreOutput, gmean } from "./scorer.js";
19
+ import { scoreOutput, gmean, aggregateReps } from "./scorer.js";
20
+ import { judgeOutput } from "./llm-judge.js";
21
+ import { defaultCallModel, attemptJsonParse, } from "./transport.js";
14
22
  export async function buildMatrix(variants, cases, opts) {
23
+ const models = opts.models && opts.models.length > 0 ? opts.models : [opts.model];
24
+ const reps = Math.max(1, opts.repetitions ?? 1);
25
+ const concurrency = opts.concurrency ?? 8;
26
+ const transport = opts.callModel ?? defaultCallModel;
27
+ // Build the full job list: (variant × case × model × rep).
15
28
  const jobs = [];
16
29
  for (const v of variants) {
17
30
  for (const c of cases) {
18
- jobs.push({ case: c, variantId: v.id, text: v.text, systemText: c.systemPrompt });
31
+ for (const model of models) {
32
+ for (let r = 0; r < reps; r++) {
33
+ jobs.push({ case: c, variantId: v.id, text: v.text, systemText: c.systemPrompt, model, rep: r });
34
+ }
35
+ }
19
36
  }
20
37
  }
21
- const concurrency = opts.concurrency ?? 4;
22
- const results = new Map();
38
+ // Work-stealing pool: keep `concurrency` jobs in flight at all times so a
39
+ // slow call (Kimi at 4 min/call is typical) doesn't block the others in its
40
+ // slice. Previous batch-and-wait loop serialized the slowest job in every
41
+ // window of `concurrency`.
42
+ const rawByKey = new Map();
23
43
  let done = 0;
24
- // Process in batches
25
- for (let i = 0; i < jobs.length; i += concurrency) {
26
- const batch = jobs.slice(i, i + concurrency);
27
- const batchResults = await Promise.all(batch.map((job) => runSingle(job, opts)));
28
- for (const r of batchResults) {
29
- results.set(`${r.variantId}:${r.caseHash}`, r);
44
+ let next = 0;
45
+ const worker = async () => {
46
+ while (true) {
47
+ const i = next++;
48
+ if (i >= jobs.length)
49
+ return;
50
+ const r = await runSingle(jobs[i], opts, transport);
51
+ const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
52
+ const arr = rawByKey.get(key) ?? [];
53
+ arr.push(r);
54
+ rawByKey.set(key, arr);
30
55
  done++;
31
56
  opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
32
57
  }
58
+ };
59
+ await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
60
+ // Collapse reps: one aggregated EvaluationResult per (variant, case, model).
61
+ const aggregated = new Map();
62
+ for (const [key, runs] of rawByKey) {
63
+ aggregated.set(key, collapseReps(runs));
33
64
  }
34
- // Assemble rows
65
+ // Optional llm-judge pass on top-N variants (by current heuristic content).
66
+ if (opts.judge)
67
+ await runJudge(variants, cases, models, aggregated, opts.judge);
68
+ // Assemble rows: per-variant aggregate across all cases and models.
35
69
  const rows = [];
36
70
  for (const v of variants) {
37
71
  const rowResults = new Map();
38
- let parseSum = 0;
39
- let schemaSum = 0;
40
- let contentSum = 0;
41
- let costSum = 0;
42
- let speedSum = 0;
43
- for (const c of cases) {
44
- const r = results.get(`${v.id}:${c.hash}`);
45
- if (!r)
46
- continue;
47
- rowResults.set(c.hash, r);
48
- parseSum += r.scores.parse;
49
- schemaSum += r.scores.schema;
50
- contentSum += r.scores.content;
51
- costSum += r.scores.costEfficiency;
52
- speedSum += r.scores.speed;
72
+ const perModel = {};
73
+ const modelGmeans = [];
74
+ let parseFailures = 0;
75
+ for (const model of models) {
76
+ const modelScores = [];
77
+ for (const c of cases) {
78
+ const key = `${v.id}:${c.hash}:${model}`;
79
+ const r = aggregated.get(key);
80
+ if (!r)
81
+ continue;
82
+ rowResults.set(models.length > 1 ? `${c.hash}:${model}` : c.hash, r);
83
+ modelScores.push(r.scores);
84
+ if (r.scores.parse < 0.5)
85
+ parseFailures++;
86
+ }
87
+ if (modelScores.length > 0) {
88
+ const modelAgg = averageDimensions(modelScores);
89
+ perModel[model] = modelAgg;
90
+ modelGmeans.push(gmean(modelAgg));
91
+ }
92
+ }
93
+ const allScores = [...rowResults.values()].map((r) => r.scores);
94
+ const aggregate = averageDimensions(allScores);
95
+ const g = gmean(aggregate);
96
+ let crossModelStddev;
97
+ if (modelGmeans.length > 1) {
98
+ const m = modelGmeans.reduce((a, b) => a + b, 0) / modelGmeans.length;
99
+ const variance = modelGmeans.reduce((a, b) => a + (b - m) ** 2, 0) / modelGmeans.length;
100
+ crossModelStddev = Math.sqrt(variance);
53
101
  }
54
- const n = cases.length;
55
- const aggregate = {
56
- parse: parseSum / n,
57
- schema: schemaSum / n,
58
- content: contentSum / n,
59
- costEfficiency: costSum / n,
60
- speed: speedSum / n,
61
- };
62
102
  rows.push({
63
103
  variantId: v.id,
64
104
  promptPath: v.promptPath,
@@ -66,127 +106,129 @@ export async function buildMatrix(variants, cases, opts) {
66
106
  text: v.text,
67
107
  results: rowResults,
68
108
  aggregate,
69
- gmean: gmean(aggregate),
109
+ gmean: g,
110
+ crossModelStddev,
111
+ perModel: models.length > 1 ? perModel : undefined,
112
+ parseFailures,
70
113
  });
71
114
  }
72
115
  return rows;
73
116
  }
74
- async function runSingle(job, opts) {
117
+ async function runSingle(job, opts, transport) {
75
118
  const started = Date.now();
76
- const baseUrl = (opts.baseUrl ?? process.env.ANTHROPIC_BASE_URL ?? "https://api.anthropic.com").replace(/\/$/, "");
77
- const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
78
- const isAnthropic = /^https?:\/\/(api\.)?anthropic\.com/i.test(baseUrl);
79
- const isKimi = /kimi\.com/i.test(baseUrl);
80
- let body;
81
- let endpoint;
82
- let headers = {
83
- "Content-Type": "application/json",
84
- "Authorization": `Bearer ${authToken}`,
119
+ const callOpts = {
120
+ model: job.model,
121
+ baseUrl: opts.baseUrl,
122
+ authToken: opts.authToken,
123
+ maxTokens: opts.maxTokens,
124
+ timeoutMs: opts.timeoutMs,
85
125
  };
86
- if (isKimi)
87
- headers["User-Agent"] = "Kilo-Code/1.0";
88
- if (isAnthropic) {
89
- // Anthropic native format
90
- endpoint = `${baseUrl}/v1/messages`;
91
- headers["anthropic-version"] = "2023-06-01";
92
- const messages = [{ role: "user", content: job.text }];
93
- const payload = {
94
- model: opts.model,
95
- max_tokens: opts.maxTokens ?? 4096,
96
- messages,
97
- };
98
- if (job.systemText)
99
- payload.system = job.systemText;
100
- body = JSON.stringify(payload);
101
- }
102
- else {
103
- // OpenAI-compatible format (OpenRouter, local proxies, etc.)
104
- endpoint = `${baseUrl}/v1/chat/completions`;
105
- const messages = [];
106
- if (job.systemText) {
107
- messages.push({ role: "system", content: job.systemText });
108
- }
109
- messages.push({ role: "user", content: job.text });
110
- body = JSON.stringify({
111
- model: opts.model,
112
- max_tokens: opts.maxTokens ?? 4096,
113
- messages,
114
- });
115
- }
116
- let raw = "";
117
- let costUsd = 0;
118
126
  try {
119
- const res = await fetch(endpoint, {
120
- method: "POST",
121
- headers,
122
- body,
123
- });
124
- if (!res.ok) {
125
- const errText = await res.text().catch(() => "");
126
- return makeErrorResult(job, errText, 0, Date.now() - started);
127
- }
128
- let inp = 0;
129
- let out = 0;
130
- if (isAnthropic) {
131
- const data = await res.json();
132
- raw = data.content?.map((c) => c.text ?? "").join("") ?? "";
133
- inp = data.usage?.input_tokens ?? 0;
134
- out = data.usage?.output_tokens ?? 0;
135
- }
136
- else {
137
- const data = await res.json();
138
- raw = data.choices?.[0]?.message?.content ?? "";
139
- inp = data.usage?.prompt_tokens ?? 0;
140
- out = data.usage?.completion_tokens ?? 0;
141
- }
142
- // Rough cost estimate: varies by model. Using claude-3-haiku as baseline.
143
- costUsd = inp * 0.000003 + out * 0.000015;
127
+ const { raw, costUsd } = await transport(job.text, job.systemText, callOpts);
128
+ const durationMs = Date.now() - started;
129
+ const parsed = attemptJsonParse(raw);
130
+ const scored = scoreOutput(raw, parsed, costUsd, durationMs, job.case, { model: job.model });
131
+ scored.variantId = job.variantId;
132
+ return scored;
144
133
  }
145
134
  catch (err) {
146
135
  const msg = err instanceof Error ? err.message : String(err);
147
- return makeErrorResult(job, msg, 0, Date.now() - started);
136
+ const durationMs = Date.now() - started;
137
+ return {
138
+ caseHash: job.case.hash,
139
+ caseName: job.case.name,
140
+ variantId: job.variantId,
141
+ promptPath: job.case.promptPath,
142
+ rawOutput: msg,
143
+ parsedOutput: null,
144
+ costUsd: 0,
145
+ durationMs,
146
+ scores: { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 },
147
+ notes: [`HTTP/fetch error: ${msg.slice(0, 200)}`],
148
+ model: job.model,
149
+ };
148
150
  }
149
- const durationMs = Date.now() - started;
150
- const parsed = attemptJsonParse(raw);
151
- const scored = scoreOutput(raw, parsed, costUsd, durationMs, job.case);
152
- scored.variantId = job.variantId;
153
- return scored;
154
151
  }
155
- function attemptJsonParse(text) {
156
- // Strip markdown fences and trailing noise
157
- const cleaned = text
158
- .replace(/^```(?:json)?\s*\n?/i, "")
159
- .replace(/\n?```\s*$/i, "")
160
- .trim();
161
- try {
162
- return JSON.parse(cleaned);
163
- }
164
- catch {
165
- // Try to find the first {…} block
166
- const m = cleaned.match(/\{[\s\S]*\}/);
167
- if (m) {
168
- try {
169
- return JSON.parse(m[0]);
152
+ /** Collapse N repetitions into a single EvaluationResult carrying mean + stddev. */
153
+ function collapseReps(runs) {
154
+ if (runs.length === 1)
155
+ return runs[0];
156
+ const { mean, stddev } = aggregateReps(runs);
157
+ // Pick the median-quality run as the "representative" raw output, so the
158
+ // report shows a realistic sample rather than the best or worst rep.
159
+ const sorted = [...runs].sort((a, b) => gmean(a.scores) - gmean(b.scores));
160
+ const mid = sorted[Math.floor(sorted.length / 2)];
161
+ return {
162
+ ...mid,
163
+ scores: mean,
164
+ stddev,
165
+ reps: runs.length,
166
+ };
167
+ }
168
+ async function runJudge(variants, cases, models, aggregated, judge) {
169
+ // Judge only the top-N variants to cap cost: a judge call per
170
+ // (variant, case, model) on a large population blows up fast.
171
+ const topN = judge.topN ?? 4;
172
+ const variantGmeans = variants.map((v) => {
173
+ const scores = [];
174
+ for (const c of cases) {
175
+ for (const model of models) {
176
+ const r = aggregated.get(`${v.id}:${c.hash}:${model}`);
177
+ if (r)
178
+ scores.push(r.scores);
170
179
  }
171
- catch {
172
- return null;
180
+ }
181
+ return { id: v.id, g: scores.length > 0 ? gmean(averageDimensions(scores)) : 0 };
182
+ });
183
+ variantGmeans.sort((a, b) => b.g - a.g);
184
+ const eligible = new Set(variantGmeans.slice(0, topN).map((x) => x.id));
185
+ const jobs = [];
186
+ for (const v of variants) {
187
+ if (!eligible.has(v.id))
188
+ continue;
189
+ for (const c of cases) {
190
+ for (const model of models) {
191
+ const key = `${v.id}:${c.hash}:${model}`;
192
+ const r = aggregated.get(key);
193
+ if (!r || r.scores.parse < 0.5)
194
+ continue; // no point judging unparseable output
195
+ jobs.push(async () => {
196
+ try {
197
+ const jr = await judgeOutput(r.rawOutput, c, judge);
198
+ r.scores = { ...r.scores, content: jr.score };
199
+ r.judgeJustification = jr.justification;
200
+ }
201
+ catch {
202
+ // Judge failure is non-fatal — keep heuristic content.
203
+ }
204
+ });
173
205
  }
174
206
  }
175
- return null;
176
207
  }
208
+ // Work-stealing pool for judge calls — modest concurrency to stay under
209
+ // provider rate limits, but no slice-blocking.
210
+ const judgeConcurrency = 3;
211
+ let nextJob = 0;
212
+ const judgeWorker = async () => {
213
+ while (true) {
214
+ const i = nextJob++;
215
+ if (i >= jobs.length)
216
+ return;
217
+ await jobs[i]();
218
+ }
219
+ };
220
+ await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
177
221
  }
178
- function makeErrorResult(job, error, costUsd, durationMs) {
222
+ function averageDimensions(scores) {
223
+ if (scores.length === 0)
224
+ return { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 };
225
+ const n = scores.length;
179
226
  return {
180
- caseHash: job.case.hash,
181
- caseName: job.case.name,
182
- variantId: job.variantId,
183
- promptPath: job.case.promptPath,
184
- rawOutput: error,
185
- parsedOutput: null,
186
- costUsd,
187
- durationMs,
188
- scores: { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 },
189
- notes: [`HTTP/fetch error: ${error.slice(0, 200)}`],
227
+ parse: scores.reduce((a, b) => a + b.parse, 0) / n,
228
+ schema: scores.reduce((a, b) => a + b.schema, 0) / n,
229
+ content: scores.reduce((a, b) => a + b.content, 0) / n,
230
+ costEfficiency: scores.reduce((a, b) => a + b.costEfficiency, 0) / n,
231
+ speed: scores.reduce((a, b) => a + b.speed, 0) / n,
190
232
  };
191
233
  }
192
234
  /** Render a prompt variant given its source path and optional variant name */
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Harvest real objectives from past claude-overnight runs to build
3
+ * benchmark cases from ground truth instead of synthetic ones.
4
+ *
5
+ * Source: <cwd>/.claude-overnight/runs/<runId>/
6
+ * - goal.md — the original objective the user ran with
7
+ * - state.json — RunState: phase ("done"/"capped"/"stopped"), accCompleted, budget
8
+ *
9
+ * Coarse fitness signal: `state.phase === "done"` and accCompleted/budget
10
+ * close to 1 means the user kept running to completion — the plan was
11
+ * actionable. Cases with "stopped" phase are likely broken plans.
12
+ *
13
+ * We do NOT pretend to have a per-case ground-truth plan. The harvested
14
+ * cases are meant to be scored with the llm-judge: real objective + a
15
+ * heuristic that the run actually finished.
16
+ */
17
+ import type { BenchmarkCase } from "../types.js";
18
+ export interface HarvestOpts {
19
+ /** Repo root — harvest looks under <cwd>/.claude-overnight/runs/ */
20
+ cwd: string;
21
+ /** Which promptPath to target in the generated cases. */
22
+ promptPath: string;
23
+ /** Variant to attach to every harvested case. Default: STANDARD. */
24
+ variant?: string;
25
+ /** Max cases to return (newest first). */
26
+ limit?: number;
27
+ /** Only include runs whose phase matches — default ["done"] (successful runs). */
28
+ phaseAllowlist?: Array<"done" | "capped" | "stopped" | "planning">;
29
+ }
30
+ export declare function harvestRealCases(opts: HarvestOpts): BenchmarkCase[];