claude-overnight 1.54.0 → 1.55.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,7 @@
17
17
  */
18
18
  import { evolvePrompt } from "../prompt-evolution/index.js";
19
19
  import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
20
+ import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
20
21
  import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
21
22
  function help() {
22
23
  process.stdout.write(`Usage: claude-overnight-evolve [options]
@@ -27,12 +28,19 @@ Options:
27
28
  --prompt-kind <kind> MCP-browser prompt kind: planning | review | evolution |
28
29
  goal-refinement | plan-supervision | simple-supervision | stuck-analysis
29
30
  --eval-model <model> Fast model for evaluation (default: claude-haiku-4-5)
31
+ --eval-models <list> Comma-separated list to run cross-model (overrides --eval-model)
30
32
  --mutate-model <model> Smarter model for mutation (defaults to eval-model)
31
33
  --generations <n> Number of evolution generations (default: 10)
32
34
  --population <n> Max population size (default: 8)
33
35
  --plateau <n> Stop early if no improvement for N generations (default: 3)
36
+ --reps <n> Repetitions per (variant, case, model) for noise floor (default: 1)
37
+ --judge Use llm-judge for content scoring (costs extra API calls)
38
+ --judge-model <model> Model to use for the judge (default: same as eval-model)
39
+ --judge-top-n <n> Judge only the top-N variants per generation (default: 4)
34
40
  --cases <suite> Benchmark suite: plan | mcp-planning | mcp-review |
35
41
  mcp-supervision | mcp-stuck (default: plan)
42
+ --harvest Append cases harvested from <cwd>/.claude-overnight/runs/*
43
+ --harvest-limit <n> Max harvested cases (default: 10)
36
44
  --base-url <url> API base URL override
37
45
  --auth-token <token> Auth token override
38
46
  --run-id <id> Preset run id (default: auto-generated)
@@ -52,7 +60,12 @@ function parseArgs() {
52
60
  generations: 10,
53
61
  population: 8,
54
62
  plateau: 3,
63
+ reps: 1,
64
+ useJudge: false,
65
+ judgeTopN: 4,
55
66
  cases: "",
67
+ harvest: false,
68
+ harvestLimit: 10,
56
69
  baseUrl: process.env.ANTHROPIC_BASE_URL,
57
70
  authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
58
71
  };
@@ -75,6 +88,10 @@ function parseArgs() {
75
88
  opts.evalModel = v;
76
89
  i++;
77
90
  break;
91
+ case "--eval-models":
92
+ opts.evalModels = v.split(",").map((s) => s.trim()).filter(Boolean);
93
+ i++;
94
+ break;
78
95
  case "--mutate-model":
79
96
  opts.mutateModel = v;
80
97
  i++;
@@ -91,10 +108,32 @@ function parseArgs() {
91
108
  opts.plateau = parseInt(v, 10);
92
109
  i++;
93
110
  break;
111
+ case "--reps":
112
+ opts.reps = parseInt(v, 10);
113
+ i++;
114
+ break;
115
+ case "--judge":
116
+ opts.useJudge = true;
117
+ break;
118
+ case "--judge-model":
119
+ opts.judgeModel = v;
120
+ i++;
121
+ break;
122
+ case "--judge-top-n":
123
+ opts.judgeTopN = parseInt(v, 10);
124
+ i++;
125
+ break;
94
126
  case "--cases":
95
127
  opts.cases = v;
96
128
  i++;
97
129
  break;
130
+ case "--harvest":
131
+ opts.harvest = true;
132
+ break;
133
+ case "--harvest-limit":
134
+ opts.harvestLimit = parseInt(v, 10);
135
+ i++;
136
+ break;
98
137
  case "--base-url":
99
138
  opts.baseUrl = v;
100
139
  i++;
@@ -138,9 +177,23 @@ async function main() {
138
177
  }
139
178
  else {
140
179
  if (opts.cases === "plan")
141
- cases = PLAN_CASES;
180
+ cases = [...PLAN_CASES];
142
181
  else
143
182
  throw new Error(`Unknown case suite: ${opts.cases}`);
183
+ if (opts.harvest) {
184
+ const harvested = harvestRealCases({
185
+ cwd: process.cwd(),
186
+ promptPath,
187
+ limit: opts.harvestLimit,
188
+ });
189
+ if (harvested.length === 0) {
190
+ console.log(` (harvest: no runs found under <cwd>/.claude-overnight/runs)`);
191
+ }
192
+ else {
193
+ console.log(` (harvest: +${harvested.length} real objectives)`);
194
+ cases = cases.concat(harvested);
195
+ }
196
+ }
144
197
  }
145
198
  console.log(`Evolution config:`);
146
199
  console.log(` target: ${opts.target}`);
@@ -156,10 +209,20 @@ async function main() {
156
209
  promptPath,
157
210
  cases,
158
211
  evalModel: opts.evalModel,
212
+ evalModels: opts.evalModels,
159
213
  mutateModel: opts.mutateModel,
160
214
  generations: opts.generations,
161
215
  populationCap: opts.population,
162
216
  plateauGenerations: opts.plateau,
217
+ repetitions: opts.reps > 1 ? opts.reps : undefined,
218
+ judge: opts.useJudge
219
+ ? {
220
+ model: opts.judgeModel ?? opts.evalModel,
221
+ baseUrl: opts.baseUrl,
222
+ authToken: opts.authToken,
223
+ topN: opts.judgeTopN,
224
+ }
225
+ : undefined,
163
226
  baseUrl: opts.baseUrl,
164
227
  authToken: opts.authToken,
165
228
  seedText,
@@ -1 +1 @@
1
- export declare const VERSION = "1.53.0";
1
+ export declare const VERSION = "1.55.1";
@@ -1,2 +1,2 @@
1
1
  // Auto-generated by build — do not edit manually.
2
- export const VERSION = "1.53.0";
2
+ export const VERSION = "1.55.1";
@@ -1,18 +1,28 @@
1
1
  /**
2
2
  * Evaluation matrix runner.
3
3
  *
4
- * Given a set of prompt variants and benchmark cases, produces a matrix:
5
- * rows = variants
6
- * columns = cases
7
- * cells = EvaluationResult with multi-dimensional scores
4
+ * rows = variants
5
+ * columns = cases (optionally × models)
6
+ * cells = EvaluationResult with multi-dimensional scores
8
7
  *
9
- * Uses direct HTTP fetch (not the full Agent SDK) so it's fast and works with
10
- * any Anthropic-compatible endpoint (OpenRouter, local proxies, etc.).
8
+ * Repetitions (N) give us a noise floor: the same (variant, case) is run N
9
+ * times and results aggregate to mean + stddev. Without this we can't tell
10
+ * whether 56.7 vs 37.4 is signal or variance.
11
+ *
12
+ * Multi-model runs (models[].length > 1) give us cross-model stddev: a
13
+ * prompt that only works on one generator is fragile.
14
+ *
15
+ * All HTTP calls go through `transport.callModel` so tests can inject a
16
+ * deterministic mock (see prompt-evolution-discrimination.test.ts).
11
17
  */
18
+ import { type JudgeOpts } from "./llm-judge.js";
19
+ import { type CallModel } from "./transport.js";
12
20
  import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
13
21
  export interface EvalOpts {
14
- /** Model to run evaluations with. Should be fast/cheap (haiku, flash, etc.) */
22
+ /** Primary generator model (retained for single-model compat). */
15
23
  model: string;
24
+ /** Multiple generator models — enables cross-model scoring. Overrides `model` when ≥2 entries. */
25
+ models?: string[];
16
26
  /** Base URL for the API endpoint */
17
27
  baseUrl?: string;
18
28
  /** Auth token */
@@ -21,6 +31,16 @@ export interface EvalOpts {
21
31
  maxTokens?: number;
22
32
  /** Concurrency for parallel case evaluation */
23
33
  concurrency?: number;
34
+ /** Per-call HTTP timeout. Defaults to 120s — bad endpoints can hang otherwise. */
35
+ timeoutMs?: number;
36
+ /** Repetitions per (variant, case, model). Default 1 — opt-in to 3+ for noise floor. */
37
+ repetitions?: number;
38
+ /** Inject an llm-judge call per case; content dimension is replaced by judge score. */
39
+ judge?: JudgeOpts & {
40
+ topN?: number;
41
+ };
42
+ /** Transport override for tests. */
43
+ callModel?: CallModel;
24
44
  /** Optional callback for progress */
25
45
  onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
26
46
  }
@@ -1,64 +1,97 @@
1
1
  /**
2
2
  * Evaluation matrix runner.
3
3
  *
4
- * Given a set of prompt variants and benchmark cases, produces a matrix:
5
- * rows = variants
6
- * columns = cases
7
- * cells = EvaluationResult with multi-dimensional scores
4
+ * rows = variants
5
+ * columns = cases (optionally × models)
6
+ * cells = EvaluationResult with multi-dimensional scores
8
7
  *
9
- * Uses direct HTTP fetch (not the full Agent SDK) so it's fast and works with
10
- * any Anthropic-compatible endpoint (OpenRouter, local proxies, etc.).
8
+ * Repetitions (N) give us a noise floor: the same (variant, case) is run N
9
+ * times and results aggregate to mean + stddev. Without this we can't tell
10
+ * whether 56.7 vs 37.4 is signal or variance.
11
+ *
12
+ * Multi-model runs (models[].length > 1) give us cross-model stddev: a
13
+ * prompt that only works on one generator is fragile.
14
+ *
15
+ * All HTTP calls go through `transport.callModel` so tests can inject a
16
+ * deterministic mock (see prompt-evolution-discrimination.test.ts).
11
17
  */
12
18
  import { renderPrompt } from "../prompts/load.js";
13
- import { scoreOutput, gmean } from "./scorer.js";
19
+ import { scoreOutput, gmean, aggregateReps } from "./scorer.js";
20
+ import { judgeOutput } from "./llm-judge.js";
21
+ import { defaultCallModel, attemptJsonParse, } from "./transport.js";
14
22
  export async function buildMatrix(variants, cases, opts) {
23
+ const models = opts.models && opts.models.length > 0 ? opts.models : [opts.model];
24
+ const reps = Math.max(1, opts.repetitions ?? 1);
25
+ const concurrency = opts.concurrency ?? 4;
26
+ const transport = opts.callModel ?? defaultCallModel;
27
+ // Build the full job list: (variant × case × model × rep).
15
28
  const jobs = [];
16
29
  for (const v of variants) {
17
30
  for (const c of cases) {
18
- jobs.push({ case: c, variantId: v.id, text: v.text, systemText: c.systemPrompt });
31
+ for (const model of models) {
32
+ for (let r = 0; r < reps; r++) {
33
+ jobs.push({ case: c, variantId: v.id, text: v.text, systemText: c.systemPrompt, model, rep: r });
34
+ }
35
+ }
19
36
  }
20
37
  }
21
- const concurrency = opts.concurrency ?? 4;
22
- const results = new Map();
38
+ // Raw results, keyed by variant:case:model, each an array of per-rep results.
39
+ const rawByKey = new Map();
23
40
  let done = 0;
24
- // Process in batches
25
41
  for (let i = 0; i < jobs.length; i += concurrency) {
26
42
  const batch = jobs.slice(i, i + concurrency);
27
- const batchResults = await Promise.all(batch.map((job) => runSingle(job, opts)));
43
+ const batchResults = await Promise.all(batch.map((job) => runSingle(job, opts, transport)));
28
44
  for (const r of batchResults) {
29
- results.set(`${r.variantId}:${r.caseHash}`, r);
45
+ const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
46
+ const arr = rawByKey.get(key) ?? [];
47
+ arr.push(r);
48
+ rawByKey.set(key, arr);
30
49
  done++;
31
50
  opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
32
51
  }
33
52
  }
34
- // Assemble rows
53
+ // Collapse reps: one aggregated EvaluationResult per (variant, case, model).
54
+ const aggregated = new Map();
55
+ for (const [key, runs] of rawByKey) {
56
+ aggregated.set(key, collapseReps(runs));
57
+ }
58
+ // Optional llm-judge pass on top-N variants (by current heuristic content).
59
+ if (opts.judge)
60
+ await runJudge(variants, cases, models, aggregated, opts.judge);
61
+ // Assemble rows: per-variant aggregate across all cases and models.
35
62
  const rows = [];
36
63
  for (const v of variants) {
37
64
  const rowResults = new Map();
38
- let parseSum = 0;
39
- let schemaSum = 0;
40
- let contentSum = 0;
41
- let costSum = 0;
42
- let speedSum = 0;
43
- for (const c of cases) {
44
- const r = results.get(`${v.id}:${c.hash}`);
45
- if (!r)
46
- continue;
47
- rowResults.set(c.hash, r);
48
- parseSum += r.scores.parse;
49
- schemaSum += r.scores.schema;
50
- contentSum += r.scores.content;
51
- costSum += r.scores.costEfficiency;
52
- speedSum += r.scores.speed;
65
+ const perModel = {};
66
+ const modelGmeans = [];
67
+ let parseFailures = 0;
68
+ for (const model of models) {
69
+ const modelScores = [];
70
+ for (const c of cases) {
71
+ const key = `${v.id}:${c.hash}:${model}`;
72
+ const r = aggregated.get(key);
73
+ if (!r)
74
+ continue;
75
+ rowResults.set(models.length > 1 ? `${c.hash}:${model}` : c.hash, r);
76
+ modelScores.push(r.scores);
77
+ if (r.scores.parse < 0.5)
78
+ parseFailures++;
79
+ }
80
+ if (modelScores.length > 0) {
81
+ const modelAgg = averageDimensions(modelScores);
82
+ perModel[model] = modelAgg;
83
+ modelGmeans.push(gmean(modelAgg));
84
+ }
85
+ }
86
+ const allScores = [...rowResults.values()].map((r) => r.scores);
87
+ const aggregate = averageDimensions(allScores);
88
+ const g = gmean(aggregate);
89
+ let crossModelStddev;
90
+ if (modelGmeans.length > 1) {
91
+ const m = modelGmeans.reduce((a, b) => a + b, 0) / modelGmeans.length;
92
+ const variance = modelGmeans.reduce((a, b) => a + (b - m) ** 2, 0) / modelGmeans.length;
93
+ crossModelStddev = Math.sqrt(variance);
53
94
  }
54
- const n = cases.length;
55
- const aggregate = {
56
- parse: parseSum / n,
57
- schema: schemaSum / n,
58
- content: contentSum / n,
59
- costEfficiency: costSum / n,
60
- speed: speedSum / n,
61
- };
62
95
  rows.push({
63
96
  variantId: v.id,
64
97
  promptPath: v.promptPath,
@@ -66,127 +99,121 @@ export async function buildMatrix(variants, cases, opts) {
66
99
  text: v.text,
67
100
  results: rowResults,
68
101
  aggregate,
69
- gmean: gmean(aggregate),
102
+ gmean: g,
103
+ crossModelStddev,
104
+ perModel: models.length > 1 ? perModel : undefined,
105
+ parseFailures,
70
106
  });
71
107
  }
72
108
  return rows;
73
109
  }
74
- async function runSingle(job, opts) {
110
+ async function runSingle(job, opts, transport) {
75
111
  const started = Date.now();
76
- const baseUrl = (opts.baseUrl ?? process.env.ANTHROPIC_BASE_URL ?? "https://api.anthropic.com").replace(/\/$/, "");
77
- const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
78
- const isAnthropic = /^https?:\/\/(api\.)?anthropic\.com/i.test(baseUrl);
79
- const isKimi = /kimi\.com/i.test(baseUrl);
80
- let body;
81
- let endpoint;
82
- let headers = {
83
- "Content-Type": "application/json",
84
- "Authorization": `Bearer ${authToken}`,
112
+ const callOpts = {
113
+ model: job.model,
114
+ baseUrl: opts.baseUrl,
115
+ authToken: opts.authToken,
116
+ maxTokens: opts.maxTokens,
117
+ timeoutMs: opts.timeoutMs,
85
118
  };
86
- if (isKimi)
87
- headers["User-Agent"] = "Kilo-Code/1.0";
88
- if (isAnthropic) {
89
- // Anthropic native format
90
- endpoint = `${baseUrl}/v1/messages`;
91
- headers["anthropic-version"] = "2023-06-01";
92
- const messages = [{ role: "user", content: job.text }];
93
- const payload = {
94
- model: opts.model,
95
- max_tokens: opts.maxTokens ?? 4096,
96
- messages,
97
- };
98
- if (job.systemText)
99
- payload.system = job.systemText;
100
- body = JSON.stringify(payload);
101
- }
102
- else {
103
- // OpenAI-compatible format (OpenRouter, local proxies, etc.)
104
- endpoint = `${baseUrl}/v1/chat/completions`;
105
- const messages = [];
106
- if (job.systemText) {
107
- messages.push({ role: "system", content: job.systemText });
108
- }
109
- messages.push({ role: "user", content: job.text });
110
- body = JSON.stringify({
111
- model: opts.model,
112
- max_tokens: opts.maxTokens ?? 4096,
113
- messages,
114
- });
115
- }
116
- let raw = "";
117
- let costUsd = 0;
118
119
  try {
119
- const res = await fetch(endpoint, {
120
- method: "POST",
121
- headers,
122
- body,
123
- });
124
- if (!res.ok) {
125
- const errText = await res.text().catch(() => "");
126
- return makeErrorResult(job, errText, 0, Date.now() - started);
127
- }
128
- let inp = 0;
129
- let out = 0;
130
- if (isAnthropic) {
131
- const data = await res.json();
132
- raw = data.content?.map((c) => c.text ?? "").join("") ?? "";
133
- inp = data.usage?.input_tokens ?? 0;
134
- out = data.usage?.output_tokens ?? 0;
135
- }
136
- else {
137
- const data = await res.json();
138
- raw = data.choices?.[0]?.message?.content ?? "";
139
- inp = data.usage?.prompt_tokens ?? 0;
140
- out = data.usage?.completion_tokens ?? 0;
141
- }
142
- // Rough cost estimate: varies by model. Using claude-3-haiku as baseline.
143
- costUsd = inp * 0.000003 + out * 0.000015;
120
+ const { raw, costUsd } = await transport(job.text, job.systemText, callOpts);
121
+ const durationMs = Date.now() - started;
122
+ const parsed = attemptJsonParse(raw);
123
+ const scored = scoreOutput(raw, parsed, costUsd, durationMs, job.case, { model: job.model });
124
+ scored.variantId = job.variantId;
125
+ return scored;
144
126
  }
145
127
  catch (err) {
146
128
  const msg = err instanceof Error ? err.message : String(err);
147
- return makeErrorResult(job, msg, 0, Date.now() - started);
129
+ const durationMs = Date.now() - started;
130
+ return {
131
+ caseHash: job.case.hash,
132
+ caseName: job.case.name,
133
+ variantId: job.variantId,
134
+ promptPath: job.case.promptPath,
135
+ rawOutput: msg,
136
+ parsedOutput: null,
137
+ costUsd: 0,
138
+ durationMs,
139
+ scores: { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 },
140
+ notes: [`HTTP/fetch error: ${msg.slice(0, 200)}`],
141
+ model: job.model,
142
+ };
148
143
  }
149
- const durationMs = Date.now() - started;
150
- const parsed = attemptJsonParse(raw);
151
- const scored = scoreOutput(raw, parsed, costUsd, durationMs, job.case);
152
- scored.variantId = job.variantId;
153
- return scored;
154
144
  }
155
- function attemptJsonParse(text) {
156
- // Strip markdown fences and trailing noise
157
- const cleaned = text
158
- .replace(/^```(?:json)?\s*\n?/i, "")
159
- .replace(/\n?```\s*$/i, "")
160
- .trim();
161
- try {
162
- return JSON.parse(cleaned);
163
- }
164
- catch {
165
- // Try to find the first {…} block
166
- const m = cleaned.match(/\{[\s\S]*\}/);
167
- if (m) {
168
- try {
169
- return JSON.parse(m[0]);
145
+ /** Collapse N repetitions into a single EvaluationResult carrying mean + stddev. */
146
+ function collapseReps(runs) {
147
+ if (runs.length === 1)
148
+ return runs[0];
149
+ const { mean, stddev } = aggregateReps(runs);
150
+ // Pick the median-quality run as the "representative" raw output, so the
151
+ // report shows a realistic sample rather than the best or worst rep.
152
+ const sorted = [...runs].sort((a, b) => gmean(a.scores) - gmean(b.scores));
153
+ const mid = sorted[Math.floor(sorted.length / 2)];
154
+ return {
155
+ ...mid,
156
+ scores: mean,
157
+ stddev,
158
+ reps: runs.length,
159
+ };
160
+ }
161
+ async function runJudge(variants, cases, models, aggregated, judge) {
162
+ // Judge only the top-N variants to cap cost: a judge call per
163
+ // (variant, case, model) on a large population blows up fast.
164
+ const topN = judge.topN ?? 4;
165
+ const variantGmeans = variants.map((v) => {
166
+ const scores = [];
167
+ for (const c of cases) {
168
+ for (const model of models) {
169
+ const r = aggregated.get(`${v.id}:${c.hash}:${model}`);
170
+ if (r)
171
+ scores.push(r.scores);
170
172
  }
171
- catch {
172
- return null;
173
+ }
174
+ return { id: v.id, g: scores.length > 0 ? gmean(averageDimensions(scores)) : 0 };
175
+ });
176
+ variantGmeans.sort((a, b) => b.g - a.g);
177
+ const eligible = new Set(variantGmeans.slice(0, topN).map((x) => x.id));
178
+ const jobs = [];
179
+ for (const v of variants) {
180
+ if (!eligible.has(v.id))
181
+ continue;
182
+ for (const c of cases) {
183
+ for (const model of models) {
184
+ const key = `${v.id}:${c.hash}:${model}`;
185
+ const r = aggregated.get(key);
186
+ if (!r || r.scores.parse < 0.5)
187
+ continue; // no point judging unparseable output
188
+ jobs.push(async () => {
189
+ try {
190
+ const jr = await judgeOutput(r.rawOutput, c, judge);
191
+ r.scores = { ...r.scores, content: jr.score };
192
+ r.judgeJustification = jr.justification;
193
+ }
194
+ catch {
195
+ // Judge failure is non-fatal — keep heuristic content.
196
+ }
197
+ });
173
198
  }
174
199
  }
175
- return null;
200
+ }
201
+ // Run judge calls with modest concurrency to stay under provider rate limits.
202
+ const concurrency = 3;
203
+ for (let i = 0; i < jobs.length; i += concurrency) {
204
+ await Promise.all(jobs.slice(i, i + concurrency).map((fn) => fn()));
176
205
  }
177
206
  }
178
- function makeErrorResult(job, error, costUsd, durationMs) {
207
+ function averageDimensions(scores) {
208
+ if (scores.length === 0)
209
+ return { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 };
210
+ const n = scores.length;
179
211
  return {
180
- caseHash: job.case.hash,
181
- caseName: job.case.name,
182
- variantId: job.variantId,
183
- promptPath: job.case.promptPath,
184
- rawOutput: error,
185
- parsedOutput: null,
186
- costUsd,
187
- durationMs,
188
- scores: { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 },
189
- notes: [`HTTP/fetch error: ${error.slice(0, 200)}`],
212
+ parse: scores.reduce((a, b) => a + b.parse, 0) / n,
213
+ schema: scores.reduce((a, b) => a + b.schema, 0) / n,
214
+ content: scores.reduce((a, b) => a + b.content, 0) / n,
215
+ costEfficiency: scores.reduce((a, b) => a + b.costEfficiency, 0) / n,
216
+ speed: scores.reduce((a, b) => a + b.speed, 0) / n,
190
217
  };
191
218
  }
192
219
  /** Render a prompt variant given its source path and optional variant name */
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Harvest real objectives from past claude-overnight runs to build
3
+ * benchmark cases from ground truth instead of synthetic ones.
4
+ *
5
+ * Source: <cwd>/.claude-overnight/runs/<runId>/
6
+ * - goal.md — the original objective the user ran with
7
+ * - state.json — RunState: phase ("done"/"capped"/"stopped"), accCompleted, budget
8
+ *
9
+ * Coarse fitness signal: `state.phase === "done"` and accCompleted/budget
10
+ * close to 1 means the user kept running to completion — the plan was
11
+ * actionable. Cases with "stopped" phase are likely broken plans.
12
+ *
13
+ * We do NOT pretend to have a per-case ground-truth plan. The harvested
14
+ * cases are meant to be scored with the llm-judge: real objective + a
15
+ * heuristic that the run actually finished.
16
+ */
17
+ import type { BenchmarkCase } from "../types.js";
18
+ export interface HarvestOpts {
19
+ /** Repo root — harvest looks under <cwd>/.claude-overnight/runs/ */
20
+ cwd: string;
21
+ /** Which promptPath to target in the generated cases. */
22
+ promptPath: string;
23
+ /** Variant to attach to every harvested case. Default: STANDARD. */
24
+ variant?: string;
25
+ /** Max cases to return (newest first). */
26
+ limit?: number;
27
+ /** Only include runs whose phase matches — default ["done"] (successful runs). */
28
+ phaseAllowlist?: Array<"done" | "capped" | "stopped" | "planning">;
29
+ }
30
+ export declare function harvestRealCases(opts: HarvestOpts): BenchmarkCase[];
@@ -0,0 +1,88 @@
1
+ /**
2
+ * Harvest real objectives from past claude-overnight runs to build
3
+ * benchmark cases from ground truth instead of synthetic ones.
4
+ *
5
+ * Source: <cwd>/.claude-overnight/runs/<runId>/
6
+ * - goal.md — the original objective the user ran with
7
+ * - state.json — RunState: phase ("done"/"capped"/"stopped"), accCompleted, budget
8
+ *
9
+ * Coarse fitness signal: `state.phase === "done"` and accCompleted/budget
10
+ * close to 1 means the user kept running to completion — the plan was
11
+ * actionable. Cases with "stopped" phase are likely broken plans.
12
+ *
13
+ * We do NOT pretend to have a per-case ground-truth plan. The harvested
14
+ * cases are meant to be scored with the llm-judge: real objective + a
15
+ * heuristic that the run actually finished.
16
+ */
17
+ import { readdirSync, readFileSync, existsSync } from "node:fs";
18
+ import { join } from "node:path";
19
+ export function harvestRealCases(opts) {
20
+ const runsDir = join(opts.cwd, ".claude-overnight", "runs");
21
+ if (!existsSync(runsDir))
22
+ return [];
23
+ const allow = new Set(opts.phaseAllowlist ?? ["done"]);
24
+ const limit = opts.limit ?? 10;
25
+ const variant = opts.variant ?? "STANDARD";
26
+ const entries = [];
27
+ for (const id of readdirSync(runsDir)) {
28
+ const runDir = join(runsDir, id);
29
+ const goalPath = join(runDir, "goal.md");
30
+ const statePath = join(runDir, "state.json");
31
+ if (!existsSync(goalPath) || !existsSync(statePath))
32
+ continue;
33
+ try {
34
+ const state = JSON.parse(readFileSync(statePath, "utf-8"));
35
+ if (state.phase && !allow.has(state.phase))
36
+ continue;
37
+ const objective = extractObjective(readFileSync(goalPath, "utf-8"));
38
+ if (!objective)
39
+ continue;
40
+ entries.push({
41
+ id,
42
+ objective,
43
+ budget: typeof state.budget === "number" && state.budget > 0 ? state.budget : 8,
44
+ startedAt: state.startedAt ?? "",
45
+ });
46
+ }
47
+ catch {
48
+ // Skip unreadable runs.
49
+ }
50
+ }
51
+ entries.sort((a, b) => b.startedAt.localeCompare(a.startedAt));
52
+ return entries.slice(0, limit).map((e) => toCase(e, opts.promptPath, variant));
53
+ }
54
+ function extractObjective(goalMd) {
55
+ // goal.md is written as "## Original Objective\n<text>" — grab everything
56
+ // under the first header, or fall back to the whole file.
57
+ const m = goalMd.match(/##\s+[^\n]*\n([\s\S]+)$/);
58
+ const body = (m ? m[1] : goalMd).trim();
59
+ return body.slice(0, 2000); // keep cases shaped like the synthetic ones
60
+ }
61
+ function toCase(e, promptPath, variant) {
62
+ const c = {
63
+ name: `real:${e.id.slice(0, 12)}`,
64
+ hash: "",
65
+ promptPath,
66
+ variant,
67
+ vars: {
68
+ objective: e.objective,
69
+ budget: e.budget,
70
+ concurrency: Math.min(6, Math.max(2, Math.ceil(e.budget / 2))),
71
+ contextConstraintNote: "Context budget: use the claude-sonnet-4-6 model's context window efficiently.",
72
+ },
73
+ criteria: {
74
+ independentTasks: true,
75
+ specificTasks: false,
76
+ requiredJsonFields: ["tasks"],
77
+ },
78
+ };
79
+ c.hash = hashCase(c);
80
+ return c;
81
+ }
82
+ function hashCase(c) {
83
+ const key = `${c.promptPath}:${c.variant ?? "default"}:${JSON.stringify(c.vars)}`;
84
+ let h = 0;
85
+ for (let i = 0; i < key.length; i++)
86
+ h = ((h << 5) - h + key.charCodeAt(i)) | 0;
87
+ return Math.abs(h).toString(36).slice(0, 8);
88
+ }