npm - claude-overnight - Versions diffs - 1.54.0 → 1.55.1 - Mend

claude-overnight 1.54.0 → 1.55.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/bin/evolve.js +64 -1
package/dist/core/_version.d.ts +1 -1
package/dist/core/_version.js +1 -1
package/dist/prompt-evolution/evaluator.d.ts +27 -7
package/dist/prompt-evolution/evaluator.js +167 -140
package/dist/prompt-evolution/fixtures/harvest.d.ts +30 -0
package/dist/prompt-evolution/fixtures/harvest.js +88 -0
package/dist/prompt-evolution/fixtures/plan-cases.d.ts +9 -6
package/dist/prompt-evolution/fixtures/plan-cases.js +72 -23
package/dist/prompt-evolution/index.d.ts +9 -0
package/dist/prompt-evolution/index.js +8 -1
package/dist/prompt-evolution/report.d.ts +8 -6
package/dist/prompt-evolution/report.js +73 -30
package/dist/prompt-evolution/scorer.d.ts +23 -5
package/dist/prompt-evolution/scorer.js +106 -62
package/dist/prompt-evolution/transport.d.ts +28 -0
package/dist/prompt-evolution/transport.js +99 -0
package/dist/prompt-evolution/types.d.ts +15 -5
package/package.json +1 -1
package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1

package/dist/bin/evolve.js CHANGED Viewed

@@ -17,6 +17,7 @@
  */
 import { evolvePrompt } from "../prompt-evolution/index.js";
 import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
+import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
 import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
 function help() {
     process.stdout.write(`Usage: claude-overnight-evolve [options]
@@ -27,12 +28,19 @@ Options:
   --prompt-kind <kind>    MCP-browser prompt kind: planning | review | evolution |
                           goal-refinement | plan-supervision | simple-supervision | stuck-analysis
   --eval-model <model>    Fast model for evaluation (default: claude-haiku-4-5)
+  --eval-models <list>    Comma-separated list to run cross-model (overrides --eval-model)
   --mutate-model <model>  Smarter model for mutation (defaults to eval-model)
   --generations <n>       Number of evolution generations (default: 10)
   --population <n>        Max population size (default: 8)
   --plateau <n>           Stop early if no improvement for N generations (default: 3)
+  --reps <n>              Repetitions per (variant, case, model) for noise floor (default: 1)
+  --judge                 Use llm-judge for content scoring (costs extra API calls)
+  --judge-model <model>   Model to use for the judge (default: same as eval-model)
+  --judge-top-n <n>       Judge only the top-N variants per generation (default: 4)
   --cases <suite>         Benchmark suite: plan | mcp-planning | mcp-review |
                           mcp-supervision | mcp-stuck (default: plan)
+  --harvest               Append cases harvested from <cwd>/.claude-overnight/runs/*
+  --harvest-limit <n>     Max harvested cases (default: 10)
   --base-url <url>        API base URL override
   --auth-token <token>    Auth token override
   --run-id <id>           Preset run id (default: auto-generated)
@@ -52,7 +60,12 @@ function parseArgs() {
         generations: 10,
         population: 8,
         plateau: 3,
+        reps: 1,
+        useJudge: false,
+        judgeTopN: 4,
         cases: "",
+        harvest: false,
+        harvestLimit: 10,
         baseUrl: process.env.ANTHROPIC_BASE_URL,
         authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
     };
@@ -75,6 +88,10 @@ function parseArgs() {
                 opts.evalModel = v;
                 i++;
                 break;
+            case "--eval-models":
+                opts.evalModels = v.split(",").map((s) => s.trim()).filter(Boolean);
+                i++;
+                break;
             case "--mutate-model":
                 opts.mutateModel = v;
                 i++;
@@ -91,10 +108,32 @@ function parseArgs() {
                 opts.plateau = parseInt(v, 10);
                 i++;
                 break;
+            case "--reps":
+                opts.reps = parseInt(v, 10);
+                i++;
+                break;
+            case "--judge":
+                opts.useJudge = true;
+                break;
+            case "--judge-model":
+                opts.judgeModel = v;
+                i++;
+                break;
+            case "--judge-top-n":
+                opts.judgeTopN = parseInt(v, 10);
+                i++;
+                break;
             case "--cases":
                 opts.cases = v;
                 i++;
                 break;
+            case "--harvest":
+                opts.harvest = true;
+                break;
+            case "--harvest-limit":
+                opts.harvestLimit = parseInt(v, 10);
+                i++;
+                break;
             case "--base-url":
                 opts.baseUrl = v;
                 i++;
@@ -138,9 +177,23 @@ async function main() {
     }
     else {
         if (opts.cases === "plan")
-            cases = PLAN_CASES;
+            cases = [...PLAN_CASES];
         else
             throw new Error(`Unknown case suite: ${opts.cases}`);
+        if (opts.harvest) {
+            const harvested = harvestRealCases({
+                cwd: process.cwd(),
+                promptPath,
+                limit: opts.harvestLimit,
+            });
+            if (harvested.length === 0) {
+                console.log(`  (harvest: no runs found under <cwd>/.claude-overnight/runs)`);
+            }
+            else {
+                console.log(`  (harvest: +${harvested.length} real objectives)`);
+                cases = cases.concat(harvested);
+            }
+        }
     }
     console.log(`Evolution config:`);
     console.log(`  target:      ${opts.target}`);
@@ -156,10 +209,20 @@ async function main() {
         promptPath,
         cases,
         evalModel: opts.evalModel,
+        evalModels: opts.evalModels,
         mutateModel: opts.mutateModel,
         generations: opts.generations,
         populationCap: opts.population,
         plateauGenerations: opts.plateau,
+        repetitions: opts.reps > 1 ? opts.reps : undefined,
+        judge: opts.useJudge
+            ? {
+                model: opts.judgeModel ?? opts.evalModel,
+                baseUrl: opts.baseUrl,
+                authToken: opts.authToken,
+                topN: opts.judgeTopN,
+            }
+            : undefined,
         baseUrl: opts.baseUrl,
         authToken: opts.authToken,
         seedText,

package/dist/core/_version.d.ts CHANGED Viewed

	@@ -1 +1 @@
1	- export declare const VERSION = "1.53.0";
1	+ export declare const VERSION = "1.55.1";

package/dist/core/_version.js CHANGED Viewed

@@ -1,2 +1,2 @@
 // Auto-generated by build — do not edit manually.
-export const VERSION = "1.53.0";
+export const VERSION = "1.55.1";

package/dist/prompt-evolution/evaluator.d.ts CHANGED Viewed

@@ -1,18 +1,28 @@
 /**
  * Evaluation matrix runner.
  *
- * Given a set of prompt variants and benchmark cases, produces a matrix:
- *   rows    = variants
- *   columns = cases
- *   cells   = EvaluationResult with multi-dimensional scores
+ * rows    = variants
+ * columns = cases (optionally × models)
+ * cells   = EvaluationResult with multi-dimensional scores
  *
- * Uses direct HTTP fetch (not the full Agent SDK) so it's fast and works with
- * any Anthropic-compatible endpoint (OpenRouter, local proxies, etc.).
+ * Repetitions (N) give us a noise floor: the same (variant, case) is run N
+ * times and results aggregate to mean + stddev. Without this we can't tell
+ * whether 56.7 vs 37.4 is signal or variance.
+ *
+ * Multi-model runs (models[].length > 1) give us cross-model stddev: a
+ * prompt that only works on one generator is fragile.
+ *
+ * All HTTP calls go through `transport.callModel` so tests can inject a
+ * deterministic mock (see prompt-evolution-discrimination.test.ts).
  */
+import { type JudgeOpts } from "./llm-judge.js";
+import { type CallModel } from "./transport.js";
 import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
 export interface EvalOpts {
-    /** Model to run evaluations with. Should be fast/cheap (haiku, flash, etc.) */
+    /** Primary generator model (retained for single-model compat). */
     model: string;
+    /** Multiple generator models — enables cross-model scoring. Overrides `model` when ≥2 entries. */
+    models?: string[];
     /** Base URL for the API endpoint */
     baseUrl?: string;
     /** Auth token */
@@ -21,6 +31,16 @@ export interface EvalOpts {
     maxTokens?: number;
     /** Concurrency for parallel case evaluation */
     concurrency?: number;
+    /** Per-call HTTP timeout. Defaults to 120s — bad endpoints can hang otherwise. */
+    timeoutMs?: number;
+    /** Repetitions per (variant, case, model). Default 1 — opt-in to 3+ for noise floor. */
+    repetitions?: number;
+    /** Inject an llm-judge call per case; content dimension is replaced by judge score. */
+    judge?: JudgeOpts & {
+        topN?: number;
+    };
+    /** Transport override for tests. */
+    callModel?: CallModel;
     /** Optional callback for progress */
     onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
 }

package/dist/prompt-evolution/evaluator.js CHANGED Viewed

@@ -1,64 +1,97 @@
 /**
  * Evaluation matrix runner.
  *
- * Given a set of prompt variants and benchmark cases, produces a matrix:
- *   rows    = variants
- *   columns = cases
- *   cells   = EvaluationResult with multi-dimensional scores
+ * rows    = variants
+ * columns = cases (optionally × models)
+ * cells   = EvaluationResult with multi-dimensional scores
  *
- * Uses direct HTTP fetch (not the full Agent SDK) so it's fast and works with
- * any Anthropic-compatible endpoint (OpenRouter, local proxies, etc.).
+ * Repetitions (N) give us a noise floor: the same (variant, case) is run N
+ * times and results aggregate to mean + stddev. Without this we can't tell
+ * whether 56.7 vs 37.4 is signal or variance.
+ *
+ * Multi-model runs (models[].length > 1) give us cross-model stddev: a
+ * prompt that only works on one generator is fragile.
+ *
+ * All HTTP calls go through `transport.callModel` so tests can inject a
+ * deterministic mock (see prompt-evolution-discrimination.test.ts).
  */
 import { renderPrompt } from "../prompts/load.js";
-import { scoreOutput, gmean } from "./scorer.js";
+import { scoreOutput, gmean, aggregateReps } from "./scorer.js";
+import { judgeOutput } from "./llm-judge.js";
+import { defaultCallModel, attemptJsonParse, } from "./transport.js";
 export async function buildMatrix(variants, cases, opts) {
+    const models = opts.models && opts.models.length > 0 ? opts.models : [opts.model];
+    const reps = Math.max(1, opts.repetitions ?? 1);
+    const concurrency = opts.concurrency ?? 4;
+    const transport = opts.callModel ?? defaultCallModel;
+    // Build the full job list: (variant × case × model × rep).
     const jobs = [];
     for (const v of variants) {
         for (const c of cases) {
-            jobs.push({ case: c, variantId: v.id, text: v.text, systemText: c.systemPrompt });
+            for (const model of models) {
+                for (let r = 0; r < reps; r++) {
+                    jobs.push({ case: c, variantId: v.id, text: v.text, systemText: c.systemPrompt, model, rep: r });
+                }
+            }
         }
     }
-    const concurrency = opts.concurrency ?? 4;
-    const results = new Map();
+    // Raw results, keyed by variant:case:model, each an array of per-rep results.
+    const rawByKey = new Map();
     let done = 0;
-    // Process in batches
     for (let i = 0; i < jobs.length; i += concurrency) {
         const batch = jobs.slice(i, i + concurrency);
-        const batchResults = await Promise.all(batch.map((job) => runSingle(job, opts)));
+        const batchResults = await Promise.all(batch.map((job) => runSingle(job, opts, transport)));
         for (const r of batchResults) {
-            results.set(`${r.variantId}:${r.caseHash}`, r);
+            const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
+            const arr = rawByKey.get(key) ?? [];
+            arr.push(r);
+            rawByKey.set(key, arr);
             done++;
             opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
         }
     }
-    // Assemble rows
+    // Collapse reps: one aggregated EvaluationResult per (variant, case, model).
+    const aggregated = new Map();
+    for (const [key, runs] of rawByKey) {
+        aggregated.set(key, collapseReps(runs));
+    }
+    // Optional llm-judge pass on top-N variants (by current heuristic content).
+    if (opts.judge)
+        await runJudge(variants, cases, models, aggregated, opts.judge);
+    // Assemble rows: per-variant aggregate across all cases and models.
     const rows = [];
     for (const v of variants) {
         const rowResults = new Map();
-        let parseSum = 0;
-        let schemaSum = 0;
-        let contentSum = 0;
-        let costSum = 0;
-        let speedSum = 0;
-        for (const c of cases) {
-            const r = results.get(`${v.id}:${c.hash}`);
-            if (!r)
-                continue;
-            rowResults.set(c.hash, r);
-            parseSum += r.scores.parse;
-            schemaSum += r.scores.schema;
-            contentSum += r.scores.content;
-            costSum += r.scores.costEfficiency;
-            speedSum += r.scores.speed;
+        const perModel = {};
+        const modelGmeans = [];
+        let parseFailures = 0;
+        for (const model of models) {
+            const modelScores = [];
+            for (const c of cases) {
+                const key = `${v.id}:${c.hash}:${model}`;
+                const r = aggregated.get(key);
+                if (!r)
+                    continue;
+                rowResults.set(models.length > 1 ? `${c.hash}:${model}` : c.hash, r);
+                modelScores.push(r.scores);
+                if (r.scores.parse < 0.5)
+                    parseFailures++;
+            }
+            if (modelScores.length > 0) {
+                const modelAgg = averageDimensions(modelScores);
+                perModel[model] = modelAgg;
+                modelGmeans.push(gmean(modelAgg));
+            }
+        }
+        const allScores = [...rowResults.values()].map((r) => r.scores);
+        const aggregate = averageDimensions(allScores);
+        const g = gmean(aggregate);
+        let crossModelStddev;
+        if (modelGmeans.length > 1) {
+            const m = modelGmeans.reduce((a, b) => a + b, 0) / modelGmeans.length;
+            const variance = modelGmeans.reduce((a, b) => a + (b - m) ** 2, 0) / modelGmeans.length;
+            crossModelStddev = Math.sqrt(variance);
         }
-        const n = cases.length;
-        const aggregate = {
-            parse: parseSum / n,
-            schema: schemaSum / n,
-            content: contentSum / n,
-            costEfficiency: costSum / n,
-            speed: speedSum / n,
-        };
         rows.push({
             variantId: v.id,
             promptPath: v.promptPath,
@@ -66,127 +99,121 @@ export async function buildMatrix(variants, cases, opts) {
             text: v.text,
             results: rowResults,
             aggregate,
-            gmean: gmean(aggregate),
+            gmean: g,
+            crossModelStddev,
+            perModel: models.length > 1 ? perModel : undefined,
+            parseFailures,
         });
     }
     return rows;
 }
-async function runSingle(job, opts) {
+async function runSingle(job, opts, transport) {
     const started = Date.now();
-    const baseUrl = (opts.baseUrl ?? process.env.ANTHROPIC_BASE_URL ?? "https://api.anthropic.com").replace(/\/$/, "");
-    const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
-    const isAnthropic = /^https?:\/\/(api\.)?anthropic\.com/i.test(baseUrl);
-    const isKimi = /kimi\.com/i.test(baseUrl);
-    let body;
-    let endpoint;
-    let headers = {
-        "Content-Type": "application/json",
-        "Authorization": `Bearer ${authToken}`,
+    const callOpts = {
+        model: job.model,
+        baseUrl: opts.baseUrl,
+        authToken: opts.authToken,
+        maxTokens: opts.maxTokens,
+        timeoutMs: opts.timeoutMs,
     };
-    if (isKimi)
-        headers["User-Agent"] = "Kilo-Code/1.0";
-    if (isAnthropic) {
-        // Anthropic native format
-        endpoint = `${baseUrl}/v1/messages`;
-        headers["anthropic-version"] = "2023-06-01";
-        const messages = [{ role: "user", content: job.text }];
-        const payload = {
-            model: opts.model,
-            max_tokens: opts.maxTokens ?? 4096,
-            messages,
-        };
-        if (job.systemText)
-            payload.system = job.systemText;
-        body = JSON.stringify(payload);
-    }
-    else {
-        // OpenAI-compatible format (OpenRouter, local proxies, etc.)
-        endpoint = `${baseUrl}/v1/chat/completions`;
-        const messages = [];
-        if (job.systemText) {
-            messages.push({ role: "system", content: job.systemText });
-        }
-        messages.push({ role: "user", content: job.text });
-        body = JSON.stringify({
-            model: opts.model,
-            max_tokens: opts.maxTokens ?? 4096,
-            messages,
-        });
-    }
-    let raw = "";
-    let costUsd = 0;
     try {
-        const res = await fetch(endpoint, {
-            method: "POST",
-            headers,
-            body,
-        });
-        if (!res.ok) {
-            const errText = await res.text().catch(() => "");
-            return makeErrorResult(job, errText, 0, Date.now() - started);
-        }
-        let inp = 0;
-        let out = 0;
-        if (isAnthropic) {
-            const data = await res.json();
-            raw = data.content?.map((c) => c.text ?? "").join("") ?? "";
-            inp = data.usage?.input_tokens ?? 0;
-            out = data.usage?.output_tokens ?? 0;
-        }
-        else {
-            const data = await res.json();
-            raw = data.choices?.[0]?.message?.content ?? "";
-            inp = data.usage?.prompt_tokens ?? 0;
-            out = data.usage?.completion_tokens ?? 0;
-        }
-        // Rough cost estimate: varies by model. Using claude-3-haiku as baseline.
-        costUsd = inp * 0.000003 + out * 0.000015;
+        const { raw, costUsd } = await transport(job.text, job.systemText, callOpts);
+        const durationMs = Date.now() - started;
+        const parsed = attemptJsonParse(raw);
+        const scored = scoreOutput(raw, parsed, costUsd, durationMs, job.case, { model: job.model });
+        scored.variantId = job.variantId;
+        return scored;
     }
     catch (err) {
         const msg = err instanceof Error ? err.message : String(err);
-        return makeErrorResult(job, msg, 0, Date.now() - started);
+        const durationMs = Date.now() - started;
+        return {
+            caseHash: job.case.hash,
+            caseName: job.case.name,
+            variantId: job.variantId,
+            promptPath: job.case.promptPath,
+            rawOutput: msg,
+            parsedOutput: null,
+            costUsd: 0,
+            durationMs,
+            scores: { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 },
+            notes: [`HTTP/fetch error: ${msg.slice(0, 200)}`],
+            model: job.model,
+        };
     }
-    const durationMs = Date.now() - started;
-    const parsed = attemptJsonParse(raw);
-    const scored = scoreOutput(raw, parsed, costUsd, durationMs, job.case);
-    scored.variantId = job.variantId;
-    return scored;
 }
-function attemptJsonParse(text) {
-    // Strip markdown fences and trailing noise
-    const cleaned = text
-        .replace(/^```(?:json)?\s*\n?/i, "")
-        .replace(/\n?```\s*$/i, "")
-        .trim();
-    try {
-        return JSON.parse(cleaned);
-    }
-    catch {
-        // Try to find the first {…} block
-        const m = cleaned.match(/\{[\s\S]*\}/);
-        if (m) {
-            try {
-                return JSON.parse(m[0]);
+/** Collapse N repetitions into a single EvaluationResult carrying mean + stddev. */
+function collapseReps(runs) {
+    if (runs.length === 1)
+        return runs[0];
+    const { mean, stddev } = aggregateReps(runs);
+    // Pick the median-quality run as the "representative" raw output, so the
+    // report shows a realistic sample rather than the best or worst rep.
+    const sorted = [...runs].sort((a, b) => gmean(a.scores) - gmean(b.scores));
+    const mid = sorted[Math.floor(sorted.length / 2)];
+    return {
+        ...mid,
+        scores: mean,
+        stddev,
+        reps: runs.length,
+    };
+}
+async function runJudge(variants, cases, models, aggregated, judge) {
+    // Judge only the top-N variants to cap cost: a judge call per
+    // (variant, case, model) on a large population blows up fast.
+    const topN = judge.topN ?? 4;
+    const variantGmeans = variants.map((v) => {
+        const scores = [];
+        for (const c of cases) {
+            for (const model of models) {
+                const r = aggregated.get(`${v.id}:${c.hash}:${model}`);
+                if (r)
+                    scores.push(r.scores);
             }
-            catch {
-                return null;
+        }
+        return { id: v.id, g: scores.length > 0 ? gmean(averageDimensions(scores)) : 0 };
+    });
+    variantGmeans.sort((a, b) => b.g - a.g);
+    const eligible = new Set(variantGmeans.slice(0, topN).map((x) => x.id));
+    const jobs = [];
+    for (const v of variants) {
+        if (!eligible.has(v.id))
+            continue;
+        for (const c of cases) {
+            for (const model of models) {
+                const key = `${v.id}:${c.hash}:${model}`;
+                const r = aggregated.get(key);
+                if (!r || r.scores.parse < 0.5)
+                    continue; // no point judging unparseable output
+                jobs.push(async () => {
+                    try {
+                        const jr = await judgeOutput(r.rawOutput, c, judge);
+                        r.scores = { ...r.scores, content: jr.score };
+                        r.judgeJustification = jr.justification;
+                    }
+                    catch {
+                        // Judge failure is non-fatal — keep heuristic content.
+                    }
+                });
             }
         }
-        return null;
+    }
+    // Run judge calls with modest concurrency to stay under provider rate limits.
+    const concurrency = 3;
+    for (let i = 0; i < jobs.length; i += concurrency) {
+        await Promise.all(jobs.slice(i, i + concurrency).map((fn) => fn()));
     }
 }
-function makeErrorResult(job, error, costUsd, durationMs) {
+function averageDimensions(scores) {
+    if (scores.length === 0)
+        return { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 };
+    const n = scores.length;
     return {
-        caseHash: job.case.hash,
-        caseName: job.case.name,
-        variantId: job.variantId,
-        promptPath: job.case.promptPath,
-        rawOutput: error,
-        parsedOutput: null,
-        costUsd,
-        durationMs,
-        scores: { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 },
-        notes: [`HTTP/fetch error: ${error.slice(0, 200)}`],
+        parse: scores.reduce((a, b) => a + b.parse, 0) / n,
+        schema: scores.reduce((a, b) => a + b.schema, 0) / n,
+        content: scores.reduce((a, b) => a + b.content, 0) / n,
+        costEfficiency: scores.reduce((a, b) => a + b.costEfficiency, 0) / n,
+        speed: scores.reduce((a, b) => a + b.speed, 0) / n,
     };
 }
 /** Render a prompt variant given its source path and optional variant name */

package/dist/prompt-evolution/fixtures/harvest.d.ts ADDED Viewed

@@ -0,0 +1,30 @@
+/**
+ * Harvest real objectives from past claude-overnight runs to build
+ * benchmark cases from ground truth instead of synthetic ones.
+ *
+ * Source: <cwd>/.claude-overnight/runs/<runId>/
+ *   - goal.md     — the original objective the user ran with
+ *   - state.json  — RunState: phase ("done"/"capped"/"stopped"), accCompleted, budget
+ *
+ * Coarse fitness signal: `state.phase === "done"` and accCompleted/budget
+ * close to 1 means the user kept running to completion — the plan was
+ * actionable. Cases with "stopped" phase are likely broken plans.
+ *
+ * We do NOT pretend to have a per-case ground-truth plan. The harvested
+ * cases are meant to be scored with the llm-judge: real objective + a
+ * heuristic that the run actually finished.
+ */
+import type { BenchmarkCase } from "../types.js";
+export interface HarvestOpts {
+    /** Repo root — harvest looks under <cwd>/.claude-overnight/runs/ */
+    cwd: string;
+    /** Which promptPath to target in the generated cases. */
+    promptPath: string;
+    /** Variant to attach to every harvested case. Default: STANDARD. */
+    variant?: string;
+    /** Max cases to return (newest first). */
+    limit?: number;
+    /** Only include runs whose phase matches — default ["done"] (successful runs). */
+    phaseAllowlist?: Array<"done" | "capped" | "stopped" | "planning">;
+}
+export declare function harvestRealCases(opts: HarvestOpts): BenchmarkCase[];

package/dist/prompt-evolution/fixtures/harvest.js ADDED Viewed

@@ -0,0 +1,88 @@
+/**
+ * Harvest real objectives from past claude-overnight runs to build
+ * benchmark cases from ground truth instead of synthetic ones.
+ *
+ * Source: <cwd>/.claude-overnight/runs/<runId>/
+ *   - goal.md     — the original objective the user ran with
+ *   - state.json  — RunState: phase ("done"/"capped"/"stopped"), accCompleted, budget
+ *
+ * Coarse fitness signal: `state.phase === "done"` and accCompleted/budget
+ * close to 1 means the user kept running to completion — the plan was
+ * actionable. Cases with "stopped" phase are likely broken plans.
+ *
+ * We do NOT pretend to have a per-case ground-truth plan. The harvested
+ * cases are meant to be scored with the llm-judge: real objective + a
+ * heuristic that the run actually finished.
+ */
+import { readdirSync, readFileSync, existsSync } from "node:fs";
+import { join } from "node:path";
+export function harvestRealCases(opts) {
+    const runsDir = join(opts.cwd, ".claude-overnight", "runs");
+    if (!existsSync(runsDir))
+        return [];
+    const allow = new Set(opts.phaseAllowlist ?? ["done"]);
+    const limit = opts.limit ?? 10;
+    const variant = opts.variant ?? "STANDARD";
+    const entries = [];
+    for (const id of readdirSync(runsDir)) {
+        const runDir = join(runsDir, id);
+        const goalPath = join(runDir, "goal.md");
+        const statePath = join(runDir, "state.json");
+        if (!existsSync(goalPath) || !existsSync(statePath))
+            continue;
+        try {
+            const state = JSON.parse(readFileSync(statePath, "utf-8"));
+            if (state.phase && !allow.has(state.phase))
+                continue;
+            const objective = extractObjective(readFileSync(goalPath, "utf-8"));
+            if (!objective)
+                continue;
+            entries.push({
+                id,
+                objective,
+                budget: typeof state.budget === "number" && state.budget > 0 ? state.budget : 8,
+                startedAt: state.startedAt ?? "",
+            });
+        }
+        catch {
+            // Skip unreadable runs.
+        }
+    }
+    entries.sort((a, b) => b.startedAt.localeCompare(a.startedAt));
+    return entries.slice(0, limit).map((e) => toCase(e, opts.promptPath, variant));
+}
+function extractObjective(goalMd) {
+    // goal.md is written as "## Original Objective\n<text>" — grab everything
+    // under the first header, or fall back to the whole file.
+    const m = goalMd.match(/##\s+[^\n]*\n([\s\S]+)$/);
+    const body = (m ? m[1] : goalMd).trim();
+    return body.slice(0, 2000); // keep cases shaped like the synthetic ones
+}
+function toCase(e, promptPath, variant) {
+    const c = {
+        name: `real:${e.id.slice(0, 12)}`,
+        hash: "",
+        promptPath,
+        variant,
+        vars: {
+            objective: e.objective,
+            budget: e.budget,
+            concurrency: Math.min(6, Math.max(2, Math.ceil(e.budget / 2))),
+            contextConstraintNote: "Context budget: use the claude-sonnet-4-6 model's context window efficiently.",
+        },
+        criteria: {
+            independentTasks: true,
+            specificTasks: false,
+            requiredJsonFields: ["tasks"],
+        },
+    };
+    c.hash = hashCase(c);
+    return c;
+}
+function hashCase(c) {
+    const key = `${c.promptPath}:${c.variant ?? "default"}:${JSON.stringify(c.vars)}`;
+    let h = 0;
+    for (let i = 0; i < key.length; i++)
+        h = ((h << 5) - h + key.charCodeAt(i)) | 0;
+    return Math.abs(h).toString(36).slice(0, 8);
+}