npm - claude-overnight - Versions diffs - 1.55.2 → 1.57.1 - Mend

claude-overnight 1.55.2 → 1.57.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/dist/bin/evolve.js +144 -2
package/dist/core/_version.d.ts +1 -1
package/dist/core/_version.js +1 -1
package/dist/prompt-evolution/evaluator-judge.d.ts +20 -0
package/dist/prompt-evolution/evaluator-judge.js +119 -0
package/dist/prompt-evolution/evaluator-utils.d.ts +7 -0
package/dist/prompt-evolution/evaluator-utils.js +17 -0
package/dist/prompt-evolution/evaluator.d.ts +20 -0
package/dist/prompt-evolution/evaluator.js +227 -89
package/dist/prompt-evolution/fixtures/generate.d.ts +38 -0
package/dist/prompt-evolution/fixtures/generate.js +168 -0
package/dist/prompt-evolution/index.d.ts +16 -0
package/dist/prompt-evolution/index.js +64 -7
package/dist/prompt-evolution/llm-judge.d.ts +2 -0
package/dist/prompt-evolution/llm-judge.js +2 -2
package/dist/prompt-evolution/persistence.d.ts +20 -0
package/dist/prompt-evolution/persistence.js +39 -0
package/dist/prompt-evolution/report.d.ts +1 -1
package/dist/prompt-evolution/report.js +134 -7
package/dist/prompt-evolution/scorer.d.ts +34 -0
package/dist/prompt-evolution/scorer.js +94 -0
package/dist/prompt-evolution/transport-batch.d.ts +54 -0
package/dist/prompt-evolution/transport-batch.js +216 -0
package/dist/prompt-evolution/types.d.ts +10 -0
package/package.json +1 -1
package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1

package/dist/bin/evolve.js CHANGED Viewed

@@ -18,6 +18,7 @@
 import { evolvePrompt } from "../prompt-evolution/index.js";
 import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
 import { harvestRealCases } from "../prompt-evolution/fixtures/harvest.js";
+import { generateCases } from "../prompt-evolution/fixtures/generate.js";
 import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
 function help() {
     process.stdout.write(`Usage: claude-overnight-evolve [options]
@@ -35,13 +36,27 @@ Options:
   --plateau <n>           Stop early if no improvement for N generations (default: 3)
   --reps <n>              Repetitions per (variant, case, model) for noise floor (default: 1)
   --concurrency <n>       Max in-flight eval calls (default: 8; bump for slow endpoints)
+  --batch                 Use provider batch API (50% cheaper, slower wall-clock)
+  --adaptive-cap <n>      Adaptive sampling: extend reps up to N when σ > threshold (default: off)
+  --adaptive-threshold <x> σ threshold that triggers an extra rep (default: 0.1)
   --judge                 Use llm-judge for content scoring (costs extra API calls)
   --judge-model <model>   Model to use for the judge (default: same as eval-model)
   --judge-top-n <n>       Judge only the top-N variants per generation (default: 4)
   --cases <suite>         Benchmark suite: plan | mcp-planning | mcp-review |
                           mcp-supervision | mcp-stuck (default: plan)
   --harvest               Append cases harvested from <cwd>/.claude-overnight/runs/*
+  --harvest-only          Use ONLY harvested real objectives (fails if none found)
   --harvest-limit <n>     Max harvested cases (default: 10)
+  --prompts <list>        Comma-separated prompt paths to evolve in sequence
+  --test-split <f>        Hold out fraction f of cases for a selection-bias-free
+                          final eval (default: 0 = no split). Use 0.3 for rigor.
+  --case-pool <n>         Target total case count; generates synthetic cases via
+                          LLM to top up if the current pool is smaller.
+  --gen-model <model>     Model used by the case generator (default: eval-model)
+Subcommands:
+  claude-overnight-evolve diff <runIdA> <runIdB>
+                          Print a per-variant diff of two persisted runs
   --base-url <url>        API base URL override
   --auth-token <token>    Auth token override
   --run-id <id>           Preset run id (default: auto-generated)
@@ -62,11 +77,14 @@ function parseArgs() {
         population: 8,
         plateau: 3,
         reps: 1,
+        batch: false,
         useJudge: false,
         judgeTopN: 4,
         cases: "",
         harvest: false,
+        harvestOnly: false,
         harvestLimit: 10,
+        testSplit: 0,
         baseUrl: process.env.ANTHROPIC_BASE_URL,
         authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
     };
@@ -117,6 +135,17 @@ function parseArgs() {
                 opts.concurrency = parseInt(v, 10);
                 i++;
                 break;
+            case "--batch":
+                opts.batch = true;
+                break;
+            case "--adaptive-cap":
+                opts.adaptiveCap = parseInt(v, 10);
+                i++;
+                break;
+            case "--adaptive-threshold":
+                opts.adaptiveThreshold = parseFloat(v);
+                i++;
+                break;
             case "--judge":
                 opts.useJudge = true;
                 break;
@@ -135,10 +164,30 @@ function parseArgs() {
             case "--harvest":
                 opts.harvest = true;
                 break;
+            case "--harvest-only":
+                opts.harvest = true;
+                opts.harvestOnly = true;
+                break;
             case "--harvest-limit":
                 opts.harvestLimit = parseInt(v, 10);
                 i++;
                 break;
+            case "--prompts":
+                opts.prompts = v.split(",").map((s) => s.trim()).filter(Boolean);
+                i++;
+                break;
+            case "--test-split":
+                opts.testSplit = parseFloat(v);
+                i++;
+                break;
+            case "--case-pool":
+                opts.casePool = parseInt(v, 10);
+                i++;
+                break;
+            case "--gen-model":
+                opts.genModel = v;
+                i++;
+                break;
             case "--base-url":
                 opts.baseUrl = v;
                 i++;
@@ -161,7 +210,31 @@ function parseArgs() {
     return opts;
 }
 async function main() {
+    // Subcommand: diff two persisted runs.
+    if (process.argv[2] === "diff") {
+        await runDiff(process.argv[3], process.argv[4]);
+        return;
+    }
     const opts = parseArgs();
+    // Multi-prompt mode: loop evolvePrompt once per prompt in opts.prompts.
+    // Each iteration gets its own runId and report. Post a combined summary
+    // at the end so the user sees best-of-batch across all prompts.
+    if (opts.prompts && opts.prompts.length > 0) {
+        const summary = [];
+        for (const p of opts.prompts) {
+            console.log(`\n========== Evolving ${p} ==========\n`);
+            const result = await evolveOne({ ...opts, prompt: p });
+            summary.push({ prompt: p, runId: result.runId, gmean: result.bestVariant.gmean, reportPath: result.reportPath });
+        }
+        console.log("\n========== Multi-prompt summary ==========");
+        for (const s of summary) {
+            console.log(`  ${s.prompt.padEnd(40)} gmean=${(s.gmean * 100).toFixed(1)}% runId=${s.runId}`);
+        }
+        return;
+    }
+    await evolveOne(opts);
+}
+async function evolveOne(opts) {
     let cases;
     let promptPath = opts.prompt;
     let seedText;
@@ -182,7 +255,7 @@ async function main() {
     }
     else {
         if (opts.cases === "plan")
-            cases = [...PLAN_CASES];
+            cases = opts.harvestOnly ? [] : [...PLAN_CASES];
         else
             throw new Error(`Unknown case suite: ${opts.cases}`);
         if (opts.harvest) {
@@ -192,13 +265,39 @@ async function main() {
                 limit: opts.harvestLimit,
             });
             if (harvested.length === 0) {
+                if (opts.harvestOnly) {
+                    throw new Error("--harvest-only set but no runs found under <cwd>/.claude-overnight/runs");
+                }
                 console.log(`  (harvest: no runs found under <cwd>/.claude-overnight/runs)`);
             }
             else {
-                console.log(`  (harvest: +${harvested.length} real objectives)`);
+                console.log(`  (harvest: ${opts.harvestOnly ? "" : "+"}${harvested.length} real objectives)`);
                 cases = cases.concat(harvested);
             }
         }
+        // Top up to --case-pool with LLM-generated synthetic cases. The generator
+        // caches its output so successive runs share the pool — real cost is
+        // paid once, amortised across every subsequent round.
+        if (opts.casePool && cases.length < opts.casePool) {
+            console.log(`  (generating cases to reach pool size ${opts.casePool}…)`);
+            try {
+                const generated = await generateCases({
+                    targetCount: opts.casePool - cases.length,
+                    model: opts.genModel ?? opts.evalModel,
+                    baseUrl: opts.baseUrl,
+                    authToken: opts.authToken,
+                    promptPath,
+                    existing: cases,
+                });
+                console.log(`  (generated: +${generated.length} synthetic cases)`);
+                cases = cases.concat(generated);
+            }
+            catch (err) {
+                const msg = err.message ?? String(err);
+                console.log(`\n  ⚠ case generation failed: ${msg.slice(0, 500)}`);
+                console.log(`  Falling back to the existing ${cases.length} case(s). Try --gen-model with an Anthropic-compatible JSON-reliable model (e.g. claude-haiku-4-5) if this persists.\n`);
+            }
+        }
     }
     console.log(`Evolution config:`);
     console.log(`  target:      ${opts.target}`);
@@ -221,6 +320,11 @@ async function main() {
         plateauGenerations: opts.plateau,
         repetitions: opts.reps > 1 ? opts.reps : undefined,
         concurrency: opts.concurrency,
+        batch: opts.batch || undefined,
+        adaptiveReps: opts.adaptiveCap
+            ? { cap: opts.adaptiveCap, threshold: opts.adaptiveThreshold }
+            : undefined,
+        testFraction: opts.testSplit > 0 ? opts.testSplit : undefined,
         judge: opts.useJudge
             ? {
                 model: opts.judgeModel ?? opts.evalModel,
@@ -247,6 +351,44 @@ async function main() {
     console.log(`speed:      ${(result.bestVariant.aggregate.speed * 100).toFixed(1)}%`);
     console.log("\n--- Prompt text ---");
     console.log(result.bestVariant.text);
+    return result;
+}
+async function runDiff(runIdA, runIdB) {
+    if (!runIdA || !runIdB) {
+        console.error("usage: claude-overnight-evolve diff <runIdA> <runIdB>");
+        process.exit(2);
+    }
+    const { loadRun } = await import("../prompt-evolution/persistence.js");
+    const a = loadRun(runIdA);
+    const b = loadRun(runIdB);
+    const collect = (run) => {
+        const out = new Map();
+        for (const rec of run.matrix) {
+            // Keep the latest-generation row per variantId so diff compares final state.
+            const existing = out.get(rec.variantId);
+            if (!existing || rec.generation > existing.generation) {
+                out.set(rec.variantId, { generation: rec.generation, variantId: rec.variantId, gmean: rec.gmean });
+            }
+        }
+        return out;
+    };
+    const rowsA = collect(a);
+    const rowsB = collect(b);
+    const ids = new Set([...rowsA.keys(), ...rowsB.keys()]);
+    console.log(`# Diff: ${runIdA} → ${runIdB}`);
+    console.log("");
+    console.log(`|  Variant  |  A gmean  |  B gmean  |   Δ   |  note  |`);
+    console.log(`|-----------|-----------|-----------|-------|--------|`);
+    const sorted = [...ids].sort();
+    for (const id of sorted) {
+        const ra = rowsA.get(id);
+        const rb = rowsB.get(id);
+        const ga = ra ? (ra.gmean * 100).toFixed(1) : "—";
+        const gb = rb ? (rb.gmean * 100).toFixed(1) : "—";
+        const delta = ra && rb ? ((rb.gmean - ra.gmean) * 100).toFixed(1) : "—";
+        const note = !ra ? "new in B" : !rb ? "missing in B" : ra.gmean < rb.gmean ? "↑" : ra.gmean > rb.gmean ? "↓" : "=";
+        console.log(`| ${id.padEnd(10)}| ${ga.padStart(9)} | ${gb.padStart(9)} | ${delta.padStart(5)} | ${note} |`);
+    }
 }
 main().catch((err) => {
     console.error(err);

package/dist/core/_version.d.ts CHANGED Viewed

	@@ -1 +1 @@
1	- export declare const VERSION = "1.55.2";
1	+ export declare const VERSION = "1.57.1";

package/dist/core/_version.js CHANGED Viewed

@@ -1,2 +1,2 @@
 // Auto-generated by build — do not edit manually.
-export const VERSION = "1.55.2";
+export const VERSION = "1.57.1";

package/dist/prompt-evolution/evaluator-judge.d.ts ADDED Viewed

@@ -0,0 +1,20 @@
+/**
+ * LLM-judge pass over a built evaluation matrix.
+ *
+ * Split out of evaluator.ts to keep each file under the 500-line cap and
+ * because the judge has its own concerns (top-N eligibility, batch vs
+ * online path, crash-resumable state).
+ *
+ * The judge REPLACES the heuristic content score with a semantic grade.
+ * We only judge top-N variants per generation to cap cost — a judge call
+ * per (variant, case, model) on a large population explodes fast.
+ */
+import { type JudgeOpts } from "./llm-judge.js";
+import type { BenchmarkCase, EvaluationResult } from "./types.js";
+import type { EvalOpts } from "./evaluator.js";
+export declare function runJudge(variants: Array<{
+    id: string;
+    text: string;
+}>, cases: BenchmarkCase[], models: string[], aggregated: Map<string, EvaluationResult>, judge: JudgeOpts & {
+    topN?: number;
+}, opts: EvalOpts): Promise<void>;

package/dist/prompt-evolution/evaluator-judge.js ADDED Viewed

@@ -0,0 +1,119 @@
+/**
+ * LLM-judge pass over a built evaluation matrix.
+ *
+ * Split out of evaluator.ts to keep each file under the 500-line cap and
+ * because the judge has its own concerns (top-N eligibility, batch vs
+ * online path, crash-resumable state).
+ *
+ * The judge REPLACES the heuristic content score with a semantic grade.
+ * We only judge top-N variants per generation to cap cost — a judge call
+ * per (variant, case, model) on a large population explodes fast.
+ */
+import { judgeOutput, buildJudgePrompt, parseJudgeOutput } from "./llm-judge.js";
+import { batchCallModel } from "./transport-batch.js";
+import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
+import { gmean } from "./scorer.js";
+import { averageDimensions } from "./evaluator-utils.js";
+export async function runJudge(variants, cases, models, aggregated, judge, opts) {
+    const topN = judge.topN ?? 4;
+    const variantGmeans = variants.map((v) => {
+        const scores = [];
+        for (const c of cases) {
+            for (const model of models) {
+                const r = aggregated.get(`${v.id}:${c.hash}:${model}`);
+                if (r)
+                    scores.push(r.scores);
+            }
+        }
+        return { id: v.id, g: scores.length > 0 ? gmean(averageDimensions(scores)) : 0 };
+    });
+    variantGmeans.sort((a, b) => b.g - a.g);
+    const eligible = new Set(variantGmeans.slice(0, topN).map((x) => x.id));
+    const cells = [];
+    for (const v of variants) {
+        if (!eligible.has(v.id))
+            continue;
+        for (const c of cases) {
+            for (const model of models) {
+                const key = `${v.id}:${c.hash}:${model}`;
+                const r = aggregated.get(key);
+                if (!r || r.scores.parse < 0.5)
+                    continue; // unparseable output isn't worth judging
+                cells.push({ key, c, r });
+            }
+        }
+    }
+    if (cells.length === 0)
+        return;
+    if (opts.batch) {
+        await runJudgeBatch(cells, judge, opts);
+        return;
+    }
+    const jobs = cells.map((cell) => async () => {
+        try {
+            const jr = await judgeOutput(cell.r.rawOutput, cell.c, judge);
+            cell.r.scores = { ...cell.r.scores, content: jr.score };
+            cell.r.judgeJustification = jr.justification;
+        }
+        catch {
+            // Judge failure is non-fatal — keep heuristic content.
+        }
+    });
+    const judgeConcurrency = 3;
+    let nextJob = 0;
+    const judgeWorker = async () => {
+        while (true) {
+            const i = nextJob++;
+            if (i >= jobs.length)
+                return;
+            await jobs[i]();
+        }
+    };
+    await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
+}
+async function runJudgeBatch(cells, judge, opts) {
+    const batchJobs = cells.map((cell, i) => ({
+        customId: `j:${i}|k:${cell.key}`,
+        userText: buildJudgePrompt(cell.r.rawOutput, cell.c),
+        model: judge.model,
+    }));
+    const existing = opts.runId != null && opts.generation != null
+        ? loadBatchState(opts.runId, opts.generation, "judge")
+        : null;
+    const transport = opts.batchCallModel ?? batchCallModel;
+    const results = await transport(batchJobs, {
+        baseUrl: judge.baseUrl ?? opts.baseUrl,
+        authToken: judge.authToken ?? opts.authToken,
+        maxTokens: judge.maxTokens ?? 2048,
+        resumeBatchId: existing?.batchId,
+        onSubmitted: (batchId, p) => {
+            if (opts.runId != null && opts.generation != null && !existing) {
+                saveBatchState(opts.runId, {
+                    generation: opts.generation,
+                    phase: "judge",
+                    batchId,
+                    provider: p,
+                    submittedAt: new Date().toISOString(),
+                });
+            }
+            opts.onBatchProgress?.(`judge batch submitted: ${batchId} (${p})`);
+        },
+        onProgress: (p) => opts.onBatchProgress?.(`judge batch ${p.batchId} ${p.phase}${p.succeeded != null ? `: ${p.succeeded}/${p.total ?? batchJobs.length}` : ""}`),
+    });
+    if (opts.runId != null && existing)
+        markBatchFinished(opts.runId, existing.batchId);
+    for (const cell of cells) {
+        const customId = batchJobs.find((b) => b.customId.includes(`|k:${cell.key}`))?.customId;
+        const got = customId ? results.get(customId) : undefined;
+        if (!got || !got.raw)
+            continue;
+        try {
+            const jr = parseJudgeOutput(got.raw);
+            cell.r.scores = { ...cell.r.scores, content: jr.score };
+            cell.r.judgeJustification = jr.justification;
+        }
+        catch {
+            // Judge parse failure is non-fatal — keep heuristic content.
+        }
+    }
+}

package/dist/prompt-evolution/evaluator-utils.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+/**
+ * Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
+ * Extracted to break the import cycle that would otherwise form between
+ * the two (both call averageDimensions, judge also needs gmean aggregates).
+ */
+import type { ScoreDimensions } from "./types.js";
+export declare function averageDimensions(scores: ScoreDimensions[]): ScoreDimensions;

package/dist/prompt-evolution/evaluator-utils.js ADDED Viewed

@@ -0,0 +1,17 @@
+/**
+ * Small shared helpers used by both evaluator.ts and evaluator-judge.ts.
+ * Extracted to break the import cycle that would otherwise form between
+ * the two (both call averageDimensions, judge also needs gmean aggregates).
+ */
+export function averageDimensions(scores) {
+    if (scores.length === 0)
+        return { parse: 0, schema: 0, content: 0, costEfficiency: 0, speed: 0 };
+    const n = scores.length;
+    return {
+        parse: scores.reduce((a, b) => a + b.parse, 0) / n,
+        schema: scores.reduce((a, b) => a + b.schema, 0) / n,
+        content: scores.reduce((a, b) => a + b.content, 0) / n,
+        costEfficiency: scores.reduce((a, b) => a + b.costEfficiency, 0) / n,
+        speed: scores.reduce((a, b) => a + b.speed, 0) / n,
+    };
+}

package/dist/prompt-evolution/evaluator.d.ts CHANGED Viewed

@@ -17,6 +17,7 @@
  */
 import { type JudgeOpts } from "./llm-judge.js";
 import { type CallModel } from "./transport.js";
+import { batchCallModel } from "./transport-batch.js";
 import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
 export interface EvalOpts {
     /** Primary generator model (retained for single-model compat). */
@@ -35,14 +36,33 @@ export interface EvalOpts {
     timeoutMs?: number;
     /** Repetitions per (variant, case, model). Default 1 — opt-in to 3+ for noise floor. */
     repetitions?: number;
+    /**
+     * Adaptive sampling: after initial `repetitions`, keep adding one rep per cell
+     * where any score-dim σ exceeds `threshold`, up to `cap` total reps. Prevents
+     * wasted reps on already-stable cells while driving noisy ones down.
+     */
+    adaptiveReps?: {
+        cap: number;
+        threshold?: number;
+    };
     /** Inject an llm-judge call per case; content dimension is replaced by judge score. */
     judge?: JudgeOpts & {
         topN?: number;
     };
     /** Transport override for tests. */
     callModel?: CallModel;
+    /** Use provider batch API instead of online calls (50% cheaper, slower wall-clock). */
+    batch?: boolean;
+    /** Run id — required when batch=true so state is crash-resumable. */
+    runId?: string;
+    /** Current generation number — used to key batch state. */
+    generation?: number;
+    /** Batch-transport override for tests. Same return shape as transport-batch.batchCallModel. */
+    batchCallModel?: typeof batchCallModel;
     /** Optional callback for progress */
     onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
+    /** Progress callback specific to batch-phase transitions. */
+    onBatchProgress?: (msg: string) => void;
 }
 export declare function buildMatrix(variants: Array<{
     id: string;