npm - claude-overnight - Versions diffs - 1.57.4 → 1.59.0 - Mend

claude-overnight 1.57.4 → 1.59.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/bin/evolve.d.ts +1 -1
package/dist/bin/evolve.js +5 -8
package/dist/core/_version.d.ts +1 -1
package/dist/core/_version.js +1 -1
package/dist/prompt-evolution/evaluator-judge.d.ts +1 -6
package/dist/prompt-evolution/evaluator-judge.js +2 -58
package/dist/prompt-evolution/evaluator.d.ts +0 -11
package/dist/prompt-evolution/evaluator.js +20 -119
package/dist/prompt-evolution/index.d.ts +0 -2
package/dist/prompt-evolution/index.js +0 -12
package/dist/prompt-evolution/persistence.d.ts +0 -20
package/dist/prompt-evolution/persistence.js +0 -39
package/dist/prompt-evolution/transport.js +15 -5
package/docs/prompt-evolution-research.md +1 -1
package/package.json +1 -1
package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1
package/dist/prompt-evolution/transport-batch.d.ts +0 -54
package/dist/prompt-evolution/transport-batch.js +0 -216

package/dist/bin/evolve.d.ts CHANGED Viewed

@@ -8,7 +8,7 @@
  *
  * Examples:
  *   claude-overnight-evolve --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 3
- *   claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-k2-6
+ *   claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-for-coding
  *
  * Requires ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) in env. When `--target
  * mcp-browser` is used the cwd must be the MCP-browser repo root (so

package/dist/bin/evolve.js CHANGED Viewed

@@ -8,7 +8,7 @@
  *
  * Examples:
  *   claude-overnight-evolve --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 3
- *   claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-k2-6
+ *   claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-for-coding
  *
  * Requires ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) in env. When `--target
  * mcp-browser` is used the cwd must be the MCP-browser repo root (so
@@ -28,7 +28,10 @@ Options:
   --prompt <path>         Prompt file path (claude-overnight)
   --prompt-kind <kind>    MCP-browser prompt kind: planning | review | evolution |
                           goal-refinement | plan-supervision | simple-supervision | stuck-analysis
-  --eval-model <model>    Fast model for evaluation (default: claude-haiku-4-5)
+  --eval-model <model>    Fast model for evaluation (default: claude-haiku-4-5).
+                          For Kimi endpoints use "kimi-for-coding" (stable alias,
+                          auto-upgrades as flagship revs). For Moonshot platform
+                          API use "kimi-k2.6" (dot, not dash).
   --eval-models <list>    Comma-separated list to run cross-model (overrides --eval-model)
   --mutate-model <model>  Smarter model for mutation (defaults to eval-model)
   --generations <n>       Number of evolution generations (default: 10)
@@ -36,7 +39,6 @@ Options:
   --plateau <n>           Stop early if no improvement for N generations (default: 3)
   --reps <n>              Repetitions per (variant, case, model) for noise floor (default: 1)
   --concurrency <n>       Max in-flight eval calls (default: 8; bump for slow endpoints)
-  --batch                 Use provider batch API (50% cheaper, slower wall-clock)
   --adaptive-cap <n>      Adaptive sampling: extend reps up to N when σ > threshold (default: off)
   --adaptive-threshold <x> σ threshold that triggers an extra rep (default: 0.1)
   --judge                 Use llm-judge for content scoring (costs extra API calls)
@@ -77,7 +79,6 @@ function parseArgs() {
         population: 8,
         plateau: 3,
         reps: 1,
-        batch: false,
         useJudge: false,
         judgeTopN: 4,
         cases: "",
@@ -135,9 +136,6 @@ function parseArgs() {
                 opts.concurrency = parseInt(v, 10);
                 i++;
                 break;
-            case "--batch":
-                opts.batch = true;
-                break;
             case "--adaptive-cap":
                 opts.adaptiveCap = parseInt(v, 10);
                 i++;
@@ -327,7 +325,6 @@ async function evolveOne(opts) {
         plateauGenerations: opts.plateau,
         repetitions: opts.reps > 1 ? opts.reps : undefined,
         concurrency: opts.concurrency,
-        batch: opts.batch || undefined,
         adaptiveReps: opts.adaptiveCap
             ? { cap: opts.adaptiveCap, threshold: opts.adaptiveThreshold }
             : undefined,

package/dist/core/_version.d.ts CHANGED Viewed

	@@ -1 +1 @@
1	- export declare const VERSION = "1.57.4";
1	+ export declare const VERSION = "1.59.0";

package/dist/core/_version.js CHANGED Viewed

@@ -1,2 +1,2 @@
 // Auto-generated by build — do not edit manually.
-export const VERSION = "1.57.4";
+export const VERSION = "1.59.0";

package/dist/prompt-evolution/evaluator-judge.d.ts CHANGED Viewed

@@ -1,20 +1,15 @@
 /**
  * LLM-judge pass over a built evaluation matrix.
  *
- * Split out of evaluator.ts to keep each file under the 500-line cap and
- * because the judge has its own concerns (top-N eligibility, batch vs
- * online path, crash-resumable state).
- *
  * The judge REPLACES the heuristic content score with a semantic grade.
  * We only judge top-N variants per generation to cap cost — a judge call
  * per (variant, case, model) on a large population explodes fast.
  */
 import { type JudgeOpts } from "./llm-judge.js";
 import type { BenchmarkCase, EvaluationResult } from "./types.js";
-import type { EvalOpts } from "./evaluator.js";
 export declare function runJudge(variants: Array<{
     id: string;
     text: string;
 }>, cases: BenchmarkCase[], models: string[], aggregated: Map<string, EvaluationResult>, judge: JudgeOpts & {
     topN?: number;
-}, opts: EvalOpts): Promise<void>;
+}): Promise<void>;

package/dist/prompt-evolution/evaluator-judge.js CHANGED Viewed

@@ -1,20 +1,14 @@
 /**
  * LLM-judge pass over a built evaluation matrix.
  *
- * Split out of evaluator.ts to keep each file under the 500-line cap and
- * because the judge has its own concerns (top-N eligibility, batch vs
- * online path, crash-resumable state).
- *
  * The judge REPLACES the heuristic content score with a semantic grade.
  * We only judge top-N variants per generation to cap cost — a judge call
  * per (variant, case, model) on a large population explodes fast.
  */
-import { judgeOutput, buildJudgePrompt, parseJudgeOutput } from "./llm-judge.js";
-import { batchCallModel } from "./transport-batch.js";
-import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
+import { judgeOutput } from "./llm-judge.js";
 import { gmean } from "./scorer.js";
 import { averageDimensions } from "./evaluator-utils.js";
-export async function runJudge(variants, cases, models, aggregated, judge, opts) {
+export async function runJudge(variants, cases, models, aggregated, judge) {
     const topN = judge.topN ?? 4;
     const variantGmeans = variants.map((v) => {
         const scores = [];
@@ -45,10 +39,6 @@ export async function runJudge(variants, cases, models, aggregated, judge, opts)
     }
     if (cells.length === 0)
         return;
-    if (opts.batch) {
-        await runJudgeBatch(cells, judge, opts);
-        return;
-    }
     const jobs = cells.map((cell) => async () => {
         try {
             const jr = await judgeOutput(cell.r.rawOutput, cell.c, judge);
@@ -71,49 +61,3 @@ export async function runJudge(variants, cases, models, aggregated, judge, opts)
     };
     await Promise.all(Array.from({ length: Math.min(judgeConcurrency, jobs.length) }, judgeWorker));
 }
-async function runJudgeBatch(cells, judge, opts) {
-    const batchJobs = cells.map((cell, i) => ({
-        customId: `j:${i}|k:${cell.key}`,
-        userText: buildJudgePrompt(cell.r.rawOutput, cell.c),
-        model: judge.model,
-    }));
-    const existing = opts.runId != null && opts.generation != null
-        ? loadBatchState(opts.runId, opts.generation, "judge")
-        : null;
-    const transport = opts.batchCallModel ?? batchCallModel;
-    const results = await transport(batchJobs, {
-        baseUrl: judge.baseUrl ?? opts.baseUrl,
-        authToken: judge.authToken ?? opts.authToken,
-        maxTokens: judge.maxTokens ?? 2048,
-        resumeBatchId: existing?.batchId,
-        onSubmitted: (batchId, p) => {
-            if (opts.runId != null && opts.generation != null && !existing) {
-                saveBatchState(opts.runId, {
-                    generation: opts.generation,
-                    phase: "judge",
-                    batchId,
-                    provider: p,
-                    submittedAt: new Date().toISOString(),
-                });
-            }
-            opts.onBatchProgress?.(`judge batch submitted: ${batchId} (${p})`);
-        },
-        onProgress: (p) => opts.onBatchProgress?.(`judge batch ${p.batchId} ${p.phase}${p.succeeded != null ? `: ${p.succeeded}/${p.total ?? batchJobs.length}` : ""}`),
-    });
-    if (opts.runId != null && existing)
-        markBatchFinished(opts.runId, existing.batchId);
-    for (const cell of cells) {
-        const customId = batchJobs.find((b) => b.customId.includes(`|k:${cell.key}`))?.customId;
-        const got = customId ? results.get(customId) : undefined;
-        if (!got || !got.raw)
-            continue;
-        try {
-            const jr = parseJudgeOutput(got.raw);
-            cell.r.scores = { ...cell.r.scores, content: jr.score };
-            cell.r.judgeJustification = jr.justification;
-        }
-        catch {
-            // Judge parse failure is non-fatal — keep heuristic content.
-        }
-    }
-}

package/dist/prompt-evolution/evaluator.d.ts CHANGED Viewed

@@ -17,7 +17,6 @@
  */
 import { type JudgeOpts } from "./llm-judge.js";
 import { type CallModel } from "./transport.js";
-import { batchCallModel } from "./transport-batch.js";
 import type { BenchmarkCase, VariantRow, PromptVars } from "./types.js";
 export interface EvalOpts {
     /** Primary generator model (retained for single-model compat). */
@@ -51,18 +50,8 @@ export interface EvalOpts {
     };
     /** Transport override for tests. */
     callModel?: CallModel;
-    /** Use provider batch API instead of online calls (50% cheaper, slower wall-clock). */
-    batch?: boolean;
-    /** Run id — required when batch=true so state is crash-resumable. */
-    runId?: string;
-    /** Current generation number — used to key batch state. */
-    generation?: number;
-    /** Batch-transport override for tests. Same return shape as transport-batch.batchCallModel. */
-    batchCallModel?: typeof batchCallModel;
     /** Optional callback for progress */
     onProgress?: (done: number, total: number, caseName: string, variantId: string) => void;
-    /** Progress callback specific to batch-phase transitions. */
-    onBatchProgress?: (msg: string) => void;
 }
 export declare function buildMatrix(variants: Array<{
     id: string;

package/dist/prompt-evolution/evaluator.js CHANGED Viewed

@@ -18,8 +18,6 @@
 import { renderPrompt } from "../prompts/load.js";
 import { scoreOutput, gmean, aggregateReps, bootstrapCI, kendallTau } from "./scorer.js";
 import { defaultCallModel, attemptJsonParse, } from "./transport.js";
-import { batchCallModel, detectBatchProvider, } from "./transport-batch.js";
-import { saveBatchState, loadBatchState, markBatchFinished } from "./persistence.js";
 import { averageDimensions } from "./evaluator-utils.js";
 import { runJudge } from "./evaluator-judge.js";
 export async function buildMatrix(variants, cases, opts) {
@@ -38,53 +36,30 @@ export async function buildMatrix(variants, cases, opts) {
             }
         }
     }
-    // Two execution paths:
-    //   batch=true  — submit every job to the provider batch API, poll, score
-    //                 results as they arrive. 50% cheaper, slower wall-clock.
-    //   batch=false — work-stealing pool: keep `concurrency` jobs in flight so
-    //                 a slow call doesn't block the others in its slice.
+    // Work-stealing pool: keep `concurrency` jobs in flight so a slow call
+    // doesn't block the others in its slice.
     const rawByKey = new Map();
-    const runOnlinePool = async () => {
-        let done = 0;
-        let next = 0;
-        const worker = async () => {
-            while (true) {
-                const i = next++;
-                if (i >= jobs.length)
-                    return;
-                const r = await runSingle(jobs[i], opts, transport);
-                const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
-                const arr = rawByKey.get(key) ?? [];
-                arr.push(r);
-                rawByKey.set(key, arr);
-                done++;
-                opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
-            }
-        };
-        await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
-    };
-    if (opts.batch) {
-        try {
-            await runBatchPath(jobs, opts, rawByKey);
-        }
-        catch (err) {
-            // Batch submission failed (Kimi's /v1/files doesn't match OpenAI,
-            // OpenRouter has no batch at all, transient provider error, etc.).
-            // Fall back to the online pool so the whole run doesn't die — losing
-            // the 50% batch discount is better than losing the run.
-            const msg = err instanceof Error ? err.message : String(err);
-            opts.onBatchProgress?.(`batch path failed, falling back to online: ${msg.slice(0, 200)}`);
-            rawByKey.clear(); // discard any partial state
-            await runOnlinePool();
+    let done = 0;
+    let next = 0;
+    const worker = async () => {
+        while (true) {
+            const i = next++;
+            if (i >= jobs.length)
+                return;
+            const r = await runSingle(jobs[i], opts, transport);
+            const key = `${r.variantId}:${r.caseHash}:${r.model ?? ""}`;
+            const arr = rawByKey.get(key) ?? [];
+            arr.push(r);
+            rawByKey.set(key, arr);
+            done++;
+            opts.onProgress?.(done, jobs.length, r.caseName, r.variantId);
         }
-    }
-    else {
-        await runOnlinePool();
-    }
+    };
+    await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, worker));
     // Adaptive sampling: for cells where any score-dim σ exceeds threshold,
     // add one more rep and rerun — up to `cap` total reps. Converges on a
     // stable estimate without wasting reps on already-stable cells.
-    if (!opts.batch && opts.adaptiveReps) {
+    if (opts.adaptiveReps) {
         const cap = opts.adaptiveReps.cap;
         const threshold = opts.adaptiveReps.threshold ?? 0.1;
         for (let round = 0; round < cap - reps; round++) {
@@ -131,7 +106,7 @@ export async function buildMatrix(variants, cases, opts) {
     }
     // Optional llm-judge pass on top-N variants (by current heuristic content).
     if (opts.judge)
-        await runJudge(variants, cases, models, aggregated, opts.judge, opts);
+        await runJudge(variants, cases, models, aggregated, opts.judge);
     // Assemble rows: per-variant aggregate across all cases and models.
     const rows = [];
     for (const v of variants) {
@@ -244,80 +219,6 @@ function halfSplitMatrix(variants, cases, models, rawByKey, side) {
     scored.sort((a, b) => b.g - a.g);
     return scored.map((s) => s.id);
 }
-async function runBatchPath(jobs, opts, rawByKey) {
-    const provider = detectBatchProvider(opts.baseUrl);
-    if (provider === "unsupported") {
-        throw new Error(`Batch API not supported for baseUrl=${opts.baseUrl}; rerun without --batch or point at an Anthropic / OpenAI-compatible endpoint.`);
-    }
-    // Build custom_ids that route results back to the right cell. Index is
-    // included so reps of the same (variant, case, model) don't collide.
-    const keyed = jobs.map((job, i) => ({
-        job,
-        index: i,
-        customId: `v:${job.variantId}|h:${job.case.hash}|m:${job.model}|r:${job.rep}|i:${i}`,
-    }));
-    const batchJobs = keyed.map((k) => ({
-        customId: k.customId,
-        userText: k.job.text,
-        systemText: k.job.systemText,
-        model: k.job.model,
-    }));
-    const started = Date.now();
-    const existing = opts.runId != null && opts.generation != null
-        ? loadBatchState(opts.runId, opts.generation, "eval")
-        : null;
-    const transport = opts.batchCallModel ?? batchCallModel;
-    const results = await transport(batchJobs, {
-        baseUrl: opts.baseUrl,
-        authToken: opts.authToken,
-        maxTokens: opts.maxTokens,
-        resumeBatchId: existing?.batchId,
-        onSubmitted: (batchId, p) => {
-            if (opts.runId != null && opts.generation != null && !existing) {
-                saveBatchState(opts.runId, {
-                    generation: opts.generation,
-                    phase: "eval",
-                    batchId,
-                    provider: p,
-                    submittedAt: new Date().toISOString(),
-                });
-            }
-            opts.onBatchProgress?.(`batch submitted: ${batchId} (${p})`);
-        },
-        onProgress: (p) => {
-            if (p.phase === "polling") {
-                const ok = p.succeeded ?? 0;
-                const failed = p.failed ?? 0;
-                const total = p.total ?? batchJobs.length;
-                opts.onBatchProgress?.(`batch ${p.batchId} polling: ${ok}/${total} done${failed ? `, ${failed} failed` : ""}`);
-            }
-            else {
-                opts.onBatchProgress?.(`batch ${p.batchId} ${p.phase}`);
-            }
-        },
-    });
-    // Mark the state entry as finished so a crash after this point doesn't
-    // cause the next run to try resuming an already-consumed batch.
-    if (opts.runId != null && existing)
-        markBatchFinished(opts.runId, existing.batchId);
-    // Score each result and populate rawByKey the same way runSingle does.
-    const durationMs = Math.round((Date.now() - started) / Math.max(1, jobs.length));
-    let done = 0;
-    for (const k of keyed) {
-        const r = results.get(k.customId);
-        const raw = r?.raw ?? "batch returned no result for this custom_id";
-        const costUsd = r?.costUsd ?? 0;
-        const parsed = attemptJsonParse(raw);
-        const scored = scoreOutput(raw, parsed, costUsd, durationMs, k.job.case, { model: k.job.model });
-        scored.variantId = k.job.variantId;
-        const mapKey = `${scored.variantId}:${scored.caseHash}:${scored.model ?? ""}`;
-        const arr = rawByKey.get(mapKey) ?? [];
-        arr.push(scored);
-        rawByKey.set(mapKey, arr);
-        done++;
-        opts.onProgress?.(done, jobs.length, k.job.case.name, k.job.variantId);
-    }
-}
 async function runSingle(job, opts, transport) {
     const started = Date.now();
     const callOpts = {

package/dist/prompt-evolution/index.d.ts CHANGED Viewed

@@ -54,8 +54,6 @@ export interface EvolveOpts {
     repetitions?: number;
     /** Max in-flight eval calls. Default 8. Raise for slow endpoints, lower for strict rate limits. */
     concurrency?: number;
-    /** Use provider batch API instead of online calls. 50% cheaper, slower wall-clock. */
-    batch?: boolean;
     /** Adaptive sampling cap (opt-in). Keeps adding reps to noisy cells up to this count. */
     adaptiveReps?: {
         cap: number;

package/dist/prompt-evolution/index.js CHANGED Viewed

@@ -73,14 +73,10 @@ export async function evolvePrompt(opts) {
             concurrency: opts.concurrency ?? 8,
             repetitions: opts.repetitions,
             judge: opts.judge,
-            batch: opts.batch,
             adaptiveReps: opts.adaptiveReps,
-            runId,
-            generation: gen,
             onProgress: (done, total, caseName, variantId) => {
                 log(`  [${done}/${total}] ${variantId.slice(0, 16)} → ${caseName}`);
             },
-            onBatchProgress: (msg) => log(`  [batch] ${msg}`),
         };
         const matrix = await buildMatrix(population, trainCases, evalOpts);
         generationMatrices.push(matrix);
@@ -195,11 +191,7 @@ export async function evolvePrompt(opts) {
         concurrency: opts.concurrency ?? 8,
         repetitions: opts.repetitions,
         judge: opts.judge,
-        batch: opts.batch,
         adaptiveReps: opts.adaptiveReps,
-        runId,
-        generation: generations,
-        onBatchProgress: (msg) => log(`  [batch] ${msg}`),
     });
     generationMatrices.push(finalMatrix);
     snapshotPrompts(runId, finalMatrix);
@@ -219,11 +211,7 @@ export async function evolvePrompt(opts) {
             authToken: opts.authToken,
             concurrency: opts.concurrency ?? 8,
             repetitions: opts.repetitions,
-            batch: opts.batch,
             adaptiveReps: opts.adaptiveReps,
-            runId,
-            generation: generations + 1,
-            onBatchProgress: (msg) => log(`  [batch-test] ${msg}`),
         });
         log(formatMatrix(testMatrix, testCases.map((c) => c.name)));
     }

package/dist/prompt-evolution/persistence.d.ts CHANGED Viewed

@@ -37,26 +37,6 @@ export declare function appendLearning(runId: string, entries: LearningEntry[]):
 export declare function snapshotPrompts(runId: string, rows: VariantRow[]): void;
 /** Finalise the run: write best.md and update meta.json. */
 export declare function finalizeRun(runId: string, result: EvolutionResult, metaPartial?: Partial<RunMeta>): void;
-/**
- * Persist batch submission state so a crashed or restarted run can resume
- * polling instead of resubmitting (which would duplicate the bill).
- *
- * Keyed by (generation, phase) so multi-generation runs and eval-vs-judge
- * submissions don't collide. Written append-only — the latest entry wins
- * on load.
- */
-export interface BatchStateEntry {
-    generation: number;
-    phase: "eval" | "judge";
-    batchId: string;
-    provider: "anthropic" | "openai-compatible";
-    submittedAt: string;
-    /** If set, we've already collected results for this entry — ignore on resume. */
-    finishedAt?: string;
-}
-export declare function saveBatchState(runId: string, entry: BatchStateEntry): void;
-export declare function loadBatchState(runId: string, generation: number, phase: "eval" | "judge"): BatchStateEntry | null;
-export declare function markBatchFinished(runId: string, batchId: string): void;
 /** List all runs, newest first. */
 export declare function listRuns(): Array<{
     runId: string;

package/dist/prompt-evolution/persistence.js CHANGED Viewed

@@ -118,45 +118,6 @@ ${result.learningLog.map((l) => `| ${l.generation} | ${l.mutationSummary} | ${(l
 `;
     writeFileSync(join(root, "best.md"), report);
 }
-export function saveBatchState(runId, entry) {
-    const path = join(runDir(runId), "batch-jobs.jsonl");
-    writeFileSync(path, JSON.stringify(entry) + "\n", { flag: "a" });
-}
-export function loadBatchState(runId, generation, phase) {
-    const path = join(runDir(runId), "batch-jobs.jsonl");
-    if (!existsSync(path))
-        return null;
-    const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
-    let latest = null;
-    for (const line of lines) {
-        try {
-            const e = JSON.parse(line);
-            if (e.generation === generation && e.phase === phase)
-                latest = e;
-        }
-        catch { /* skip malformed */ }
-    }
-    // Only return if not yet finished — otherwise caller would re-poll a consumed batch.
-    return latest && !latest.finishedAt ? latest : null;
-}
-export function markBatchFinished(runId, batchId) {
-    const path = join(runDir(runId), "batch-jobs.jsonl");
-    if (!existsSync(path))
-        return;
-    const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
-    const updated = lines.map((line) => {
-        try {
-            const e = JSON.parse(line);
-            if (e.batchId === batchId && !e.finishedAt) {
-                e.finishedAt = new Date().toISOString();
-                return JSON.stringify(e);
-            }
-        }
-        catch { /* skip */ }
-        return line;
-    });
-    writeFileSync(path, updated.join("\n") + "\n");
-}
 /** List all runs, newest first. */
 export function listRuns() {
     const root = storeRoot();

package/dist/prompt-evolution/transport.js CHANGED Viewed

@@ -8,17 +8,22 @@
  * Supports both Anthropic-native and OpenAI-compatible endpoints so we can
  * run the same eval against Haiku, Kimi, and OpenRouter without a rewrite.
  */
+import { VERSION } from "../core/_version.js";
+const USER_AGENT = `claude-overnight-evolve/${VERSION}`;
 export async function defaultCallModel(userText, systemText, opts) {
     const baseUrl = (opts.baseUrl ?? process.env.ANTHROPIC_BASE_URL ?? "https://api.anthropic.com").replace(/\/$/, "");
     const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
     const isAnthropic = /^https?:\/\/(api\.)?anthropic\.com/i.test(baseUrl);
-    const isKimi = /kimi\.com/i.test(baseUrl);
+    // Identify ourselves honestly. Kimi's coding-endpoint docs explicitly say
+    // "Tampering with the client identifier (User-Agent) is considered a
+    // violation." The previous "Kilo-Code/1.0" was impersonating a third-party
+    // tool; we now send our real binary name + version.
     const headers = {
         "Content-Type": "application/json",
         "Authorization": `Bearer ${authToken}`,
+        "User-Agent": USER_AGENT,
     };
-    if (isKimi)
-        headers["User-Agent"] = "Kilo-Code/1.0";
+    const maxOut = opts.maxTokens ?? 4096;
     let endpoint;
     let body;
     if (isAnthropic) {
@@ -26,7 +31,7 @@ export async function defaultCallModel(userText, systemText, opts) {
         headers["anthropic-version"] = "2023-06-01";
         const payload = {
             model: opts.model,
-            max_tokens: opts.maxTokens ?? 4096,
+            max_tokens: maxOut, // Anthropic uses max_tokens, not max_completion_tokens.
             messages: [{ role: "user", content: userText }],
         };
         if (systemText)
@@ -39,9 +44,14 @@ export async function defaultCallModel(userText, systemText, opts) {
         if (systemText)
             messages.push({ role: "system", content: systemText });
         messages.push({ role: "user", content: userText });
+        // Platform.moonshot.ai marks max_tokens deprecated in favor of
+        // max_completion_tokens. Kimi's coding endpoint still accepts max_tokens.
+        // Sending both is safe — OpenAI, Moonshot, DeepSeek, and Kimi all tolerate
+        // the extra field, and we're future-proof against the deprecation.
         body = JSON.stringify({
             model: opts.model,
-            max_tokens: opts.maxTokens ?? 4096,
+            max_tokens: maxOut,
+            max_completion_tokens: maxOut,
             messages,
         });
     }

package/docs/prompt-evolution-research.md CHANGED Viewed

@@ -183,7 +183,7 @@ Your laptop can be off the whole time.
 npm run evolve -- --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 10
 # Evolve an MCP-browser supervision prompt
-npm run evolve -- --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-k2-6 --generations 10
+npm run evolve -- --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-for-coding --generations 10
 ```
 ### Via Platform API (runs on server)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-overnight",
-  "version": "1.57.4",
+  "version": "1.59.0",
   "description": "Overnight parallel coding agents in git worktrees, with a self-curating skill memory that improves while the run is going. Mix Claude Opus as planner, Kimi 2.6 or Cursor composer-2 as cheap fast worker, Gemini or Qwen for bulk implementation. Multi-wave autonomous loop that plans, executes, reviews, and steers itself until the objective is met. Crash-safe resume, rate-limit aware, usage cap preserves headroom for your interactive Claude Code.",
   "type": "module",
   "bin": {

package/plugins/claude-overnight/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-overnight",
-  "version": "1.57.4",
+  "version": "1.59.0",
   "description": "Claude Code skill for understanding, installing, and inspecting claude-overnight runs: overnight parallel coding agents in git worktrees with a self-curating skill memory, multi-wave steering, three-layer review, and crash-safe resume. Mix Opus planner with Kimi 2.6, Cursor composer-2, Gemini, Qwen, or any Anthropic-compatible worker.",
   "author": {
     "name": "Francesco Fornace"

package/dist/prompt-evolution/transport-batch.d.ts DELETED Viewed

@@ -1,54 +0,0 @@
-/**
- * Batch-API transport for prompt evolution.
- *
- * 50% cheaper than online calls on every major provider that supports
- * batch. Perfect fit for generations=1 benchmark rounds where interactive
- * progress isn't needed — we submit 120-1000 requests, poll every 30-300s,
- * then pull the results in one shot.
- *
- * Provider detection from baseUrl:
- *   - api.anthropic.com → Anthropic Message Batches API (one-shot submit)
- *   - kimi / moonshot / openai → OpenAI-compatible file-based batch
- *   - openrouter → NO batch support; throws (caller must fall back to online)
- *
- * Custom IDs route results back to the right (variant, case, model, rep)
- * cell. The evaluator builds ids like `v0:h_abc:kimi-k2-6:r0`.
- *
- * Poll state is persisted via `persistBatchState` so a crashed or
- * restarted run can resume without resubmitting.
- */
-import type { CallModelResult } from "./transport.js";
-export interface BatchJob {
-    customId: string;
-    userText: string;
-    systemText?: string;
-    model: string;
-}
-export interface BatchOpts {
-    baseUrl?: string;
-    authToken?: string;
-    maxTokens?: number;
-    /** Poll interval starts here and doubles to `pollMaxMs`. Defaults 30s → 5min. */
-    pollStartMs?: number;
-    pollMaxMs?: number;
-    /** Overall timeout for the whole batch. Default 24h — matches provider SLAs. */
-    batchTimeoutMs?: number;
-    /** Called with progress snapshots during polling. */
-    onProgress?: (p: BatchProgress) => void;
-    /** Restore a previously-submitted batch instead of resubmitting. */
-    resumeBatchId?: string;
-    /** Called after submit returns an id — use to persist for crash resume. */
-    onSubmitted?: (batchId: string, provider: BatchProvider) => void;
-}
-export interface BatchProgress {
-    provider: BatchProvider;
-    batchId: string;
-    phase: "submitted" | "polling" | "downloading" | "done";
-    processing?: number;
-    succeeded?: number;
-    failed?: number;
-    total?: number;
-}
-export type BatchProvider = "anthropic" | "openai-compatible" | "unsupported";
-export declare function detectBatchProvider(baseUrl: string | undefined): BatchProvider;
-export declare function batchCallModel(jobs: BatchJob[], opts: BatchOpts): Promise<Map<string, CallModelResult>>;

package/dist/prompt-evolution/transport-batch.js DELETED Viewed

@@ -1,216 +0,0 @@
-/**
- * Batch-API transport for prompt evolution.
- *
- * 50% cheaper than online calls on every major provider that supports
- * batch. Perfect fit for generations=1 benchmark rounds where interactive
- * progress isn't needed — we submit 120-1000 requests, poll every 30-300s,
- * then pull the results in one shot.
- *
- * Provider detection from baseUrl:
- *   - api.anthropic.com → Anthropic Message Batches API (one-shot submit)
- *   - kimi / moonshot / openai → OpenAI-compatible file-based batch
- *   - openrouter → NO batch support; throws (caller must fall back to online)
- *
- * Custom IDs route results back to the right (variant, case, model, rep)
- * cell. The evaluator builds ids like `v0:h_abc:kimi-k2-6:r0`.
- *
- * Poll state is persisted via `persistBatchState` so a crashed or
- * restarted run can resume without resubmitting.
- */
-export function detectBatchProvider(baseUrl) {
-    const url = (baseUrl ?? "https://api.anthropic.com").toLowerCase();
-    if (/(^|\/\/)(api\.)?anthropic\.com/.test(url))
-        return "anthropic";
-    if (/openrouter/.test(url))
-        return "unsupported";
-    // Everything else that speaks /v1/chat/completions — OpenAI, Kimi, Moonshot,
-    // DeepSeek — exposes an OpenAI-compatible batch endpoint.
-    return "openai-compatible";
-}
-export async function batchCallModel(jobs, opts) {
-    if (jobs.length === 0)
-        return new Map();
-    const provider = detectBatchProvider(opts.baseUrl);
-    if (provider === "unsupported") {
-        throw new Error(`Batch API not supported for baseUrl=${opts.baseUrl}; use online transport`);
-    }
-    if (provider === "anthropic")
-        return runAnthropicBatch(jobs, opts);
-    return runOpenAIBatch(jobs, opts);
-}
-// ── Anthropic ──────────────────────────────────────────────────────────────
-async function runAnthropicBatch(jobs, opts) {
-    const baseUrl = (opts.baseUrl ?? "https://api.anthropic.com").replace(/\/$/, "");
-    const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
-    const headers = {
-        "Content-Type": "application/json",
-        "Authorization": `Bearer ${authToken}`,
-        "anthropic-version": "2023-06-01",
-        "anthropic-beta": "message-batches-2024-09-24",
-    };
-    let batchId = opts.resumeBatchId;
-    if (!batchId) {
-        const body = JSON.stringify({
-            requests: jobs.map((j) => {
-                const params = {
-                    model: j.model,
-                    max_tokens: opts.maxTokens ?? 4096,
-                    messages: [{ role: "user", content: j.userText }],
-                };
-                if (j.systemText)
-                    params.system = j.systemText;
-                return { custom_id: j.customId, params };
-            }),
-        });
-        const res = await fetch(`${baseUrl}/v1/messages/batches`, { method: "POST", headers, body });
-        if (!res.ok)
-            throw new Error(`Anthropic batch submit: HTTP ${res.status} ${await res.text()}`);
-        const data = await res.json();
-        batchId = data.id;
-        opts.onSubmitted?.(batchId, "anthropic");
-    }
-    opts.onProgress?.({ provider: "anthropic", batchId, phase: "submitted", total: jobs.length });
-    const endedAt = await pollUntilDone(async () => {
-        const res = await fetch(`${baseUrl}/v1/messages/batches/${batchId}`, { headers });
-        if (!res.ok)
-            throw new Error(`Anthropic batch poll: HTTP ${res.status}`);
-        const d = await res.json();
-        opts.onProgress?.({
-            provider: "anthropic",
-            batchId: batchId,
-            phase: "polling",
-            processing: d.request_counts?.processing,
-            succeeded: d.request_counts?.succeeded,
-            failed: (d.request_counts?.errored ?? 0) + (d.request_counts?.canceled ?? 0) + (d.request_counts?.expired ?? 0),
-            total: jobs.length,
-        });
-        return d.processing_status === "ended" ? d : null;
-    }, opts);
-    opts.onProgress?.({ provider: "anthropic", batchId, phase: "downloading" });
-    const resultsUrl = endedAt.results_url ?? `${baseUrl}/v1/messages/batches/${batchId}/results`;
-    const res = await fetch(resultsUrl, { headers });
-    if (!res.ok)
-        throw new Error(`Anthropic batch results: HTTP ${res.status}`);
-    const text = await res.text();
-    const out = new Map();
-    for (const line of text.split("\n")) {
-        if (!line.trim())
-            continue;
-        const row = JSON.parse(line);
-        if (row.result.type === "succeeded") {
-            const raw = row.result.message.content.map((c) => c.text ?? "").join("");
-            const inp = row.result.message.usage?.input_tokens ?? 0;
-            const outp = row.result.message.usage?.output_tokens ?? 0;
-            out.set(row.custom_id, { raw, costUsd: (inp * 0.000003 + outp * 0.000015) * 0.5, inputTokens: inp, outputTokens: outp });
-        }
-        else {
-            const msg = row.result.type === "errored" ? row.result.error.message : row.result.type;
-            out.set(row.custom_id, { raw: `batch error: ${msg}`, costUsd: 0, inputTokens: 0, outputTokens: 0 });
-        }
-    }
-    opts.onProgress?.({ provider: "anthropic", batchId, phase: "done", succeeded: out.size, total: jobs.length });
-    return out;
-}
-// ── OpenAI-compatible (OpenAI, Kimi/Moonshot, DeepSeek) ────────────────────
-async function runOpenAIBatch(jobs, opts) {
-    const baseUrl = (opts.baseUrl ?? "https://api.openai.com").replace(/\/$/, "");
-    const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
-    const authHeaders = { "Authorization": `Bearer ${authToken}` };
-    let batchId = opts.resumeBatchId;
-    let outputFileId;
-    if (!batchId) {
-        // Build the JSONL payload and upload as a file.
-        const jsonl = jobs.map((j) => {
-            const messages = [];
-            if (j.systemText)
-                messages.push({ role: "system", content: j.systemText });
-            messages.push({ role: "user", content: j.userText });
-            return JSON.stringify({
-                custom_id: j.customId,
-                method: "POST",
-                url: "/v1/chat/completions",
-                body: { model: j.model, max_tokens: opts.maxTokens ?? 4096, messages },
-            });
-        }).join("\n");
-        const form = new FormData();
-        form.append("purpose", "batch");
-        form.append("file", new Blob([jsonl], { type: "application/jsonl" }), "batch-input.jsonl");
-        const fileRes = await fetch(`${baseUrl}/v1/files`, { method: "POST", headers: authHeaders, body: form });
-        if (!fileRes.ok) {
-            const body = await fileRes.text().catch(() => "");
-            throw new Error(`Batch file-upload failed: HTTP ${fileRes.status} at ${baseUrl}/v1/files. ` +
-                `This provider may not support OpenAI-compatible batch. Response: ${body.slice(0, 300)}`);
-        }
-        const fileData = await fileRes.json();
-        const createRes = await fetch(`${baseUrl}/v1/batches`, {
-            method: "POST",
-            headers: { ...authHeaders, "Content-Type": "application/json" },
-            body: JSON.stringify({ input_file_id: fileData.id, endpoint: "/v1/chat/completions", completion_window: "24h" }),
-        });
-        if (!createRes.ok)
-            throw new Error(`OpenAI-compat batch create: HTTP ${createRes.status} ${await createRes.text()}`);
-        const createData = await createRes.json();
-        batchId = createData.id;
-        opts.onSubmitted?.(batchId, "openai-compatible");
-    }
-    opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "submitted", total: jobs.length });
-    const endedAt = await pollUntilDone(async () => {
-        const res = await fetch(`${baseUrl}/v1/batches/${batchId}`, { headers: authHeaders });
-        if (!res.ok)
-            throw new Error(`OpenAI-compat batch poll: HTTP ${res.status}`);
-        const d = await res.json();
-        opts.onProgress?.({
-            provider: "openai-compatible",
-            batchId: batchId,
-            phase: "polling",
-            succeeded: d.request_counts?.completed,
-            failed: d.request_counts?.failed,
-            total: d.request_counts?.total ?? jobs.length,
-        });
-        if (d.status === "completed")
-            return d;
-        if (d.status === "failed" || d.status === "expired" || d.status === "cancelled") {
-            throw new Error(`OpenAI-compat batch ${d.status}`);
-        }
-        return null;
-    }, opts);
-    outputFileId = endedAt.output_file_id;
-    if (!outputFileId)
-        throw new Error("OpenAI-compat batch completed with no output_file_id");
-    opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "downloading" });
-    const contentRes = await fetch(`${baseUrl}/v1/files/${outputFileId}/content`, { headers: authHeaders });
-    if (!contentRes.ok)
-        throw new Error(`OpenAI-compat batch download: HTTP ${contentRes.status}`);
-    const text = await contentRes.text();
-    const out = new Map();
-    for (const line of text.split("\n")) {
-        if (!line.trim())
-            continue;
-        const row = JSON.parse(line);
-        if (row.error || !row.response) {
-            out.set(row.custom_id, { raw: `batch error: ${row.error?.message ?? "unknown"}`, costUsd: 0, inputTokens: 0, outputTokens: 0 });
-            continue;
-        }
-        const raw = row.response.body.choices?.[0]?.message?.content ?? "";
-        const inp = row.response.body.usage?.prompt_tokens ?? 0;
-        const outp = row.response.body.usage?.completion_tokens ?? 0;
-        out.set(row.custom_id, { raw, costUsd: (inp * 0.000003 + outp * 0.000015) * 0.5, inputTokens: inp, outputTokens: outp });
-    }
-    opts.onProgress?.({ provider: "openai-compatible", batchId, phase: "done", succeeded: out.size, total: jobs.length });
-    return out;
-}
-// ── Shared poll loop ───────────────────────────────────────────────────────
-async function pollUntilDone(check, opts) {
-    const start = Date.now();
-    const deadline = start + (opts.batchTimeoutMs ?? 24 * 60 * 60 * 1000);
-    let delay = opts.pollStartMs ?? 30_000;
-    const maxDelay = opts.pollMaxMs ?? 5 * 60_000;
-    while (Date.now() < deadline) {
-        const result = await check();
-        if (result != null)
-            return result;
-        await new Promise((r) => setTimeout(r, delay));
-        delay = Math.min(maxDelay, delay * 2);
-    }
-    throw new Error("Batch exceeded batchTimeoutMs without completing");
-}