claude-overnight 1.51.3 → 1.53.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -0
- package/dist/bin/evolve.d.ts +18 -0
- package/dist/bin/evolve.js +185 -0
- package/dist/core/_version.d.ts +1 -1
- package/dist/core/_version.js +1 -1
- package/dist/prompt-evolution/adapters/mcp-browser.js +21 -3
- package/dist/prompt-evolution/evaluator.js +3 -0
- package/dist/prompt-evolution/index.d.ts +7 -6
- package/dist/prompt-evolution/index.js +71 -10
- package/dist/prompt-evolution/llm-judge.d.ts +40 -0
- package/dist/prompt-evolution/llm-judge.js +158 -0
- package/dist/prompt-evolution/mutator.js +9 -5
- package/dist/prompt-evolution/persistence.d.ts +51 -0
- package/dist/prompt-evolution/persistence.js +169 -0
- package/dist/prompt-evolution/report.d.ts +20 -0
- package/dist/prompt-evolution/report.js +134 -0
- package/dist/prompt-evolution/types.d.ts +7 -0
- package/docs/prompt-evolution-research.md +256 -0
- package/package.json +5 -4
- package/plugins/claude-overnight/.claude-plugin/plugin.json +1 -1
package/README.md
CHANGED
|
@@ -572,6 +572,19 @@ A fixed-plan `tasks.json` (without `flexiblePlan: true`) bypasses orchestration
|
|
|
572
572
|
| `1` | Some tasks failed |
|
|
573
573
|
| `2` | All failed or none completed |
|
|
574
574
|
|
|
575
|
+
## Prompt evolution (server-side)
|
|
576
|
+
|
|
577
|
+
The `src/prompt-evolution/` engine and `claude-overnight-evolve` CLI power a self-evolution pipeline that optimises prompts (the planner prompt here, MCP-browser's supervisor prompts, or any prompt in a user's repo) via Pareto-frontier mutation with LLM-as-judge and heuristic scoring.
|
|
578
|
+
|
|
579
|
+
**Multi-hour runs aren't meant for your laptop.** Two ways to run it:
|
|
580
|
+
|
|
581
|
+
1. **`npx claude-overnight-evolve …`** — quickest. Fine for smoke tests or short runs; needs `ANTHROPIC_API_KEY` in env and keeps running only as long as your shell is open. Output: `~/.claude-overnight/prompt-evolution/<runId>/`.
|
|
582
|
+
2. **Self-hosted Docker** — [`self-host/`](self-host/README.md) ships a tiny runner image + optional HTTP server (enqueue + read-back) you can run on any VPS. Laptop can be off.
|
|
583
|
+
|
|
584
|
+
Experiment credentials — any Anthropic-compatible provider (Anthropic direct, OpenRouter, Kimi, DashScope, a local proxy) — are injected via env vars: `ANTHROPIC_BASE_URL`, `ANTHROPIC_API_KEY`, `EVAL_MODEL`, `MUTATE_MODEL`. Self-host reads them from `self-host/.env` (or per-run `env:` in the enqueue body).
|
|
585
|
+
|
|
586
|
+
Full design: [docs/prompt-evolution-research.md](docs/prompt-evolution-research.md).
|
|
587
|
+
|
|
575
588
|
## License
|
|
576
589
|
|
|
577
590
|
MIT
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* `claude-overnight-evolve` — CLI for the prompt-evolution engine.
|
|
4
|
+
*
|
|
5
|
+
* Ships with the npm package (compiled to dist/bin/evolve.js). The MCP-browser
|
|
6
|
+
* platform runs this binary inside a per-project `raw`-mode container via
|
|
7
|
+
* `docker exec`. See docs/prompt-evolution-research.md.
|
|
8
|
+
*
|
|
9
|
+
* Examples:
|
|
10
|
+
* claude-overnight-evolve --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 3
|
|
11
|
+
* claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-k2-6
|
|
12
|
+
*
|
|
13
|
+
* Requires ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) in env. When `--target
|
|
14
|
+
* mcp-browser` is used the cwd must be the MCP-browser repo root (so
|
|
15
|
+
* `platform/supervisor/gemini-client.ts` resolves), or pass the file via
|
|
16
|
+
* `MCP_BROWSER_GEMINI_CLIENT`.
|
|
17
|
+
*/
|
|
18
|
+
export {};
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* `claude-overnight-evolve` — CLI for the prompt-evolution engine.
|
|
4
|
+
*
|
|
5
|
+
* Ships with the npm package (compiled to dist/bin/evolve.js). The MCP-browser
|
|
6
|
+
* platform runs this binary inside a per-project `raw`-mode container via
|
|
7
|
+
* `docker exec`. See docs/prompt-evolution-research.md.
|
|
8
|
+
*
|
|
9
|
+
* Examples:
|
|
10
|
+
* claude-overnight-evolve --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 3
|
|
11
|
+
* claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-k2-6
|
|
12
|
+
*
|
|
13
|
+
* Requires ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) in env. When `--target
|
|
14
|
+
* mcp-browser` is used the cwd must be the MCP-browser repo root (so
|
|
15
|
+
* `platform/supervisor/gemini-client.ts` resolves), or pass the file via
|
|
16
|
+
* `MCP_BROWSER_GEMINI_CLIENT`.
|
|
17
|
+
*/
|
|
18
|
+
import { evolvePrompt } from "../prompt-evolution/index.js";
|
|
19
|
+
import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
|
|
20
|
+
import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
|
|
21
|
+
function help() {
|
|
22
|
+
process.stdout.write(`Usage: claude-overnight-evolve [options]
|
|
23
|
+
|
|
24
|
+
Options:
|
|
25
|
+
--target <name> claude-overnight | mcp-browser (default: claude-overnight)
|
|
26
|
+
--prompt <path> Prompt file path (claude-overnight)
|
|
27
|
+
--prompt-kind <kind> MCP-browser prompt kind: planning | review | evolution |
|
|
28
|
+
goal-refinement | plan-supervision | simple-supervision | stuck-analysis
|
|
29
|
+
--eval-model <model> Fast model for evaluation (default: claude-haiku-4-5)
|
|
30
|
+
--mutate-model <model> Smarter model for mutation (defaults to eval-model)
|
|
31
|
+
--generations <n> Number of evolution generations (default: 10)
|
|
32
|
+
--population <n> Max population size (default: 8)
|
|
33
|
+
--plateau <n> Stop early if no improvement for N generations (default: 3)
|
|
34
|
+
--cases <suite> Benchmark suite: plan | mcp-planning | mcp-review |
|
|
35
|
+
mcp-supervision | mcp-stuck (default: plan)
|
|
36
|
+
--base-url <url> API base URL override
|
|
37
|
+
--auth-token <token> Auth token override
|
|
38
|
+
--run-id <id> Preset run id (default: auto-generated)
|
|
39
|
+
`);
|
|
40
|
+
process.exit(0);
|
|
41
|
+
}
|
|
42
|
+
function parseArgs() {
|
|
43
|
+
const args = process.argv.slice(2);
|
|
44
|
+
if (args.includes("--help") || args.includes("-h"))
|
|
45
|
+
help();
|
|
46
|
+
const opts = {
|
|
47
|
+
target: "claude-overnight",
|
|
48
|
+
prompt: "10_planning/10-3_plan",
|
|
49
|
+
promptKind: "",
|
|
50
|
+
evalModel: process.env.EVAL_MODEL ?? "claude-haiku-4-5",
|
|
51
|
+
mutateModel: process.env.MUTATE_MODEL,
|
|
52
|
+
generations: 10,
|
|
53
|
+
population: 8,
|
|
54
|
+
plateau: 3,
|
|
55
|
+
cases: "",
|
|
56
|
+
baseUrl: process.env.ANTHROPIC_BASE_URL,
|
|
57
|
+
authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
|
|
58
|
+
};
|
|
59
|
+
for (let i = 0; i < args.length; i++) {
|
|
60
|
+
const v = args[i + 1];
|
|
61
|
+
switch (args[i]) {
|
|
62
|
+
case "--target":
|
|
63
|
+
opts.target = v;
|
|
64
|
+
i++;
|
|
65
|
+
break;
|
|
66
|
+
case "--prompt":
|
|
67
|
+
opts.prompt = v;
|
|
68
|
+
i++;
|
|
69
|
+
break;
|
|
70
|
+
case "--prompt-kind":
|
|
71
|
+
opts.promptKind = v;
|
|
72
|
+
i++;
|
|
73
|
+
break;
|
|
74
|
+
case "--eval-model":
|
|
75
|
+
opts.evalModel = v;
|
|
76
|
+
i++;
|
|
77
|
+
break;
|
|
78
|
+
case "--mutate-model":
|
|
79
|
+
opts.mutateModel = v;
|
|
80
|
+
i++;
|
|
81
|
+
break;
|
|
82
|
+
case "--generations":
|
|
83
|
+
opts.generations = parseInt(v, 10);
|
|
84
|
+
i++;
|
|
85
|
+
break;
|
|
86
|
+
case "--population":
|
|
87
|
+
opts.population = parseInt(v, 10);
|
|
88
|
+
i++;
|
|
89
|
+
break;
|
|
90
|
+
case "--plateau":
|
|
91
|
+
opts.plateau = parseInt(v, 10);
|
|
92
|
+
i++;
|
|
93
|
+
break;
|
|
94
|
+
case "--cases":
|
|
95
|
+
opts.cases = v;
|
|
96
|
+
i++;
|
|
97
|
+
break;
|
|
98
|
+
case "--base-url":
|
|
99
|
+
opts.baseUrl = v;
|
|
100
|
+
i++;
|
|
101
|
+
break;
|
|
102
|
+
case "--auth-token":
|
|
103
|
+
opts.authToken = v;
|
|
104
|
+
i++;
|
|
105
|
+
break;
|
|
106
|
+
case "--run-id":
|
|
107
|
+
opts.runId = v;
|
|
108
|
+
i++;
|
|
109
|
+
break;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
if (opts.target === "mcp-browser" && !opts.cases) {
|
|
113
|
+
opts.cases = `mcp-${opts.promptKind || "planning"}`;
|
|
114
|
+
}
|
|
115
|
+
if (!opts.cases)
|
|
116
|
+
opts.cases = "plan";
|
|
117
|
+
return opts;
|
|
118
|
+
}
|
|
119
|
+
async function main() {
|
|
120
|
+
const opts = parseArgs();
|
|
121
|
+
let cases;
|
|
122
|
+
let promptPath = opts.prompt;
|
|
123
|
+
let seedText;
|
|
124
|
+
if (opts.target === "mcp-browser") {
|
|
125
|
+
const kind = (opts.promptKind || "planning");
|
|
126
|
+
const scenarioMap = {
|
|
127
|
+
planning: PLANNING_SCENARIOS,
|
|
128
|
+
review: REVIEW_SCENARIOS,
|
|
129
|
+
evolution: [],
|
|
130
|
+
"goal-refinement": [],
|
|
131
|
+
"plan-supervision": SUPERVISION_SCENARIOS,
|
|
132
|
+
"simple-supervision": SUPERVISION_SCENARIOS,
|
|
133
|
+
"stuck-analysis": STUCK_SCENARIOS,
|
|
134
|
+
};
|
|
135
|
+
cases = hydrateCases(scenariosToCases(kind, scenarioMap[kind]));
|
|
136
|
+
promptPath = `mcp-browser/${kind}`;
|
|
137
|
+
seedText = extractPrompt(kind);
|
|
138
|
+
}
|
|
139
|
+
else {
|
|
140
|
+
if (opts.cases === "plan")
|
|
141
|
+
cases = PLAN_CASES;
|
|
142
|
+
else
|
|
143
|
+
throw new Error(`Unknown case suite: ${opts.cases}`);
|
|
144
|
+
}
|
|
145
|
+
console.log(`Evolution config:`);
|
|
146
|
+
console.log(` target: ${opts.target}`);
|
|
147
|
+
console.log(` prompt: ${promptPath}`);
|
|
148
|
+
console.log(` evalModel: ${opts.evalModel}`);
|
|
149
|
+
console.log(` mutateModel: ${opts.mutateModel ?? opts.evalModel}`);
|
|
150
|
+
console.log(` generations: ${opts.generations}`);
|
|
151
|
+
console.log(` population: ${opts.population}`);
|
|
152
|
+
console.log(` plateau: ${opts.plateau}`);
|
|
153
|
+
console.log(` cases: ${cases.length} (${opts.cases})`);
|
|
154
|
+
console.log("");
|
|
155
|
+
const result = await evolvePrompt({
|
|
156
|
+
promptPath,
|
|
157
|
+
cases,
|
|
158
|
+
evalModel: opts.evalModel,
|
|
159
|
+
mutateModel: opts.mutateModel,
|
|
160
|
+
generations: opts.generations,
|
|
161
|
+
populationCap: opts.population,
|
|
162
|
+
plateauGenerations: opts.plateau,
|
|
163
|
+
baseUrl: opts.baseUrl,
|
|
164
|
+
authToken: opts.authToken,
|
|
165
|
+
seedText,
|
|
166
|
+
target: opts.target,
|
|
167
|
+
runId: opts.runId,
|
|
168
|
+
onLog: (text) => console.log(text),
|
|
169
|
+
});
|
|
170
|
+
console.log("\n=== BEST VARIANT ===");
|
|
171
|
+
console.log(`id: ${result.bestVariant.variantId}`);
|
|
172
|
+
console.log(`generation: ${result.bestVariant.generation}`);
|
|
173
|
+
console.log(`gmean: ${(result.bestVariant.gmean * 100).toFixed(1)}%`);
|
|
174
|
+
console.log(`parse: ${(result.bestVariant.aggregate.parse * 100).toFixed(1)}%`);
|
|
175
|
+
console.log(`schema: ${(result.bestVariant.aggregate.schema * 100).toFixed(1)}%`);
|
|
176
|
+
console.log(`content: ${(result.bestVariant.aggregate.content * 100).toFixed(1)}%`);
|
|
177
|
+
console.log(`cost: ${(result.bestVariant.aggregate.costEfficiency * 100).toFixed(1)}%`);
|
|
178
|
+
console.log(`speed: ${(result.bestVariant.aggregate.speed * 100).toFixed(1)}%`);
|
|
179
|
+
console.log("\n--- Prompt text ---");
|
|
180
|
+
console.log(result.bestVariant.text);
|
|
181
|
+
}
|
|
182
|
+
main().catch((err) => {
|
|
183
|
+
console.error(err);
|
|
184
|
+
process.exit(1);
|
|
185
|
+
});
|
package/dist/core/_version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "1.
|
|
1
|
+
export declare const VERSION = "1.53.0";
|
package/dist/core/_version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// Auto-generated by build — do not edit manually.
|
|
2
|
-
export const VERSION = "1.
|
|
2
|
+
export const VERSION = "1.53.0";
|
|
@@ -12,10 +12,28 @@
|
|
|
12
12
|
*/
|
|
13
13
|
import { readFileSync } from "node:fs";
|
|
14
14
|
import { resolve } from "node:path";
|
|
15
|
-
|
|
15
|
+
/**
|
|
16
|
+
* Resolve the path to `gemini-client.ts` inside whichever MCP-browser checkout
|
|
17
|
+
* is in scope. Order:
|
|
18
|
+
* 1. `MCP_BROWSER_GEMINI_CLIENT` env var — explicit override.
|
|
19
|
+
* 2. `MCP_BROWSER_REPO` env var — repo root, relative file is appended.
|
|
20
|
+
* 3. cwd — expected shape on fornace.net (inside the project's raw container
|
|
21
|
+
* the repo is cloned at `/workspace`, so `process.cwd()` resolves it).
|
|
22
|
+
*
|
|
23
|
+
* NEVER hardcode an absolute host path — this runs in a container on the
|
|
24
|
+
* server and on any contributor's laptop.
|
|
25
|
+
*/
|
|
26
|
+
function resolveGeminiClientPath() {
|
|
27
|
+
const override = process.env.MCP_BROWSER_GEMINI_CLIENT;
|
|
28
|
+
if (override)
|
|
29
|
+
return resolve(override);
|
|
30
|
+
const repo = process.env.MCP_BROWSER_REPO ?? process.cwd();
|
|
31
|
+
return resolve(repo, "platform/supervisor/gemini-client.ts");
|
|
32
|
+
}
|
|
16
33
|
/** Extract a const prompt string from gemini-client.ts by name */
|
|
17
34
|
export function extractPrompt(kind) {
|
|
18
|
-
const
|
|
35
|
+
const path = resolveGeminiClientPath();
|
|
36
|
+
const source = readFileSync(path, "utf-8");
|
|
19
37
|
const nameMap = {
|
|
20
38
|
planning: "PLANNING_PROMPT",
|
|
21
39
|
review: "REVIEW_PROMPT",
|
|
@@ -29,7 +47,7 @@ export function extractPrompt(kind) {
|
|
|
29
47
|
const pattern = new RegExp(`const ${constName} = \`([\\s\\S]*?)\`;`);
|
|
30
48
|
const m = source.match(pattern);
|
|
31
49
|
if (!m)
|
|
32
|
-
throw new Error(`Prompt ${constName} not found in ${
|
|
50
|
+
throw new Error(`Prompt ${constName} not found in ${path}`);
|
|
33
51
|
return m[1].trim();
|
|
34
52
|
}
|
|
35
53
|
/** Build a synthetic user prompt for a given kind and scenario */
|
|
@@ -76,12 +76,15 @@ async function runSingle(job, opts) {
|
|
|
76
76
|
const baseUrl = (opts.baseUrl ?? process.env.ANTHROPIC_BASE_URL ?? "https://api.anthropic.com").replace(/\/$/, "");
|
|
77
77
|
const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
|
|
78
78
|
const isAnthropic = /^https?:\/\/(api\.)?anthropic\.com/i.test(baseUrl);
|
|
79
|
+
const isKimi = /kimi\.com/i.test(baseUrl);
|
|
79
80
|
let body;
|
|
80
81
|
let endpoint;
|
|
81
82
|
let headers = {
|
|
82
83
|
"Content-Type": "application/json",
|
|
83
84
|
"Authorization": `Bearer ${authToken}`,
|
|
84
85
|
};
|
|
86
|
+
if (isKimi)
|
|
87
|
+
headers["User-Agent"] = "Kilo-Code/1.0";
|
|
85
88
|
if (isAnthropic) {
|
|
86
89
|
// Anthropic native format
|
|
87
90
|
endpoint = `${baseUrl}/v1/messages`;
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
* 4. Mutate worst-performing variants using failure traces
|
|
18
18
|
* 5. Repeat
|
|
19
19
|
*/
|
|
20
|
-
import type { BenchmarkCase,
|
|
20
|
+
import type { BenchmarkCase, EvolutionResult } from "./types.js";
|
|
21
21
|
export interface EvolveOpts {
|
|
22
22
|
/** Prompt file path, e.g. "10_planning/10-3_plan" or "mcp-browser/planning" */
|
|
23
23
|
promptPath: string;
|
|
@@ -31,6 +31,8 @@ export interface EvolveOpts {
|
|
|
31
31
|
generations?: number;
|
|
32
32
|
/** Population size cap */
|
|
33
33
|
populationCap?: number;
|
|
34
|
+
/** Stop early if no improvement for N generations (default: 3) */
|
|
35
|
+
plateauGenerations?: number;
|
|
34
36
|
/** Current canon gmean (0 if none) */
|
|
35
37
|
canonGmean?: number;
|
|
36
38
|
/** Optional logging callback */
|
|
@@ -41,10 +43,9 @@ export interface EvolveOpts {
|
|
|
41
43
|
authToken?: string;
|
|
42
44
|
/** Optional seed prompt text (for non-file prompts like MCP-browser) */
|
|
43
45
|
seedText?: string;
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
learningLog: LearningEntry[];
|
|
46
|
+
/** Target project label for persistence */
|
|
47
|
+
target?: string;
|
|
48
|
+
/** Run ID override (auto-generated if omitted) */
|
|
49
|
+
runId?: string;
|
|
49
50
|
}
|
|
50
51
|
export declare function evolvePrompt(opts: EvolveOpts): Promise<EvolutionResult>;
|
|
@@ -21,12 +21,31 @@ import { renderPrompt } from "../prompts/load.js";
|
|
|
21
21
|
import { buildMatrix, renderVariant } from "./evaluator.js";
|
|
22
22
|
import { mutate } from "./mutator.js";
|
|
23
23
|
import { curate, formatMatrix } from "./curator.js";
|
|
24
|
+
import { initRun, appendMatrix, appendLearning, snapshotPrompts, finalizeRun } from "./persistence.js";
|
|
25
|
+
import { generateReport } from "./report.js";
|
|
24
26
|
export async function evolvePrompt(opts) {
|
|
25
27
|
const log = opts.onLog ?? ((t) => process.stdout.write(t + "\n"));
|
|
26
|
-
const generations = opts.generations ??
|
|
27
|
-
const populationCap = opts.populationCap ??
|
|
28
|
+
const generations = opts.generations ?? 10;
|
|
29
|
+
const populationCap = opts.populationCap ?? 8;
|
|
30
|
+
const plateauGenerations = opts.plateauGenerations ?? 3;
|
|
28
31
|
const mutateModel = opts.mutateModel ?? opts.evalModel;
|
|
29
32
|
const canonGmean = opts.canonGmean ?? 0;
|
|
33
|
+
const target = opts.target ?? "claude-overnight";
|
|
34
|
+
const runId = opts.runId ?? `run_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 6)}`;
|
|
35
|
+
// ── 0. Initialise persistence ──
|
|
36
|
+
initRun({
|
|
37
|
+
runId,
|
|
38
|
+
promptPath: opts.promptPath,
|
|
39
|
+
target,
|
|
40
|
+
evalModel: opts.evalModel,
|
|
41
|
+
mutateModel,
|
|
42
|
+
generations,
|
|
43
|
+
populationCap,
|
|
44
|
+
startedAt: new Date().toISOString(),
|
|
45
|
+
status: "running",
|
|
46
|
+
caseNames: opts.cases.map((c) => c.name),
|
|
47
|
+
});
|
|
48
|
+
log(`Run directory: ~/.claude-overnight/prompt-evolution/${runId}/`);
|
|
30
49
|
// ── 1. Seed population from existing variants or seed text ──
|
|
31
50
|
let population = opts.seedText
|
|
32
51
|
? [{ id: "default", promptPath: opts.promptPath, generation: 0, text: opts.seedText }]
|
|
@@ -34,6 +53,8 @@ export async function evolvePrompt(opts) {
|
|
|
34
53
|
log(`Seeded ${population.length} variants from ${opts.promptPath}`);
|
|
35
54
|
const learningLog = [];
|
|
36
55
|
let bestOverall = null;
|
|
56
|
+
const generationMatrices = [];
|
|
57
|
+
let generationsWithoutImprovement = 0;
|
|
37
58
|
for (let gen = 0; gen < generations; gen++) {
|
|
38
59
|
log(`\n=== Generation ${gen + 1}/${generations} | Population: ${population.length} ===`);
|
|
39
60
|
// ── 2. Evaluate ──
|
|
@@ -47,11 +68,18 @@ export async function evolvePrompt(opts) {
|
|
|
47
68
|
},
|
|
48
69
|
};
|
|
49
70
|
const matrix = await buildMatrix(population, opts.cases, evalOpts);
|
|
71
|
+
generationMatrices.push(matrix);
|
|
72
|
+
snapshotPrompts(runId, matrix);
|
|
73
|
+
appendMatrix(runId, gen, matrix);
|
|
50
74
|
log(formatMatrix(matrix, opts.cases.map((c) => c.name)));
|
|
51
75
|
// Track best
|
|
52
76
|
const genBest = matrix.reduce((a, b) => (a.gmean > b.gmean ? a : b));
|
|
53
|
-
if (!bestOverall || genBest.gmean > bestOverall.gmean) {
|
|
77
|
+
if (!bestOverall || genBest.gmean > bestOverall.gmean + 0.001) {
|
|
54
78
|
bestOverall = genBest;
|
|
79
|
+
generationsWithoutImprovement = 0;
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
generationsWithoutImprovement++;
|
|
55
83
|
}
|
|
56
84
|
// ── 3. Curate ──
|
|
57
85
|
const curateOpts = {
|
|
@@ -71,6 +99,12 @@ export async function evolvePrompt(opts) {
|
|
|
71
99
|
}));
|
|
72
100
|
// ── 5. Mutate to refill ──
|
|
73
101
|
const targetSize = Math.min(populationCap, keptRows.length + 2);
|
|
102
|
+
const newEntries = [];
|
|
103
|
+
// Early stopping check
|
|
104
|
+
if (generationsWithoutImprovement >= plateauGenerations && gen >= 2) {
|
|
105
|
+
log(`\n=== Early stop: no improvement for ${generationsWithoutImprovement} generations ===`);
|
|
106
|
+
break;
|
|
107
|
+
}
|
|
74
108
|
if (nextPop.length < targetSize && gen < generations - 1) {
|
|
75
109
|
const mutantsNeeded = targetSize - nextPop.length;
|
|
76
110
|
log(`Generating ${mutantsNeeded} mutant(s)...`);
|
|
@@ -115,12 +149,14 @@ export async function evolvePrompt(opts) {
|
|
|
115
149
|
generation: mutant.generation,
|
|
116
150
|
text: mutant.text,
|
|
117
151
|
});
|
|
118
|
-
|
|
152
|
+
const entry = {
|
|
119
153
|
generation: gen,
|
|
120
154
|
mutationSummary: mutant.mutationSummary,
|
|
121
155
|
fitnessDelta: 0, // filled next gen
|
|
122
156
|
status: "neutral",
|
|
123
|
-
}
|
|
157
|
+
};
|
|
158
|
+
learningLog.push(entry);
|
|
159
|
+
newEntries.push(entry);
|
|
124
160
|
log(` Mutant ${mutant.variantId} ← ${parent.variantId}: ${mutant.mutationSummary}`);
|
|
125
161
|
}
|
|
126
162
|
catch (err) {
|
|
@@ -129,6 +165,8 @@ export async function evolvePrompt(opts) {
|
|
|
129
165
|
}
|
|
130
166
|
}
|
|
131
167
|
}
|
|
168
|
+
if (newEntries.length)
|
|
169
|
+
appendLearning(runId, newEntries);
|
|
132
170
|
population = nextPop;
|
|
133
171
|
}
|
|
134
172
|
// Final evaluation of surviving population
|
|
@@ -139,13 +177,36 @@ export async function evolvePrompt(opts) {
|
|
|
139
177
|
authToken: opts.authToken,
|
|
140
178
|
concurrency: 4,
|
|
141
179
|
});
|
|
180
|
+
generationMatrices.push(finalMatrix);
|
|
181
|
+
snapshotPrompts(runId, finalMatrix);
|
|
182
|
+
appendMatrix(runId, generations, finalMatrix);
|
|
142
183
|
log(formatMatrix(finalMatrix, opts.cases.map((c) => c.name)));
|
|
143
184
|
const best = finalMatrix.reduce((a, b) => (a.gmean > b.gmean ? a : b));
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
185
|
+
const historicalBest = bestOverall && bestOverall.gmean > best.gmean ? bestOverall : best;
|
|
186
|
+
const result = {
|
|
187
|
+
bestVariant: historicalBest,
|
|
188
|
+
allRows: finalMatrix,
|
|
189
|
+
learningLog,
|
|
190
|
+
runId,
|
|
191
|
+
};
|
|
192
|
+
// Generate and save report
|
|
193
|
+
const baselineText = generationMatrices[0]?.find((r) => r.variantId === "default")?.text;
|
|
194
|
+
const report = generateReport({
|
|
195
|
+
runId,
|
|
196
|
+
promptPath: opts.promptPath,
|
|
197
|
+
target,
|
|
198
|
+
evalModel: opts.evalModel,
|
|
199
|
+
generations,
|
|
200
|
+
baselineText,
|
|
201
|
+
}, result, generationMatrices);
|
|
202
|
+
const { writeFileSync } = await import("node:fs");
|
|
203
|
+
const { runDir } = await import("./persistence.js");
|
|
204
|
+
const reportPath = `${runDir(runId)}/report.md`;
|
|
205
|
+
writeFileSync(reportPath, report);
|
|
206
|
+
result.reportPath = reportPath;
|
|
207
|
+
finalizeRun(runId, result);
|
|
208
|
+
log(`\nReport saved: ${reportPath}`);
|
|
209
|
+
return result;
|
|
149
210
|
}
|
|
150
211
|
// ── Helpers ──
|
|
151
212
|
function seedPopulation(promptPath) {
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-as-judge scoring for prompt evolution.
|
|
3
|
+
*
|
|
4
|
+
* Inspired by Hermes Agent's autoresearch skill and self-evolution repo:
|
|
5
|
+
* - Structured rubric (5 criteria × 1-5 scale, normalised to 0-1)
|
|
6
|
+
* - The judge sees the prompt, the case, and the model's raw output
|
|
7
|
+
* - Returns both a scalar score and human-readable justification
|
|
8
|
+
*
|
|
9
|
+
* When to use:
|
|
10
|
+
* - Content criteria that are too fuzzy for deterministic regex
|
|
11
|
+
* (e.g. "was the plan creative?", "did the response follow the spirit of the prompt?")
|
|
12
|
+
* - Final validation gate before promoting a variant to canon
|
|
13
|
+
*
|
|
14
|
+
* Cost: ~1 judge call per case per generation (~$0.002-0.01 each).
|
|
15
|
+
*/
|
|
16
|
+
import type { BenchmarkCase, ScoreDimensions } from "./types.js";
|
|
17
|
+
export interface JudgeOpts {
|
|
18
|
+
model: string;
|
|
19
|
+
baseUrl?: string;
|
|
20
|
+
authToken?: string;
|
|
21
|
+
maxTokens?: number;
|
|
22
|
+
timeoutMs?: number;
|
|
23
|
+
}
|
|
24
|
+
export interface JudgeResult {
|
|
25
|
+
/** 0-1 overall quality score */
|
|
26
|
+
score: number;
|
|
27
|
+
/** Dimension breakdown matching ScoreDimensions keys */
|
|
28
|
+
dimensions: Partial<ScoreDimensions>;
|
|
29
|
+
/** Human-readable rubric justification */
|
|
30
|
+
justification: string;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Score a single (case, output) pair with an LLM judge.
|
|
34
|
+
*
|
|
35
|
+
* The judge prompt is carefully structured to be reproducible:
|
|
36
|
+
* - Exact rubric with 1-5 Likert scale definitions
|
|
37
|
+
* - One-shot example in the prompt text
|
|
38
|
+
* - Forced JSON output schema
|
|
39
|
+
*/
|
|
40
|
+
export declare function judgeOutput(rawOutput: string, c: BenchmarkCase, opts: JudgeOpts): Promise<JudgeResult>;
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-as-judge scoring for prompt evolution.
|
|
3
|
+
*
|
|
4
|
+
* Inspired by Hermes Agent's autoresearch skill and self-evolution repo:
|
|
5
|
+
* - Structured rubric (5 criteria × 1-5 scale, normalised to 0-1)
|
|
6
|
+
* - The judge sees the prompt, the case, and the model's raw output
|
|
7
|
+
* - Returns both a scalar score and human-readable justification
|
|
8
|
+
*
|
|
9
|
+
* When to use:
|
|
10
|
+
* - Content criteria that are too fuzzy for deterministic regex
|
|
11
|
+
* (e.g. "was the plan creative?", "did the response follow the spirit of the prompt?")
|
|
12
|
+
* - Final validation gate before promoting a variant to canon
|
|
13
|
+
*
|
|
14
|
+
* Cost: ~1 judge call per case per generation (~$0.002-0.01 each).
|
|
15
|
+
*/
|
|
16
|
+
const DEFAULT_RUBRIC = [
|
|
17
|
+
{ name: "parse", question: "Is the output well-formed and parseable (valid JSON if expected, clear structure otherwise)?" },
|
|
18
|
+
{ name: "schema", question: "Does the output contain all required fields / follow the expected schema?" },
|
|
19
|
+
{ name: "content", question: "Is the content accurate, relevant, and satisfying the user's intent?" },
|
|
20
|
+
{ name: "concision", question: "Is the response concise without omitting necessary detail?" },
|
|
21
|
+
{ name: "instruction", question: "Does the output follow the explicit and implicit instructions in the system prompt?" },
|
|
22
|
+
];
|
|
23
|
+
/**
|
|
24
|
+
* Score a single (case, output) pair with an LLM judge.
|
|
25
|
+
*
|
|
26
|
+
* The judge prompt is carefully structured to be reproducible:
|
|
27
|
+
* - Exact rubric with 1-5 Likert scale definitions
|
|
28
|
+
* - One-shot example in the prompt text
|
|
29
|
+
* - Forced JSON output schema
|
|
30
|
+
*/
|
|
31
|
+
export async function judgeOutput(rawOutput, c, opts) {
|
|
32
|
+
const baseUrl = (opts.baseUrl ?? process.env.ANTHROPIC_BASE_URL ?? "https://api.anthropic.com").replace(/\/$/, "");
|
|
33
|
+
const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
|
|
34
|
+
const isKimi = /kimi\.com/i.test(baseUrl);
|
|
35
|
+
const prompt = buildJudgePrompt(rawOutput, c);
|
|
36
|
+
const body = JSON.stringify({
|
|
37
|
+
model: opts.model,
|
|
38
|
+
max_tokens: opts.maxTokens ?? 2048,
|
|
39
|
+
messages: [{ role: "user", content: prompt }],
|
|
40
|
+
});
|
|
41
|
+
const isAnthropic = /^https?:\/\/(api\.)?anthropic\.com/i.test(baseUrl);
|
|
42
|
+
const endpoint = isAnthropic ? `${baseUrl}/v1/messages` : `${baseUrl}/v1/chat/completions`;
|
|
43
|
+
const headers = {
|
|
44
|
+
"Content-Type": "application/json",
|
|
45
|
+
"Authorization": `Bearer ${authToken}`,
|
|
46
|
+
};
|
|
47
|
+
if (isAnthropic)
|
|
48
|
+
headers["anthropic-version"] = "2023-06-01";
|
|
49
|
+
if (isKimi)
|
|
50
|
+
headers["User-Agent"] = "Kilo-Code/1.0";
|
|
51
|
+
const res = await fetch(endpoint, {
|
|
52
|
+
method: "POST",
|
|
53
|
+
headers,
|
|
54
|
+
body,
|
|
55
|
+
signal: AbortSignal.timeout(opts.timeoutMs ?? 60_000),
|
|
56
|
+
});
|
|
57
|
+
if (!res.ok) {
|
|
58
|
+
const text = await res.text().catch(() => "");
|
|
59
|
+
throw new Error(`Judge HTTP ${res.status}: ${text.slice(0, 200)}`);
|
|
60
|
+
}
|
|
61
|
+
let raw = "";
|
|
62
|
+
if (isAnthropic) {
|
|
63
|
+
const data = await res.json();
|
|
64
|
+
raw = data.content?.map((c) => c.text ?? "").join("") ?? "";
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
const data = await res.json();
|
|
68
|
+
raw = data.choices?.[0]?.message?.content ?? "";
|
|
69
|
+
}
|
|
70
|
+
return parseJudgeOutput(raw);
|
|
71
|
+
}
|
|
72
|
+
function buildJudgePrompt(rawOutput, c) {
|
|
73
|
+
const rubricLines = DEFAULT_RUBRIC
|
|
74
|
+
.map((r, i) => `${i + 1}. **${r.name}**: ${r.question}`)
|
|
75
|
+
.join("\n");
|
|
76
|
+
return `You are an expert prompt-evaluation judge. Your task is to score a language-model output against a benchmark case using a strict rubric.
|
|
77
|
+
|
|
78
|
+
## Benchmark Case
|
|
79
|
+
|
|
80
|
+
- **Name**: ${c.name}
|
|
81
|
+
- **Prompt path**: ${c.promptPath}
|
|
82
|
+
- **Expected criteria**:
|
|
83
|
+
${Object.entries(c.criteria).map(([k, v]) => ` - ${k}: ${v}`).join("\n")}
|
|
84
|
+
|
|
85
|
+
## Model Output
|
|
86
|
+
|
|
87
|
+
\`\`\`
|
|
88
|
+
${rawOutput.slice(0, 3000)}
|
|
89
|
+
\`\`\`
|
|
90
|
+
|
|
91
|
+
## Rubric
|
|
92
|
+
|
|
93
|
+
Score each criterion on a 1-5 Likert scale:
|
|
94
|
+
- 5 = Excellent / exceeds expectations
|
|
95
|
+
- 4 = Good / meets expectations
|
|
96
|
+
- 3 = Acceptable / minor issues
|
|
97
|
+
- 2 = Poor / significant issues
|
|
98
|
+
- 1 = Unacceptable / fails completely
|
|
99
|
+
|
|
100
|
+
${rubricLines}
|
|
101
|
+
|
|
102
|
+
## Response Format
|
|
103
|
+
|
|
104
|
+
Respond ONLY with a JSON object in this exact shape (no markdown fences, no extra text):
|
|
105
|
+
|
|
106
|
+
{"parse":5,"schema":5,"content":4,"concision":4,"instruction":5,"justification":"Brief justification here."}
|
|
107
|
+
`;
|
|
108
|
+
}
|
|
109
|
+
function parseJudgeOutput(raw) {
|
|
110
|
+
// Strip fences
|
|
111
|
+
const cleaned = raw
|
|
112
|
+
.replace(/^\`\`\`(?:json)?\s*\n?/i, "")
|
|
113
|
+
.replace(/\n?\`\`\`\s*$/i, "")
|
|
114
|
+
.trim();
|
|
115
|
+
let obj;
|
|
116
|
+
try {
|
|
117
|
+
obj = JSON.parse(cleaned);
|
|
118
|
+
}
|
|
119
|
+
catch {
|
|
120
|
+
// Try to extract first JSON object
|
|
121
|
+
const m = cleaned.match(/\{[\s\S]*\}/);
|
|
122
|
+
if (!m) {
|
|
123
|
+
return { score: 0.5, dimensions: {}, justification: "Judge returned unparseable JSON. Falling back to neutral." };
|
|
124
|
+
}
|
|
125
|
+
try {
|
|
126
|
+
obj = JSON.parse(m[0]);
|
|
127
|
+
}
|
|
128
|
+
catch {
|
|
129
|
+
return { score: 0.5, dimensions: {}, justification: "Judge returned unparseable JSON. Falling back to neutral." };
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
const getNum = (k) => {
|
|
133
|
+
const v = obj[k];
|
|
134
|
+
if (typeof v === "number")
|
|
135
|
+
return Math.max(1, Math.min(5, v));
|
|
136
|
+
return 3; // neutral default
|
|
137
|
+
};
|
|
138
|
+
const parse = (getNum("parse") - 1) / 4;
|
|
139
|
+
const schema = (getNum("schema") - 1) / 4;
|
|
140
|
+
const content = (getNum("content") - 1) / 4;
|
|
141
|
+
const concision = (getNum("concision") - 1) / 4;
|
|
142
|
+
const instruction = (getNum("instruction") - 1) / 4;
|
|
143
|
+
// Map concision → costEfficiency, instruction → speed (proxy for "follows instructions quickly")
|
|
144
|
+
const dimensions = {
|
|
145
|
+
parse,
|
|
146
|
+
schema,
|
|
147
|
+
content,
|
|
148
|
+
costEfficiency: concision,
|
|
149
|
+
speed: instruction,
|
|
150
|
+
};
|
|
151
|
+
const vals = [parse, schema, content, concision, instruction];
|
|
152
|
+
const score = vals.reduce((a, b) => a + b, 0) / vals.length;
|
|
153
|
+
return {
|
|
154
|
+
score,
|
|
155
|
+
dimensions,
|
|
156
|
+
justification: typeof obj.justification === "string" ? obj.justification : "(no justification)",
|
|
157
|
+
};
|
|
158
|
+
}
|