kairn-cli 2.2.9 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +248 -82
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
// src/cli.ts
|
|
2
2
|
import { Command as Command12 } from "commander";
|
|
3
3
|
import chalk15 from "chalk";
|
|
4
|
+
import { createRequire } from "module";
|
|
4
5
|
|
|
5
6
|
// src/commands/init.ts
|
|
6
7
|
import { Command } from "commander";
|
|
7
|
-
import { input, password, select } from "@inquirer/prompts";
|
|
8
|
+
import { confirm, input, password, select } from "@inquirer/prompts";
|
|
8
9
|
import chalk3 from "chalk";
|
|
9
10
|
import Anthropic from "@anthropic-ai/sdk";
|
|
10
11
|
import OpenAI from "openai";
|
|
@@ -62,6 +63,59 @@ async function saveConfig(config) {
|
|
|
62
63
|
await fs.writeFile(CONFIG_PATH, JSON.stringify(config, null, 2), "utf-8");
|
|
63
64
|
}
|
|
64
65
|
|
|
66
|
+
// src/auth/keychain.ts
|
|
67
|
+
import { exec } from "child_process";
|
|
68
|
+
import { promisify } from "util";
|
|
69
|
+
var execAsync = promisify(exec);
|
|
70
|
+
var KEYCHAIN_SERVICE = "Claude Code-credentials";
|
|
71
|
+
var TOKEN_EXPIRY_BUFFER_MS = 6e4;
|
|
72
|
+
function parseKeychainCredentials(raw) {
|
|
73
|
+
let parsed;
|
|
74
|
+
try {
|
|
75
|
+
parsed = JSON.parse(raw);
|
|
76
|
+
} catch {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
if (typeof parsed !== "object" || parsed === null) return null;
|
|
80
|
+
const obj = parsed;
|
|
81
|
+
const oauth = obj["claudeAiOauth"];
|
|
82
|
+
if (typeof oauth !== "object" || oauth === null) return null;
|
|
83
|
+
const oauthObj = oauth;
|
|
84
|
+
const accessToken = oauthObj["accessToken"];
|
|
85
|
+
const refreshToken = oauthObj["refreshToken"];
|
|
86
|
+
const expiresAt = oauthObj["expiresAt"];
|
|
87
|
+
const subscriptionType = oauthObj["subscriptionType"];
|
|
88
|
+
if (typeof accessToken !== "string" || !accessToken) return null;
|
|
89
|
+
if (typeof refreshToken !== "string") return null;
|
|
90
|
+
if (typeof expiresAt !== "number") return null;
|
|
91
|
+
return {
|
|
92
|
+
accessToken,
|
|
93
|
+
refreshToken,
|
|
94
|
+
expiresAt,
|
|
95
|
+
subscriptionType: typeof subscriptionType === "string" ? subscriptionType : "unknown"
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
function isTokenExpired(credentials) {
|
|
99
|
+
return Date.now() + TOKEN_EXPIRY_BUFFER_MS >= credentials.expiresAt;
|
|
100
|
+
}
|
|
101
|
+
async function readClaudeCodeCredentials(account) {
|
|
102
|
+
if (process.platform !== "darwin") return null;
|
|
103
|
+
try {
|
|
104
|
+
const acct = account ?? "";
|
|
105
|
+
const cmd = acct ? `security find-generic-password -s "${KEYCHAIN_SERVICE}" -a "${acct}" -w` : `security find-generic-password -s "${KEYCHAIN_SERVICE}" -w`;
|
|
106
|
+
const { stdout } = await execAsync(cmd, { timeout: 5e3 });
|
|
107
|
+
return parseKeychainCredentials(stdout.trim());
|
|
108
|
+
} catch {
|
|
109
|
+
return null;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
async function getAccessToken(account) {
|
|
113
|
+
const creds = await readClaudeCodeCredentials(account);
|
|
114
|
+
if (!creds) return null;
|
|
115
|
+
if (isTokenExpired(creds)) return null;
|
|
116
|
+
return creds.accessToken;
|
|
117
|
+
}
|
|
118
|
+
|
|
65
119
|
// src/providers.ts
|
|
66
120
|
var PROVIDER_CONFIGS = {
|
|
67
121
|
anthropic: {
|
|
@@ -508,30 +562,49 @@ var initCommand = new Command("init").description("Set up Kairn with your API ke
|
|
|
508
562
|
choices: PROVIDER_MODELS[provider]
|
|
509
563
|
});
|
|
510
564
|
}
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
565
|
+
let apiKey = "";
|
|
566
|
+
let authType = "api-key";
|
|
567
|
+
if (provider === "anthropic") {
|
|
568
|
+
const oauthToken = await getAccessToken();
|
|
569
|
+
if (oauthToken) {
|
|
570
|
+
const useOAuth = await confirm({
|
|
571
|
+
message: "Claude Code subscription detected. Use it instead of an API key? (experimental \u2014 may break)",
|
|
572
|
+
default: true
|
|
573
|
+
});
|
|
574
|
+
if (useOAuth) {
|
|
575
|
+
authType = "claude-code-oauth";
|
|
576
|
+
console.log(ui.warn("Using Claude Code OAuth token. This is undocumented and may break at any time."));
|
|
577
|
+
console.log(ui.success("OAuth token validated"));
|
|
578
|
+
}
|
|
579
|
+
}
|
|
518
580
|
}
|
|
519
|
-
if (
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
581
|
+
if (authType === "api-key") {
|
|
582
|
+
apiKey = await password({
|
|
583
|
+
message: `${providerDisplayName} API key${provider === "other" ? " (Enter to skip)" : ""}`,
|
|
584
|
+
mask: "*"
|
|
585
|
+
});
|
|
586
|
+
if (!apiKey && provider !== "other") {
|
|
587
|
+
console.log(ui.error("No API key provided. Aborting."));
|
|
524
588
|
process.exit(1);
|
|
525
589
|
}
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
590
|
+
if (apiKey) {
|
|
591
|
+
console.log(chalk3.dim("\n Verifying API key..."));
|
|
592
|
+
const valid = await verifyKey(provider, apiKey, baseURL, model);
|
|
593
|
+
if (!valid) {
|
|
594
|
+
console.log(ui.error("Invalid API key. Check your key and try again."));
|
|
595
|
+
process.exit(1);
|
|
596
|
+
}
|
|
597
|
+
console.log(ui.success("API key verified"));
|
|
598
|
+
} else {
|
|
599
|
+
console.log(ui.warn("No API key \u2014 skipping verification"));
|
|
600
|
+
}
|
|
529
601
|
}
|
|
530
602
|
const config = {
|
|
531
603
|
provider,
|
|
532
|
-
api_key: apiKey
|
|
604
|
+
api_key: apiKey,
|
|
533
605
|
model,
|
|
534
606
|
...baseURL ? { base_url: baseURL } : {},
|
|
607
|
+
...authType !== "api-key" ? { auth_type: authType } : {},
|
|
535
608
|
default_runtime: "claude-code",
|
|
536
609
|
created_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
537
610
|
};
|
|
@@ -555,7 +628,7 @@ var initCommand = new Command("init").description("Set up Kairn with your API ke
|
|
|
555
628
|
|
|
556
629
|
// src/commands/describe.ts
|
|
557
630
|
import { Command as Command2 } from "commander";
|
|
558
|
-
import { input as input2, confirm, select as select2 } from "@inquirer/prompts";
|
|
631
|
+
import { input as input2, confirm as confirm2, select as select2 } from "@inquirer/prompts";
|
|
559
632
|
import chalk5 from "chalk";
|
|
560
633
|
|
|
561
634
|
// src/compiler/compile.ts
|
|
@@ -1240,8 +1313,18 @@ async function callLLM(config, userMessage, options) {
|
|
|
1240
1313
|
const { systemPrompt } = options;
|
|
1241
1314
|
const jsonMode = options.jsonMode ?? false;
|
|
1242
1315
|
const providerName = getProviderName(config.provider);
|
|
1316
|
+
let apiKey = config.api_key;
|
|
1317
|
+
if (config.auth_type === "claude-code-oauth") {
|
|
1318
|
+
const oauthToken = await getAccessToken();
|
|
1319
|
+
if (!oauthToken) {
|
|
1320
|
+
throw new Error(
|
|
1321
|
+
"Claude Code OAuth token unavailable or expired. Run `kairn init` to reconfigure, or launch Claude Code to refresh the token."
|
|
1322
|
+
);
|
|
1323
|
+
}
|
|
1324
|
+
apiKey = oauthToken;
|
|
1325
|
+
}
|
|
1243
1326
|
if (config.provider === "anthropic") {
|
|
1244
|
-
const client2 = new Anthropic2({ apiKey
|
|
1327
|
+
const client2 = new Anthropic2({ apiKey });
|
|
1245
1328
|
const messages = [
|
|
1246
1329
|
{ role: "user", content: userMessage }
|
|
1247
1330
|
];
|
|
@@ -1262,7 +1345,7 @@ async function callLLM(config, userMessage, options) {
|
|
|
1262
1345
|
}
|
|
1263
1346
|
}
|
|
1264
1347
|
const resolvedBaseURL = getBaseURL(config.provider, config.base_url);
|
|
1265
|
-
const clientOptions = { apiKey
|
|
1348
|
+
const clientOptions = { apiKey };
|
|
1266
1349
|
if (resolvedBaseURL) clientOptions.baseURL = resolvedBaseURL;
|
|
1267
1350
|
const client = new OpenAI2(clientOptions);
|
|
1268
1351
|
try {
|
|
@@ -2441,7 +2524,7 @@ Autonomy level: ${autonomyLevel} (${autonomyLabel(autonomyLevel)})`;
|
|
|
2441
2524
|
console.log("");
|
|
2442
2525
|
}
|
|
2443
2526
|
}
|
|
2444
|
-
const proceed = options.yes || await
|
|
2527
|
+
const proceed = options.yes || await confirm2({
|
|
2445
2528
|
message: "Generate environment in current directory?",
|
|
2446
2529
|
default: true
|
|
2447
2530
|
});
|
|
@@ -2656,7 +2739,7 @@ var updateRegistryCommand = new Command5("update-registry").description("Fetch t
|
|
|
2656
2739
|
|
|
2657
2740
|
// src/commands/optimize.ts
|
|
2658
2741
|
import { Command as Command6 } from "commander";
|
|
2659
|
-
import { confirm as
|
|
2742
|
+
import { confirm as confirm3 } from "@inquirer/prompts";
|
|
2660
2743
|
import chalk9 from "chalk";
|
|
2661
2744
|
import ora from "ora";
|
|
2662
2745
|
import fs12 from "fs/promises";
|
|
@@ -2718,7 +2801,7 @@ function detectFramework(deps) {
|
|
|
2718
2801
|
];
|
|
2719
2802
|
const detected = [];
|
|
2720
2803
|
for (const [packages, name] of frameworks) {
|
|
2721
|
-
if (packages.some((
|
|
2804
|
+
if (packages.some((pkg2) => deps.includes(pkg2))) {
|
|
2722
2805
|
detected.push(name);
|
|
2723
2806
|
}
|
|
2724
2807
|
}
|
|
@@ -2742,11 +2825,11 @@ function extractEnvKeys(content) {
|
|
|
2742
2825
|
return keys;
|
|
2743
2826
|
}
|
|
2744
2827
|
async function scanProject(dir) {
|
|
2745
|
-
const
|
|
2746
|
-
const deps =
|
|
2747
|
-
const devDeps =
|
|
2828
|
+
const pkg2 = await readJsonSafe(path11.join(dir, "package.json"));
|
|
2829
|
+
const deps = pkg2?.dependencies ? Object.keys(pkg2.dependencies) : [];
|
|
2830
|
+
const devDeps = pkg2?.devDependencies ? Object.keys(pkg2.devDependencies) : [];
|
|
2748
2831
|
const allDeps = [...deps, ...devDeps];
|
|
2749
|
-
const scripts =
|
|
2832
|
+
const scripts = pkg2?.scripts || {};
|
|
2750
2833
|
const rootFiles = await listDirSafe(dir);
|
|
2751
2834
|
const keyFiles = rootFiles.filter(
|
|
2752
2835
|
(f) => [
|
|
@@ -2808,8 +2891,8 @@ async function scanProject(dir) {
|
|
|
2808
2891
|
existingSkills = await listDirSafe(path11.join(claudeDir, "skills"));
|
|
2809
2892
|
existingAgents = (await listDirSafe(path11.join(claudeDir, "agents"))).filter((f) => f.endsWith(".md")).map((f) => f.replace(".md", ""));
|
|
2810
2893
|
}
|
|
2811
|
-
const name =
|
|
2812
|
-
const description =
|
|
2894
|
+
const name = pkg2?.name || path11.basename(dir);
|
|
2895
|
+
const description = pkg2?.description || "";
|
|
2813
2896
|
return {
|
|
2814
2897
|
name,
|
|
2815
2898
|
description,
|
|
@@ -3028,7 +3111,7 @@ var optimizeCommand = new Command6("optimize").description("Scan an existing pro
|
|
|
3028
3111
|
}
|
|
3029
3112
|
if (!options.yes) {
|
|
3030
3113
|
console.log("");
|
|
3031
|
-
const proceed = await
|
|
3114
|
+
const proceed = await confirm3({
|
|
3032
3115
|
message: "Generate optimized environment? This will overwrite existing .claude/ files.",
|
|
3033
3116
|
default: false
|
|
3034
3117
|
});
|
|
@@ -3040,7 +3123,7 @@ var optimizeCommand = new Command6("optimize").description("Scan an existing pro
|
|
|
3040
3123
|
} else {
|
|
3041
3124
|
console.log(chalk9.dim("\n No existing .claude/ directory found \u2014 generating from scratch.\n"));
|
|
3042
3125
|
if (!options.yes) {
|
|
3043
|
-
const proceed = await
|
|
3126
|
+
const proceed = await confirm3({
|
|
3044
3127
|
message: "Generate Claude Code environment for this project?",
|
|
3045
3128
|
default: true
|
|
3046
3129
|
});
|
|
@@ -3103,7 +3186,7 @@ var optimizeCommand = new Command6("optimize").description("Scan an existing pro
|
|
|
3103
3186
|
}
|
|
3104
3187
|
}
|
|
3105
3188
|
console.log("");
|
|
3106
|
-
const apply = await
|
|
3189
|
+
const apply = await confirm3({
|
|
3107
3190
|
message: "Apply these changes?",
|
|
3108
3191
|
default: true
|
|
3109
3192
|
});
|
|
@@ -3702,7 +3785,7 @@ import ora2 from "ora";
|
|
|
3702
3785
|
import fs24 from "fs/promises";
|
|
3703
3786
|
import path24 from "path";
|
|
3704
3787
|
import { parse as yamlParse2 } from "yaml";
|
|
3705
|
-
import { confirm as
|
|
3788
|
+
import { confirm as confirm4, select as select4 } from "@inquirer/prompts";
|
|
3706
3789
|
|
|
3707
3790
|
// src/evolve/init.ts
|
|
3708
3791
|
import fs15 from "fs/promises";
|
|
@@ -3945,14 +4028,14 @@ async function buildProjectProfile(projectRoot) {
|
|
|
3945
4028
|
path15.join(projectRoot, "package.json"),
|
|
3946
4029
|
"utf-8"
|
|
3947
4030
|
);
|
|
3948
|
-
const
|
|
4031
|
+
const pkg2 = JSON.parse(pkgStr);
|
|
3949
4032
|
profile.language = "typescript";
|
|
3950
|
-
if (
|
|
3951
|
-
profile.scripts =
|
|
4033
|
+
if (pkg2.scripts && typeof pkg2.scripts === "object") {
|
|
4034
|
+
profile.scripts = pkg2.scripts;
|
|
3952
4035
|
}
|
|
3953
4036
|
const deps = {
|
|
3954
|
-
...
|
|
3955
|
-
...
|
|
4037
|
+
...pkg2.dependencies ?? {},
|
|
4038
|
+
...pkg2.devDependencies ?? {}
|
|
3956
4039
|
};
|
|
3957
4040
|
if (deps.next) {
|
|
3958
4041
|
profile.framework = "Next.js";
|
|
@@ -4051,8 +4134,8 @@ async function copyDir(src, dest) {
|
|
|
4051
4134
|
}
|
|
4052
4135
|
|
|
4053
4136
|
// src/evolve/runner.ts
|
|
4054
|
-
import { exec as
|
|
4055
|
-
import { promisify as
|
|
4137
|
+
import { exec as exec3, spawn } from "child_process";
|
|
4138
|
+
import { promisify as promisify3 } from "util";
|
|
4056
4139
|
import fs18 from "fs/promises";
|
|
4057
4140
|
import os3 from "os";
|
|
4058
4141
|
import path18 from "path";
|
|
@@ -4177,11 +4260,11 @@ async function loadIterationLog(workspacePath, iteration) {
|
|
|
4177
4260
|
}
|
|
4178
4261
|
|
|
4179
4262
|
// src/evolve/exec.ts
|
|
4180
|
-
import { exec } from "child_process";
|
|
4181
|
-
import { promisify } from "util";
|
|
4182
|
-
var
|
|
4263
|
+
import { exec as exec2 } from "child_process";
|
|
4264
|
+
import { promisify as promisify2 } from "util";
|
|
4265
|
+
var execAsync2 = promisify2(exec2);
|
|
4183
4266
|
async function execCommand(cmd, cwd, timeoutMs = 3e4) {
|
|
4184
|
-
return
|
|
4267
|
+
return execAsync2(cmd, { cwd, timeout: timeoutMs });
|
|
4185
4268
|
}
|
|
4186
4269
|
|
|
4187
4270
|
// src/evolve/scorers.ts
|
|
@@ -4341,21 +4424,47 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
|
|
|
4341
4424
|
breakdown
|
|
4342
4425
|
};
|
|
4343
4426
|
}
|
|
4427
|
+
function classifyFailure(score, stdout, stderr) {
|
|
4428
|
+
if (score.pass) return score;
|
|
4429
|
+
const combined = `${stdout}
|
|
4430
|
+
${stderr}`.toLowerCase();
|
|
4431
|
+
const scoreValue = score.score ?? 0;
|
|
4432
|
+
let failureCategory = "unknown";
|
|
4433
|
+
let failureReason = "";
|
|
4434
|
+
if (stderr.includes("[setup]") && stderr.includes("Error") || combined.includes("command not found") || combined.includes("no such file or directory")) {
|
|
4435
|
+
failureCategory = "task";
|
|
4436
|
+
failureReason = "Task setup failed or references missing resources";
|
|
4437
|
+
} else if (combined.includes("token limit") || combined.includes("context length") || combined.includes("rate limit") || combined.includes("api error") || combined.includes("429") || combined.includes("overloaded")) {
|
|
4438
|
+
failureCategory = "model";
|
|
4439
|
+
failureReason = "Model API error, token limit, or rate limit";
|
|
4440
|
+
} else if (combined.includes("build failed") && combined.includes("before") || combined.includes("merge conflict") || combined.includes("git dirty") || combined.includes("uncommitted changes")) {
|
|
4441
|
+
failureCategory = "repo";
|
|
4442
|
+
failureReason = "Pre-existing repo issues (build failure, dirty state)";
|
|
4443
|
+
} else if (scoreValue >= 20 && scoreValue < 80) {
|
|
4444
|
+
failureCategory = "harness";
|
|
4445
|
+
failureReason = "Agent attempted the task but did not follow harness conventions";
|
|
4446
|
+
}
|
|
4447
|
+
return { ...score, failureCategory, failureReason };
|
|
4448
|
+
}
|
|
4344
4449
|
async function scoreTask(task, workspacePath, stdout, stderr, config) {
|
|
4450
|
+
let score;
|
|
4345
4451
|
if (task.scoring === "pass-fail") {
|
|
4346
|
-
|
|
4347
|
-
}
|
|
4348
|
-
|
|
4349
|
-
|
|
4452
|
+
score = await passFailScorer(task, workspacePath, stdout, stderr);
|
|
4453
|
+
} else if (task.scoring === "llm-judge" && config) {
|
|
4454
|
+
score = await llmJudgeScorer(task, workspacePath, stdout, stderr, config);
|
|
4455
|
+
} else if (task.scoring === "rubric" && config) {
|
|
4456
|
+
score = await rubricScorer(task, workspacePath, stdout, stderr, config);
|
|
4457
|
+
} else {
|
|
4458
|
+
score = await passFailScorer(task, workspacePath, stdout, stderr);
|
|
4350
4459
|
}
|
|
4351
|
-
if (
|
|
4352
|
-
|
|
4460
|
+
if (!score.pass) {
|
|
4461
|
+
score = classifyFailure(score, stdout, stderr);
|
|
4353
4462
|
}
|
|
4354
|
-
return
|
|
4463
|
+
return score;
|
|
4355
4464
|
}
|
|
4356
4465
|
|
|
4357
4466
|
// src/evolve/runner.ts
|
|
4358
|
-
var
|
|
4467
|
+
var execAsync3 = promisify3(exec3);
|
|
4359
4468
|
var COPY_SKIP_DIRS = /* @__PURE__ */ new Set([".git", "node_modules", ".kairn-evolve", ".claude"]);
|
|
4360
4469
|
async function deployMcpJson(harnessPath, workDir) {
|
|
4361
4470
|
const src = path18.join(harnessPath, ".mcp.json");
|
|
@@ -4365,12 +4474,12 @@ async function deployMcpJson(harnessPath, workDir) {
|
|
|
4365
4474
|
async function createIsolatedWorkspace(projectRoot, harnessPath) {
|
|
4366
4475
|
const suffix = `${Date.now()}-${Math.random().toString(36).slice(2)}`;
|
|
4367
4476
|
try {
|
|
4368
|
-
await
|
|
4477
|
+
await execAsync3("git rev-parse --is-inside-work-tree", {
|
|
4369
4478
|
cwd: projectRoot,
|
|
4370
4479
|
timeout: 5e3
|
|
4371
4480
|
});
|
|
4372
4481
|
const tmpDir2 = path18.join(os3.tmpdir(), `kairn-evolve-wt-${suffix}`);
|
|
4373
|
-
await
|
|
4482
|
+
await execAsync3(`git worktree add --detach "${tmpDir2}" HEAD`, {
|
|
4374
4483
|
cwd: projectRoot,
|
|
4375
4484
|
timeout: 3e4
|
|
4376
4485
|
});
|
|
@@ -4409,14 +4518,14 @@ async function copyProjectDir(src, dest) {
|
|
|
4409
4518
|
async function cleanupIsolatedWorkspace(workDir, isWorktree, projectRoot) {
|
|
4410
4519
|
if (isWorktree) {
|
|
4411
4520
|
try {
|
|
4412
|
-
await
|
|
4521
|
+
await execAsync3(`git worktree remove "${workDir}" --force`, {
|
|
4413
4522
|
cwd: projectRoot,
|
|
4414
4523
|
timeout: 1e4
|
|
4415
4524
|
});
|
|
4416
4525
|
} catch {
|
|
4417
4526
|
await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
|
|
4418
4527
|
});
|
|
4419
|
-
await
|
|
4528
|
+
await execAsync3("git worktree prune", {
|
|
4420
4529
|
cwd: projectRoot,
|
|
4421
4530
|
timeout: 5e3
|
|
4422
4531
|
}).catch(() => {
|
|
@@ -4437,7 +4546,7 @@ async function runTask(task, harnessPath, traceDir, iteration, projectRoot) {
|
|
|
4437
4546
|
let setupStderr = "";
|
|
4438
4547
|
if (task.setup.trim()) {
|
|
4439
4548
|
try {
|
|
4440
|
-
await
|
|
4549
|
+
await execAsync3(task.setup, { cwd: workDir, timeout: 6e4 });
|
|
4441
4550
|
} catch (err) {
|
|
4442
4551
|
setupStderr = err instanceof Error ? err.message : String(err);
|
|
4443
4552
|
}
|
|
@@ -5218,19 +5327,41 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5218
5327
|
};
|
|
5219
5328
|
await writeIterationLog(workspacePath, rollbackLog);
|
|
5220
5329
|
history.push(rollbackLog);
|
|
5330
|
+
const bestHarnessPath = path21.join(
|
|
5331
|
+
workspacePath,
|
|
5332
|
+
"iterations",
|
|
5333
|
+
bestIteration.toString(),
|
|
5334
|
+
"harness"
|
|
5335
|
+
);
|
|
5221
5336
|
if (iter + 1 < evolveConfig.maxIterations) {
|
|
5222
|
-
|
|
5223
|
-
|
|
5224
|
-
|
|
5225
|
-
|
|
5226
|
-
|
|
5227
|
-
|
|
5228
|
-
|
|
5229
|
-
|
|
5230
|
-
|
|
5231
|
-
|
|
5232
|
-
|
|
5233
|
-
|
|
5337
|
+
onProgress?.({ type: "proposing", iteration: iter, message: "Proposing new mutations after rollback" });
|
|
5338
|
+
try {
|
|
5339
|
+
let rollbackProposal = await propose(
|
|
5340
|
+
iter,
|
|
5341
|
+
workspacePath,
|
|
5342
|
+
bestHarnessPath,
|
|
5343
|
+
history,
|
|
5344
|
+
tasks,
|
|
5345
|
+
kairnConfig,
|
|
5346
|
+
evolveConfig.proposerModel
|
|
5347
|
+
);
|
|
5348
|
+
if (rollbackProposal.mutations.length > evolveConfig.maxMutationsPerIteration) {
|
|
5349
|
+
rollbackProposal = {
|
|
5350
|
+
...rollbackProposal,
|
|
5351
|
+
mutations: rollbackProposal.mutations.slice(0, evolveConfig.maxMutationsPerIteration)
|
|
5352
|
+
};
|
|
5353
|
+
}
|
|
5354
|
+
const nextIterDir2 = path21.join(workspacePath, "iterations", (iter + 1).toString());
|
|
5355
|
+
await applyMutations(bestHarnessPath, nextIterDir2, rollbackProposal.mutations);
|
|
5356
|
+
onProgress?.({
|
|
5357
|
+
type: "mutations-applied",
|
|
5358
|
+
iteration: iter,
|
|
5359
|
+
mutationCount: rollbackProposal.mutations.length
|
|
5360
|
+
});
|
|
5361
|
+
} catch {
|
|
5362
|
+
const nextIterDir2 = path21.join(workspacePath, "iterations", (iter + 1).toString());
|
|
5363
|
+
await copyDir(bestHarnessPath, path21.join(nextIterDir2, "harness"));
|
|
5364
|
+
}
|
|
5234
5365
|
}
|
|
5235
5366
|
continue;
|
|
5236
5367
|
}
|
|
@@ -5431,6 +5562,7 @@ function buildLeaderboard(iterations, tasks) {
|
|
|
5431
5562
|
const taskIds = tasks.map((t) => t.id);
|
|
5432
5563
|
return taskIds.map((taskId) => {
|
|
5433
5564
|
const scores = {};
|
|
5565
|
+
const variance = {};
|
|
5434
5566
|
let bestScore = -1;
|
|
5435
5567
|
let bestIteration = 0;
|
|
5436
5568
|
for (const iter of iterations) {
|
|
@@ -5438,13 +5570,21 @@ function buildLeaderboard(iterations, tasks) {
|
|
|
5438
5570
|
if (s) {
|
|
5439
5571
|
const score = numericScore2(s);
|
|
5440
5572
|
scores[iter.iteration] = score;
|
|
5573
|
+
if (s.variance) {
|
|
5574
|
+
variance[iter.iteration] = {
|
|
5575
|
+
mean: s.variance.mean,
|
|
5576
|
+
stddev: s.variance.stddev,
|
|
5577
|
+
runs: s.variance.runs
|
|
5578
|
+
};
|
|
5579
|
+
}
|
|
5441
5580
|
if (score > bestScore) {
|
|
5442
5581
|
bestScore = score;
|
|
5443
5582
|
bestIteration = iter.iteration;
|
|
5444
5583
|
}
|
|
5445
5584
|
}
|
|
5446
5585
|
}
|
|
5447
|
-
|
|
5586
|
+
const hasVariance = Object.keys(variance).length > 0;
|
|
5587
|
+
return { taskId, scores, bestIteration, bestScore, ...hasVariance ? { variance } : {} };
|
|
5448
5588
|
});
|
|
5449
5589
|
}
|
|
5450
5590
|
function iterationStatus(iter, bestIteration) {
|
|
@@ -5480,13 +5620,29 @@ async function generateMarkdownReport(workspacePath) {
|
|
|
5480
5620
|
lines.push("");
|
|
5481
5621
|
lines.push("## Iterations");
|
|
5482
5622
|
lines.push("");
|
|
5483
|
-
|
|
5484
|
-
|
|
5623
|
+
const hasVariance = iterations.some(
|
|
5624
|
+
(iter) => Object.values(iter.taskResults).some((s) => s.variance)
|
|
5625
|
+
);
|
|
5626
|
+
if (hasVariance) {
|
|
5627
|
+
lines.push("| Iter | Score | Mutations | Status |");
|
|
5628
|
+
lines.push("|------|-------|-----------|--------|");
|
|
5629
|
+
} else {
|
|
5630
|
+
lines.push("| Iter | Score | Mutations | Status |");
|
|
5631
|
+
lines.push("|------|-------|-----------|--------|");
|
|
5632
|
+
}
|
|
5485
5633
|
for (const iter of iterations) {
|
|
5486
5634
|
const mutations = iter.proposal?.mutations.length ?? 0;
|
|
5487
5635
|
const mutStr = mutations > 0 ? mutations.toString() : "-";
|
|
5488
5636
|
const status = iterationStatus(iter, bestIter.iteration);
|
|
5489
|
-
|
|
5637
|
+
let scoreStr = `${iter.score.toFixed(1)}%`;
|
|
5638
|
+
if (hasVariance) {
|
|
5639
|
+
const stddevs = Object.values(iter.taskResults).map((s) => s.variance?.stddev).filter((v) => v !== void 0);
|
|
5640
|
+
if (stddevs.length > 0) {
|
|
5641
|
+
const avgStddev = stddevs.reduce((a, b) => a + b, 0) / stddevs.length;
|
|
5642
|
+
scoreStr = `${iter.score.toFixed(1)}% \xB1${avgStddev.toFixed(1)}`;
|
|
5643
|
+
}
|
|
5644
|
+
}
|
|
5645
|
+
lines.push(`| ${iter.iteration} | ${scoreStr} | ${mutStr} | ${status} |`);
|
|
5490
5646
|
}
|
|
5491
5647
|
lines.push("");
|
|
5492
5648
|
if (leaderboard.length > 0) {
|
|
@@ -5499,7 +5655,10 @@ async function generateMarkdownReport(workspacePath) {
|
|
|
5499
5655
|
for (const entry of leaderboard) {
|
|
5500
5656
|
const scoreCols = iterNums.map((n) => {
|
|
5501
5657
|
const s = entry.scores[n];
|
|
5502
|
-
|
|
5658
|
+
if (s === void 0) return "-";
|
|
5659
|
+
const v = entry.variance?.[n];
|
|
5660
|
+
if (v && v.runs > 1) return `${s.toFixed(0)}% \xB1${v.stddev.toFixed(1)}`;
|
|
5661
|
+
return `${s.toFixed(0)}%`;
|
|
5503
5662
|
});
|
|
5504
5663
|
lines.push(`| ${entry.taskId} | ${scoreCols.join(" | ")} | ${entry.bestScore.toFixed(0)}% (iter ${entry.bestIteration}) |`);
|
|
5505
5664
|
}
|
|
@@ -5549,12 +5708,17 @@ async function generateJsonReport(workspacePath) {
|
|
|
5549
5708
|
bestIteration: bestIter.iteration,
|
|
5550
5709
|
improvement
|
|
5551
5710
|
},
|
|
5552
|
-
iterations: iterations.map((iter) =>
|
|
5553
|
-
|
|
5554
|
-
|
|
5555
|
-
|
|
5556
|
-
|
|
5557
|
-
|
|
5711
|
+
iterations: iterations.map((iter) => {
|
|
5712
|
+
const stddevs = Object.values(iter.taskResults).map((s) => s.variance?.stddev).filter((v) => v !== void 0);
|
|
5713
|
+
const avgStddev = stddevs.length > 0 ? stddevs.reduce((a, b) => a + b, 0) / stddevs.length : void 0;
|
|
5714
|
+
return {
|
|
5715
|
+
iteration: iter.iteration,
|
|
5716
|
+
score: iter.score,
|
|
5717
|
+
...avgStddev !== void 0 ? { stddev: avgStddev } : {},
|
|
5718
|
+
mutationCount: iter.proposal?.mutations.length ?? 0,
|
|
5719
|
+
status: iterationStatus(iter, bestIter.iteration)
|
|
5720
|
+
};
|
|
5721
|
+
}),
|
|
5558
5722
|
leaderboard,
|
|
5559
5723
|
counterfactuals
|
|
5560
5724
|
};
|
|
@@ -5744,7 +5908,7 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
|
|
|
5744
5908
|
let addMore = true;
|
|
5745
5909
|
while (addMore) {
|
|
5746
5910
|
try {
|
|
5747
|
-
addMore = await
|
|
5911
|
+
addMore = await confirm4({ message: "Add another eval task?", default: false });
|
|
5748
5912
|
} catch {
|
|
5749
5913
|
addMore = false;
|
|
5750
5914
|
}
|
|
@@ -6157,10 +6321,12 @@ async function countFiles(dir) {
|
|
|
6157
6321
|
}
|
|
6158
6322
|
|
|
6159
6323
|
// src/cli.ts
|
|
6324
|
+
var require2 = createRequire(import.meta.url);
|
|
6325
|
+
var pkg = require2("../package.json");
|
|
6160
6326
|
var program = new Command12();
|
|
6161
6327
|
program.name("kairn").description(
|
|
6162
6328
|
"Compile natural language intent into optimized Claude Code environments"
|
|
6163
|
-
).version(
|
|
6329
|
+
).version(pkg.version).option("--no-color", "Disable colored output");
|
|
6164
6330
|
program.addCommand(initCommand);
|
|
6165
6331
|
program.addCommand(describeCommand);
|
|
6166
6332
|
program.addCommand(optimizeCommand);
|