kairn-cli 2.7.1 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +426 -29
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -218,6 +218,7 @@ async function callLLM(config, userMessage, options) {
|
|
|
218
218
|
const maxTokens = options.maxTokens ?? 8192;
|
|
219
219
|
const { systemPrompt } = options;
|
|
220
220
|
const jsonMode = options.jsonMode ?? false;
|
|
221
|
+
const cacheControl = options.cacheControl ?? false;
|
|
221
222
|
const providerName = getProviderName(config.provider);
|
|
222
223
|
let apiKey = config.api_key;
|
|
223
224
|
if (config.auth_type === "claude-code-oauth") {
|
|
@@ -238,7 +239,7 @@ async function callLLM(config, userMessage, options) {
|
|
|
238
239
|
const response = await client2.messages.create({
|
|
239
240
|
model: config.model,
|
|
240
241
|
max_tokens: maxTokens,
|
|
241
|
-
system: systemPrompt,
|
|
242
|
+
system: cacheControl ? [{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } }] : systemPrompt,
|
|
242
243
|
messages
|
|
243
244
|
});
|
|
244
245
|
const textBlock = response.content.find((block) => block.type === "text");
|
|
@@ -461,6 +462,96 @@ var init_exec = __esm({
|
|
|
461
462
|
});
|
|
462
463
|
|
|
463
464
|
// src/evolve/scorers.ts
|
|
465
|
+
function scoreCriterionDeterministic(criterionText, stdout, stderr) {
|
|
466
|
+
const combined = `${stdout}
|
|
467
|
+
${stderr}`.toLowerCase();
|
|
468
|
+
const criterionLower = criterionText.toLowerCase().trim();
|
|
469
|
+
if (/^ran\b/i.test(criterionText.trim())) {
|
|
470
|
+
for (const entry of RAN_COMMAND_EVIDENCE) {
|
|
471
|
+
const matchesKeyword = entry.keywords.some(
|
|
472
|
+
(kw) => criterionLower.includes(kw.toLowerCase())
|
|
473
|
+
);
|
|
474
|
+
if (matchesKeyword) {
|
|
475
|
+
const found = entry.evidence.some((ev) => combined.includes(ev.toLowerCase()));
|
|
476
|
+
if (found) {
|
|
477
|
+
const matchedEvidence = entry.evidence.find(
|
|
478
|
+
(ev) => combined.includes(ev.toLowerCase())
|
|
479
|
+
);
|
|
480
|
+
return {
|
|
481
|
+
score: 1,
|
|
482
|
+
reasoning: `Deterministic: found evidence of '${matchedEvidence}' in output`
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
return {
|
|
486
|
+
score: 0,
|
|
487
|
+
reasoning: `Deterministic: no evidence of '${entry.keywords[0]}' found`
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
return null;
|
|
492
|
+
}
|
|
493
|
+
if (/^(zero|no)\b/i.test(criterionText.trim())) {
|
|
494
|
+
for (const entry of ABSENCE_PATTERNS) {
|
|
495
|
+
const matchesKeyword = entry.keywords.some(
|
|
496
|
+
(kw) => criterionLower.includes(kw.toLowerCase())
|
|
497
|
+
);
|
|
498
|
+
if (matchesKeyword) {
|
|
499
|
+
const found = entry.search.some((pat) => combined.includes(pat.toLowerCase()));
|
|
500
|
+
if (found) {
|
|
501
|
+
const matchedPattern = entry.search.find(
|
|
502
|
+
(pat) => combined.includes(pat.toLowerCase())
|
|
503
|
+
);
|
|
504
|
+
return {
|
|
505
|
+
score: 0,
|
|
506
|
+
reasoning: `Deterministic: found '${matchedPattern}' which should be absent`
|
|
507
|
+
};
|
|
508
|
+
}
|
|
509
|
+
return {
|
|
510
|
+
score: 1,
|
|
511
|
+
reasoning: `Deterministic: no prohibited pattern found in output`
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
return null;
|
|
516
|
+
}
|
|
517
|
+
if (/^uses?\b/i.test(criterionText.trim())) {
|
|
518
|
+
for (const entry of PRESENCE_PATTERNS) {
|
|
519
|
+
if (criterionLower.includes(entry.keyword.toLowerCase())) {
|
|
520
|
+
const found = entry.search.some((s) => combined.includes(s.toLowerCase()));
|
|
521
|
+
if (found) {
|
|
522
|
+
return {
|
|
523
|
+
score: 1,
|
|
524
|
+
reasoning: `Deterministic: found '${entry.keyword}' in output`
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
return {
|
|
528
|
+
score: 0,
|
|
529
|
+
reasoning: `Deterministic: '${entry.keyword}' not found in output`
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
return null;
|
|
534
|
+
}
|
|
535
|
+
if (/^calls?\b/i.test(criterionText.trim())) {
|
|
536
|
+
for (const pattern of CALL_PATTERNS) {
|
|
537
|
+
if (criterionLower.includes(pattern.toLowerCase())) {
|
|
538
|
+
const found = combined.includes(pattern.toLowerCase());
|
|
539
|
+
if (found) {
|
|
540
|
+
return {
|
|
541
|
+
score: 1,
|
|
542
|
+
reasoning: `Deterministic: found '${pattern}' in output`
|
|
543
|
+
};
|
|
544
|
+
}
|
|
545
|
+
return {
|
|
546
|
+
score: 0,
|
|
547
|
+
reasoning: `Deterministic: '${pattern}' not found in output`
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
return null;
|
|
552
|
+
}
|
|
553
|
+
return null;
|
|
554
|
+
}
|
|
464
555
|
async function passFailScorer(task, workspacePath, stdout, stderr) {
|
|
465
556
|
const outcomes = Array.isArray(task.expected_outcome) ? task.expected_outcome : task.expected_outcome.split("\n");
|
|
466
557
|
const commands = outcomes.map((line) => line.replace(/^-\s*/, "").trim()).filter((line) => COMMAND_PATTERN.test(line));
|
|
@@ -513,7 +604,8 @@ async function llmJudgeScorer(task, workspacePath, stdout, stderr, config) {
|
|
|
513
604
|
try {
|
|
514
605
|
const response = await callLLM(config, userMessage, {
|
|
515
606
|
systemPrompt: JUDGE_SYSTEM_PROMPT,
|
|
516
|
-
maxTokens: 1024
|
|
607
|
+
maxTokens: 1024,
|
|
608
|
+
cacheControl: true
|
|
517
609
|
});
|
|
518
610
|
let cleaned = response.trim();
|
|
519
611
|
if (cleaned.startsWith("```")) {
|
|
@@ -544,6 +636,20 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
|
|
|
544
636
|
const breakdown = [];
|
|
545
637
|
let weightedSum = 0;
|
|
546
638
|
for (const criterion of task.rubric) {
|
|
639
|
+
const deterministicResult = scoreCriterionDeterministic(
|
|
640
|
+
criterion.criterion,
|
|
641
|
+
stdout,
|
|
642
|
+
stderr
|
|
643
|
+
);
|
|
644
|
+
if (deterministicResult !== null) {
|
|
645
|
+
breakdown.push({
|
|
646
|
+
criterion: criterion.criterion,
|
|
647
|
+
score: deterministicResult.score,
|
|
648
|
+
weight: criterion.weight
|
|
649
|
+
});
|
|
650
|
+
weightedSum += deterministicResult.score * criterion.weight;
|
|
651
|
+
continue;
|
|
652
|
+
}
|
|
547
653
|
const userMessage = [
|
|
548
654
|
"## Task",
|
|
549
655
|
task.description,
|
|
@@ -560,7 +666,8 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
|
|
|
560
666
|
try {
|
|
561
667
|
const response = await callLLM(config, userMessage, {
|
|
562
668
|
systemPrompt: RUBRIC_SYSTEM_PROMPT,
|
|
563
|
-
maxTokens: 512
|
|
669
|
+
maxTokens: 512,
|
|
670
|
+
cacheControl: true
|
|
564
671
|
});
|
|
565
672
|
let cleaned = response.trim();
|
|
566
673
|
if (cleaned.startsWith("```")) {
|
|
@@ -638,7 +745,7 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
|
|
|
638
745
|
}
|
|
639
746
|
return score;
|
|
640
747
|
}
|
|
641
|
-
var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT;
|
|
748
|
+
var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT, RAN_COMMAND_EVIDENCE, ABSENCE_PATTERNS, PRESENCE_PATTERNS, CALL_PATTERNS;
|
|
642
749
|
var init_scorers = __esm({
|
|
643
750
|
"src/evolve/scorers.ts"() {
|
|
644
751
|
"use strict";
|
|
@@ -661,6 +768,31 @@ Return ONLY valid JSON:
|
|
|
661
768
|
"score": 0.0-1.0,
|
|
662
769
|
"reasoning": "Brief explanation"
|
|
663
770
|
}`;
|
|
771
|
+
RAN_COMMAND_EVIDENCE = [
|
|
772
|
+
{ keywords: ["npm run build", "build", "tsup"], evidence: ["build success", "tsup", "built in", "build completed"] },
|
|
773
|
+
{ keywords: ["tsc", "typecheck"], evidence: ["tsc", "typecheck"] },
|
|
774
|
+
{ keywords: ["npm run lint", "eslint", "lint"], evidence: ["lint", "eslint"] },
|
|
775
|
+
{ keywords: ["npm test", "vitest", "test"], evidence: ["vitest", "test files", "tests passed", "passed (", "tests "] }
|
|
776
|
+
];
|
|
777
|
+
ABSENCE_PATTERNS = [
|
|
778
|
+
{ keywords: [".then()", ".catch()"], search: [".then(", ".catch("] },
|
|
779
|
+
{ keywords: ["readfilesync", "writefilesync"], search: ["readfilesync", "writefilesync"] },
|
|
780
|
+
{ keywords: ["sync"], search: ["sync"] }
|
|
781
|
+
];
|
|
782
|
+
PRESENCE_PATTERNS = [
|
|
783
|
+
{ keyword: "chalk.green", search: ["chalk.green"] },
|
|
784
|
+
{ keyword: "chalk.yellow", search: ["chalk.yellow"] },
|
|
785
|
+
{ keyword: "chalk.red", search: ["chalk.red"] },
|
|
786
|
+
{ keyword: "chalk.cyan", search: ["chalk.cyan"] },
|
|
787
|
+
{ keyword: "fs.promises", search: ["fs.promises", "fs/promises"] },
|
|
788
|
+
{ keyword: "fs/promises", search: ["fs.promises", "fs/promises"] },
|
|
789
|
+
{ keyword: "async/await", search: ["async ", "await "] },
|
|
790
|
+
{ keyword: "@inquirer/prompts", search: ["@inquirer/prompts"] }
|
|
791
|
+
];
|
|
792
|
+
CALL_PATTERNS = [
|
|
793
|
+
"process.exit(1)",
|
|
794
|
+
"process.exit"
|
|
795
|
+
];
|
|
664
796
|
}
|
|
665
797
|
});
|
|
666
798
|
|
|
@@ -1329,7 +1461,8 @@ async function propose(iteration, workspacePath, harnessPath, history, tasks, co
|
|
|
1329
1461
|
const response = await callLLM(proposerConfig, userMessage, {
|
|
1330
1462
|
systemPrompt: PROPOSER_SYSTEM_PROMPT,
|
|
1331
1463
|
maxTokens: 8192,
|
|
1332
|
-
jsonMode: true
|
|
1464
|
+
jsonMode: true,
|
|
1465
|
+
cacheControl: true
|
|
1333
1466
|
});
|
|
1334
1467
|
return parseProposerResponse(response);
|
|
1335
1468
|
}
|
|
@@ -1831,7 +1964,22 @@ async function parseAgents(harnessPath) {
|
|
|
1831
1964
|
if (Array.isArray(disallowedTools)) {
|
|
1832
1965
|
node.disallowedTools = disallowedTools;
|
|
1833
1966
|
}
|
|
1834
|
-
const
|
|
1967
|
+
const modelRouting = frontmatter["modelRouting"];
|
|
1968
|
+
if (typeof modelRouting === "object" && modelRouting !== null) {
|
|
1969
|
+
const mr = modelRouting;
|
|
1970
|
+
if (typeof mr["default"] === "string") {
|
|
1971
|
+
node.modelRouting = {
|
|
1972
|
+
default: mr["default"]
|
|
1973
|
+
};
|
|
1974
|
+
if (typeof mr["escalateTo"] === "string") {
|
|
1975
|
+
node.modelRouting.escalateTo = mr["escalateTo"];
|
|
1976
|
+
}
|
|
1977
|
+
if (typeof mr["escalateWhen"] === "string") {
|
|
1978
|
+
node.modelRouting.escalateWhen = mr["escalateWhen"];
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
}
|
|
1982
|
+
const knownKeys = /* @__PURE__ */ new Set(["name", "model", "disallowedTools", "modelRouting"]);
|
|
1835
1983
|
const extra = {};
|
|
1836
1984
|
for (const [key, value] of Object.entries(frontmatter)) {
|
|
1837
1985
|
if (!knownKeys.has(key)) {
|
|
@@ -2457,8 +2605,9 @@ function renderRuleWithFrontmatter(rule) {
|
|
|
2457
2605
|
function renderAgentWithFrontmatter(agent) {
|
|
2458
2606
|
const hasModel = agent.model !== void 0;
|
|
2459
2607
|
const hasDisallowed = agent.disallowedTools !== void 0 && agent.disallowedTools.length > 0;
|
|
2608
|
+
const hasRouting = agent.modelRouting !== void 0;
|
|
2460
2609
|
const hasExtra = agent.extraFrontmatter !== void 0 && Object.keys(agent.extraFrontmatter).length > 0;
|
|
2461
|
-
if (!hasModel && !hasDisallowed && !hasExtra) {
|
|
2610
|
+
if (!hasModel && !hasDisallowed && !hasRouting && !hasExtra) {
|
|
2462
2611
|
return agent.content;
|
|
2463
2612
|
}
|
|
2464
2613
|
const yamlLines = ["---"];
|
|
@@ -2471,6 +2620,16 @@ function renderAgentWithFrontmatter(agent) {
|
|
|
2471
2620
|
yamlLines.push(` - ${tool}`);
|
|
2472
2621
|
}
|
|
2473
2622
|
}
|
|
2623
|
+
if (hasRouting) {
|
|
2624
|
+
yamlLines.push("modelRouting:");
|
|
2625
|
+
yamlLines.push(` default: ${agent.modelRouting.default}`);
|
|
2626
|
+
if (agent.modelRouting.escalateTo) {
|
|
2627
|
+
yamlLines.push(` escalateTo: ${agent.modelRouting.escalateTo}`);
|
|
2628
|
+
}
|
|
2629
|
+
if (agent.modelRouting.escalateWhen) {
|
|
2630
|
+
yamlLines.push(` escalateWhen: ${agent.modelRouting.escalateWhen}`);
|
|
2631
|
+
}
|
|
2632
|
+
}
|
|
2474
2633
|
if (hasExtra) {
|
|
2475
2634
|
for (const [key, value] of Object.entries(agent.extraFrontmatter)) {
|
|
2476
2635
|
if (Array.isArray(value)) {
|
|
@@ -2904,6 +3063,14 @@ async function renderAffectedFiles(ir, targetDir, touchedCategories) {
|
|
|
2904
3063
|
if (touchedCategories.has("agents")) {
|
|
2905
3064
|
await deleteOrphanedFiles(targetDir, "agents", fileMap);
|
|
2906
3065
|
}
|
|
3066
|
+
if (touchedCategories.has("mcp") && !fileMap.has(".mcp.json")) {
|
|
3067
|
+
await fs22.unlink(path22.join(targetDir, ".mcp.json")).catch(() => {
|
|
3068
|
+
});
|
|
3069
|
+
}
|
|
3070
|
+
if (touchedCategories.has("settings") && !fileMap.has("settings.json")) {
|
|
3071
|
+
await fs22.unlink(path22.join(targetDir, "settings.json")).catch(() => {
|
|
3072
|
+
});
|
|
3073
|
+
}
|
|
2907
3074
|
}
|
|
2908
3075
|
function getFileCategory(relativePath) {
|
|
2909
3076
|
if (relativePath === "CLAUDE.md") return "claude_md";
|
|
@@ -3342,6 +3509,92 @@ var init_regularization = __esm({
|
|
|
3342
3509
|
}
|
|
3343
3510
|
});
|
|
3344
3511
|
|
|
3512
|
+
// src/evolve/targeting.ts
|
|
3513
|
+
function mutationToAspect(mutation) {
|
|
3514
|
+
switch (mutation.type) {
|
|
3515
|
+
case "update_section": {
|
|
3516
|
+
const id = mutation.sectionId;
|
|
3517
|
+
if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
|
|
3518
|
+
if (id === "commands" || id === "custom-key-commands") return "commands";
|
|
3519
|
+
if (id === "verification") return "verification";
|
|
3520
|
+
if (id === "architecture") return "architecture";
|
|
3521
|
+
return "general";
|
|
3522
|
+
}
|
|
3523
|
+
case "add_section": {
|
|
3524
|
+
const id = mutation.section.id;
|
|
3525
|
+
if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
|
|
3526
|
+
if (id === "commands" || id === "custom-key-commands") return "commands";
|
|
3527
|
+
if (id === "verification") return "verification";
|
|
3528
|
+
if (id === "architecture") return "architecture";
|
|
3529
|
+
return "general";
|
|
3530
|
+
}
|
|
3531
|
+
case "remove_section":
|
|
3532
|
+
case "reorder_section":
|
|
3533
|
+
return "general";
|
|
3534
|
+
case "add_command":
|
|
3535
|
+
case "update_command":
|
|
3536
|
+
case "remove_command":
|
|
3537
|
+
return "commands";
|
|
3538
|
+
case "add_rule":
|
|
3539
|
+
case "update_rule":
|
|
3540
|
+
case "remove_rule":
|
|
3541
|
+
return "rules";
|
|
3542
|
+
case "add_agent":
|
|
3543
|
+
case "update_agent":
|
|
3544
|
+
case "remove_agent":
|
|
3545
|
+
return "agents";
|
|
3546
|
+
case "add_mcp_server":
|
|
3547
|
+
case "remove_mcp_server":
|
|
3548
|
+
return "mcp";
|
|
3549
|
+
case "update_settings":
|
|
3550
|
+
return "settings";
|
|
3551
|
+
case "raw_text":
|
|
3552
|
+
return "general";
|
|
3553
|
+
}
|
|
3554
|
+
}
|
|
3555
|
+
function mutationsToAspects(mutations) {
|
|
3556
|
+
const aspects = /* @__PURE__ */ new Set();
|
|
3557
|
+
for (const m of mutations) {
|
|
3558
|
+
aspects.add(mutationToAspect(m));
|
|
3559
|
+
}
|
|
3560
|
+
return aspects;
|
|
3561
|
+
}
|
|
3562
|
+
function taskDependsOnAspects(task) {
|
|
3563
|
+
const aspects = TEMPLATE_ASPECTS[task.template];
|
|
3564
|
+
return new Set(aspects ?? ["general"]);
|
|
3565
|
+
}
|
|
3566
|
+
function shouldReEvaluate(task, changedAspects) {
|
|
3567
|
+
if (changedAspects.has("general")) return true;
|
|
3568
|
+
if (changedAspects.size === 0) return false;
|
|
3569
|
+
const taskAspects = taskDependsOnAspects(task);
|
|
3570
|
+
if (taskAspects.has("general")) return true;
|
|
3571
|
+
for (const aspect of taskAspects) {
|
|
3572
|
+
if (changedAspects.has(aspect)) return true;
|
|
3573
|
+
}
|
|
3574
|
+
return false;
|
|
3575
|
+
}
|
|
3576
|
+
function filterTasksByAspects(tasks, changedAspects) {
|
|
3577
|
+
return tasks.filter((t) => shouldReEvaluate(t, changedAspects));
|
|
3578
|
+
}
|
|
3579
|
+
var TEMPLATE_ASPECTS;
|
|
3580
|
+
var init_targeting = __esm({
|
|
3581
|
+
"src/evolve/targeting.ts"() {
|
|
3582
|
+
"use strict";
|
|
3583
|
+
TEMPLATE_ASPECTS = {
|
|
3584
|
+
"convention-adherence": ["conventions", "rules"],
|
|
3585
|
+
"workflow-compliance": ["commands", "verification"],
|
|
3586
|
+
"rule-compliance": ["rules"],
|
|
3587
|
+
"intent-routing": ["settings"],
|
|
3588
|
+
"add-feature": ["general"],
|
|
3589
|
+
"fix-bug": ["general"],
|
|
3590
|
+
"refactor": ["architecture", "conventions"],
|
|
3591
|
+
"test-writing": ["verification", "commands"],
|
|
3592
|
+
"config-change": ["settings", "mcp"],
|
|
3593
|
+
"documentation": ["general"]
|
|
3594
|
+
};
|
|
3595
|
+
}
|
|
3596
|
+
});
|
|
3597
|
+
|
|
3345
3598
|
// src/evolve/loop.ts
|
|
3346
3599
|
import fs25 from "fs/promises";
|
|
3347
3600
|
import path25 from "path";
|
|
@@ -3374,6 +3627,7 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
3374
3627
|
}
|
|
3375
3628
|
}
|
|
3376
3629
|
}
|
|
3630
|
+
let lastChangedAspects = null;
|
|
3377
3631
|
let rngState = evolveConfig.rngSeed ?? 42;
|
|
3378
3632
|
const rng = () => {
|
|
3379
3633
|
rngState = rngState * 1664525 + 1013904223 & 4294967295;
|
|
@@ -3420,6 +3674,22 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
3420
3674
|
tasksToRun.push(task);
|
|
3421
3675
|
}
|
|
3422
3676
|
}
|
|
3677
|
+
if (lastChangedAspects !== null) {
|
|
3678
|
+
const targetedTasks = filterTasksByAspects(tasksToRun, lastChangedAspects);
|
|
3679
|
+
const skippedByTargeting = tasksToRun.filter((t) => !targetedTasks.includes(t));
|
|
3680
|
+
for (const task of skippedByTargeting) {
|
|
3681
|
+
const prev = prevLog.taskResults[task.id];
|
|
3682
|
+
const prevVal = prev ? prev.score ?? (prev.pass ? 100 : 0) : 0;
|
|
3683
|
+
carriedScores[task.id] = { pass: prevVal >= 50, score: prevVal };
|
|
3684
|
+
onProgress?.({
|
|
3685
|
+
type: "task-skipped",
|
|
3686
|
+
iteration: iter,
|
|
3687
|
+
taskId: task.id,
|
|
3688
|
+
message: `Skipped ${task.id} (unaffected by mutations)`
|
|
3689
|
+
});
|
|
3690
|
+
}
|
|
3691
|
+
tasksToRun = targetedTasks;
|
|
3692
|
+
}
|
|
3423
3693
|
const sampleSize = evolveConfig.evalSampleSize;
|
|
3424
3694
|
if (sampleSize > 0 && sampleSize < tasksToRun.length) {
|
|
3425
3695
|
let sampled;
|
|
@@ -3488,7 +3758,7 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
3488
3758
|
}
|
|
3489
3759
|
if (useThompson) {
|
|
3490
3760
|
const scoreMap = {};
|
|
3491
|
-
for (const [taskId, score] of Object.entries(
|
|
3761
|
+
for (const [taskId, score] of Object.entries(evalResults)) {
|
|
3492
3762
|
scoreMap[taskId] = score.score ?? (score.pass ? 100 : 0);
|
|
3493
3763
|
}
|
|
3494
3764
|
beliefs = updateBeliefs(beliefs, scoreMap);
|
|
@@ -3575,6 +3845,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
3575
3845
|
}
|
|
3576
3846
|
const nextIterDir2 = path25.join(workspacePath, "iterations", (iter + 1).toString());
|
|
3577
3847
|
await applyMutations(bestHarnessPath, nextIterDir2, rollbackProposal.mutations);
|
|
3848
|
+
try {
|
|
3849
|
+
const rollbackIR = await parseHarness(bestHarnessPath);
|
|
3850
|
+
const irMuts = translateMutations(rollbackProposal.mutations, rollbackIR);
|
|
3851
|
+
lastChangedAspects = mutationsToAspects(irMuts);
|
|
3852
|
+
} catch {
|
|
3853
|
+
lastChangedAspects = null;
|
|
3854
|
+
}
|
|
3578
3855
|
onProgress?.({
|
|
3579
3856
|
type: "mutations-applied",
|
|
3580
3857
|
iteration: iter,
|
|
@@ -3679,8 +3956,16 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
3679
3956
|
proposal.mutations
|
|
3680
3957
|
);
|
|
3681
3958
|
diffPatch = mutationResult.diffPatch;
|
|
3959
|
+
try {
|
|
3960
|
+
const currentIR = await parseHarness(harnessPath);
|
|
3961
|
+
const irMuts = translateMutations(proposal.mutations, currentIR);
|
|
3962
|
+
lastChangedAspects = mutationsToAspects(irMuts);
|
|
3963
|
+
} catch {
|
|
3964
|
+
lastChangedAspects = null;
|
|
3965
|
+
}
|
|
3682
3966
|
} catch {
|
|
3683
3967
|
await copyDir(harnessPath, path25.join(nextIterDir, "harness"));
|
|
3968
|
+
lastChangedAspects = null;
|
|
3684
3969
|
}
|
|
3685
3970
|
onProgress?.({
|
|
3686
3971
|
type: "mutations-applied",
|
|
@@ -3779,6 +4064,8 @@ var init_loop = __esm({
|
|
|
3779
4064
|
init_sampling();
|
|
3780
4065
|
init_regularization();
|
|
3781
4066
|
init_parser();
|
|
4067
|
+
init_translate();
|
|
4068
|
+
init_targeting();
|
|
3782
4069
|
}
|
|
3783
4070
|
});
|
|
3784
4071
|
|
|
@@ -3909,7 +4196,8 @@ ${userMessage}`;
|
|
|
3909
4196
|
const response = await callLLM(proposerConfig, fullMessage, {
|
|
3910
4197
|
systemPrompt,
|
|
3911
4198
|
maxTokens: 8192,
|
|
3912
|
-
jsonMode: true
|
|
4199
|
+
jsonMode: true,
|
|
4200
|
+
cacheControl: true
|
|
3913
4201
|
});
|
|
3914
4202
|
const proposal = parseProposerResponse(response);
|
|
3915
4203
|
return {
|
|
@@ -4744,6 +5032,12 @@ At the start of every session, before doing ANY work:
|
|
|
4744
5032
|
|
|
4745
5033
|
This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
|
|
4746
5034
|
|
|
5035
|
+
## Sprint Contract
|
|
5036
|
+
|
|
5037
|
+
Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
|
|
5038
|
+
Each criterion must be numbered, testable, and independently verifiable.
|
|
5039
|
+
After implementing, verify EACH criterion individually. Do not mark done until all pass.
|
|
5040
|
+
|
|
4747
5041
|
## Completion Standards
|
|
4748
5042
|
|
|
4749
5043
|
Never mark a task "done" without running the Completion Verification checklist.
|
|
@@ -4773,6 +5067,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
|
|
|
4773
5067
|
15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
|
|
4774
5068
|
16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
|
|
4775
5069
|
17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
|
|
5070
|
+
18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
|
|
4776
5071
|
|
|
4777
5072
|
## Shell-Integrated Commands
|
|
4778
5073
|
|
|
@@ -4829,17 +5124,19 @@ Only generate scoped rules when the workflow involves multiple code domains.
|
|
|
4829
5124
|
|
|
4830
5125
|
Generate hooks in settings.json based on project type:
|
|
4831
5126
|
|
|
4832
|
-
**All code projects** \u2014 block destructive commands:
|
|
5127
|
+
**All code projects** \u2014 block destructive commands, credential leaks, injection, and network exfiltration:
|
|
4833
5128
|
\`\`\`json
|
|
4834
5129
|
{
|
|
4835
5130
|
"hooks": {
|
|
4836
|
-
"PreToolUse": [
|
|
4837
|
-
|
|
4838
|
-
|
|
4839
|
-
"
|
|
4840
|
-
|
|
4841
|
-
|
|
4842
|
-
|
|
5131
|
+
"PreToolUse": [
|
|
5132
|
+
{
|
|
5133
|
+
"matcher": "Bash",
|
|
5134
|
+
"hooks": [{
|
|
5135
|
+
"type": "command",
|
|
5136
|
+
"command": "CMD=$(cat | jq -r '.tool_input.command // empty') && echo \\"$CMD\\" | grep -qiE 'rm\\\\s+-rf\\\\s+/|DROP\\\\s+(TABLE|DATABASE)|curl.*\\\\|\\\\s*sh|:(){ :|:& };:|git\\\\s+push.*--force(?!-with-lease)|ch(mod|own).*-R\\\\s+/|npm\\\\s+publish(?!.*--dry-run)|(api[_-]?key|secret|token|password)\\\\s*[:=]|AKIA[0-9A-Z]{16}|BEGIN.*PRIVATE\\\\s+KEY|;\\\\s*(DROP|DELETE|ALTER|TRUNCATE)\\\\s+|\\\\.\\\\./\\\\.\\\\./\\\\.\\\\./|nc\\\\s+.*-e|/dev/tcp/|bash\\\\s+-i|curl.*-d.*@|wget.*--post-file' && echo 'Blocked dangerous command' >&2 && exit 2 || true"
|
|
5137
|
+
}]
|
|
5138
|
+
}
|
|
5139
|
+
]
|
|
4843
5140
|
}
|
|
4844
5141
|
}
|
|
4845
5142
|
\`\`\`
|
|
@@ -4881,6 +5178,17 @@ All projects should include a PostCompact hook to restore context after compacti
|
|
|
4881
5178
|
|
|
4882
5179
|
Merge this into the settings hooks alongside the PreToolUse and PostToolUse hooks.
|
|
4883
5180
|
|
|
5181
|
+
For long-running sessions (>2 hours or >3 compactions), prefer "Full Reset" over re-inject:
|
|
5182
|
+
replace the prompt-type PostCompact hook with a command-type hook that pipes CLAUDE.md + SPRINT.md + DECISIONS.md content directly into additionalContext.
|
|
5183
|
+
|
|
5184
|
+
## Memory Persistence Hooks
|
|
5185
|
+
|
|
5186
|
+
For projects with multi-session workflows, include SessionStart/End hooks that persist context to \`.claude/memory.json\`:
|
|
5187
|
+
- **SessionEnd:** Save recent decisions, sprint status, and known gotchas to \`.claude/memory.json\`
|
|
5188
|
+
- **SessionStart:** Load \`.claude/memory.json\` and inject as additionalContext
|
|
5189
|
+
|
|
5190
|
+
This ensures accumulated project knowledge survives session boundaries.
|
|
5191
|
+
|
|
4884
5192
|
## For Code Projects, Additionally Include
|
|
4885
5193
|
|
|
4886
5194
|
- \`/project:plan\` command (plan before coding)
|
|
@@ -4890,7 +5198,7 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
|
|
|
4890
5198
|
- \`/project:status\` command (live git status, recent commits, SPRINT.md overview using ! prefix)
|
|
4891
5199
|
- \`/project:fix\` command (takes $ARGUMENTS as issue number, plans fix, implements, tests, commits)
|
|
4892
5200
|
- \`/project:sprint\` command (define acceptance criteria before coding, writes to docs/SPRINT.md)
|
|
4893
|
-
- \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
|
|
5201
|
+
- \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). Phase 4 (Verify) MUST validate EACH acceptance criterion from docs/SPRINT.md individually, reporting PASS/FAIL per item as a contract scorecard. MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
|
|
4894
5202
|
- A TDD skill using the 3-phase isolation pattern (RED \u2192 GREEN \u2192 REFACTOR):
|
|
4895
5203
|
- RED: Write failing test only. Verify it FAILS.
|
|
4896
5204
|
- GREEN: Write MINIMUM code to pass. Nothing extra.
|
|
@@ -4900,13 +5208,21 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
|
|
|
4900
5208
|
- \`@qa-orchestrator\` (sonnet) \u2014 delegates to linter and e2e-tester, compiles QA report
|
|
4901
5209
|
- \`@linter\` (haiku) \u2014 runs formatters, linters, security scanners
|
|
4902
5210
|
- \`@e2e-tester\` (sonnet, only when Playwright is in tools) \u2014 browser-based QA via Playwright
|
|
4903
|
-
-
|
|
4904
|
-
|
|
4905
|
-
|
|
4906
|
-
-
|
|
4907
|
-
-
|
|
4908
|
-
-
|
|
4909
|
-
|
|
5211
|
+
- A "Model Selection" section in generated agents:
|
|
5212
|
+
\`\`\`
|
|
5213
|
+
## Model Selection (all agents)
|
|
5214
|
+
- Haiku: simple file edits, linting, formatting, doc updates (<50 lines changed)
|
|
5215
|
+
- Sonnet: implementation, testing, debugging, code review (50-500 lines)
|
|
5216
|
+
- Opus: architecture decisions, spec writing, complex refactors (>500 lines or cross-cutting)
|
|
5217
|
+
Default: Sonnet. Only escalate to Opus when the task involves multi-file architecture or ambiguous requirements.
|
|
5218
|
+
\`\`\`
|
|
5219
|
+
- Development pipeline agents (used by /project:develop). Each agent should include a modelRouting field in its YAML frontmatter:
|
|
5220
|
+
- \`@architect\` (default: opus) \u2014 conducts spec interview with user, writes confirmed spec to docs/SPRINT.md with numbered acceptance criteria. Your spec is a CONTRACT \u2014 the verifier will check every criterion. Vague criteria = guaranteed rework.
|
|
5221
|
+
- \`@planner\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 reads spec and codebase, creates step-by-step implementation plan in docs/PLAN.md
|
|
5222
|
+
- \`@implementer\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 TDD-focused implementation, writes failing tests then minimum code to pass
|
|
5223
|
+
- \`@fixer\` (default: sonnet, use haiku for single-file fixes) \u2014 targeted bug fixing from verifier/review feedback
|
|
5224
|
+
- \`@doc-updater\` (default: haiku) \u2014 extracts decisions and learnings from completed work, updates docs/DECISIONS.md and docs/LEARNINGS.md
|
|
5225
|
+
- \`/project:spec\` command (interview-based spec creation \u2014 asks 5-8 questions one at a time, writes structured spec to docs/SPRINT.md with ## Acceptance Criteria containing 3-8 numbered, testable conditions. Each criterion must be independently verifiable. Does NOT start coding until confirmed)
|
|
4910
5226
|
- \`/project:prove\` command (runs tests, shows git diff vs main, rates confidence HIGH/MEDIUM/LOW with evidence)
|
|
4911
5227
|
- \`/project:grill\` command (adversarial code review \u2014 challenges each change with "why this approach?", "what if X input?", rates BLOCKER/SHOULD-FIX/NITPICK, blocks until BLOCKERs resolved)
|
|
4912
5228
|
- \`/project:reset\` command (reads DECISIONS.md and LEARNINGS.md, proposes clean restart, stashes current work, implements elegant solution)
|
|
@@ -5059,6 +5375,12 @@ At the start of every session, before doing ANY work:
|
|
|
5059
5375
|
|
|
5060
5376
|
This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
|
|
5061
5377
|
|
|
5378
|
+
## Sprint Contract
|
|
5379
|
+
|
|
5380
|
+
Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
|
|
5381
|
+
Each criterion must be numbered, testable, and independently verifiable.
|
|
5382
|
+
After implementing, verify EACH criterion individually. Do not mark done until all pass.
|
|
5383
|
+
|
|
5062
5384
|
## Completion Standards
|
|
5063
5385
|
|
|
5064
5386
|
Never mark a task "done" without running the Completion Verification checklist.
|
|
@@ -5088,6 +5410,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
|
|
|
5088
5410
|
15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
|
|
5089
5411
|
16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
|
|
5090
5412
|
17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
|
|
5413
|
+
18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
|
|
5091
5414
|
|
|
5092
5415
|
## Tool Selection Rules
|
|
5093
5416
|
|
|
@@ -8719,7 +9042,81 @@ async function listFilesRecursive(dir) {
|
|
|
8719
9042
|
await walk(dir);
|
|
8720
9043
|
return results;
|
|
8721
9044
|
}
|
|
8722
|
-
async function
|
|
9045
|
+
async function findBestPBTHarness(workspacePath) {
|
|
9046
|
+
const branchesDir = path27.join(workspacePath, "branches");
|
|
9047
|
+
let branchEntries;
|
|
9048
|
+
try {
|
|
9049
|
+
branchEntries = await fs27.readdir(branchesDir);
|
|
9050
|
+
} catch {
|
|
9051
|
+
return null;
|
|
9052
|
+
}
|
|
9053
|
+
let bestScore = -Infinity;
|
|
9054
|
+
let bestPath = "";
|
|
9055
|
+
let bestLabel = "";
|
|
9056
|
+
for (const branchId of branchEntries) {
|
|
9057
|
+
const branchPath = path27.join(branchesDir, branchId);
|
|
9058
|
+
const branchIterations = await listIterations(branchPath);
|
|
9059
|
+
if (branchIterations.length === 0) continue;
|
|
9060
|
+
const bestIter = await findBestIteration(branchPath, branchIterations);
|
|
9061
|
+
const log = await loadIterationLog(branchPath, bestIter);
|
|
9062
|
+
const score = log?.score ?? 0;
|
|
9063
|
+
if (score > bestScore) {
|
|
9064
|
+
bestScore = score;
|
|
9065
|
+
bestPath = path27.join(branchPath, "iterations", bestIter.toString(), "harness");
|
|
9066
|
+
bestLabel = `branch ${branchId}, iteration ${bestIter} (${score.toFixed(1)}%)`;
|
|
9067
|
+
}
|
|
9068
|
+
}
|
|
9069
|
+
const synthesisHarness = path27.join(workspacePath, "synthesis", "harness");
|
|
9070
|
+
try {
|
|
9071
|
+
await fs27.access(synthesisHarness);
|
|
9072
|
+
const synthesisLog = await loadIterationLog(workspacePath, 999);
|
|
9073
|
+
const synthScore = synthesisLog?.score ?? 0;
|
|
9074
|
+
if (synthScore > bestScore) {
|
|
9075
|
+
bestScore = synthScore;
|
|
9076
|
+
bestPath = synthesisHarness;
|
|
9077
|
+
bestLabel = `Meta-Principal synthesis (${synthScore.toFixed(1)}%)`;
|
|
9078
|
+
}
|
|
9079
|
+
} catch {
|
|
9080
|
+
}
|
|
9081
|
+
if (!bestPath) return null;
|
|
9082
|
+
return { harnessPath: bestPath, label: bestLabel };
|
|
9083
|
+
}
|
|
9084
|
+
async function applyEvolution(workspacePath, projectRoot, targetIteration, pbt) {
|
|
9085
|
+
if (pbt) {
|
|
9086
|
+
const pbtResult = await findBestPBTHarness(workspacePath);
|
|
9087
|
+
if (!pbtResult) {
|
|
9088
|
+
throw new Error("No PBT results found. Run `kairn evolve pbt` first.");
|
|
9089
|
+
}
|
|
9090
|
+
const claudeDir2 = path27.join(projectRoot, ".claude");
|
|
9091
|
+
const diffPreview2 = await generateDiff2(claudeDir2, pbtResult.harnessPath);
|
|
9092
|
+
const currentFiles2 = await listFilesRecursive(claudeDir2);
|
|
9093
|
+
const targetFiles2 = await listFilesRecursive(pbtResult.harnessPath);
|
|
9094
|
+
const allPaths2 = /* @__PURE__ */ new Set([...currentFiles2, ...targetFiles2]);
|
|
9095
|
+
const filesChanged2 = [];
|
|
9096
|
+
for (const filePath of allPaths2) {
|
|
9097
|
+
const currentContent = await fs27.readFile(path27.join(claudeDir2, filePath), "utf-8").catch(() => null);
|
|
9098
|
+
const targetContent = await fs27.readFile(path27.join(pbtResult.harnessPath, filePath), "utf-8").catch(() => null);
|
|
9099
|
+
if (currentContent !== targetContent) {
|
|
9100
|
+
filesChanged2.push(filePath);
|
|
9101
|
+
}
|
|
9102
|
+
}
|
|
9103
|
+
await fs27.rm(claudeDir2, { recursive: true, force: true });
|
|
9104
|
+
await copyDir(pbtResult.harnessPath, claudeDir2);
|
|
9105
|
+
const harnessMcpJson2 = path27.join(pbtResult.harnessPath, ".mcp.json");
|
|
9106
|
+
const projectMcpJson2 = path27.join(projectRoot, ".mcp.json");
|
|
9107
|
+
try {
|
|
9108
|
+
await fs27.access(harnessMcpJson2);
|
|
9109
|
+
await fs27.copyFile(harnessMcpJson2, projectMcpJson2);
|
|
9110
|
+
if (!filesChanged2.includes(".mcp.json")) filesChanged2.push(".mcp.json");
|
|
9111
|
+
} catch {
|
|
9112
|
+
}
|
|
9113
|
+
return {
|
|
9114
|
+
iteration: -1,
|
|
9115
|
+
// signals PBT source
|
|
9116
|
+
filesChanged: filesChanged2,
|
|
9117
|
+
diffPreview: diffPreview2
|
|
9118
|
+
};
|
|
9119
|
+
}
|
|
8723
9120
|
const iterations = await listIterations(workspacePath);
|
|
8724
9121
|
if (iterations.length === 0) {
|
|
8725
9122
|
throw new Error("No iterations found in workspace. Run `kairn evolve run` first.");
|
|
@@ -8778,7 +9175,7 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration) {
|
|
|
8778
9175
|
// src/commands/evolve.ts
|
|
8779
9176
|
var DEFAULT_CONFIG = {
|
|
8780
9177
|
model: "claude-sonnet-4-6",
|
|
8781
|
-
proposerModel: "claude-
|
|
9178
|
+
proposerModel: "claude-sonnet-4-6",
|
|
8782
9179
|
scorer: "pass-fail",
|
|
8783
9180
|
maxIterations: 5,
|
|
8784
9181
|
parallelTasks: 1,
|
|
@@ -9290,7 +9687,7 @@ evolveCommand.command("pbt").description("Run Population-Based Training with par
|
|
|
9290
9687
|
process.exit(1);
|
|
9291
9688
|
}
|
|
9292
9689
|
});
|
|
9293
|
-
evolveCommand.command("apply").description("Apply the best evolved harness to your project").option("--iter <n>", "Apply a specific iteration instead of the best").option("--force", "Apply even if git working tree is dirty").option("--no-commit", "Skip automatic git commit after applying").action(async (options) => {
|
|
9690
|
+
evolveCommand.command("apply").description("Apply the best evolved harness to your project").option("--iter <n>", "Apply a specific iteration instead of the best").option("--pbt", "Apply best PBT result (branch winner or synthesis)").option("--force", "Apply even if git working tree is dirty").option("--no-commit", "Skip automatic git commit after applying").action(async (options) => {
|
|
9294
9691
|
try {
|
|
9295
9692
|
const projectRoot = process.cwd();
|
|
9296
9693
|
const workspace = path30.join(projectRoot, ".kairn-evolve");
|
|
@@ -9309,7 +9706,7 @@ evolveCommand.command("apply").description("Apply the best evolved harness to yo
|
|
|
9309
9706
|
process.exit(1);
|
|
9310
9707
|
}
|
|
9311
9708
|
}
|
|
9312
|
-
const result = await applyEvolution(workspace, projectRoot, targetIteration);
|
|
9709
|
+
const result = await applyEvolution(workspace, projectRoot, targetIteration, options.pbt);
|
|
9313
9710
|
if (result.diffPreview) {
|
|
9314
9711
|
console.log(ui.section("Changes"));
|
|
9315
9712
|
for (const line of result.diffPreview.split("\n")) {
|