kairn-cli 2.7.2 → 2.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +467 -33
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -218,6 +218,7 @@ async function callLLM(config, userMessage, options) {
|
|
|
218
218
|
const maxTokens = options.maxTokens ?? 8192;
|
|
219
219
|
const { systemPrompt } = options;
|
|
220
220
|
const jsonMode = options.jsonMode ?? false;
|
|
221
|
+
const cacheControl = options.cacheControl ?? false;
|
|
221
222
|
const providerName = getProviderName(config.provider);
|
|
222
223
|
let apiKey = config.api_key;
|
|
223
224
|
if (config.auth_type === "claude-code-oauth") {
|
|
@@ -238,7 +239,7 @@ async function callLLM(config, userMessage, options) {
|
|
|
238
239
|
const response = await client2.messages.create({
|
|
239
240
|
model: config.model,
|
|
240
241
|
max_tokens: maxTokens,
|
|
241
|
-
system: systemPrompt,
|
|
242
|
+
system: cacheControl ? [{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } }] : systemPrompt,
|
|
242
243
|
messages
|
|
243
244
|
});
|
|
244
245
|
const textBlock = response.content.find((block) => block.type === "text");
|
|
@@ -461,6 +462,96 @@ var init_exec = __esm({
|
|
|
461
462
|
});
|
|
462
463
|
|
|
463
464
|
// src/evolve/scorers.ts
|
|
465
|
+
function scoreCriterionDeterministic(criterionText, stdout, stderr) {
|
|
466
|
+
const combined = `${stdout}
|
|
467
|
+
${stderr}`.toLowerCase();
|
|
468
|
+
const criterionLower = criterionText.toLowerCase().trim();
|
|
469
|
+
if (/^ran\b/i.test(criterionText.trim())) {
|
|
470
|
+
for (const entry of RAN_COMMAND_EVIDENCE) {
|
|
471
|
+
const matchesKeyword = entry.keywords.some(
|
|
472
|
+
(kw) => criterionLower.includes(kw.toLowerCase())
|
|
473
|
+
);
|
|
474
|
+
if (matchesKeyword) {
|
|
475
|
+
const found = entry.evidence.some((ev) => combined.includes(ev.toLowerCase()));
|
|
476
|
+
if (found) {
|
|
477
|
+
const matchedEvidence = entry.evidence.find(
|
|
478
|
+
(ev) => combined.includes(ev.toLowerCase())
|
|
479
|
+
);
|
|
480
|
+
return {
|
|
481
|
+
score: 1,
|
|
482
|
+
reasoning: `Deterministic: found evidence of '${matchedEvidence}' in output`
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
return {
|
|
486
|
+
score: 0,
|
|
487
|
+
reasoning: `Deterministic: no evidence of '${entry.keywords[0]}' found`
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
return null;
|
|
492
|
+
}
|
|
493
|
+
if (/^(zero|no)\b/i.test(criterionText.trim())) {
|
|
494
|
+
for (const entry of ABSENCE_PATTERNS) {
|
|
495
|
+
const matchesKeyword = entry.keywords.some(
|
|
496
|
+
(kw) => criterionLower.includes(kw.toLowerCase())
|
|
497
|
+
);
|
|
498
|
+
if (matchesKeyword) {
|
|
499
|
+
const found = entry.search.some((pat) => combined.includes(pat.toLowerCase()));
|
|
500
|
+
if (found) {
|
|
501
|
+
const matchedPattern = entry.search.find(
|
|
502
|
+
(pat) => combined.includes(pat.toLowerCase())
|
|
503
|
+
);
|
|
504
|
+
return {
|
|
505
|
+
score: 0,
|
|
506
|
+
reasoning: `Deterministic: found '${matchedPattern}' which should be absent`
|
|
507
|
+
};
|
|
508
|
+
}
|
|
509
|
+
return {
|
|
510
|
+
score: 1,
|
|
511
|
+
reasoning: `Deterministic: no prohibited pattern found in output`
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
return null;
|
|
516
|
+
}
|
|
517
|
+
if (/^uses?\b/i.test(criterionText.trim())) {
|
|
518
|
+
for (const entry of PRESENCE_PATTERNS) {
|
|
519
|
+
if (criterionLower.includes(entry.keyword.toLowerCase())) {
|
|
520
|
+
const found = entry.search.some((s) => combined.includes(s.toLowerCase()));
|
|
521
|
+
if (found) {
|
|
522
|
+
return {
|
|
523
|
+
score: 1,
|
|
524
|
+
reasoning: `Deterministic: found '${entry.keyword}' in output`
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
return {
|
|
528
|
+
score: 0,
|
|
529
|
+
reasoning: `Deterministic: '${entry.keyword}' not found in output`
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
return null;
|
|
534
|
+
}
|
|
535
|
+
if (/^calls?\b/i.test(criterionText.trim())) {
|
|
536
|
+
for (const pattern of CALL_PATTERNS) {
|
|
537
|
+
if (criterionLower.includes(pattern.toLowerCase())) {
|
|
538
|
+
const found = combined.includes(pattern.toLowerCase());
|
|
539
|
+
if (found) {
|
|
540
|
+
return {
|
|
541
|
+
score: 1,
|
|
542
|
+
reasoning: `Deterministic: found '${pattern}' in output`
|
|
543
|
+
};
|
|
544
|
+
}
|
|
545
|
+
return {
|
|
546
|
+
score: 0,
|
|
547
|
+
reasoning: `Deterministic: '${pattern}' not found in output`
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
return null;
|
|
552
|
+
}
|
|
553
|
+
return null;
|
|
554
|
+
}
|
|
464
555
|
async function passFailScorer(task, workspacePath, stdout, stderr) {
|
|
465
556
|
const outcomes = Array.isArray(task.expected_outcome) ? task.expected_outcome : task.expected_outcome.split("\n");
|
|
466
557
|
const commands = outcomes.map((line) => line.replace(/^-\s*/, "").trim()).filter((line) => COMMAND_PATTERN.test(line));
|
|
@@ -513,7 +604,8 @@ async function llmJudgeScorer(task, workspacePath, stdout, stderr, config) {
|
|
|
513
604
|
try {
|
|
514
605
|
const response = await callLLM(config, userMessage, {
|
|
515
606
|
systemPrompt: JUDGE_SYSTEM_PROMPT,
|
|
516
|
-
maxTokens: 1024
|
|
607
|
+
maxTokens: 1024,
|
|
608
|
+
cacheControl: true
|
|
517
609
|
});
|
|
518
610
|
let cleaned = response.trim();
|
|
519
611
|
if (cleaned.startsWith("```")) {
|
|
@@ -544,6 +636,20 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
|
|
|
544
636
|
const breakdown = [];
|
|
545
637
|
let weightedSum = 0;
|
|
546
638
|
for (const criterion of task.rubric) {
|
|
639
|
+
const deterministicResult = scoreCriterionDeterministic(
|
|
640
|
+
criterion.criterion,
|
|
641
|
+
stdout,
|
|
642
|
+
stderr
|
|
643
|
+
);
|
|
644
|
+
if (deterministicResult !== null) {
|
|
645
|
+
breakdown.push({
|
|
646
|
+
criterion: criterion.criterion,
|
|
647
|
+
score: deterministicResult.score,
|
|
648
|
+
weight: criterion.weight
|
|
649
|
+
});
|
|
650
|
+
weightedSum += deterministicResult.score * criterion.weight;
|
|
651
|
+
continue;
|
|
652
|
+
}
|
|
547
653
|
const userMessage = [
|
|
548
654
|
"## Task",
|
|
549
655
|
task.description,
|
|
@@ -560,7 +666,8 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
|
|
|
560
666
|
try {
|
|
561
667
|
const response = await callLLM(config, userMessage, {
|
|
562
668
|
systemPrompt: RUBRIC_SYSTEM_PROMPT,
|
|
563
|
-
maxTokens: 512
|
|
669
|
+
maxTokens: 512,
|
|
670
|
+
cacheControl: true
|
|
564
671
|
});
|
|
565
672
|
let cleaned = response.trim();
|
|
566
673
|
if (cleaned.startsWith("```")) {
|
|
@@ -638,7 +745,7 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
|
|
|
638
745
|
}
|
|
639
746
|
return score;
|
|
640
747
|
}
|
|
641
|
-
var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT;
|
|
748
|
+
var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT, RAN_COMMAND_EVIDENCE, ABSENCE_PATTERNS, PRESENCE_PATTERNS, CALL_PATTERNS;
|
|
642
749
|
var init_scorers = __esm({
|
|
643
750
|
"src/evolve/scorers.ts"() {
|
|
644
751
|
"use strict";
|
|
@@ -661,6 +768,31 @@ Return ONLY valid JSON:
|
|
|
661
768
|
"score": 0.0-1.0,
|
|
662
769
|
"reasoning": "Brief explanation"
|
|
663
770
|
}`;
|
|
771
|
+
RAN_COMMAND_EVIDENCE = [
|
|
772
|
+
{ keywords: ["npm run build", "build", "tsup"], evidence: ["build success", "tsup", "built in", "build completed"] },
|
|
773
|
+
{ keywords: ["tsc", "typecheck"], evidence: ["tsc", "typecheck"] },
|
|
774
|
+
{ keywords: ["npm run lint", "eslint", "lint"], evidence: ["lint", "eslint"] },
|
|
775
|
+
{ keywords: ["npm test", "vitest", "test"], evidence: ["vitest", "test files", "tests passed", "passed (", "tests "] }
|
|
776
|
+
];
|
|
777
|
+
ABSENCE_PATTERNS = [
|
|
778
|
+
{ keywords: [".then()", ".catch()"], search: [".then(", ".catch("] },
|
|
779
|
+
{ keywords: ["readfilesync", "writefilesync"], search: ["readfilesync", "writefilesync"] },
|
|
780
|
+
{ keywords: ["sync"], search: ["sync"] }
|
|
781
|
+
];
|
|
782
|
+
PRESENCE_PATTERNS = [
|
|
783
|
+
{ keyword: "chalk.green", search: ["chalk.green"] },
|
|
784
|
+
{ keyword: "chalk.yellow", search: ["chalk.yellow"] },
|
|
785
|
+
{ keyword: "chalk.red", search: ["chalk.red"] },
|
|
786
|
+
{ keyword: "chalk.cyan", search: ["chalk.cyan"] },
|
|
787
|
+
{ keyword: "fs.promises", search: ["fs.promises", "fs/promises"] },
|
|
788
|
+
{ keyword: "fs/promises", search: ["fs.promises", "fs/promises"] },
|
|
789
|
+
{ keyword: "async/await", search: ["async ", "await "] },
|
|
790
|
+
{ keyword: "@inquirer/prompts", search: ["@inquirer/prompts"] }
|
|
791
|
+
];
|
|
792
|
+
CALL_PATTERNS = [
|
|
793
|
+
"process.exit(1)",
|
|
794
|
+
"process.exit"
|
|
795
|
+
];
|
|
664
796
|
}
|
|
665
797
|
});
|
|
666
798
|
|
|
@@ -1329,7 +1461,8 @@ async function propose(iteration, workspacePath, harnessPath, history, tasks, co
|
|
|
1329
1461
|
const response = await callLLM(proposerConfig, userMessage, {
|
|
1330
1462
|
systemPrompt: PROPOSER_SYSTEM_PROMPT,
|
|
1331
1463
|
maxTokens: 8192,
|
|
1332
|
-
jsonMode: true
|
|
1464
|
+
jsonMode: true,
|
|
1465
|
+
cacheControl: true
|
|
1333
1466
|
});
|
|
1334
1467
|
return parseProposerResponse(response);
|
|
1335
1468
|
}
|
|
@@ -1831,7 +1964,22 @@ async function parseAgents(harnessPath) {
|
|
|
1831
1964
|
if (Array.isArray(disallowedTools)) {
|
|
1832
1965
|
node.disallowedTools = disallowedTools;
|
|
1833
1966
|
}
|
|
1834
|
-
const
|
|
1967
|
+
const modelRouting = frontmatter["modelRouting"];
|
|
1968
|
+
if (typeof modelRouting === "object" && modelRouting !== null) {
|
|
1969
|
+
const mr = modelRouting;
|
|
1970
|
+
if (typeof mr["default"] === "string") {
|
|
1971
|
+
node.modelRouting = {
|
|
1972
|
+
default: mr["default"]
|
|
1973
|
+
};
|
|
1974
|
+
if (typeof mr["escalateTo"] === "string") {
|
|
1975
|
+
node.modelRouting.escalateTo = mr["escalateTo"];
|
|
1976
|
+
}
|
|
1977
|
+
if (typeof mr["escalateWhen"] === "string") {
|
|
1978
|
+
node.modelRouting.escalateWhen = mr["escalateWhen"];
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
}
|
|
1982
|
+
const knownKeys = /* @__PURE__ */ new Set(["name", "model", "disallowedTools", "modelRouting"]);
|
|
1835
1983
|
const extra = {};
|
|
1836
1984
|
for (const [key, value] of Object.entries(frontmatter)) {
|
|
1837
1985
|
if (!knownKeys.has(key)) {
|
|
@@ -2457,8 +2605,9 @@ function renderRuleWithFrontmatter(rule) {
|
|
|
2457
2605
|
function renderAgentWithFrontmatter(agent) {
|
|
2458
2606
|
const hasModel = agent.model !== void 0;
|
|
2459
2607
|
const hasDisallowed = agent.disallowedTools !== void 0 && agent.disallowedTools.length > 0;
|
|
2608
|
+
const hasRouting = agent.modelRouting !== void 0;
|
|
2460
2609
|
const hasExtra = agent.extraFrontmatter !== void 0 && Object.keys(agent.extraFrontmatter).length > 0;
|
|
2461
|
-
if (!hasModel && !hasDisallowed && !hasExtra) {
|
|
2610
|
+
if (!hasModel && !hasDisallowed && !hasRouting && !hasExtra) {
|
|
2462
2611
|
return agent.content;
|
|
2463
2612
|
}
|
|
2464
2613
|
const yamlLines = ["---"];
|
|
@@ -2471,6 +2620,16 @@ function renderAgentWithFrontmatter(agent) {
|
|
|
2471
2620
|
yamlLines.push(` - ${tool}`);
|
|
2472
2621
|
}
|
|
2473
2622
|
}
|
|
2623
|
+
if (hasRouting) {
|
|
2624
|
+
yamlLines.push("modelRouting:");
|
|
2625
|
+
yamlLines.push(` default: ${agent.modelRouting.default}`);
|
|
2626
|
+
if (agent.modelRouting.escalateTo) {
|
|
2627
|
+
yamlLines.push(` escalateTo: ${agent.modelRouting.escalateTo}`);
|
|
2628
|
+
}
|
|
2629
|
+
if (agent.modelRouting.escalateWhen) {
|
|
2630
|
+
yamlLines.push(` escalateWhen: ${agent.modelRouting.escalateWhen}`);
|
|
2631
|
+
}
|
|
2632
|
+
}
|
|
2474
2633
|
if (hasExtra) {
|
|
2475
2634
|
for (const [key, value] of Object.entries(agent.extraFrontmatter)) {
|
|
2476
2635
|
if (Array.isArray(value)) {
|
|
@@ -3350,6 +3509,93 @@ var init_regularization = __esm({
|
|
|
3350
3509
|
}
|
|
3351
3510
|
});
|
|
3352
3511
|
|
|
3512
|
+
// src/evolve/targeting.ts
|
|
3513
|
+
function mutationToAspect(mutation) {
|
|
3514
|
+
switch (mutation.type) {
|
|
3515
|
+
case "update_section": {
|
|
3516
|
+
const id = mutation.sectionId;
|
|
3517
|
+
if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
|
|
3518
|
+
if (id === "commands" || id === "custom-key-commands") return "commands";
|
|
3519
|
+
if (id === "verification") return "verification";
|
|
3520
|
+
if (id === "architecture") return "architecture";
|
|
3521
|
+
return "general";
|
|
3522
|
+
}
|
|
3523
|
+
case "add_section": {
|
|
3524
|
+
const id = mutation.section.id;
|
|
3525
|
+
if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
|
|
3526
|
+
if (id === "commands" || id === "custom-key-commands") return "commands";
|
|
3527
|
+
if (id === "verification") return "verification";
|
|
3528
|
+
if (id === "architecture") return "architecture";
|
|
3529
|
+
return "general";
|
|
3530
|
+
}
|
|
3531
|
+
case "remove_section":
|
|
3532
|
+
case "reorder_section":
|
|
3533
|
+
return "general";
|
|
3534
|
+
case "add_command":
|
|
3535
|
+
case "update_command":
|
|
3536
|
+
case "remove_command":
|
|
3537
|
+
return "commands";
|
|
3538
|
+
case "add_rule":
|
|
3539
|
+
case "update_rule":
|
|
3540
|
+
case "remove_rule":
|
|
3541
|
+
return "rules";
|
|
3542
|
+
case "add_agent":
|
|
3543
|
+
case "update_agent":
|
|
3544
|
+
case "remove_agent":
|
|
3545
|
+
return "agents";
|
|
3546
|
+
case "add_mcp_server":
|
|
3547
|
+
case "remove_mcp_server":
|
|
3548
|
+
return "mcp";
|
|
3549
|
+
case "update_settings":
|
|
3550
|
+
return "settings";
|
|
3551
|
+
case "raw_text":
|
|
3552
|
+
return "general";
|
|
3553
|
+
}
|
|
3554
|
+
}
|
|
3555
|
+
function mutationsToAspects(mutations) {
|
|
3556
|
+
const aspects = /* @__PURE__ */ new Set();
|
|
3557
|
+
for (const m of mutations) {
|
|
3558
|
+
aspects.add(mutationToAspect(m));
|
|
3559
|
+
}
|
|
3560
|
+
return aspects;
|
|
3561
|
+
}
|
|
3562
|
+
function taskDependsOnAspects(task) {
|
|
3563
|
+
const aspects = TEMPLATE_ASPECTS[task.template];
|
|
3564
|
+
return new Set(aspects ?? ["general"]);
|
|
3565
|
+
}
|
|
3566
|
+
function shouldReEvaluate(task, changedAspects) {
|
|
3567
|
+
if (changedAspects.has("general")) return true;
|
|
3568
|
+
if (changedAspects.size === 0) return false;
|
|
3569
|
+
const taskAspects = taskDependsOnAspects(task);
|
|
3570
|
+
if (taskAspects.has("general")) return true;
|
|
3571
|
+
for (const aspect of taskAspects) {
|
|
3572
|
+
if (changedAspects.has(aspect)) return true;
|
|
3573
|
+
}
|
|
3574
|
+
return false;
|
|
3575
|
+
}
|
|
3576
|
+
function filterTasksByAspects(tasks, changedAspects) {
|
|
3577
|
+
return tasks.filter((t) => shouldReEvaluate(t, changedAspects));
|
|
3578
|
+
}
|
|
3579
|
+
var TEMPLATE_ASPECTS;
|
|
3580
|
+
var init_targeting = __esm({
|
|
3581
|
+
"src/evolve/targeting.ts"() {
|
|
3582
|
+
"use strict";
|
|
3583
|
+
TEMPLATE_ASPECTS = {
|
|
3584
|
+
"convention-adherence": ["conventions", "rules"],
|
|
3585
|
+
"workflow-compliance": ["commands", "verification"],
|
|
3586
|
+
"rule-compliance": ["rules"],
|
|
3587
|
+
"intent-routing": ["settings"],
|
|
3588
|
+
"add-feature": ["general"],
|
|
3589
|
+
"fix-bug": ["general"],
|
|
3590
|
+
"refactor": ["architecture", "conventions"],
|
|
3591
|
+
"test-writing": ["verification", "commands"],
|
|
3592
|
+
"config-change": ["settings", "mcp"],
|
|
3593
|
+
"documentation": ["general"],
|
|
3594
|
+
"persistence-completion": ["commands", "verification"]
|
|
3595
|
+
};
|
|
3596
|
+
}
|
|
3597
|
+
});
|
|
3598
|
+
|
|
3353
3599
|
// src/evolve/loop.ts
|
|
3354
3600
|
import fs25 from "fs/promises";
|
|
3355
3601
|
import path25 from "path";
|
|
@@ -3382,6 +3628,7 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
3382
3628
|
}
|
|
3383
3629
|
}
|
|
3384
3630
|
}
|
|
3631
|
+
let lastChangedAspects = null;
|
|
3385
3632
|
let rngState = evolveConfig.rngSeed ?? 42;
|
|
3386
3633
|
const rng = () => {
|
|
3387
3634
|
rngState = rngState * 1664525 + 1013904223 & 4294967295;
|
|
@@ -3428,6 +3675,22 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
3428
3675
|
tasksToRun.push(task);
|
|
3429
3676
|
}
|
|
3430
3677
|
}
|
|
3678
|
+
if (lastChangedAspects !== null) {
|
|
3679
|
+
const targetedTasks = filterTasksByAspects(tasksToRun, lastChangedAspects);
|
|
3680
|
+
const skippedByTargeting = tasksToRun.filter((t) => !targetedTasks.includes(t));
|
|
3681
|
+
for (const task of skippedByTargeting) {
|
|
3682
|
+
const prev = prevLog.taskResults[task.id];
|
|
3683
|
+
const prevVal = prev ? prev.score ?? (prev.pass ? 100 : 0) : 0;
|
|
3684
|
+
carriedScores[task.id] = { pass: prevVal >= 50, score: prevVal };
|
|
3685
|
+
onProgress?.({
|
|
3686
|
+
type: "task-skipped",
|
|
3687
|
+
iteration: iter,
|
|
3688
|
+
taskId: task.id,
|
|
3689
|
+
message: `Skipped ${task.id} (unaffected by mutations)`
|
|
3690
|
+
});
|
|
3691
|
+
}
|
|
3692
|
+
tasksToRun = targetedTasks;
|
|
3693
|
+
}
|
|
3431
3694
|
const sampleSize = evolveConfig.evalSampleSize;
|
|
3432
3695
|
if (sampleSize > 0 && sampleSize < tasksToRun.length) {
|
|
3433
3696
|
let sampled;
|
|
@@ -3583,6 +3846,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
3583
3846
|
}
|
|
3584
3847
|
const nextIterDir2 = path25.join(workspacePath, "iterations", (iter + 1).toString());
|
|
3585
3848
|
await applyMutations(bestHarnessPath, nextIterDir2, rollbackProposal.mutations);
|
|
3849
|
+
try {
|
|
3850
|
+
const rollbackIR = await parseHarness(bestHarnessPath);
|
|
3851
|
+
const irMuts = translateMutations(rollbackProposal.mutations, rollbackIR);
|
|
3852
|
+
lastChangedAspects = mutationsToAspects(irMuts);
|
|
3853
|
+
} catch {
|
|
3854
|
+
lastChangedAspects = null;
|
|
3855
|
+
}
|
|
3586
3856
|
onProgress?.({
|
|
3587
3857
|
type: "mutations-applied",
|
|
3588
3858
|
iteration: iter,
|
|
@@ -3687,8 +3957,16 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
3687
3957
|
proposal.mutations
|
|
3688
3958
|
);
|
|
3689
3959
|
diffPatch = mutationResult.diffPatch;
|
|
3960
|
+
try {
|
|
3961
|
+
const currentIR = await parseHarness(harnessPath);
|
|
3962
|
+
const irMuts = translateMutations(proposal.mutations, currentIR);
|
|
3963
|
+
lastChangedAspects = mutationsToAspects(irMuts);
|
|
3964
|
+
} catch {
|
|
3965
|
+
lastChangedAspects = null;
|
|
3966
|
+
}
|
|
3690
3967
|
} catch {
|
|
3691
3968
|
await copyDir(harnessPath, path25.join(nextIterDir, "harness"));
|
|
3969
|
+
lastChangedAspects = null;
|
|
3692
3970
|
}
|
|
3693
3971
|
onProgress?.({
|
|
3694
3972
|
type: "mutations-applied",
|
|
@@ -3787,6 +4065,8 @@ var init_loop = __esm({
|
|
|
3787
4065
|
init_sampling();
|
|
3788
4066
|
init_regularization();
|
|
3789
4067
|
init_parser();
|
|
4068
|
+
init_translate();
|
|
4069
|
+
init_targeting();
|
|
3790
4070
|
}
|
|
3791
4071
|
});
|
|
3792
4072
|
|
|
@@ -3917,7 +4197,8 @@ ${userMessage}`;
|
|
|
3917
4197
|
const response = await callLLM(proposerConfig, fullMessage, {
|
|
3918
4198
|
systemPrompt,
|
|
3919
4199
|
maxTokens: 8192,
|
|
3920
|
-
jsonMode: true
|
|
4200
|
+
jsonMode: true,
|
|
4201
|
+
cacheControl: true
|
|
3921
4202
|
});
|
|
3922
4203
|
const proposal = parseProposerResponse(response);
|
|
3923
4204
|
return {
|
|
@@ -4629,7 +4910,7 @@ You must output a JSON object matching the SkeletonSpec schema.
|
|
|
4629
4910
|
- MCP servers: maximum 6. Prefer fewer.
|
|
4630
4911
|
- Skills: maximum 3. Only include directly relevant ones.
|
|
4631
4912
|
- Agents: maximum 5. Orchestration pipeline (/develop) agents.
|
|
4632
|
-
- Hooks: maximum
|
|
4913
|
+
- Hooks: maximum 5 (auto-format, block-destructive, PostCompact, memory-persistence, plus one contextual).
|
|
4633
4914
|
|
|
4634
4915
|
If the workflow doesn't clearly need a tool, DO NOT include it.
|
|
4635
4916
|
Each MCP server costs 500-2000 tokens of context window.
|
|
@@ -4752,6 +5033,12 @@ At the start of every session, before doing ANY work:
|
|
|
4752
5033
|
|
|
4753
5034
|
This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
|
|
4754
5035
|
|
|
5036
|
+
## Sprint Contract
|
|
5037
|
+
|
|
5038
|
+
Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
|
|
5039
|
+
Each criterion must be numbered, testable, and independently verifiable.
|
|
5040
|
+
After implementing, verify EACH criterion individually. Do not mark done until all pass.
|
|
5041
|
+
|
|
4755
5042
|
## Completion Standards
|
|
4756
5043
|
|
|
4757
5044
|
Never mark a task "done" without running the Completion Verification checklist.
|
|
@@ -4781,6 +5068,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
|
|
|
4781
5068
|
15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
|
|
4782
5069
|
16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
|
|
4783
5070
|
17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
|
|
5071
|
+
18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
|
|
4784
5072
|
|
|
4785
5073
|
## Shell-Integrated Commands
|
|
4786
5074
|
|
|
@@ -4837,17 +5125,19 @@ Only generate scoped rules when the workflow involves multiple code domains.
|
|
|
4837
5125
|
|
|
4838
5126
|
Generate hooks in settings.json based on project type:
|
|
4839
5127
|
|
|
4840
|
-
**All code projects** \u2014 block destructive commands:
|
|
5128
|
+
**All code projects** \u2014 block destructive commands, credential leaks, injection, and network exfiltration:
|
|
4841
5129
|
\`\`\`json
|
|
4842
5130
|
{
|
|
4843
5131
|
"hooks": {
|
|
4844
|
-
"PreToolUse": [
|
|
4845
|
-
|
|
4846
|
-
|
|
4847
|
-
"
|
|
4848
|
-
|
|
4849
|
-
|
|
4850
|
-
|
|
5132
|
+
"PreToolUse": [
|
|
5133
|
+
{
|
|
5134
|
+
"matcher": "Bash",
|
|
5135
|
+
"hooks": [{
|
|
5136
|
+
"type": "command",
|
|
5137
|
+
"command": "CMD=$(cat | jq -r '.tool_input.command // empty') && echo \\"$CMD\\" | grep -qiE 'rm\\\\s+-rf\\\\s+/|DROP\\\\s+(TABLE|DATABASE)|curl.*\\\\|\\\\s*sh|:(){ :|:& };:|git\\\\s+push.*--force(?!-with-lease)|ch(mod|own).*-R\\\\s+/|npm\\\\s+publish(?!.*--dry-run)|(api[_-]?key|secret|token|password)\\\\s*[:=]|AKIA[0-9A-Z]{16}|BEGIN.*PRIVATE\\\\s+KEY|;\\\\s*(DROP|DELETE|ALTER|TRUNCATE)\\\\s+|\\\\.\\\\./\\\\.\\\\./\\\\.\\\\./|nc\\\\s+.*-e|/dev/tcp/|bash\\\\s+-i|curl.*-d.*@|wget.*--post-file' && echo 'Blocked dangerous command' >&2 && exit 2 || true"
|
|
5138
|
+
}]
|
|
5139
|
+
}
|
|
5140
|
+
]
|
|
4851
5141
|
}
|
|
4852
5142
|
}
|
|
4853
5143
|
\`\`\`
|
|
@@ -4889,6 +5179,23 @@ All projects should include a PostCompact hook to restore context after compacti
|
|
|
4889
5179
|
|
|
4890
5180
|
Merge this into the settings hooks alongside the PreToolUse and PostToolUse hooks.
|
|
4891
5181
|
|
|
5182
|
+
For long-running sessions (>2 hours or >3 compactions), prefer "Full Reset" over re-inject:
|
|
5183
|
+
replace the prompt-type PostCompact hook with a command-type hook that pipes CLAUDE.md + SPRINT.md + DECISIONS.md content directly into additionalContext.
|
|
5184
|
+
|
|
5185
|
+
## Memory Persistence Hooks
|
|
5186
|
+
|
|
5187
|
+
For projects with multi-session workflows, include SessionStart/End hooks that persist context to \`.claude/memory.json\`:
|
|
5188
|
+
- **SessionEnd:** Save recent decisions, sprint status, and known gotchas to \`.claude/memory.json\`
|
|
5189
|
+
- **SessionStart:** Load \`.claude/memory.json\` and inject as additionalContext
|
|
5190
|
+
|
|
5191
|
+
This ensures accumulated project knowledge survives session boundaries.
|
|
5192
|
+
|
|
5193
|
+
## Persistence Routing Hook
|
|
5194
|
+
|
|
5195
|
+
For code projects with autonomy level 3+, include a \`UserPromptSubmit\` hook that routes complex tasks through \`/project:persist\`.
|
|
5196
|
+
Generate \`.claude/hooks/persist-router.mjs\` (the routing logic) and add a \`UserPromptSubmit\` hook entry in settings.json that invokes it.
|
|
5197
|
+
Also add \`"persistence_routing": "auto"\` (level 3-4) or \`"persistence_routing": "manual"\` (level 1-2) to settings.json.
|
|
5198
|
+
|
|
4892
5199
|
## For Code Projects, Additionally Include
|
|
4893
5200
|
|
|
4894
5201
|
- \`/project:plan\` command (plan before coding)
|
|
@@ -4898,7 +5205,7 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
|
|
|
4898
5205
|
- \`/project:status\` command (live git status, recent commits, SPRINT.md overview using ! prefix)
|
|
4899
5206
|
- \`/project:fix\` command (takes $ARGUMENTS as issue number, plans fix, implements, tests, commits)
|
|
4900
5207
|
- \`/project:sprint\` command (define acceptance criteria before coding, writes to docs/SPRINT.md)
|
|
4901
|
-
- \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
|
|
5208
|
+
- \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). Phase 4 (Verify) MUST validate EACH acceptance criterion from docs/SPRINT.md individually, reporting PASS/FAIL per item as a contract scorecard. MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
|
|
4902
5209
|
- A TDD skill using the 3-phase isolation pattern (RED \u2192 GREEN \u2192 REFACTOR):
|
|
4903
5210
|
- RED: Write failing test only. Verify it FAILS.
|
|
4904
5211
|
- GREEN: Write MINIMUM code to pass. Nothing extra.
|
|
@@ -4908,16 +5215,31 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
|
|
|
4908
5215
|
- \`@qa-orchestrator\` (sonnet) \u2014 delegates to linter and e2e-tester, compiles QA report
|
|
4909
5216
|
- \`@linter\` (haiku) \u2014 runs formatters, linters, security scanners
|
|
4910
5217
|
- \`@e2e-tester\` (sonnet, only when Playwright is in tools) \u2014 browser-based QA via Playwright
|
|
4911
|
-
-
|
|
4912
|
-
|
|
4913
|
-
|
|
4914
|
-
-
|
|
4915
|
-
-
|
|
4916
|
-
-
|
|
4917
|
-
|
|
5218
|
+
- A "Model Selection" section in generated agents:
|
|
5219
|
+
\`\`\`
|
|
5220
|
+
## Model Selection (all agents)
|
|
5221
|
+
- Haiku: simple file edits, linting, formatting, doc updates (<50 lines changed)
|
|
5222
|
+
- Sonnet: implementation, testing, debugging, code review (50-500 lines)
|
|
5223
|
+
- Opus: architecture decisions, spec writing, complex refactors (>500 lines or cross-cutting)
|
|
5224
|
+
Default: Sonnet. Only escalate to Opus when the task involves multi-file architecture or ambiguous requirements.
|
|
5225
|
+
\`\`\`
|
|
5226
|
+
- Development pipeline agents (used by /project:develop). Each agent should include a modelRouting field in its YAML frontmatter:
|
|
5227
|
+
- \`@architect\` (default: opus) \u2014 conducts spec interview with user, writes confirmed spec to docs/SPRINT.md with numbered acceptance criteria. Your spec is a CONTRACT \u2014 the verifier will check every criterion. Vague criteria = guaranteed rework.
|
|
5228
|
+
- \`@planner\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 reads spec and codebase, creates step-by-step implementation plan in docs/PLAN.md
|
|
5229
|
+
- \`@implementer\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 TDD-focused implementation, writes failing tests then minimum code to pass
|
|
5230
|
+
- \`@fixer\` (default: sonnet, use haiku for single-file fixes) \u2014 targeted bug fixing from verifier/review feedback
|
|
5231
|
+
- \`@doc-updater\` (default: haiku) \u2014 extracts decisions and learnings from completed work, updates docs/DECISIONS.md and docs/LEARNINGS.md
|
|
5232
|
+
- \`/project:spec\` command (interview-based spec creation \u2014 asks 5-8 questions one at a time, writes structured spec to docs/SPRINT.md with ## Acceptance Criteria containing 3-8 numbered, testable conditions. Each criterion must be independently verifiable. Does NOT start coding until confirmed)
|
|
4918
5233
|
- \`/project:prove\` command (runs tests, shows git diff vs main, rates confidence HIGH/MEDIUM/LOW with evidence)
|
|
4919
5234
|
- \`/project:grill\` command (adversarial code review \u2014 challenges each change with "why this approach?", "what if X input?", rates BLOCKER/SHOULD-FIX/NITPICK, blocks until BLOCKERs resolved)
|
|
4920
5235
|
- \`/project:reset\` command (reads DECISIONS.md and LEARNINGS.md, proposes clean restart, stashes current work, implements elegant solution)
|
|
5236
|
+
- \`/project:persist\` command (persistent execution loop \u2014 reads acceptance criteria from docs/SPRINT.md, works criterion-by-criterion with structured progress tracking in .claude/progress.json, auto-retries on verification failure up to 3 times per criterion, delegates to @grill for review gate before completion, resumes from progress.json if session was interrupted). The command protocol:
|
|
5237
|
+
1. Load or initialize .claude/progress.json from docs/SPRINT.md numbered acceptance criteria
|
|
5238
|
+
2. For each incomplete criterion: implement, run verification (build/test/typecheck/lint), mark PASSED or retry (max 3 attempts per criterion, mark BLOCKED after 3 failures)
|
|
5239
|
+
3. After all criteria attempted: if any BLOCKED report which and why; if all PASSED proceed to review gate
|
|
5240
|
+
4. Review gate: delegate to @grill for adversarial review; fix blockers if found (max 1 fix cycle)
|
|
5241
|
+
5. Persist state: write final progress.json; include progress summary in memory.json for session resume
|
|
5242
|
+
Resume protocol: when progress.json exists, skip PASSED criteria, resume from first non-PASSED criterion, carry forward failure notes from prior attempts.
|
|
4921
5243
|
|
|
4922
5244
|
## For Research Projects, Additionally Include
|
|
4923
5245
|
|
|
@@ -4962,7 +5284,7 @@ Return ONLY valid JSON matching this structure:
|
|
|
4962
5284
|
\`\`\`json
|
|
4963
5285
|
{
|
|
4964
5286
|
"claude_md": "Full CLAUDE.md content (under 150 lines)",
|
|
4965
|
-
"commands": { "help": "...", "develop": "...", "status": "...", "fix": "...", "sprint": "...", "spec": "...", "prove": "...", "grill": "...", "reset": "..." },
|
|
5287
|
+
"commands": { "help": "...", "develop": "...", "status": "...", "fix": "...", "sprint": "...", "spec": "...", "prove": "...", "grill": "...", "reset": "...", "persist": "..." },
|
|
4966
5288
|
"rules": { "continuity": "...", "security": "..." },
|
|
4967
5289
|
"agents": { "architect": "...", "planner": "...", "implementer": "...", "fixer": "...", "doc-updater": "...", "qa-orchestrator": "...", "linter": "...", "e2e-tester": "..." },
|
|
4968
5290
|
"skills": { "skill-name/SKILL": "..." },
|
|
@@ -5067,6 +5389,12 @@ At the start of every session, before doing ANY work:
|
|
|
5067
5389
|
|
|
5068
5390
|
This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
|
|
5069
5391
|
|
|
5392
|
+
## Sprint Contract
|
|
5393
|
+
|
|
5394
|
+
Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
|
|
5395
|
+
Each criterion must be numbered, testable, and independently verifiable.
|
|
5396
|
+
After implementing, verify EACH criterion individually. Do not mark done until all pass.
|
|
5397
|
+
|
|
5070
5398
|
## Completion Standards
|
|
5071
5399
|
|
|
5072
5400
|
Never mark a task "done" without running the Completion Verification checklist.
|
|
@@ -5096,6 +5424,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
|
|
|
5096
5424
|
15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
|
|
5097
5425
|
16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
|
|
5098
5426
|
17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
|
|
5427
|
+
18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
|
|
5099
5428
|
|
|
5100
5429
|
## Tool Selection Rules
|
|
5101
5430
|
|
|
@@ -5114,7 +5443,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
|
|
|
5114
5443
|
- Skills: maximum 3. Only include directly relevant ones.
|
|
5115
5444
|
- Agents: maximum 5. Orchestration pipeline (/develop) agents.
|
|
5116
5445
|
- Commands: no limit (loaded on demand, zero context cost).
|
|
5117
|
-
- Hooks: maximum
|
|
5446
|
+
- Hooks: maximum 5 (auto-format, block-destructive, PostCompact, memory-persistence, plus one contextual).
|
|
5118
5447
|
|
|
5119
5448
|
If the workflow doesn't clearly need a tool, DO NOT include it.
|
|
5120
5449
|
Each MCP server costs 500-2000 tokens of context window.
|
|
@@ -5143,7 +5472,8 @@ Return ONLY valid JSON matching this structure:
|
|
|
5143
5472
|
},
|
|
5144
5473
|
"commands": {
|
|
5145
5474
|
"help": "markdown content for /project:help",
|
|
5146
|
-
"develop": "markdown content for /project:develop"
|
|
5475
|
+
"develop": "markdown content for /project:develop",
|
|
5476
|
+
"persist": "markdown content for /project:persist"
|
|
5147
5477
|
},
|
|
5148
5478
|
"rules": {
|
|
5149
5479
|
"continuity": "markdown content for continuity rule",
|
|
@@ -6220,6 +6550,9 @@ function applyAutonomyLevel(spec) {
|
|
|
6220
6550
|
const agents = spec.harness.agents ?? {};
|
|
6221
6551
|
const docs = spec.harness.docs ?? {};
|
|
6222
6552
|
const settings = spec.harness.settings ?? {};
|
|
6553
|
+
if (!("persistence_routing" in settings)) {
|
|
6554
|
+
settings.persistence_routing = level >= 3 ? "auto" : "manual";
|
|
6555
|
+
}
|
|
6223
6556
|
if (level >= 1) {
|
|
6224
6557
|
if (!("tour" in commands)) {
|
|
6225
6558
|
commands.tour = TOUR_COMMAND;
|
|
@@ -6291,6 +6624,85 @@ var ENV_LOADER_HOOK = {
|
|
|
6291
6624
|
command: 'if [ -f .env ] && [ -n "$CLAUDE_ENV_FILE" ]; then grep -v "^#" .env | grep -v "^$" | grep "=" >> "$CLAUDE_ENV_FILE"; fi'
|
|
6292
6625
|
}]
|
|
6293
6626
|
};
|
|
6627
|
+
var PERSIST_ROUTER_TEMPLATE = `import { readFileSync } from 'fs';
|
|
6628
|
+
|
|
6629
|
+
const input = JSON.parse(readFileSync('/dev/stdin', 'utf8'));
|
|
6630
|
+
const prompt = (input.prompt ?? '').trim();
|
|
6631
|
+
|
|
6632
|
+
// Pass-through patterns (fast exit)
|
|
6633
|
+
const PASSTHROUGH = /^(what|how|why|where|when|can you|does|is |show me|find |search |list |\\/project:)/i;
|
|
6634
|
+
const SINGLE_FILE = /^(edit|fix the typo|update the comment|change the|rename) .{3,60}$/i;
|
|
6635
|
+
|
|
6636
|
+
if (PASSTHROUGH.test(prompt) || SINGLE_FILE.test(prompt) || prompt.length < 20) {
|
|
6637
|
+
process.stdout.write(JSON.stringify({ continue: true }));
|
|
6638
|
+
process.exit(0);
|
|
6639
|
+
}
|
|
6640
|
+
|
|
6641
|
+
// Check config for routing mode
|
|
6642
|
+
let routingMode = 'auto';
|
|
6643
|
+
try {
|
|
6644
|
+
const settings = JSON.parse(readFileSync('.claude/settings.json', 'utf8'));
|
|
6645
|
+
routingMode = settings.persistence_routing ?? 'auto';
|
|
6646
|
+
} catch { /* default to auto */ }
|
|
6647
|
+
|
|
6648
|
+
if (routingMode === 'off') {
|
|
6649
|
+
process.stdout.write(JSON.stringify({ continue: true }));
|
|
6650
|
+
process.exit(0);
|
|
6651
|
+
}
|
|
6652
|
+
|
|
6653
|
+
// Complexity signals
|
|
6654
|
+
const signals = [];
|
|
6655
|
+
|
|
6656
|
+
if (/\\b(then|after that|and also|next|finally|step \\d|first .* then)\\b/i.test(prompt)) {
|
|
6657
|
+
signals.push('multi-step');
|
|
6658
|
+
}
|
|
6659
|
+
if (/\\b(add|implement|build|create|integrate|set up)\\b.*\\b(feature|auth|api|endpoint|page|component|module|service|database|migration)\\b/i.test(prompt)) {
|
|
6660
|
+
signals.push('feature-scope');
|
|
6661
|
+
}
|
|
6662
|
+
if (/\\b(migrate|convert|replace|upgrade|refactor|rewrite|restructure)\\b/i.test(prompt)) {
|
|
6663
|
+
signals.push('refactor-scope');
|
|
6664
|
+
}
|
|
6665
|
+
if (/\\b(when .* happens|steps to reproduce|broken|crash|regression|fails when)\\b/i.test(prompt)) {
|
|
6666
|
+
signals.push('bug-with-repro');
|
|
6667
|
+
}
|
|
6668
|
+
if (/\\b(persist|keep working|don't stop|until done|until .* pass)\\b/i.test(prompt)) {
|
|
6669
|
+
signals.push('explicit');
|
|
6670
|
+
}
|
|
6671
|
+
if (prompt.split(/\\s+/).length > 50) {
|
|
6672
|
+
signals.push('long-prompt');
|
|
6673
|
+
}
|
|
6674
|
+
|
|
6675
|
+
const shouldRoute = routingMode === 'manual'
|
|
6676
|
+
? signals.includes('explicit')
|
|
6677
|
+
: signals.length >= 2 || signals.includes('explicit');
|
|
6678
|
+
|
|
6679
|
+
if (shouldRoute) {
|
|
6680
|
+
process.stdout.write(JSON.stringify({
|
|
6681
|
+
continue: true,
|
|
6682
|
+
hookSpecificOutput: {
|
|
6683
|
+
hookEventName: 'UserPromptSubmit',
|
|
6684
|
+
additionalContext: [
|
|
6685
|
+
'PERSISTENCE ROUTING: This task has complexity signals (' + signals.join(', ') + ').',
|
|
6686
|
+
'Execute this using the /project:persist workflow:',
|
|
6687
|
+
'1. Ensure acceptance criteria exist in docs/SPRINT.md (create from this prompt if needed)',
|
|
6688
|
+
'2. Initialize .claude/progress.json',
|
|
6689
|
+
'3. Work criterion-by-criterion until all pass',
|
|
6690
|
+
'4. Run review gate before marking complete',
|
|
6691
|
+
].join('\\n'),
|
|
6692
|
+
},
|
|
6693
|
+
}));
|
|
6694
|
+
} else {
|
|
6695
|
+
process.stdout.write(JSON.stringify({ continue: true }));
|
|
6696
|
+
}
|
|
6697
|
+
`;
|
|
6698
|
+
var PERSIST_ROUTER_HOOK = {
|
|
6699
|
+
matcher: "",
|
|
6700
|
+
hooks: [{
|
|
6701
|
+
type: "command",
|
|
6702
|
+
command: 'node "$CLAUDE_PROJECT_DIR/.claude/hooks/persist-router.mjs"',
|
|
6703
|
+
timeout: 5
|
|
6704
|
+
}]
|
|
6705
|
+
};
|
|
6294
6706
|
function resolveSettings(spec, options) {
|
|
6295
6707
|
const settings = spec.harness.settings;
|
|
6296
6708
|
const base = settings && Object.keys(settings).length > 0 ? { ...settings } : {};
|
|
@@ -6304,6 +6716,13 @@ function resolveSettings(spec, options) {
|
|
|
6304
6716
|
hooks.SessionStart = sessionStart;
|
|
6305
6717
|
base.hooks = hooks;
|
|
6306
6718
|
}
|
|
6719
|
+
if (isCodeProject(spec) && (spec.autonomy_level ?? 1) >= 3) {
|
|
6720
|
+
const hooks = base.hooks ?? {};
|
|
6721
|
+
const userPromptSubmit = hooks.UserPromptSubmit ?? [];
|
|
6722
|
+
userPromptSubmit.push(PERSIST_ROUTER_HOOK);
|
|
6723
|
+
hooks.UserPromptSubmit = userPromptSubmit;
|
|
6724
|
+
base.hooks = hooks;
|
|
6725
|
+
}
|
|
6307
6726
|
const hasIntentHooks = spec.harness.hooks && Object.keys(spec.harness.hooks).length > 0;
|
|
6308
6727
|
if (hasIntentHooks) {
|
|
6309
6728
|
const hooks = base.hooks ?? {};
|
|
@@ -6395,6 +6814,9 @@ function buildFileMap(spec, options) {
|
|
|
6395
6814
|
files.set(".claude/hooks/intent-log.jsonl", "");
|
|
6396
6815
|
}
|
|
6397
6816
|
}
|
|
6817
|
+
if (isCodeProject(spec) && (spec.autonomy_level ?? 1) >= 3) {
|
|
6818
|
+
files.set(".claude/hooks/persist-router.mjs", PERSIST_ROUTER_TEMPLATE);
|
|
6819
|
+
}
|
|
6398
6820
|
return files;
|
|
6399
6821
|
}
|
|
6400
6822
|
async function writeEnvironment(spec, targetDir, options) {
|
|
@@ -6465,6 +6887,11 @@ async function writeEnvironment(spec, targetDir, options) {
|
|
|
6465
6887
|
written.push(".claude/hooks/intent-log.jsonl");
|
|
6466
6888
|
}
|
|
6467
6889
|
}
|
|
6890
|
+
if (isCodeProject(spec) && (spec.autonomy_level ?? 1) >= 3) {
|
|
6891
|
+
const p = path5.join(claudeDir, "hooks", "persist-router.mjs");
|
|
6892
|
+
await writeFile(p, PERSIST_ROUTER_TEMPLATE);
|
|
6893
|
+
written.push(".claude/hooks/persist-router.mjs");
|
|
6894
|
+
}
|
|
6468
6895
|
return written;
|
|
6469
6896
|
}
|
|
6470
6897
|
function summarizeSpec(spec, registry) {
|
|
@@ -8163,14 +8590,20 @@ var EVAL_TEMPLATES = {
|
|
|
8163
8590
|
name: "Intent Routing",
|
|
8164
8591
|
description: "Test that natural language prompts route to the correct workflow command via intent hooks",
|
|
8165
8592
|
bestFor: ["feature-development", "full-stack", "api-building"]
|
|
8593
|
+
},
|
|
8594
|
+
"persistence-completion": {
|
|
8595
|
+
id: "persistence-completion",
|
|
8596
|
+
name: "Persistence Completion",
|
|
8597
|
+
description: "Can the agent complete a multi-criterion task using the persistence loop?",
|
|
8598
|
+
bestFor: ["feature-development", "full-stack", "api-building", "maintenance"]
|
|
8166
8599
|
}
|
|
8167
8600
|
};
|
|
8168
8601
|
function selectTemplatesForWorkflow(workflowType) {
|
|
8169
8602
|
const mapping = {
|
|
8170
|
-
"feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance", "intent-routing"],
|
|
8171
|
-
"api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
|
|
8172
|
-
"full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
|
|
8173
|
-
"maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
|
|
8603
|
+
"feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance", "intent-routing", "persistence-completion"],
|
|
8604
|
+
"api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence", "persistence-completion"],
|
|
8605
|
+
"full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence", "persistence-completion"],
|
|
8606
|
+
"maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance", "persistence-completion"],
|
|
8174
8607
|
"debugging": ["fix-bug", "test-writing", "rule-compliance"],
|
|
8175
8608
|
"qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
|
|
8176
8609
|
"architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
|
|
@@ -8191,6 +8624,7 @@ IMPORTANT: For harness-aware templates (convention-adherence, workflow-complianc
|
|
|
8191
8624
|
- convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
|
|
8192
8625
|
- workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
|
|
8193
8626
|
- rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
|
|
8627
|
+
- persistence-completion: Task MUST have 3+ acceptance criteria that require sequential implementation. The task description should be a realistic feature request \u2014 the agent must parse it into criteria. Judge by: (a) all criteria met (progress.json status: complete), (b) structured tracking used (progress.json exists with 3+ criteria), (c) tests pass, (d) review gate executed (progress.json review field present).
|
|
8194
8628
|
|
|
8195
8629
|
These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
|
|
8196
8630
|
|
|
@@ -8860,7 +9294,7 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration, pbt)
|
|
|
8860
9294
|
// src/commands/evolve.ts
|
|
8861
9295
|
var DEFAULT_CONFIG = {
|
|
8862
9296
|
model: "claude-sonnet-4-6",
|
|
8863
|
-
proposerModel: "claude-
|
|
9297
|
+
proposerModel: "claude-sonnet-4-6",
|
|
8864
9298
|
scorer: "pass-fail",
|
|
8865
9299
|
maxIterations: 5,
|
|
8866
9300
|
parallelTasks: 1,
|