@kody-ade/kody-engine 0.4.106 → 0.4.108

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin/kody.js CHANGED
@@ -880,7 +880,7 @@ var init_loadPriorArt = __esm({
880
880
  // package.json
881
881
  var package_default = {
882
882
  name: "@kody-ade/kody-engine",
883
- version: "0.4.106",
883
+ version: "0.4.108",
884
884
  description: "kody \u2014 autonomous development engine. Single-session Claude Code agent behind a generic executor + declarative executable profiles.",
885
885
  license: "MIT",
886
886
  type: "module",
@@ -2711,6 +2711,50 @@ init_issue();
2711
2711
  import { execFileSync as execFileSync30, spawn as spawn6 } from "child_process";
2712
2712
  import * as fs38 from "fs";
2713
2713
  import * as path36 from "path";
2714
+
2715
+ // src/discipline.ts
2716
+ var DISCIPLINE = `# Working discipline (applies to this entire task)
2717
+
2718
+ These rules override any instinct to take a shortcut. They exist because this
2719
+ work runs unattended \u2014 no human will catch a hand-waved claim before it ships.
2720
+
2721
+ ## Prove before you claim
2722
+ You do not get to decide you are "done". Your job is to make the work correct
2723
+ and to PROVE every claim you make; a separate wrapper verifies the result and
2724
+ will re-invoke you with the gap if proof is missing.
2725
+
2726
+ Before writing ANY success or completion statement ("done", "fixed", "passes",
2727
+ "works", "verified"):
2728
+ 1. IDENTIFY the exact command to run, or the exact file:line to read, that
2729
+ would prove the claim.
2730
+ 2. RUN/READ it fresh in this run. Do not rely on memory of an earlier output
2731
+ or on "it should pass".
2732
+ 3. READ the full output and exit code (or the actual lines you cited).
2733
+ 4. Only then make the claim \u2014 and make it WITH the evidence beside it.
2734
+
2735
+ "Great!" / "Perfect!" / "Done!" with nothing checked this turn is the same as
2736
+ claiming something you did not verify. Don't.
2737
+
2738
+ ## Do not rationalize past a step
2739
+ Violating the letter of an instruction is violating its spirit. If you catch
2740
+ yourself thinking any of the following, STOP \u2014 it is a red flag, not a green
2741
+ light:
2742
+
2743
+ | The thought | The reality |
2744
+ |---|---|
2745
+ | "This is too simple to test/verify." | Simple changes break callers too; the check is cheap, the silent regression is not. |
2746
+ | "I already verified this earlier." | Earlier is not now \u2014 state changed. Run it again. |
2747
+ | "The diff looks right, so it works." | Reading code is not running it. Run it. |
2748
+ | "I'll skip this one step to save time." | The skipped step is the one that fails later with no human watching. |
2749
+ | "It probably passes." | "Probably" is not evidence. Make it certain, or say you could not. |
2750
+
2751
+ ## When you genuinely cannot finish
2752
+ If you cannot complete or cannot verify something, say so plainly and name what
2753
+ is blocking you. An honest "I could not verify X because Y" is correct and
2754
+ useful. A confident claim you never checked is the most expensive failure mode
2755
+ here \u2014 never substitute it for the truth.`;
2756
+
2757
+ // src/executor.ts
2714
2758
  init_events();
2715
2759
 
2716
2760
  // src/lifecycleLabels.ts
@@ -3578,6 +3622,7 @@ function loadSubagents(profile) {
3578
3622
  const tools = fm.tools.split(",").map((t) => t.trim()).filter(Boolean);
3579
3623
  if (tools.length > 0) def.tools = tools;
3580
3624
  }
3625
+ if (fm.model) def.model = fm.model;
3581
3626
  agents[fm.name || name] = def;
3582
3627
  }
3583
3628
  return agents;
@@ -11251,7 +11296,9 @@ async function runExecutable(profileName, input) {
11251
11296
  maxTurns: profile.claudeCode.maxTurns,
11252
11297
  maxThinkingTokens: profile.claudeCode.maxThinkingTokens,
11253
11298
  maxTurnTimeoutMs: typeof profile.claudeCode.maxTurnTimeoutSec === "number" ? Math.floor(profile.claudeCode.maxTurnTimeoutSec * 1e3) : void 0,
11254
- systemPromptAppend: [profile.claudeCode.systemPromptAppend, taskArtifacts?.promptAddendum].filter((s) => typeof s === "string" && s.length > 0).join("\n\n") || void 0,
11299
+ // DISCIPLINE leads so the stable, role-agnostic block sits at the front
11300
+ // of the cacheable system-prompt prefix; profile/task appends follow.
11301
+ systemPromptAppend: [DISCIPLINE, profile.claudeCode.systemPromptAppend, taskArtifacts?.promptAddendum].filter((s) => typeof s === "string" && s.length > 0).join("\n\n") || void 0,
11255
11302
  cacheable: profile.claudeCode.cacheable,
11256
11303
  enableVerifyTool: profile.claudeCode.enableVerifyTool,
11257
11304
  verifyToolMaxAttempts: profile.claudeCode.verifyAttempts ?? null,
@@ -37,7 +37,7 @@
37
37
  "mcp__kody-verify"
38
38
  ],
39
39
  "hooks": ["block-git"],
40
- "skills": [],
40
+ "skills": ["systematic-debugging"],
41
41
  "commands": [],
42
42
  "subagents": [],
43
43
  "plugins": [],
@@ -39,7 +39,7 @@
39
39
  "mcp__kody-verify"
40
40
  ],
41
41
  "hooks": ["block-git"],
42
- "skills": [],
42
+ "skills": ["systematic-debugging"],
43
43
  "commands": [],
44
44
  "subagents": [],
45
45
  "plugins": [],
@@ -18,8 +18,11 @@ Return ONLY a concise findings block — no preamble, no final-plan formatting (
18
18
 
19
19
  ```
20
20
  AREA: <the area/files you were assigned>
21
+ - status: DONE | NEEDS_CONTEXT | BLOCKED
21
22
  - changes: <file:line — current state → target state, exact edit location>
22
23
  - pattern to reuse: <sibling path + which idioms/APIs are mirrored, or "new convention because …">
23
24
  - API surface: <symbol → definition path, or UNVERIFIED>
24
25
  - risks/edge cases/tests: <bullets an implementer must handle>
25
26
  ```
27
+
28
+ `status`: `DONE` = area fully investigated. `NEEDS_CONTEXT` = you need a file, boundary, or decision the lead must supply before you can finish — say exactly what. `BLOCKED` = the assigned area doesn't exist or the assignment is wrong — say why. Report `NEEDS_CONTEXT`/`BLOCKED` honestly; never pad the block with guesses to look complete.
@@ -23,7 +23,7 @@
23
23
  "Grep",
24
24
  "Glob",
25
25
  "Bash",
26
- "Task"
26
+ "Agent"
27
27
  ],
28
28
  "hooks": ["block-write"],
29
29
  "skills": [],
@@ -62,11 +62,12 @@ If a file you need to read does not exist, say so explicitly in the plan under "
62
62
 
63
63
  # Parallel investigation (do this to meet the Research floor faster)
64
64
 
65
- You have a `plan-scout` subagent available via the `Task` tool. Use it to satisfy the Research floor in parallel:
65
+ You have a `plan-scout` subagent available via the `Agent` tool. Use it to satisfy the Research floor in parallel:
66
66
 
67
67
  1. **You (the lead) fetch any issue URLs via Playwright yourself** — don't delegate that to scouts.
68
- 2. Identify the distinct areas/files this change will touch (e.g. "the field component", "the data hook", "the migration", "the tests"). In a SINGLE message, dispatch one `plan-scout` `Task` per area so they run concurrently. Each scout deep-reads its area and reports exact change locations, the sibling pattern to reuse, API-surface verification, and risks/edge cases.
68
+ 2. Identify the distinct areas/files this change will touch (e.g. "the field component", "the data hook", "the migration", "the tests"). In a SINGLE message, dispatch one `plan-scout` `Agent` call per area so they run concurrently. Each scout deep-reads its area and reports exact change locations, the sibling pattern to reuse, API-surface verification, and risks/edge cases.
69
69
  3. Wait for all scouts, then synthesize their findings into the plan below. Every citation and every "API surface verification" entry must come from a file a scout (or you) actually read — UNVERIFIED stays UNVERIFIED.
70
+ 4. **Check each scout's `status`.** A scout that returns `NEEDS_CONTEXT` or `BLOCKED` did not finish its area. Do NOT re-dispatch the same scout with the same instructions — that just burns a turn for the same result. Instead, change something: supply the context it asked for, narrow or redefine its area, or read that area yourself. Never loop an unchanged dispatch.
70
71
 
71
72
  For a small single-file change, one scout (or your own reading) is fine — don't manufacture parallelism that isn't there.
72
73
 
@@ -82,6 +83,15 @@ COMMIT_MSG: plan: <very short title>
82
83
  PR_SUMMARY:
83
84
  <A deep, detailed implementation plan in markdown with the following sections, in order. Omit a section only if its trigger condition is not met — do not leave placeholders. Depth is expected; brevity for its own sake is not a goal.
84
85
 
86
+ ## Requirement coverage
87
+ Enumerate EVERY discrete ask in the issue (and any answered clarifications) as a
88
+ checklist, each mapped to where this plan delivers it:
89
+ - <verbatim ask> → <the section/file in this plan that addresses it>
90
+ - <verbatim ask> → ⚠ MISSING — <what's needed, or why it can't be planned>
91
+ Do not finalize a plan that still has a ⚠ MISSING row unless that row is also
92
+ listed under "Ambiguities & assumptions" with a concrete blocker. Silently
93
+ dropping an ask the issue made is a planning failure.
94
+
85
95
  ## Existing patterns found
86
96
  For each major part of the change, name the sibling module in this repo that
87
97
  already solves a similar problem and state how this plan reuses it.
@@ -175,6 +185,8 @@ No filler. No marketing language. Depth over brevity.>
175
185
  - Read-only. Do NOT modify any file.
176
186
  - Do NOT run git or gh commands.
177
187
  - No speculative scope — plan only what the issue asks for, but plan it THOROUGHLY.
188
+ - **Deliver the full ask or split it — never silently shrink it.** Planning a reduced version of what the issue requested is the most damaging failure mode. When any of these phrases (or their intent) describe a *stated requirement* rather than a genuine deferred phase, treat it as a BLOCKER: `"v1"`, `"v2 later"`, `"simplified"`, `"basic version"`, `"minimal"`, `"static for now"`, `"hardcoded for now"`, `"placeholder"`, `"stub"`, `"will be wired later"`, `"future enhancement"`. If the full ask is genuinely too large for one plan, output `FAILED: scope too large — split into <sub-issues>` — do NOT quietly plan less than was asked.
189
+ - **Authority limits on narrowing scope.** You may narrow or split scope ONLY for concrete constraints: output/context-token budget, information you cannot obtain, or a dependency conflict. You may NOT narrow scope because a part looks hard, complex, or time-consuming — difficulty is never a license to reduce the ask.
178
190
  - **Plan length ≤ ~1500 lines / ~15k tokens.** Larger plans get truncated by output token caps before the closing `DONE` marker — and a truncated plan is worse than a smaller one. If a feature legitimately needs more, output `FAILED: scope too large for single plan — split into <list of sub-issues>` instead of overrunning.
179
191
  - If the issue is ambiguous and you cannot make progress without input, output `FAILED: <what's unclear>` instead of a plan.
180
192
  - If the Research floor cannot be met because required files are missing or unreadable, output `FAILED: <what could not be read>` instead of a half-blind plan.
@@ -17,8 +17,11 @@ Return ONLY a concise findings block — no preamble, no final-doc formatting (t
17
17
 
18
18
  ```
19
19
  AREA: <the area you were assigned>
20
+ - status: DONE | NEEDS_CONTEXT | BLOCKED
20
21
  - findings:
21
22
  - <file:line — what's there and why it matters for this issue>
22
23
  - patterns to reuse: <sibling module path + one line, or "none found (searched X)">
23
24
  - open questions / gaps: <anything an implementer still wouldn't know, or "none">
24
25
  ```
26
+
27
+ `status`: `DONE` = area fully investigated. `NEEDS_CONTEXT` = you need a file, boundary, or decision the lead must supply before you can finish — say exactly what. `BLOCKED` = the assigned area doesn't exist or the assignment is wrong — say why. Report `NEEDS_CONTEXT`/`BLOCKED` honestly; never pad the block with guesses to look complete.
@@ -22,7 +22,7 @@
22
22
  "Grep",
23
23
  "Glob",
24
24
  "Bash",
25
- "Task",
25
+ "Agent",
26
26
  "mcp__playwright"
27
27
  ],
28
28
  "hooks": ["block-write"],
@@ -35,11 +35,12 @@ If a prior-art block is present above, scan the diffs and review comments — th
35
35
 
36
36
  # Parallel investigation (do this before writing the doc)
37
37
 
38
- You have a `research-scout` subagent available via the `Task` tool. Use it to investigate the repo in parallel:
38
+ You have a `research-scout` subagent available via the `Agent` tool. Use it to investigate the repo in parallel:
39
39
 
40
40
  1. **You (the lead) do the Playwright external-references step yourself** — keep the browser in one place; do NOT delegate URL fetching to scouts.
41
- 2. From the issue, identify 2–4 distinct investigation areas (e.g. "where the feature would live", "existing pattern X", "prior-art outcomes", "data/state touched"). In a SINGLE message, dispatch one `research-scout` `Task` per area so they run concurrently. Give each scout its specific area and the issue context.
41
+ 2. From the issue, identify 2–4 distinct investigation areas (e.g. "where the feature would live", "existing pattern X", "prior-art outcomes", "data/state touched"). In a SINGLE message, dispatch one `research-scout` `Agent` call per area so they run concurrently. Give each scout its specific area and the issue context.
42
42
  3. Wait for all scouts, then synthesize their findings into the doc below. Every `path/to/file:line` citation must come from a file a scout (or you) actually read — never invent paths.
43
+ 4. **Check each scout's `status`.** A scout that returns `NEEDS_CONTEXT` or `BLOCKED` did not finish its area. Do NOT re-dispatch the same scout with the same instructions — that just burns a turn for the same result. Instead, change something: supply the context it asked for, narrow or redefine its area, or read that area yourself. Never loop an unchanged dispatch.
43
44
 
44
45
  For a trivial issue where one area suffices, a single scout (or your own reading) is fine — don't manufacture parallelism that isn't there.
45
46
 
@@ -18,9 +18,12 @@ Return ONLY this block — no preamble:
18
18
 
19
19
  ```
20
20
  CORRECTNESS
21
+ - status: DONE | NEEDS_CONTEXT | BLOCKED
21
22
  - severity: BLOCK | WARN | NONE
22
23
  - findings:
23
24
  - <file:line — concrete bug/regression and how it manifests at runtime, or "None">
24
25
  ```
25
26
 
26
27
  Use `BLOCK` only for a clear correctness or regression risk (wrong output, broken caller, dropped tested case). Test-coverage gaps that aren't outright bugs are `WARN`.
28
+
29
+ `status`: `DONE` = you reviewed the full diff. `NEEDS_CONTEXT` = you need a file or context the lead must supply to finish — say exactly what. `BLOCKED` = you could not read the diff/files at all — say why. Never emit `severity: NONE` to fake a clean review when you were actually blocked; report the block.
@@ -17,9 +17,12 @@ Return ONLY this block — no preamble:
17
17
 
18
18
  ```
19
19
  SECURITY
20
+ - status: DONE | NEEDS_CONTEXT | BLOCKED
20
21
  - severity: BLOCK | WARN | NONE
21
22
  - findings:
22
23
  - <file:line — concrete issue and the exploit it enables, or "None">
23
24
  ```
24
25
 
25
26
  Use `BLOCK` only for a real, exploitable vulnerability introduced by this diff. Pre-existing issues the diff didn't touch are out of scope.
27
+
28
+ `status`: `DONE` = you reviewed the full diff. `NEEDS_CONTEXT` = you need a file or context the lead must supply to finish — say exactly what. `BLOCKED` = you could not read the diff/files at all — say why. Never emit `severity: NONE` to fake a clean review when you were actually blocked; report the block.
@@ -17,9 +17,12 @@ Return ONLY this block — no preamble:
17
17
 
18
18
  ```
19
19
  STRUCTURE
20
+ - status: DONE | NEEDS_CONTEXT | BLOCKED
20
21
  - severity: WARN | NONE
21
22
  - findings:
22
23
  - <file:line — concrete structural/convention/doc gap and the existing pattern it should follow, or "None">
23
24
  ```
24
25
 
25
26
  Structure findings never `BLOCK` — they are advisory. Use `WARN` for real gaps, `NONE` otherwise.
27
+
28
+ `status`: `DONE` = you reviewed the full diff. `NEEDS_CONTEXT` = you need a file or context the lead must supply to finish — say exactly what. `BLOCKED` = you could not read the diff/files at all — say why. Never emit `severity: NONE` to fake a clean review when you were actually blocked; report the block.
@@ -24,7 +24,7 @@
24
24
  "Grep",
25
25
  "Glob",
26
26
  "Bash",
27
- "Task"
27
+ "Agent"
28
28
  ],
29
29
  "hooks": ["block-write"],
30
30
  "skills": [],
@@ -16,19 +16,43 @@ Base: {{pr.baseRefName}} ← Head: {{pr.headRefName}}
16
16
 
17
17
  # How to run this review
18
18
 
19
- 1. **Fan out in parallel.** In a SINGLE message, issue three `Task` calls — one to each subagent — so they run concurrently:
19
+ 1. **Fan out in parallel.** In a SINGLE message, issue three `Agent` calls — one to each subagent — so they run concurrently:
20
20
  - `review-security` — security vulnerabilities.
21
21
  - `review-correctness` — logic bugs, regressions, test gaps.
22
22
  - `review-style` — structure, conventions, duplication, docs.
23
23
 
24
24
  Give each subagent the same context: PR #{{pr.number}}, the base/head refs above, and the diff. Instruct each to read the full changed files (not just hunks) before reporting, and to return only its structured block.
25
25
 
26
- 2. **Synthesize.** Once all three return, merge their findings into the single comment below. Resolve the verdict from the worst severity reported:
26
+ 2. **Check each reviewer's `status` before trusting its verdict.** A reviewer that returns `NEEDS_CONTEXT` or `BLOCKED` did not actually complete its review — do NOT treat its `severity: NONE` as a clean pass. Do NOT re-dispatch the same reviewer with the same instructions; change something: give it the context it asked for, or note in the comment that this dimension could not be reviewed. A review missing a whole dimension cannot be **PASS**.
27
+
28
+ 3. **Synthesize.** Once all three have genuinely completed, merge their findings into the single comment below. Resolve the verdict from the worst severity reported:
27
29
  - any `BLOCK` (security or correctness) → **FAIL**
28
30
  - no BLOCK but any `WARN` → **CONCERNS**
29
31
  - all `NONE` → **PASS**
30
32
 
31
- 3. Drop duplicate findings, keep every distinct `file:line` citation. Do not invent citations — only pass through what the subagents reported from files they actually read.
33
+ 4. Drop duplicate findings, keep every distinct `file:line` citation. Do not invent citations — only pass through what the subagents reported from files they actually read.
34
+
35
+ # Review stance — do not go soft
36
+
37
+ Default to skepticism: assume the diff contains a defect until the code proves otherwise, and surface every issue you can demonstrate with a `file:line`. Watch for the ways a reviewer quietly goes easy — each is a failure here:
38
+
39
+ - Downgrading a real BLOCK to a WARN or a Suggestion so the review feels less harsh.
40
+ - Accepting "looks right" without confirming the change is actually wired (apply the depth ladder below).
41
+ - Treating a stub or placeholder shipped against a *stated* requirement as acceptable. Phrases like `"v1"`, `"basic version"`, `"simplified"`, `"minimal"`, `"static for now"`, `"hardcoded for now"`, `"placeholder"`, `"stub"`, `"will be wired later"`, `"future enhancement"` — when they describe a behavior the issue actually asked for — are a **FAIL**, not a note.
42
+ - Returning **PASS** when a whole dimension came back `BLOCKED`/`NEEDS_CONTEXT`.
43
+
44
+ Severity reflects the risk in the code, never how it feels to report it.
45
+
46
+ # Implementation depth — existence is not implementation
47
+
48
+ For every change in the diff, don't stop at "the code is there". Walk the ladder:
49
+
50
+ 1. **Exists** — the function / route / field / component is present.
51
+ 2. **Substantive** — it has real logic, not a stub, `TODO`, `return null`, or an echo of its input.
52
+ 3. **Wired** — its output is actually consumed: the query result is returned, the fetched response is used, the new config key is read where it matters, the handler is registered/exported, the component is rendered. Grep the symbol's usages to confirm it's consumed, not just defined.
53
+ 4. **Functional** — it produces the right result for the issue's cases.
54
+
55
+ Missing *wiring* is the most common defect — a query that exists but whose result is never returned, a fetch whose response is ignored, a config default added in code but absent from the schema. A change that reaches only Exists/Substantive but isn't wired is a correctness **FAIL**, not a style note.
32
56
 
33
57
  # Required output
34
58
 
@@ -0,0 +1,43 @@
1
+ ---
2
+ name: systematic-debugging
3
+ description: Use when a test fails, a build breaks, or runtime behavior is wrong and the cause is not immediately obvious — especially before changing code to "see if it helps".
4
+ ---
5
+
6
+ # Systematic debugging
7
+
8
+ A failing check is a question ("why did this happen?"), not a prompt to start
9
+ editing. Guessing-and-checking edits the code before you understand it; it
10
+ usually masks the symptom, leaves the real bug, and burns turns. Find the cause
11
+ first, then make one targeted fix.
12
+
13
+ ## The loop
14
+
15
+ 1. **Reproduce.** Run the exact failing command and read the FULL output and
16
+ exit code — not a summary, not memory of a past run. If you can't reproduce
17
+ it, you can't fix it; say so.
18
+ 2. **Isolate.** Narrow to the smallest input/file/line that still fails. Read
19
+ the full failing function and the code that calls it — the cause often sits
20
+ outside the line the error points at.
21
+ 3. **Find the root cause.** State, in one sentence, the actual mechanism: "X is
22
+ null here because Y never set it." If you can't write that sentence, you
23
+ don't understand it yet — keep reading, don't start editing.
24
+ 4. **Fix the cause, not the symptom.** Make the smallest change that addresses
25
+ the mechanism you named. Do not swallow the error, loosen the assertion, or
26
+ special-case the one failing input to make it pass.
27
+ 5. **Verify.** Re-run the exact command from step 1, fresh. Confirm it now
28
+ passes AND that you didn't break a sibling — run the surrounding tests too.
29
+
30
+ ## Red flags — stop if you think any of these
31
+
32
+ | The thought | The reality |
33
+ |---|---|
34
+ | "Let me just try this and see if it works." | You're editing before you understand. Reproduce and isolate first. |
35
+ | "I'll wrap it in try/catch / loosen the test." | That hides the bug, it doesn't fix it. Find why it throws. |
36
+ | "It only fails sometimes, probably flaky." | "Probably" is not a root cause. Isolate the condition that triggers it. |
37
+ | "The fix is obvious from the error message." | Confirm the mechanism by reading the code; error messages mislead. |
38
+
39
+ ## When you're stuck
40
+
41
+ If two genuine attempts at the root cause fail, stop and say what you tried,
42
+ what you ruled out, and what you'd need to get unblocked. An honest dead-end
43
+ beats a symptom-masking patch that ships a latent bug.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kody-ade/kody-engine",
3
- "version": "0.4.106",
3
+ "version": "0.4.108",
4
4
  "description": "kody — autonomous development engine. Single-session Claude Code agent behind a generic executor + declarative executable profiles.",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -90,4 +90,4 @@ jobs:
90
90
  INIT_MESSAGE: ${{ inputs.message }}
91
91
  MODEL: ${{ inputs.model }}
92
92
  DASHBOARD_URL: ${{ inputs.dashboardUrl }}
93
- run: npx -y -p @kody-ade/kody-engine@0.4.106 kody-engine
93
+ run: npx -y -p @kody-ade/kody-engine@0.4.108 kody-engine