pan-wizard 2.8.1 → 2.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -2
- package/bin/install.js +23 -0
- package/commands/pan/assumptions.md +38 -3
- package/commands/pan/audit-deployment.md +6 -0
- package/commands/pan/debug.md +71 -2
- package/commands/pan/exec-phase.md +90 -0
- package/commands/pan/focus-auto.md +181 -18
- package/commands/pan/focus-design.md +302 -14
- package/commands/pan/focus-doc-audit.md +530 -0
- package/commands/pan/focus-drift-walking.md +525 -0
- package/commands/pan/focus-exec.md +168 -46
- package/commands/pan/focus-plan.md +204 -12
- package/commands/pan/focus-scan.md +17 -5
- package/commands/pan/map-codebase.md +32 -6
- package/commands/pan/milestone-audit.md +23 -0
- package/commands/pan/new-project.md +64 -0
- package/commands/pan/pause.md +42 -1
- package/commands/pan/plan-phase.md +84 -0
- package/commands/pan/profile.md +2 -1
- package/commands/pan/quick.md +15 -0
- package/commands/pan/resume.md +62 -2
- package/commands/pan/verify-phase.md +42 -0
- package/package.json +1 -1
- package/pan-wizard-core/bin/lib/commands.cjs +29 -7
- package/pan-wizard-core/bin/lib/config.cjs +10 -0
- package/pan-wizard-core/bin/lib/constants.cjs +3 -1
- package/pan-wizard-core/bin/lib/core.cjs +168 -21
- package/pan-wizard-core/bin/lib/focus.cjs +5 -0
- package/pan-wizard-core/bin/lib/verify.cjs +283 -4
- package/pan-wizard-core/bin/pan-tools.cjs +11 -2
- package/pan-wizard-core/references/model-profiles.md +191 -62
- package/pan-wizard-core/workflows/help.md +11 -1
- package/pan-wizard-core/workflows/profile.md +8 -1
- package/pan-wizard-core/workflows/settings.md +14 -0
- package/scripts/generate-skills-docs.py +560 -0
package/README.md
CHANGED
|
@@ -47,7 +47,7 @@ PAN is the context engineering layer that makes Claude Code reliable. It breaks
|
|
|
47
47
|
└─────────────────────┬───────────────────────────────────────┘
|
|
48
48
|
│ invokes
|
|
49
49
|
┌─────────────────────▼───────────────────────────────────────┐
|
|
50
|
-
│ COMMANDS (
|
|
50
|
+
│ COMMANDS (42 .md files + 4 CLI operations) │
|
|
51
51
|
│ Thin orchestrators that spawn agents and route results │
|
|
52
52
|
└─────────────────────┬───────────────────────────────────────┘
|
|
53
53
|
│ spawns
|
|
@@ -149,7 +149,7 @@ node bin/install.js --claude --local
|
|
|
149
149
|
Installs to `./.claude/` for testing modifications before contributing.
|
|
150
150
|
|
|
151
151
|
```bash
|
|
152
|
-
npm test #
|
|
152
|
+
npm test # 1747 unit tests
|
|
153
153
|
npm run test:scenarios # Scenario tests (install + integration)
|
|
154
154
|
npm run test:all # All tests (unit + scenario)
|
|
155
155
|
```
|
|
@@ -580,6 +580,8 @@ PAN is not a replacement for your IDE or AI agent — it's the orchestration lay
|
|
|
580
580
|
| `/pan:focus-auto` | Continuous scan→plan→exec loop with purpose-driven categories and 5-layer safety harness |
|
|
581
581
|
| `/pan:focus-sync` | Detect and report stale documentation counts |
|
|
582
582
|
| `/pan:focus-design` | 10-phase strategic feature investigation pipeline |
|
|
583
|
+
| `/pan:focus-drift-walking` | Walk project tree, detect doc-code drift, score severity, auto-repair |
|
|
584
|
+
| `/pan:focus-doc-audit` | Multi-dimensional document audit with 8-dimension quality scoring |
|
|
583
585
|
|
|
584
586
|
<sup>¹ Contributed by reddit user OracleGreyBeard</sup>
|
|
585
587
|
|
package/bin/install.js
CHANGED
|
@@ -1501,8 +1501,11 @@ function install(isGlobal, runtime = 'claude') {
|
|
|
1501
1501
|
console.error(` ${yellow}✗${reset} package.json write failed: ${e.message}`);
|
|
1502
1502
|
failures.push('package.json');
|
|
1503
1503
|
}
|
|
1504
|
+
}
|
|
1504
1505
|
|
|
1506
|
+
if (!isCodex && !isOpencode) {
|
|
1505
1507
|
// Copy hooks from dist/ (bundled with dependencies)
|
|
1508
|
+
// Hooks are only supported by Claude Code, Gemini, and Copilot CLI
|
|
1506
1509
|
// Template paths for the target runtime (replaces '.claude' with correct config dir)
|
|
1507
1510
|
try {
|
|
1508
1511
|
const hooksSrc = path.join(src, 'hooks', 'dist');
|
|
@@ -1549,6 +1552,26 @@ function install(isGlobal, runtime = 'claude') {
|
|
|
1549
1552
|
// Report any backed-up local patches
|
|
1550
1553
|
reportLocalPatches(targetDir, runtime);
|
|
1551
1554
|
|
|
1555
|
+
// Post-install self-check: verify critical files exist before declaring success
|
|
1556
|
+
const selfCheckIssues = [];
|
|
1557
|
+
const panToolsPath = path.join(targetDir, 'pan-wizard-core', 'bin', 'pan-tools.cjs');
|
|
1558
|
+
if (!fs.existsSync(panToolsPath)) selfCheckIssues.push('pan-tools.cjs missing');
|
|
1559
|
+
const manifestFullPath = path.join(targetDir, MANIFEST_NAME);
|
|
1560
|
+
if (fs.existsSync(manifestFullPath)) {
|
|
1561
|
+
try {
|
|
1562
|
+
const mfst = JSON.parse(fs.readFileSync(manifestFullPath, 'utf8'));
|
|
1563
|
+
const fileCount = Object.keys(mfst.files || {}).length;
|
|
1564
|
+
if (fileCount < 50) selfCheckIssues.push(`manifest has only ${fileCount} files (expected 150+)`);
|
|
1565
|
+
} catch { selfCheckIssues.push('manifest unreadable'); }
|
|
1566
|
+
} else {
|
|
1567
|
+
selfCheckIssues.push('manifest missing');
|
|
1568
|
+
}
|
|
1569
|
+
if (selfCheckIssues.length > 0) {
|
|
1570
|
+
console.error(`\n ${yellow}⚠ Post-install check found issues:${reset}`);
|
|
1571
|
+
for (const issue of selfCheckIssues) console.error(` - ${issue}`);
|
|
1572
|
+
console.error(` Run ${cyan}pan-tools validate deployment${reset} for full diagnostics.\n`);
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1552
1575
|
if (isCodex) {
|
|
1553
1576
|
return { settingsPath: null, settings: null, statuslineCommand: null, runtime };
|
|
1554
1577
|
}
|
|
@@ -27,15 +27,50 @@ Phase number: $ARGUMENTS (required)
|
|
|
27
27
|
Project state and roadmap are loaded in-workflow using targeted reads.
|
|
28
28
|
</context>
|
|
29
29
|
|
|
30
|
+
<investigate_before_claiming>
|
|
31
|
+
Before surfacing any assumption, read the actual codebase first.
|
|
32
|
+
- Read existing source files related to the phase's domain
|
|
33
|
+
- Grep for relevant function names, imports, patterns
|
|
34
|
+
- Base assumptions on what the code actually shows, not speculation
|
|
35
|
+
Do not claim "the project uses X" without verifying it in the files.
|
|
36
|
+
</investigate_before_claiming>
|
|
37
|
+
|
|
38
|
+
<citation_requirement>
|
|
39
|
+
Every assumption MUST cite the evidence that supports it.
|
|
40
|
+
|
|
41
|
+
**Before presenting assumptions to the user, scan your draft for unsourced claims.** Any assumption without file:line evidence is speculation, not a grounded assumption.
|
|
42
|
+
|
|
43
|
+
**Format:** "Assumption: [claim] — Evidence: [file:line or grep result]"
|
|
44
|
+
|
|
45
|
+
**Grounding rules:**
|
|
46
|
+
- Technical approach assumptions require: file:line showing the current pattern/framework in use
|
|
47
|
+
- Dependency assumptions require: import/require evidence from the relevant module
|
|
48
|
+
- Scope boundary assumptions require: file paths showing what exists vs what doesn't
|
|
49
|
+
- Risk assumptions require: file:line showing the fragile pattern or grep showing the coupling
|
|
50
|
+
|
|
51
|
+
**Anti-pattern:**
|
|
52
|
+
```
|
|
53
|
+
BAD: "Assumption: The project uses Express for routing"
|
|
54
|
+
→ Did you check? Maybe it uses Fastify, or has no server at all.
|
|
55
|
+
GOOD: "Assumption: The project uses Express for routing — Evidence: require('express')
|
|
56
|
+
at src/server.ts:3, route definitions at src/routes/index.ts:12-45"
|
|
57
|
+
```
|
|
58
|
+
</citation_requirement>
|
|
59
|
+
|
|
30
60
|
<process>
|
|
31
61
|
1. Validate phase number argument (error if missing or invalid)
|
|
32
62
|
2. Check if phase exists in roadmap
|
|
33
|
-
3.
|
|
63
|
+
3. Read relevant source files to ground assumptions in evidence
|
|
64
|
+
4. For each assumption, follow observe-think-conclude:
|
|
65
|
+
- OBSERVE: What does the code show?
|
|
66
|
+
- THINK: What does this imply for the phase approach?
|
|
67
|
+
- CONCLUDE: State the assumption with file:line evidence
|
|
68
|
+
5. Follow assumptions.md workflow:
|
|
34
69
|
- Analyze roadmap description
|
|
35
70
|
- Surface assumptions about: technical approach, implementation order, scope, risks, dependencies
|
|
36
|
-
- Present assumptions clearly
|
|
71
|
+
- Present assumptions clearly with file:line references where applicable
|
|
37
72
|
- Prompt "What do you think?"
|
|
38
|
-
|
|
73
|
+
5. Gather feedback and offer next steps
|
|
39
74
|
</process>
|
|
40
75
|
|
|
41
76
|
<success_criteria>
|
|
@@ -38,6 +38,12 @@ If no target directory is provided, ask the user:
|
|
|
38
38
|
Validate the directory exists before proceeding.
|
|
39
39
|
</step>
|
|
40
40
|
|
|
41
|
+
<investigate_before_judging>
|
|
42
|
+
Never report a file as missing, broken, or misconfigured without reading it first.
|
|
43
|
+
For every audit check: read the actual file, verify its contents, then state the finding with evidence.
|
|
44
|
+
Do not speculate about file contents based on filenames alone.
|
|
45
|
+
</investigate_before_judging>
|
|
46
|
+
|
|
41
47
|
<step name="installation_audit">
|
|
42
48
|
**Phase 1 — Installation Integrity Audit**
|
|
43
49
|
|
package/commands/pan/debug.md
CHANGED
|
@@ -49,6 +49,30 @@ If active sessions exist AND no $ARGUMENTS:
|
|
|
49
49
|
If $ARGUMENTS provided OR user describes new issue:
|
|
50
50
|
- Continue to symptom gathering
|
|
51
51
|
|
|
52
|
+
## Reasoning Protocol
|
|
53
|
+
|
|
54
|
+
For each debugging step, follow the observe-think-act pattern:
|
|
55
|
+
1. **OBSERVE** — State what you see (error message, unexpected output, file contents)
|
|
56
|
+
2. **THINK** — Reason about what this means and what to investigate next
|
|
57
|
+
3. **ACT** — Execute one targeted tool call based on the reasoning
|
|
58
|
+
This prevents random exploration and keeps investigation systematic.
|
|
59
|
+
|
|
60
|
+
## Meta-Prompting: Self-Generated Debug Strategy
|
|
61
|
+
|
|
62
|
+
After gathering symptoms (step 2), generate your own investigation plan before spawning the debugger:
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
Given symptoms: "{summary}"
|
|
66
|
+
My debug strategy:
|
|
67
|
+
1. Most likely cause: {hypothesis} → Test by: {specific check}
|
|
68
|
+
2. Second most likely: {hypothesis} → Test by: {specific check}
|
|
69
|
+
3. Long shot: {hypothesis} → Test by: {specific check}
|
|
70
|
+
4. Files to read first: {ordered list, most relevant first}
|
|
71
|
+
5. What would DISPROVE each hypothesis: {falsification criteria}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
This self-generated strategy is passed to the pan-debugger agent as part of the prompt, giving it a targeted investigation plan rather than open-ended exploration. The falsification criteria are critical — they prevent the agent from confirming a hypothesis by only looking for supporting evidence.
|
|
75
|
+
|
|
52
76
|
## 2. Gather Symptoms (if new issue)
|
|
53
77
|
|
|
54
78
|
Use AskUserQuestion for each:
|
|
@@ -123,9 +147,47 @@ Task(
|
|
|
123
147
|
- "Manual investigation" - done
|
|
124
148
|
- "Add more context" - gather more symptoms, spawn again
|
|
125
149
|
|
|
150
|
+
<debug_handoff_schema>
|
|
151
|
+
Debug session files (`.planning/debug/{slug}.md`) MUST contain structured state for cross-agent handoff:
|
|
152
|
+
|
|
153
|
+
```yaml
|
|
154
|
+
# Required sections in debug session file
|
|
155
|
+
session: "{slug}"
|
|
156
|
+
status: "investigating | root-cause-found | fix-applied | resolved"
|
|
157
|
+
created: "{ISO-8601}"
|
|
158
|
+
updated: "{ISO-8601}"
|
|
159
|
+
|
|
160
|
+
symptoms:
|
|
161
|
+
expected: "{what should happen}"
|
|
162
|
+
actual: "{what happens instead}"
|
|
163
|
+
errors: "{error messages}"
|
|
164
|
+
reproduction: "{steps to reproduce}"
|
|
165
|
+
|
|
166
|
+
investigation:
|
|
167
|
+
hypotheses_tested:
|
|
168
|
+
- hypothesis: "{what we thought}"
|
|
169
|
+
result: "confirmed | eliminated"
|
|
170
|
+
evidence: "{file:line or command output}"
|
|
171
|
+
hypotheses_remaining:
|
|
172
|
+
- "{what still needs checking}"
|
|
173
|
+
|
|
174
|
+
root_cause: # Populated when found
|
|
175
|
+
description: "{what's actually wrong}"
|
|
176
|
+
evidence: "{file:line proof}"
|
|
177
|
+
confidence: "high | medium | low"
|
|
178
|
+
|
|
179
|
+
fix: # Populated when applied
|
|
180
|
+
files_changed: ["{paths}"]
|
|
181
|
+
approach: "{what was done}"
|
|
182
|
+
tests_added: ["{test paths}"]
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
**Why structured:** Each continuation agent starts with 0 context. Without structured state, it re-reads the entire investigation log and may re-test eliminated hypotheses. With structured state, it reads `hypotheses_tested` (skip these), checks `hypotheses_remaining` (do these next), and picks up exactly where the previous agent stopped.
|
|
186
|
+
</debug_handoff_schema>
|
|
187
|
+
|
|
126
188
|
## 5. Spawn Continuation Agent (After Checkpoint)
|
|
127
189
|
|
|
128
|
-
When user responds to checkpoint, spawn fresh agent:
|
|
190
|
+
When user responds to checkpoint, spawn fresh agent with the structured debug state:
|
|
129
191
|
|
|
130
192
|
```markdown
|
|
131
193
|
<objective>
|
|
@@ -134,7 +196,7 @@ Continue debugging {slug}. Evidence is in the debug file.
|
|
|
134
196
|
|
|
135
197
|
<prior_state>
|
|
136
198
|
<files_to_read>
|
|
137
|
-
- .planning/debug/{slug}.md (Debug session state)
|
|
199
|
+
- .planning/debug/{slug}.md (Debug session state — parse structured sections)
|
|
138
200
|
</files_to_read>
|
|
139
201
|
</prior_state>
|
|
140
202
|
|
|
@@ -146,6 +208,13 @@ Continue debugging {slug}. Evidence is in the debug file.
|
|
|
146
208
|
<mode>
|
|
147
209
|
goal: find_and_fix
|
|
148
210
|
</mode>
|
|
211
|
+
|
|
212
|
+
<handoff_instructions>
|
|
213
|
+
1. Parse the debug file's structured sections (symptoms, investigation, root_cause, fix)
|
|
214
|
+
2. Do NOT re-test hypotheses marked "eliminated" — they are dead ends
|
|
215
|
+
3. Start from hypotheses_remaining or the checkpoint's next action
|
|
216
|
+
4. Update the debug file's structured sections as you progress
|
|
217
|
+
</handoff_instructions>
|
|
149
218
|
```
|
|
150
219
|
|
|
151
220
|
```
|
|
@@ -27,6 +27,32 @@ Context budget: ~15% orchestrator, 100% fresh per subagent.
|
|
|
27
27
|
@~/.claude/pan-wizard-core/references/ui-brand.md
|
|
28
28
|
</execution_context>
|
|
29
29
|
|
|
30
|
+
<completion_contract>
|
|
31
|
+
Execution is complete when ALL conditions are met:
|
|
32
|
+
1. Every plan in the phase has been dispatched to a subagent
|
|
33
|
+
2. All subagents have returned (success or failure)
|
|
34
|
+
3. Full test suite passes with count >= pre-execution baseline
|
|
35
|
+
4. All verified tasks committed with accurate commit messages
|
|
36
|
+
5. state.md updated with phase progress
|
|
37
|
+
6. Failed tasks (if any) logged with error classification and root cause
|
|
38
|
+
|
|
39
|
+
Execution FAILS if: test count drops below baseline after all retries, or state corruption is detected.
|
|
40
|
+
</completion_contract>
|
|
41
|
+
|
|
42
|
+
<wave_dependencies>
|
|
43
|
+
Discovery → Baseline: Test baseline MUST be captured before any wave executes (regression detection requires it)
|
|
44
|
+
Baseline → Wave N: Each wave MUST wait for the previous wave to complete and pass verification
|
|
45
|
+
Wave N → Commit N: Wave changes MUST pass tests before committing (don't commit broken code)
|
|
46
|
+
All Waves → Final Verify: Full test suite MUST pass after all waves complete
|
|
47
|
+
Final Verify → State Update: state.md MUST only be updated after verification passes
|
|
48
|
+
|
|
49
|
+
HARD STOP conditions (do not proceed to next wave):
|
|
50
|
+
- Baseline capture fails (test suite broken before we start) → STOP, report to user
|
|
51
|
+
- Wave N test count drops below baseline after 3 retries → revert wave, mark all wave tasks FAILED, continue to next wave
|
|
52
|
+
- State corruption detected (malformed state.md or plan files) → STOP execution entirely, report to user
|
|
53
|
+
- All waves complete but final test count < baseline → revert last wave, re-verify
|
|
54
|
+
</wave_dependencies>
|
|
55
|
+
|
|
30
56
|
<context>
|
|
31
57
|
Phase: $ARGUMENTS
|
|
32
58
|
|
|
@@ -39,7 +65,71 @@ Phase: $ARGUMENTS
|
|
|
39
65
|
Context files are resolved inside the workflow via `pan-tools init execute-phase` and per-subagent `<files_to_read>` blocks.
|
|
40
66
|
</context>
|
|
41
67
|
|
|
68
|
+
<action_gating>
|
|
69
|
+
Each execution stage has a restricted set of appropriate actions. Using the wrong tool at the wrong stage causes regressions.
|
|
70
|
+
|
|
71
|
+
| Stage | Read | Grep/Glob | Edit/Write | Bash (tests) | Bash (git) | Agent |
|
|
72
|
+
|-------|------|-----------|------------|--------------|------------|-------|
|
|
73
|
+
| Discovery (find plans) | YES | YES | NO | NO | NO | NO |
|
|
74
|
+
| Baseline capture | YES | NO | NO | YES | YES | NO |
|
|
75
|
+
| Wave execution | YES | YES | YES | YES | NO | YES |
|
|
76
|
+
| Wave verification | YES | YES | NO | YES | NO | NO |
|
|
77
|
+
| Wave commit | NO | NO | NO | NO | YES | NO |
|
|
78
|
+
| Final verification | YES | YES | NO | YES | NO | NO |
|
|
79
|
+
| State update | YES | NO | YES | NO | YES | NO |
|
|
80
|
+
|
|
81
|
+
**Key constraints:**
|
|
82
|
+
- Discovery: read-only — do not modify files while figuring out what to execute
|
|
83
|
+
- Baseline: run tests + git status only — no code changes before baseline is captured
|
|
84
|
+
- Wave verification: NO Edit/Write — you are checking work, not doing more work
|
|
85
|
+
- Wave commit: git operations only — all code changes must be done before committing
|
|
86
|
+
</action_gating>
|
|
87
|
+
|
|
42
88
|
<process>
|
|
43
89
|
Execute the execute-phase workflow from @~/.claude/pan-wizard-core/workflows/exec-phase.md end-to-end.
|
|
44
90
|
Preserve all workflow gates (wave execution, checkpoint handling, verification, state updates, routing).
|
|
91
|
+
|
|
92
|
+
**Context Management Across Waves:**
|
|
93
|
+
- KEEP: Phase goals, test baseline, current wave tasks, file paths being modified
|
|
94
|
+
- SUMMARIZE: Completed wave results to one-line summaries
|
|
95
|
+
- DISCARD: Raw tool output from previous waves
|
|
96
|
+
|
|
97
|
+
**Attention Anchor — emit after each wave completes:**
|
|
98
|
+
```
|
|
99
|
+
Wave {N}/{total} complete | Tasks: {done}/{total} | Tests: {baseline} → {current}
|
|
100
|
+
Remaining waves: {list of wave numbers with task counts}
|
|
101
|
+
Next: Wave {N+1} — {task count} tasks [{task IDs}]
|
|
102
|
+
```
|
|
103
|
+
This prevents drift in multi-wave phases where the agent loses track of which waves remain and what the test baseline was.
|
|
104
|
+
|
|
105
|
+
**State Intent Before Implementing (M+ tasks):**
|
|
106
|
+
For each STANDARD or FULL task, state before coding: "I will modify [files], adding [what], to achieve [goal]. Risk: [what could break]."
|
|
107
|
+
|
|
108
|
+
**Pre-Commit Verification Checklist — apply before each wave commit:**
|
|
109
|
+
1. Every modified file was read before editing
|
|
110
|
+
2. `git diff --stat` contains only files related to the current wave's tasks
|
|
111
|
+
3. Test suite passes and count meets or exceeds pre-wave baseline
|
|
112
|
+
4. Commit message lists only tasks that are verified (tests ran, tests passed)
|
|
113
|
+
5. No secrets or credentials staged
|
|
114
|
+
|
|
115
|
+
If any check fails: fix and re-verify before committing.
|
|
116
|
+
|
|
117
|
+
**Error Recovery Classification — apply when any task fails:**
|
|
118
|
+
- RECOVERABLE (retry up to 3 times): test failure after code change, build syntax error, file not found (search for moved path)
|
|
119
|
+
- UNRECOVERABLE (mark task FAILED, continue to next): same failure after 3 retries, permission errors, state corruption, unrelated test regression
|
|
120
|
+
Never let a failed task block the rest of the wave.
|
|
121
|
+
|
|
122
|
+
**Anti-Overengineering:**
|
|
123
|
+
Implement exactly what the plan says. Do not add features, refactor surrounding code, add comments to unchanged files, or create abstractions for one-time operations.
|
|
124
|
+
|
|
125
|
+
**Common Anti-Patterns (avoid these):**
|
|
126
|
+
```
|
|
127
|
+
BAD: Task says "add input validation" → you also refactor the error handler, add logging, and rename variables
|
|
128
|
+
→ 3 unrelated changes pollute the diff, risk regressions in untested paths
|
|
129
|
+
GOOD: Add validation only → commit → let the next task handle error handling if planned
|
|
130
|
+
|
|
131
|
+
BAD: Test fails → change the test's expected output to match the broken code
|
|
132
|
+
→ Bug is now hidden, passes CI, breaks in production
|
|
133
|
+
GOOD: Test fails → read the test intent → fix the code to match the expected behavior
|
|
134
|
+
```
|
|
45
135
|
</process>
|
|
@@ -18,11 +18,11 @@ Run purpose-driven improvement campaigns with a single command. The auto-runner
|
|
|
18
18
|
|
|
19
19
|
**ADR:** ADR-0015 | **Heritage:** execplan budget + PanMonty categories + focus-exec pipeline
|
|
20
20
|
|
|
21
|
-
##
|
|
21
|
+
## Project Scope Boundary
|
|
22
22
|
|
|
23
|
-
This command runs improvement campaigns on the **host project's source code** —
|
|
23
|
+
This command runs improvement campaigns on the **host project's source code** — not on PAN Wizard's own infrastructure.
|
|
24
24
|
|
|
25
|
-
**
|
|
25
|
+
**Exclude these directories from scanning and execution:**
|
|
26
26
|
- `.claude/`, `.github/copilot-instructions.md`, `.opencode/`, `.gemini/`, `.codex/` — PAN runtime directories
|
|
27
27
|
- Any `pan-wizard-core/`, `pan-tools`, agent `.md`, or command `.md` files within PAN runtime directories
|
|
28
28
|
|
|
@@ -30,6 +30,18 @@ This command runs improvement campaigns on the **host project's source code**
|
|
|
30
30
|
|
|
31
31
|
---
|
|
32
32
|
|
|
33
|
+
<completion_contract>
|
|
34
|
+
A campaign is complete when ANY stop condition is met:
|
|
35
|
+
1. Max cycles reached (--max-cycles, default 10)
|
|
36
|
+
2. Total budget exhausted (--total-budget, default 200)
|
|
37
|
+
3. Scan returns zero items for the selected category
|
|
38
|
+
4. Context window drops below 25% (CRITICAL threshold)
|
|
39
|
+
5. User sends /pan:focus-auto --stop
|
|
40
|
+
6. Category-specific completion (e.g., prompts_remaining === 0)
|
|
41
|
+
|
|
42
|
+
Each cycle is complete when: scan → plan → exec → commit succeeds, OR a safety harness triggers and the cycle is cleanly aborted with state preserved.
|
|
43
|
+
</completion_contract>
|
|
44
|
+
|
|
33
45
|
## FIRST ACTION — Category Selection (if no --category argument)
|
|
34
46
|
|
|
35
47
|
If `$ARGUMENTS` does NOT contain `--category`, you MUST ask the user before doing anything else.
|
|
@@ -45,8 +57,9 @@ Which category should this auto campaign focus on?
|
|
|
45
57
|
4. **features** — Roadmap items, new capabilities (P3-P5)
|
|
46
58
|
5. **docs** — Stale documentation, missing command descriptions (P5-P6)
|
|
47
59
|
6. **optimize** — Performance bottlenecks, redundant computation, robustness hardening (P1-P4)
|
|
60
|
+
7. **prompts** — Execute micro-prompt documents sequentially, or generate them from specs (P0-P6)
|
|
48
61
|
|
|
49
|
-
Reply with a number (1-
|
|
62
|
+
Reply with a number (1-7) or category name.
|
|
50
63
|
```
|
|
51
64
|
|
|
52
65
|
**After the user replies, map their response to a category name:**
|
|
@@ -56,8 +69,9 @@ Reply with a number (1-6) or category name.
|
|
|
56
69
|
- "4" or "features" → SELECTED_CATEGORY = features
|
|
57
70
|
- "5" or "docs" → SELECTED_CATEGORY = docs
|
|
58
71
|
- "6" or "optimize" → SELECTED_CATEGORY = optimize
|
|
72
|
+
- "7" or "prompts" → SELECTED_CATEGORY = prompts
|
|
59
73
|
|
|
60
|
-
|
|
74
|
+
Wait for the user's reply before proceeding. Do not guess or pick a default category.
|
|
61
75
|
|
|
62
76
|
## AUTONOMY RULES (apply AFTER category is selected)
|
|
63
77
|
|
|
@@ -75,7 +89,7 @@ Reply with a number (1-6) or category name.
|
|
|
75
89
|
|
|
76
90
|
| Flag | Default | Description |
|
|
77
91
|
|------|---------|-------------|
|
|
78
|
-
| `--category` | null (all) | cleanup, tests, stability, features, docs |
|
|
92
|
+
| `--category` | null (all) | cleanup, tests, stability, features, docs, optimize, prompts |
|
|
79
93
|
| `--mode` | category-dependent | bugfix, balanced, features, full |
|
|
80
94
|
| `--budget` | category-dependent | Points per cycle (5-100) |
|
|
81
95
|
| `--max-cycles` | 10 | Maximum iterations (1-50) |
|
|
@@ -95,6 +109,7 @@ Reply with a number (1-6) or category name.
|
|
|
95
109
|
| features | P3-P5 | features | 50 |
|
|
96
110
|
| docs | P5-P6 | balanced | 30 |
|
|
97
111
|
| optimize | P1-P4 | balanced | 50 |
|
|
112
|
+
| prompts | P0-P6 | balanced | 100 |
|
|
98
113
|
|
|
99
114
|
## Pipeline
|
|
100
115
|
|
|
@@ -122,6 +137,20 @@ Reply with a number (1-6) or category name.
|
|
|
122
137
|
4. Run `git status` to verify clean working tree (warn if dirty, don't block)
|
|
123
138
|
5. Create safety tag: `git tag -f focus-auto-baseline`
|
|
124
139
|
|
|
140
|
+
<phase_dependencies>
|
|
141
|
+
Phase 0 → Phase 1: Init MUST succeed before baseline (state tracking requires valid run)
|
|
142
|
+
Phase 1 → Phase 2: Baseline MUST be captured before main loop (regression circuit breaker needs it)
|
|
143
|
+
Phase 2 (each cycle): Scan → Plan → Exec → Commit is strictly sequential within a cycle
|
|
144
|
+
- Scan MUST complete before plan (plan needs scan items)
|
|
145
|
+
- Plan MUST complete before exec (exec needs batch file)
|
|
146
|
+
- Exec MUST complete and tests pass before commit (never commit broken code)
|
|
147
|
+
|
|
148
|
+
HARD STOP conditions:
|
|
149
|
+
- Phase 1 fails (tests broken): Do not enter main loop — report and exit
|
|
150
|
+
- Any cycle: test count drops below baseline after revert → stop campaign, preserve state
|
|
151
|
+
- Context drops below 25%: stop campaign cleanly (safety harness 3)
|
|
152
|
+
</phase_dependencies>
|
|
153
|
+
|
|
125
154
|
### Phase 2: Main Loop
|
|
126
155
|
|
|
127
156
|
**For each cycle (1 to max_cycles), execute Steps 2.1 through 2.5 without stopping:**
|
|
@@ -144,6 +173,9 @@ Perform a deep codebase scan to find actionable work items with evidence.
|
|
|
144
173
|
- **features:** roadmap items not yet implemented, README promises without backing code
|
|
145
174
|
- **docs:** stale documentation, missing command descriptions
|
|
146
175
|
- **optimize:** N+1 operations (file I/O / network calls inside loops), redundant re-computation (`JSON.parse`/`stringify` of same data), synchronous blocking in async modules (`readFileSync`/`execSync` alongside async exports), algorithmic complexity (nested `.find()`/`.filter()` in loops creating O(n²)+), unnecessary allocations in hot paths (spread in loops, string concat vs `join()`), regex construction inside loops (should be hoisted), unbounded collection growth (`.push()` without size limits), swallowed errors (`catch {}` / `catch { /* */ }`), suboptimal data structures (array `.includes()` where Set is better), dead assignments, unguarded property access on nullable values (`.length`/`.split()`/`.match()[0]` without null check)
|
|
176
|
+
- **prompts:** Two operational modes — detect which applies:
|
|
177
|
+
- **Execute mode:** Find micro-prompt documents (`.md` files containing ordered prompt blocks, e.g., `## Prompt 1`, `## Prompt 2`, or numbered checklist items `- [ ] Prompt: ...`). Look in `.planning/`, project root, and `docs/` for files matching patterns: `*prompts*`, `*micro-prompt*`, `*prompt-plan*`, `*prompt-sequence*`. Each unchecked/incomplete prompt block is one work item.
|
|
178
|
+
- **Generate mode:** Find specification documents (files matching `*spec*`, `*prd*`, `*requirements*`, `*feature*` in `.planning/`, `docs/specs/`, project root) that do NOT already have a corresponding micro-prompt document. Each spec needing decomposition is one work item.
|
|
147
179
|
|
|
148
180
|
**Optimize category: convergent re-scan.** On cycles 2+, cross-reference scan findings against previous cycle completions (`cycles[].items` in auto-run state). Only pick genuinely new items — skip IDs already completed or failed. If the count of new findings drops AND cycle efficiency drops below 30% of the prior cycle's, this signals convergence and the `diminishing_returns` stop condition fires.
|
|
149
181
|
- Use the Agent tool with Explore subagent for thorough analysis if needed
|
|
@@ -226,6 +258,11 @@ Implement each item from the batch created in Step 2.2. Record `tests_before` by
|
|
|
226
258
|
6. Run the project's test suite
|
|
227
259
|
7. Pass = DONE | Fail = investigate (15 min max), then revert, mark FAILED
|
|
228
260
|
|
|
261
|
+
**Error Recovery Classification:**
|
|
262
|
+
- RECOVERABLE (retry up to 3 times): test failure after code change, build syntax error, file not found (search for moved path)
|
|
263
|
+
- UNRECOVERABLE (mark FAILED, move to next item): same failure after 3 retries, permission errors, state corruption, unrelated test regression
|
|
264
|
+
A failed item never blocks subsequent items.
|
|
265
|
+
|
|
229
266
|
**After all items in the batch:**
|
|
230
267
|
1. Run full test suite — ALL tests must pass
|
|
231
268
|
2. Record `tests_after` from the summary line
|
|
@@ -244,12 +281,25 @@ Check the response for stop conditions:
|
|
|
244
281
|
- `max_cycles`: Maximum iterations reached — go to Phase 3
|
|
245
282
|
- `zero_completed`: No items completed in this cycle — go to Phase 3
|
|
246
283
|
- `diminishing_returns`: Optimize only — cycle efficiency < 30% of previous cycle — go to Phase 3
|
|
284
|
+
- `prompts_complete`: Prompts only — all prompts in document executed — go to Phase 3
|
|
247
285
|
- `null`: Continue to next cycle
|
|
248
286
|
|
|
249
|
-
#### Step 2.5: Inter-Cycle
|
|
287
|
+
#### Step 2.5: Inter-Cycle Context Management
|
|
288
|
+
|
|
289
|
+
Between cycles, manage context to prevent quality degradation over long campaigns:
|
|
290
|
+
- **KEEP:** Current cycle goals, test baseline, error states, active file paths
|
|
291
|
+
- **SUMMARIZE:** Previous cycle results to a one-line summary each
|
|
292
|
+
- **DISCARD:** Raw tool output from previous cycles, superseded scan results
|
|
250
293
|
|
|
251
294
|
Display one-line cycle summary: `Cycle N/M | X/Y pts | Z items done | Tests: A -> B`
|
|
252
295
|
|
|
296
|
+
**Attention anchor — emit after every cycle summary:**
|
|
297
|
+
```
|
|
298
|
+
Remaining: {cycles_left} cycles | {budget_remaining}/{total_budget} pts | Safety: {active_harness_warnings}
|
|
299
|
+
Next: Cycle {N+1} — Scan → Plan → Exec → Commit
|
|
300
|
+
```
|
|
301
|
+
This prevents lost-in-the-middle drift in 10+ cycle campaigns where the agent forgets budget limits or stop conditions.
|
|
302
|
+
|
|
253
303
|
Then continue immediately to the next cycle (back to Step 2.1).
|
|
254
304
|
|
|
255
305
|
### Phase 3: Campaign End
|
|
@@ -294,20 +344,133 @@ Then continue immediately to the next cycle (back to Step 2.1).
|
|
|
294
344
|
7. **Verify Understanding** — State understanding for M+ items before coding.
|
|
295
345
|
8. **Preserve Tests** — Never change test expectations to match broken code.
|
|
296
346
|
9. **Accurate Commits** — Only claim verified items in commit messages. Include actual test counts.
|
|
347
|
+
10. **Vary Similar Fixes** — When 3+ items in a cycle share the same fix pattern (e.g., "add null check"), re-read each module's conventions before applying. The same pattern may need different implementations in different modules. Check after the 3rd fix whether a shared helper would be better than scattered copies.
|
|
348
|
+
|
|
349
|
+
## Prompts Category — Execution Details
|
|
350
|
+
|
|
351
|
+
The prompts category operates in two distinct modes. Detect which mode applies during the scan phase based on what the scan finds.
|
|
352
|
+
|
|
353
|
+
### Execute Mode (micro-prompt document found)
|
|
354
|
+
|
|
355
|
+
A micro-prompt document contains an ordered sequence of self-contained implementation prompts. Each prompt describes a single, testable change.
|
|
356
|
+
|
|
357
|
+
**Document format recognized:**
|
|
358
|
+
|
|
359
|
+
```markdown
|
|
360
|
+
# Micro-Prompts: <Feature Name>
|
|
361
|
+
|
|
362
|
+
Source: <spec file or description>
|
|
363
|
+
Generated: <date>
|
|
364
|
+
|
|
365
|
+
## Prompt 1: <title>
|
|
366
|
+
- [ ] Complete
|
|
367
|
+
|
|
368
|
+
<implementation instructions>
|
|
369
|
+
|
|
370
|
+
### Expected outcome
|
|
371
|
+
<what should work after this prompt>
|
|
372
|
+
|
|
373
|
+
### Test
|
|
374
|
+
<how to verify>
|
|
375
|
+
|
|
376
|
+
---
|
|
377
|
+
|
|
378
|
+
## Prompt 2: <title>
|
|
379
|
+
- [ ] Complete
|
|
380
|
+
...
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
Alternative format — checklist style:
|
|
384
|
+
```markdown
|
|
385
|
+
- [ ] Prompt 1: <title> — <instructions>
|
|
386
|
+
- [ ] Prompt 2: <title> — <instructions>
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
**Execution strategy:**
|
|
390
|
+
|
|
391
|
+
1. Read the micro-prompt document, identify all prompt blocks
|
|
392
|
+
2. Find the first uncompleted prompt (unchecked `- [ ]`)
|
|
393
|
+
3. Execute that prompt's instructions — implement the code changes described
|
|
394
|
+
4. Run the project's test suite (or the prompt-specific test if given)
|
|
395
|
+
5. If tests pass: mark the prompt as complete (`- [x]`), commit, move to next prompt
|
|
396
|
+
6. If tests fail: one fix attempt, then revert and mark prompt as FAILED, move to next prompt
|
|
397
|
+
7. Each prompt = one batch item. Budget: 1 prompt per cycle unless prompt is trivial (XS)
|
|
398
|
+
8. Record `prompts_remaining` count in cycle update — when 0, `prompts_complete` stop fires
|
|
399
|
+
|
|
400
|
+
**Key rules:**
|
|
401
|
+
- Execute prompts in document order — NEVER skip ahead or reorder
|
|
402
|
+
- Each prompt is atomic — commit after each successful prompt
|
|
403
|
+
- A failed prompt does NOT block subsequent prompts (mark failed, continue)
|
|
404
|
+
- The prompt document is the plan — do not re-plan or expand scope beyond what each prompt says
|
|
405
|
+
|
|
406
|
+
### Generate Mode (spec found without corresponding prompt document)
|
|
407
|
+
|
|
408
|
+
When a specification document is found that doesn't have a matching micro-prompt document, decompose it into ordered prompts.
|
|
409
|
+
|
|
410
|
+
**Generation strategy:**
|
|
411
|
+
|
|
412
|
+
1. Read the spec document thoroughly
|
|
413
|
+
2. Identify all discrete implementation steps
|
|
414
|
+
3. Order steps by dependency — foundation first, features that depend on earlier steps later
|
|
415
|
+
4. For each step, write a prompt block containing:
|
|
416
|
+
- Clear title describing the change
|
|
417
|
+
- Implementation instructions (files to create/modify, logic to implement)
|
|
418
|
+
- Expected outcome (what should work after this prompt)
|
|
419
|
+
- Test instruction (how to verify the prompt succeeded)
|
|
420
|
+
5. Write the micro-prompt document to `.planning/prompts/<spec-slug>-prompts.md`
|
|
421
|
+
6. Each generated document = one batch item (typically M or L size)
|
|
422
|
+
|
|
423
|
+
**Decomposition heuristics:**
|
|
424
|
+
- One prompt per logical unit of work (one function, one API endpoint, one component)
|
|
425
|
+
- Each prompt should be independently testable
|
|
426
|
+
- Prompts should be 5-30 minutes of implementation work each
|
|
427
|
+
- Aim for 5-20 prompts per spec (split large specs, combine trivial items)
|
|
428
|
+
- Include a "Prompt 0: Project setup" if the spec requires new dependencies or scaffolding
|
|
429
|
+
- Include a final "Prompt N: Integration test" that verifies the full feature end-to-end
|
|
430
|
+
|
|
431
|
+
**After generation:** The document is written and committed. The next cycle will detect it in execute mode and begin executing prompts sequentially.
|
|
432
|
+
|
|
433
|
+
<failure_pattern_capture>
|
|
434
|
+
When the same failure pattern appears in 2+ items within a campaign, capture it for future runs.
|
|
435
|
+
|
|
436
|
+
**Detection:** After marking an item FAILED, check if the error classification matches any previous failure in this campaign:
|
|
437
|
+
- Same error type (e.g., "test regression in unrelated module")
|
|
438
|
+
- Same file or module involved
|
|
439
|
+
- Same root cause category (e.g., "missing null check pattern", "import path mismatch")
|
|
440
|
+
|
|
441
|
+
**Capture (when pattern repeats):**
|
|
442
|
+
Append to `.planning/focus/failure-patterns.md`:
|
|
443
|
+
```markdown
|
|
444
|
+
## Pattern: {short description}
|
|
445
|
+
- **First seen:** Cycle {N}, Item {ID}
|
|
446
|
+
- **Recurrence:** Cycle {M}, Item {ID2}
|
|
447
|
+
- **Error type:** {classification}
|
|
448
|
+
- **Root cause:** {what actually went wrong}
|
|
449
|
+
- **Avoidance rule:** {what to check before attempting similar items}
|
|
450
|
+
- **Files involved:** {paths}
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
**Use (on subsequent cycles):**
|
|
454
|
+
Before executing an item, check if its target files or error category match a known failure pattern. If so:
|
|
455
|
+
- Apply the avoidance rule BEFORE implementing
|
|
456
|
+
- If the pattern suggests the item will fail (e.g., "all items touching module X regress"), skip with reason "matches known failure pattern — defer to manual investigation"
|
|
457
|
+
|
|
458
|
+
This prevents the campaign from burning budget on items that will predictably fail.
|
|
459
|
+
</failure_pattern_capture>
|
|
297
460
|
|
|
298
461
|
## NEVER DO
|
|
299
462
|
|
|
300
|
-
- Invoke the Skill tool
|
|
301
|
-
- Stop or pause between phases
|
|
302
|
-
- Ask the user questions after category selection
|
|
303
|
-
- Skip the baseline test capture
|
|
304
|
-
- Continue after a test regression
|
|
305
|
-
- Expand scope beyond what the scan found
|
|
306
|
-
- Run more cycles than --max-cycles
|
|
307
|
-
- Spend more points than --total-budget
|
|
308
|
-
- Skip recording cycle results via --update
|
|
309
|
-
- Change test expectations to match broken code
|
|
310
|
-
- Use `git add -A` or `git add .` —
|
|
463
|
+
- Invoke the Skill tool — scan/plan/exec must run inline so state stays coherent across cycles
|
|
464
|
+
- Stop or pause between phases — interruptions break the autonomous loop and lose cycle momentum
|
|
465
|
+
- Ask the user questions after category selection — the whole point is autonomous execution; questions defeat that
|
|
466
|
+
- Skip the baseline test capture — without a baseline, the regression circuit breaker has nothing to compare against
|
|
467
|
+
- Continue after a test regression — a test count decrease means code was broken; continuing compounds the damage
|
|
468
|
+
- Expand scope beyond what the scan found — scope creep in an autonomous loop compounds unpredictably across cycles
|
|
469
|
+
- Run more cycles than --max-cycles — the limit exists to cap total cost and prevent runaway loops
|
|
470
|
+
- Spend more points than --total-budget — the budget cap is the user's cost control mechanism
|
|
471
|
+
- Skip recording cycle results via --update — unrecorded cycles break resume, status, and stop-condition checks
|
|
472
|
+
- Change test expectations to match broken code — this hides bugs instead of fixing them
|
|
473
|
+
- Use `git add -A` or `git add .` — bulk staging can accidentally commit secrets, build artifacts, or unrelated changes
|
|
311
474
|
|
|
312
475
|
## ALWAYS DO
|
|
313
476
|
|