pi-crew 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/package.json +1 -1
- package/skills/REFERENCE.md +136 -0
- package/skills/delegation-patterns/SKILL.md +1 -1
- package/skills/event-log-tracing/SKILL.md +1 -1
- package/skills/multi-perspective-review/SKILL.md +17 -1
- package/skills/orchestration/SKILL.md +1 -1
- package/skills/post-mortem/SKILL.md +90 -0
- package/skills/safe-bash/SKILL.md +1 -1
- package/skills/scrutinize/SKILL.md +67 -0
- package/skills/systematic-debugging/SKILL.md +60 -5
- package/skills/verification-before-done/SKILL.md +1 -1
- package/skills/workspace-isolation/SKILL.md +1 -1
- package/test-integration-check.ts +114 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,28 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.4.0] — 9arm-skills Enforcement Patterns & Integration Tests (2026-05-26)
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
- **systematic-debugging: Refuse Gate** — Hard constraints before proposing fixes. Must verify repro exists, root cause known, and hypothesis falsified before any fix.
|
|
7
|
+
- **systematic-debugging: Recite Ritual** — Psychological anchor at session start. Recite 4-step mantra before beginning any debug session.
|
|
8
|
+
- **systematic-debugging: Falsify-First** — Phase 3 now requires disproof before proof. Run disproof experiments first to save time on wrong hypotheses.
|
|
9
|
+
- **systematic-debugging: Breadcrumb Ledger** — Structured experiment tracking within debug sessions.
|
|
10
|
+
- **multi-perspective-review: Simpler Alternative Pass** — Mandatory pre-review step to question if the change should exist at all.
|
|
11
|
+
- **New skill: scrutinize** — Outsider-perspective review questioning intent before tracing code.
|
|
12
|
+
- **New skill: post-mortem** — Engineering RCA documentation with 4 required inputs gate.
|
|
13
|
+
- **skills/REFERENCE.md** — New documentation of skill chains, inventory, and anti-patterns.
|
|
14
|
+
- **Trigger conditions** added to all major skill descriptions for better skill invocation matching.
|
|
15
|
+
|
|
16
|
+
### Bug Fixes
|
|
17
|
+
- **CI reliability** — Fixed flaky tests on macOS: crew-widget and render-scheduler timing issues resolved.
|
|
18
|
+
- **Team-context import detection** — Fixed regex to correctly match only direct `/team-tool.ts` imports, not `/team-tool/context.ts`.
|
|
19
|
+
|
|
20
|
+
### Tests
|
|
21
|
+
- **New test-integration-check.ts** — Integration tests for core pi-crew functionality (agent/team/workflow discovery, fast-fix team run).
|
|
22
|
+
- **1740 tests passing** across all platforms (Ubuntu, macOS, Windows).
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
3
26
|
## [0.3.8] — Zombie Run Auto-Repair & Test Stability (2026-05-25)
|
|
4
27
|
|
|
5
28
|
### Features
|
package/package.json
CHANGED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# pi-crew Skills Reference
|
|
2
|
+
|
|
3
|
+
## Skill Chains
|
|
4
|
+
|
|
5
|
+
### Bug Investigation
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
systematic-debugging (4 phases with refuse gate)
|
|
9
|
+
↓
|
|
10
|
+
verification-before-done (evidence before claim)
|
|
11
|
+
↓
|
|
12
|
+
post-mortem (RCA documentation)
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
### Multi-phase Work
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
orchestration (phase coordination)
|
|
19
|
+
↓
|
|
20
|
+
delegation-patterns (task splitting)
|
|
21
|
+
↓
|
|
22
|
+
verification-before-done (after each phase)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Code Review (Quick)
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
scrutinize (outsider perspective + simpler alternative)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Code Review (Deep)
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
scrutinize (outsider perspective)
|
|
35
|
+
↓
|
|
36
|
+
multi-perspective-review (8-pass deep review)
|
|
37
|
+
↓
|
|
38
|
+
secure-agent-orchestration-review (security focus)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## When to Invoke
|
|
44
|
+
|
|
45
|
+
| Situation | Skill |
|
|
46
|
+
|-----------|-------|
|
|
47
|
+
| Bug / test failure / crash | `systematic-debugging` |
|
|
48
|
+
| Before claiming done | `verification-before-done` |
|
|
49
|
+
| Code review (quick) | `scrutinize` |
|
|
50
|
+
| Code review (deep) | `multi-perspective-review` |
|
|
51
|
+
| Task delegation | `delegation-patterns` |
|
|
52
|
+
| Complex multi-phase work | `orchestration` |
|
|
53
|
+
| After bug is fixed | `post-mortem` |
|
|
54
|
+
| Security review | `secure-agent-orchestration-review` |
|
|
55
|
+
| Workspace safety | `workspace-isolation` |
|
|
56
|
+
| Bash safety | `safe-bash` |
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Skills Inventory
|
|
61
|
+
|
|
62
|
+
### Core Discipline
|
|
63
|
+
|
|
64
|
+
| Skill | Description |
|
|
65
|
+
|-------|-------------|
|
|
66
|
+
| `systematic-debugging` | Four-phase debugging with refuse gates, falsify-first discipline |
|
|
67
|
+
| `verification-before-done` | Evidence before claims |
|
|
68
|
+
| `orchestration` | Multi-phase coordination, 8 rules including "respawn not absorb" |
|
|
69
|
+
|
|
70
|
+
### Review
|
|
71
|
+
|
|
72
|
+
| Skill | Description |
|
|
73
|
+
|-------|-------------|
|
|
74
|
+
| `scrutinize` | Outsider-perspective review questioning intent |
|
|
75
|
+
| `multi-perspective-review` | 8-pass deep code review |
|
|
76
|
+
| `secure-agent-orchestration-review` | Security-focused review |
|
|
77
|
+
|
|
78
|
+
### Documentation
|
|
79
|
+
|
|
80
|
+
| Skill | Description |
|
|
81
|
+
|-------|-------------|
|
|
82
|
+
| `post-mortem` | Engineering RCA record |
|
|
83
|
+
|
|
84
|
+
### Delegation
|
|
85
|
+
|
|
86
|
+
| Skill | Description |
|
|
87
|
+
|-------|-------------|
|
|
88
|
+
| `delegation-patterns` | Task splitting patterns |
|
|
89
|
+
| `requirements-to-task-packet` | Task packet creation |
|
|
90
|
+
|
|
91
|
+
### Runtime/Safety
|
|
92
|
+
|
|
93
|
+
| Skill | Description |
|
|
94
|
+
|-------|-------------|
|
|
95
|
+
| `workspace-isolation` | Security boundary enforcement |
|
|
96
|
+
| `worktree-isolation` | Git worktree safety |
|
|
97
|
+
| `safe-bash` | Bash command safety |
|
|
98
|
+
| `state-mutation-locking` | State mutation protection |
|
|
99
|
+
|
|
100
|
+
### Observability
|
|
101
|
+
|
|
102
|
+
| Skill | Description |
|
|
103
|
+
|-------|-------------|
|
|
104
|
+
| `event-log-tracing` | JSONL event log analysis |
|
|
105
|
+
| `runtime-state-reader` | Runtime state inspection |
|
|
106
|
+
| `observability-reliability` | Reliability patterns |
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Anti-patterns
|
|
111
|
+
|
|
112
|
+
| Anti-pattern | Skill | Rule |
|
|
113
|
+
|--------------|-------|------|
|
|
114
|
+
| Proposing fix before reproducing | `systematic-debugging` | Refuse Gate |
|
|
115
|
+
| Running proof before disproof | `systematic-debugging` | Phase 3 |
|
|
116
|
+
| Claiming "tests pass" without fresh run | `verification-before-done` | Gate Function |
|
|
117
|
+
| Reviewing diff-local without tracing path | `scrutinize` | Trace step |
|
|
118
|
+
| Skipping simpler-alternative pass | `multi-perspective-review` | Pre-review |
|
|
119
|
+
| Editing files yourself as orchestrator | `orchestration` | Rule 1 |
|
|
120
|
+
| Dispatching serially when parallel possible | `orchestration` | Rule 3 |
|
|
121
|
+
| Committing a red tree | `orchestration` | Rule 6 |
|
|
122
|
+
| Absorbing subagent's broken work | `orchestration` | Rule 7 |
|
|
123
|
+
| Rubber-stamp review | `multi-perspective-review` | Rules |
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Key Enforcement Patterns (from 9arm)
|
|
128
|
+
|
|
129
|
+
| Pattern | Implemented In |
|
|
130
|
+
|---------|---------------|
|
|
131
|
+
| **Refuse Gate** | `systematic-debugging` |
|
|
132
|
+
| **Recite Ritual** | `systematic-debugging` (Invocation) |
|
|
133
|
+
| **Falsify Before Proof** | `systematic-debugging` (Phase 3) |
|
|
134
|
+
| **Simpler Alternative Pass** | `scrutinize`, `multi-perspective-review` |
|
|
135
|
+
| **Required Inputs Gate** | `post-mortem` |
|
|
136
|
+
| **Respawn Not Absorb** | `orchestration` (Rule 7) |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: delegation-patterns
|
|
3
|
-
description: Subagent/team delegation workflow. Use when splitting work across pi-crew teams, direct agents, async background workers, chains, or parallel
|
|
3
|
+
description: "Subagent/team delegation workflow. Use when splitting work across pi-crew teams, direct agents, async background workers, chains, or parallel tasks. Triggers: delegate this, split this task, parallelize, dispatch workers, assign to team, spawn agents."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# delegation-patterns
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: event-log-tracing
|
|
3
|
-
description: Structured event logging
|
|
3
|
+
description: "Structured event logging for worker lifecycle, live agents, crash recovery. Use when debugging crashes, tracing agent lifecycle, investigating stale runs. Triggers: event log, trace events, worker crashed, agent died, stale run, events.jsonl."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# event-log-tracing
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: multi-perspective-review
|
|
3
|
-
description: Use when reviewing a plan, diff, implementation, worker output, release candidate, or external review
|
|
3
|
+
description: "Multi-perspective code review with simpler-alternative pass. Use when reviewing a plan, diff, implementation, worker output, release candidate, or external feedback. Triggers: review this, look at this, LGTM check, sanity check, audit this, get a second opinion, check this PR, examine this code."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# multi-perspective-review
|
|
@@ -9,6 +9,22 @@ Core principle: review early, review often, and separate concerns. Reviewer outp
|
|
|
9
9
|
|
|
10
10
|
Distilled from detailed reads of requesting-code-review, receiving-code-review, subagent review checkpoints, differential review, and specialized review-agent patterns.
|
|
11
11
|
|
|
12
|
+
## Pre-review: Simpler Alternative Pass (Mandatory)
|
|
13
|
+
|
|
14
|
+
Before running any review passes, ask:
|
|
15
|
+
|
|
16
|
+
1. **Is there a simpler, smaller, or more elegant way to achieve the same goal?**
|
|
17
|
+
- Doing nothing (is the problem real and load-bearing?)
|
|
18
|
+
- Using something that already exists in the codebase
|
|
19
|
+
- A smaller change that solves 90% of the goal with 10% of the risk
|
|
20
|
+
- Solving it at a different layer (config vs code, framework vs app)
|
|
21
|
+
2. If a better alternative exists, surface it BEFORE the line-by-line review.
|
|
22
|
+
3. Skip only if the user explicitly says "don't question scope."
|
|
23
|
+
|
|
24
|
+
This is the most valuable finding you can produce — surfacing unnecessary complexity before reviewing its details.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
12
28
|
## Review Passes
|
|
13
29
|
|
|
14
30
|
Run relevant passes separately:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: orchestration
|
|
3
|
-
description: Multi-phase orchestration
|
|
3
|
+
description: "Multi-phase orchestration for planners and executors. Use when decomposing complex tasks into parallel phases, dispatching workers, verifying gates, and iterating to closure. Triggers: orchestrate this, coordinate these tasks, run this multi-phase, dispatch workers, coordinate team."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# orchestration
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: post-mortem
|
|
3
|
+
description: "Write engineering RCA record after bug is fixed. Use when asking: write post-mortem, RCA, root cause analysis, document this fix, close out this bug. Triggers: post-mortem, postmortem, root cause, RCA, document this fix, write up the cause, close out bug."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# post-mortem
|
|
7
|
+
|
|
8
|
+
The canonical engineering record of a bug fix. Written after debugging lands a real fix.
|
|
9
|
+
|
|
10
|
+
## Required Inputs — Refuse to Draft Without These
|
|
11
|
+
|
|
12
|
+
- [ ] **Reliable repro exists** (deterministic or high-rate flake)
|
|
13
|
+
- [ ] **Root cause is known** (mechanism identified, not a hypothesis)
|
|
14
|
+
- [ ] **Fix is identified** (PR / commit / branch)
|
|
15
|
+
- [ ] **Fix is validated** (original repro now passes)
|
|
16
|
+
|
|
17
|
+
If any missing → list what's missing and stop. Do not draft.
|
|
18
|
+
|
|
19
|
+
## Structure
|
|
20
|
+
|
|
21
|
+
### 1. Summary
|
|
22
|
+
|
|
23
|
+
What broke (user terms), what fixed it (one sentence). JIRA key, PR, owner. A reader who stops here should have the right answer.
|
|
24
|
+
|
|
25
|
+
### 2. Symptom
|
|
26
|
+
|
|
27
|
+
Concrete: test output, error message, log line. No paraphrase. What was actually observed.
|
|
28
|
+
|
|
29
|
+
### 3. Root Cause
|
|
30
|
+
|
|
31
|
+
The actual bug mechanism. Code identifiers welcome — function names, file paths, branch conditions. Walk the cause chain end-to-end.
|
|
32
|
+
|
|
33
|
+
### 4. Why It Produced the Symptom
|
|
34
|
+
|
|
35
|
+
Walk the chain so reader connects symptom to cause. Often non-obvious — bug is in X but visible failure is in Y.
|
|
36
|
+
|
|
37
|
+
### 5. Fix
|
|
38
|
+
|
|
39
|
+
What changed and why this addresses root cause. Link to PR/commit. If a previous fix attempt papered over the symptom, name it and explain what was wrong.
|
|
40
|
+
|
|
41
|
+
### 6. How It Was Found
|
|
42
|
+
|
|
43
|
+
Short. The debugging path:
|
|
44
|
+
|
|
45
|
+
- What repro made it deterministic
|
|
46
|
+
- What tools cracked it
|
|
47
|
+
- Hypotheses tried and rejected (with one-line reason each)
|
|
48
|
+
- The single experiment that confirmed the cause
|
|
49
|
+
|
|
50
|
+
### 7. Why It Slipped Through
|
|
51
|
+
|
|
52
|
+
CI gap? Latent code? Workload gap? Incomplete prior fix? Review miss? Be specific.
|
|
53
|
+
|
|
54
|
+
If honest answer is "no good reason" — say so. **Blameless** — describe the gap, not the person.
|
|
55
|
+
|
|
56
|
+
### 8. Validation
|
|
57
|
+
|
|
58
|
+
How we know the fix works:
|
|
59
|
+
|
|
60
|
+
- Original failing test now passes (test name)
|
|
61
|
+
- Customer workload now completes (workload identifier)
|
|
62
|
+
- Other affected configs/workloads also tested
|
|
63
|
+
|
|
64
|
+
If only one config validated, say so explicitly.
|
|
65
|
+
|
|
66
|
+
### 9. Action Items
|
|
67
|
+
|
|
68
|
+
What + owner + tracking artifact:
|
|
69
|
+
|
|
70
|
+
- Regression test added at <seam>. (Owner, test name)
|
|
71
|
+
- CI gap closed: <new check>. (Owner, ticket)
|
|
72
|
+
- Doc/runbook updated. (Owner, link)
|
|
73
|
+
|
|
74
|
+
If none needed: "None — fix is sufficient and no class-of-bug follow-up warranted."
|
|
75
|
+
|
|
76
|
+
## Tone
|
|
77
|
+
|
|
78
|
+
This is engineer-to-engineer:
|
|
79
|
+
|
|
80
|
+
- **Code identifiers are first-class.** Keep them — future engineers grep their way back.
|
|
81
|
+
- **Mechanism over narrative.** Walk the cause chain, don't soften.
|
|
82
|
+
- **Blameless.** Describe gaps and bugs, never people.
|
|
83
|
+
- **No hedging.** State it or don't write it.
|
|
84
|
+
|
|
85
|
+
## Rules
|
|
86
|
+
|
|
87
|
+
- Never invent facts.
|
|
88
|
+
- Never strip code identifiers (they are the index).
|
|
89
|
+
- State validation coverage honestly.
|
|
90
|
+
- Get sign-off before posting to JIRA.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: safe-bash
|
|
3
|
-
description: Safe shell-command workflow. Use
|
|
3
|
+
description: "Safe shell-command workflow. Use when executing shell commands, prefer read-only, avoid destructive actions. Triggers: run this command, execute bash, safe bash, avoid rm, destructive command, shell injection."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# safe-bash
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scrutinize
|
|
3
|
+
description: "Outsider-perspective review questioning intent before tracing code. Use when asking: should this even exist?, is there a simpler way?, get a second opinion, before deep code review. Triggers: scrutinize this, question this, is there a better way?, simplify this, overkill?, too complex."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Scrutinize
|
|
7
|
+
|
|
8
|
+
Stand outside the change and ask whether it should exist at all, then verify it actually does what it claims end-to-end.
|
|
9
|
+
|
|
10
|
+
## Operating Stance
|
|
11
|
+
|
|
12
|
+
- **Outsider.** Forget who wrote it and why they think it's right. Read the artifact cold.
|
|
13
|
+
- **End-to-end, not diff-local.** The diff is the entry point, not the scope.
|
|
14
|
+
- **Actionable, concise, with rationale.** Every finding states what to change, why, and what evidence led you there.
|
|
15
|
+
|
|
16
|
+
## Workflow
|
|
17
|
+
|
|
18
|
+
### 1. Intent — Is this necessary?
|
|
19
|
+
|
|
20
|
+
- State the goal in one sentence, in your own words. If you cannot, the artifact is underspecified — say so and stop.
|
|
21
|
+
- Ask: **Is there a simpler way?**
|
|
22
|
+
- Delete/does-nothing (is the problem real and load-bearing?)
|
|
23
|
+
- Use existing code (does this already exist?)
|
|
24
|
+
- Smaller change (solves 90% of goal with 10% of risk?)
|
|
25
|
+
- Different layer (config vs code, framework vs app, build vs runtime?)
|
|
26
|
+
- If a better alternative exists, name it BEFORE the line-by-line review.
|
|
27
|
+
|
|
28
|
+
### 2. Trace — Walk the actual code path
|
|
29
|
+
|
|
30
|
+
- For each behavior the change claims, trace end-to-end through real code — not just the lines in the diff.
|
|
31
|
+
- Include unchanged code on either side of the diff. Bugs hide at the seams.
|
|
32
|
+
- Entry point → call sites → branches taken → state mutated → exit/return/side effect.
|
|
33
|
+
|
|
34
|
+
### 3. Verify — Does it do what it claims?
|
|
35
|
+
|
|
36
|
+
- Does the traced code actually produce the behavior?
|
|
37
|
+
- What inputs/states would break it? (Edge cases, concurrent callers, error paths, partial failures, retries, empty/null/unicode/huge inputs)
|
|
38
|
+
- What does it silently change? (Performance, error semantics, observability, contracts)
|
|
39
|
+
- How is it tested? (Do tests exercise the traced path, or pass while skipping it?)
|
|
40
|
+
|
|
41
|
+
### 4. Report
|
|
42
|
+
|
|
43
|
+
Format per finding:
|
|
44
|
+
|
|
45
|
+
```text
|
|
46
|
+
[severity] file:line
|
|
47
|
+
Issue: ...
|
|
48
|
+
Impact: ...
|
|
49
|
+
Fix: ...
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Severity:
|
|
53
|
+
|
|
54
|
+
- critical: data loss, secret leak, arbitrary command/path escape
|
|
55
|
+
- high: broken core workflow, ownership bypass
|
|
56
|
+
- medium: regression, flaky behavior
|
|
57
|
+
- low: polish, maintainability
|
|
58
|
+
|
|
59
|
+
Close with verdict: **ship / fix-then-ship / rework / reject** — with single biggest reason.
|
|
60
|
+
|
|
61
|
+
## Rules
|
|
62
|
+
|
|
63
|
+
- **No rubber-stamps.** "LGTM" is not an output. If nothing found, say what you traced.
|
|
64
|
+
- **Cite or it didn't happen.** Every claim needs specific path/file/line.
|
|
65
|
+
- **One simpler-alternative pass is MANDATORY.** Skip only if user says "don't question scope."
|
|
66
|
+
- **Distinguish claim from verification.** "The PR says X" and "I traced X and confirmed" are different.
|
|
67
|
+
- **No flattery, no hedging.** State the finding.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: systematic-debugging
|
|
3
|
-
description: Use when encountering a bug, test failure, blocked run, provider error, stale state, crash, or unexpected behavior
|
|
3
|
+
description: "Four-phase debugging discipline with refuse gates. Use when encountering a bug, test failure, blocked run, provider error, stale state, crash, or unexpected behavior. Triggers: debug this, investigate, fix this bug, something is broken, crash, error, test failed, it broke, not working, unexpected."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# systematic-debugging
|
|
@@ -9,6 +9,36 @@ Core principle: no fixes without root-cause investigation first. Symptom patches
|
|
|
9
9
|
|
|
10
10
|
Distilled from detailed reads of systematic-debugging, root-cause tracing, TDD, and error-analysis skill patterns.
|
|
11
11
|
|
|
12
|
+
## Invocation — Read Before Debugging
|
|
13
|
+
|
|
14
|
+
Before beginning any debug session, recite these four steps:
|
|
15
|
+
|
|
16
|
+
> **1. First is reproducibility.** Can the issue be reproduced reliably?
|
|
17
|
+
> **2. Know the fail path.** Where does the code break and what stops it from breaking?
|
|
18
|
+
> **3. Question your hypothesis.** What would disprove it?
|
|
19
|
+
> **4. Every run is a breadcrumb.** Cross-reference all of them.
|
|
20
|
+
|
|
21
|
+
If the user says "skip the ritual" → skip the recitation but still apply the four phases silently.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Refuse Gate — Do NOT Proceed Without These
|
|
26
|
+
|
|
27
|
+
Before proposing ANY fix:
|
|
28
|
+
|
|
29
|
+
- [ ] **Can you reproduce the issue reliably?** (deterministic or >50% flake rate)
|
|
30
|
+
- [ ] **Do you know the root cause?** (confirmed mechanism, not a hypothesis)
|
|
31
|
+
- [ ] **Have you tried to FALSIFY your hypothesis first?** (disproof before proof)
|
|
32
|
+
|
|
33
|
+
If ANY answer is NO:
|
|
34
|
+
→ Stop.
|
|
35
|
+
→ State what's missing.
|
|
36
|
+
→ Do not propose a fix.
|
|
37
|
+
|
|
38
|
+
Exception: if the user explicitly says "just patch the symptom" — proceed but flag it as a symptom patch, not a root-cause fix.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
12
42
|
## Four Phases
|
|
13
43
|
|
|
14
44
|
### 1. Root Cause Investigation
|
|
@@ -34,9 +64,14 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
|
|
|
34
64
|
- Identify dependencies: config home, project root markers, env vars, locks, stale caches, provider model capabilities.
|
|
35
65
|
- Do not assume small differences are irrelevant.
|
|
36
66
|
|
|
37
|
-
### 3. Hypothesis and Test
|
|
67
|
+
### 3. Hypothesis and Test — Falsify First
|
|
38
68
|
|
|
39
|
-
- State one hypothesis:
|
|
69
|
+
- State one hypothesis: "I think X is the root cause because Y."
|
|
70
|
+
- Generate 3-5 ranked hypotheses, not one. Single-hypothesis thinking anchors on the first plausible idea.
|
|
71
|
+
- For each hypothesis:
|
|
72
|
+
- What is the simplest **proof**? What is the cleanest **disproof**?
|
|
73
|
+
- Run the **disproof FIRST**. If the hypothesis survives, it's real. If it dies, you saved time chasing a phantom.
|
|
74
|
+
- Does it explain the symptom end-to-end? Walk it through.
|
|
40
75
|
- Test one variable at a time with the smallest read-only probe or targeted test.
|
|
41
76
|
- If wrong, discard the hypothesis instead of piling on fixes.
|
|
42
77
|
- After three failed fixes, question architecture or assumptions before continuing.
|
|
@@ -45,7 +80,7 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
|
|
|
45
80
|
|
|
46
81
|
- Add or identify a failing regression test when practical.
|
|
47
82
|
- Fix the root cause, not the symptom.
|
|
48
|
-
- Avoid
|
|
83
|
+
- Avoid "while I'm here" refactors.
|
|
49
84
|
- Verify targeted behavior, then broader gates.
|
|
50
85
|
|
|
51
86
|
## Evidence to Collect
|
|
@@ -60,8 +95,28 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
|
|
|
60
95
|
|
|
61
96
|
## Anti-patterns
|
|
62
97
|
|
|
63
|
-
-
|
|
98
|
+
- Proposing a fix before reproducing (the refuse gate exists for a reason).
|
|
99
|
+
- Running proof experiments before disproof (disproof first saves time).
|
|
100
|
+
- Trusting a single passing run as validation (check against all prior breadcrumbs).
|
|
64
101
|
- Assuming real user global config cannot pollute tests.
|
|
65
102
|
- Treating provider errors as only transient network failures.
|
|
66
103
|
- Removing guards because they reveal a blocked state.
|
|
67
104
|
- Editing unrelated layers before checking the hypothesis.
|
|
105
|
+
|
|
106
|
+
## Breadcrumb Ledger
|
|
107
|
+
|
|
108
|
+
Maintain a running ledger of every experiment in this session. Each entry:
|
|
109
|
+
|
|
110
|
+
| # | What Changed | What Happened | Ruled In/Out |
|
|
111
|
+
|---|-------------|--------------|-------------|
|
|
112
|
+
| 1 | Added `[DBG-001]` probe | Got `[output]` | Hypothesis A ruled out |
|
|
113
|
+
| 2 | Changed X to Y | Same error persists | Not X |
|
|
114
|
+
| 3 | Checked Z config | Found mismatch | Z is contributing |
|
|
115
|
+
|
|
116
|
+
When a new hypothesis surfaces, walk the ledger:
|
|
117
|
+
- Does it hold for **every** prior observation?
|
|
118
|
+
- If any past run contradicts it, the hypothesis is wrong or incomplete.
|
|
119
|
+
|
|
120
|
+
When in doubt, design the **single experiment** whose outcome makes it certain — run that next.
|
|
121
|
+
|
|
122
|
+
Update the ledger after every run. It is your memory across the session.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: verification-before-done
|
|
3
|
-
description:
|
|
3
|
+
description: "Evidence before claims. Use before claiming work is complete, fixed, passing, reviewed, committed, or ready to hand off. Triggers: done, fixed, complete, ready to merge, can I close, is it working, verify this, check if it passes, all good, LGTM, ready to ship."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# verification-before-done
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: workspace-isolation
|
|
3
|
-
description: Workspace isolation boundaries
|
|
3
|
+
description: "Workspace isolation boundaries. Use when ensuring agents from workspace A cannot access workspace B, or worktree-based parallel execution. Triggers: workspace isolation, cross-workspace access, escape boundary, worktree safety."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# workspace-isolation
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Integration check: validates pi-crew core discovery and team-run functionality.
|
|
3
|
+
* Run with: node --experimental-strip-types --test test-integration-check.ts
|
|
4
|
+
*/
|
|
5
|
+
import * as fs from "node:fs";
|
|
6
|
+
import * as os from "node:os";
|
|
7
|
+
import * as path from "node:path";
|
|
8
|
+
import test from "node:test";
|
|
9
|
+
import assert from "node:assert/strict";
|
|
10
|
+
|
|
11
|
+
import { discoverAgents, allAgents } from "./src/agents/discover-agents.ts";
|
|
12
|
+
import { discoverTeams, allTeams } from "./src/teams/discover-teams.ts";
|
|
13
|
+
import { discoverWorkflows, allWorkflows } from "./src/workflows/discover-workflows.ts";
|
|
14
|
+
import { handleTeamTool } from "./src/extension/team-tool.ts";
|
|
15
|
+
import { loadRunManifestById } from "./src/state/state-store.ts";
|
|
16
|
+
|
|
17
|
+
const pkgRoot = path.resolve(import.meta.dirname ?? ".");
|
|
18
|
+
|
|
19
|
+
// ── Discovery tests ──────────────────────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
test("discovers builtin agents", () => {
|
|
22
|
+
const discovery = discoverAgents(pkgRoot);
|
|
23
|
+
assert.ok(discovery, "discoverAgents should return a result");
|
|
24
|
+
assert.ok(
|
|
25
|
+
discovery.builtin.length >= 10,
|
|
26
|
+
`Expected ≥10 builtin agents, got ${discovery.builtin.length}`,
|
|
27
|
+
);
|
|
28
|
+
const all = allAgents(discovery);
|
|
29
|
+
const names = all.map((a) => a.name);
|
|
30
|
+
assert.ok(names.includes("executor"), `Missing "executor" agent. Got: ${names.join(", ")}`);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
test("discovers builtin teams", () => {
|
|
34
|
+
const discovery = discoverTeams(pkgRoot);
|
|
35
|
+
assert.ok(discovery, "discoverTeams should return a result");
|
|
36
|
+
assert.ok(
|
|
37
|
+
discovery.builtin.length >= 6,
|
|
38
|
+
`Expected ≥6 builtin teams, got ${discovery.builtin.length}`,
|
|
39
|
+
);
|
|
40
|
+
const all = allTeams(discovery);
|
|
41
|
+
const names = all.map((t) => t.name);
|
|
42
|
+
assert.ok(names.includes("fast-fix"), `Missing "fast-fix" team. Got: ${names.join(", ")}`);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
test("discovers builtin workflows", () => {
|
|
46
|
+
const discovery = discoverWorkflows(pkgRoot);
|
|
47
|
+
assert.ok(discovery, "discoverWorkflows should return a result");
|
|
48
|
+
assert.ok(
|
|
49
|
+
discovery.builtin.length >= 6,
|
|
50
|
+
`Expected ≥6 builtin workflows, got ${discovery.builtin.length}`,
|
|
51
|
+
);
|
|
52
|
+
const all = allWorkflows(discovery);
|
|
53
|
+
const names = all.map((w) => w.name);
|
|
54
|
+
assert.ok(
|
|
55
|
+
names.includes("fast-fix"),
|
|
56
|
+
`Missing "fast-fix" workflow. Got: ${names.join(", ")}`,
|
|
57
|
+
);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
// ── Team run test ─────────────────────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
test("fast-fix team run completes successfully with mock child Pi", async () => {
|
|
63
|
+
const cwd = fs.mkdtempSync(path.join(os.tmpdir(), "pi-crew-int-check-"));
|
|
64
|
+
fs.mkdirSync(path.join(cwd, ".crew"), { recursive: true });
|
|
65
|
+
|
|
66
|
+
const prevExec = process.env.PI_TEAMS_EXECUTE_WORKERS;
|
|
67
|
+
const prevMock = process.env.PI_TEAMS_MOCK_CHILD_PI;
|
|
68
|
+
process.env.PI_TEAMS_EXECUTE_WORKERS = "1";
|
|
69
|
+
process.env.PI_TEAMS_MOCK_CHILD_PI = "success";
|
|
70
|
+
|
|
71
|
+
try {
|
|
72
|
+
const run = await handleTeamTool(
|
|
73
|
+
{ action: "run", team: "fast-fix", goal: "create a hello.txt file" },
|
|
74
|
+
{ cwd },
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
// run result is not an error
|
|
78
|
+
assert.equal(run.isError, false, `handleTeamTool returned error: ${JSON.stringify(run)}`);
|
|
79
|
+
|
|
80
|
+
const runId = run.details.runId;
|
|
81
|
+
assert.ok(runId, "Expected a runId in details");
|
|
82
|
+
|
|
83
|
+
// manifest should be persisted and completed
|
|
84
|
+
const loaded = loadRunManifestById(cwd, runId!);
|
|
85
|
+
assert.ok(loaded, "loadRunManifestById should return data");
|
|
86
|
+
assert.equal(
|
|
87
|
+
loaded!.manifest.status,
|
|
88
|
+
"completed",
|
|
89
|
+
`Expected manifest status "completed", got "${loaded!.manifest.status}"`,
|
|
90
|
+
);
|
|
91
|
+
|
|
92
|
+
// all tasks should be completed
|
|
93
|
+
const taskStatuses = loaded!.tasks.map((t) => t.status);
|
|
94
|
+
assert.ok(
|
|
95
|
+
taskStatuses.every((s) => s === "completed"),
|
|
96
|
+
`Not all tasks completed: ${JSON.stringify(taskStatuses)}`,
|
|
97
|
+
);
|
|
98
|
+
|
|
99
|
+
// artifacts directory should exist
|
|
100
|
+
const artifactsDir = path.join(cwd, ".crew", "artifacts", runId!);
|
|
101
|
+
assert.ok(
|
|
102
|
+
fs.existsSync(artifactsDir),
|
|
103
|
+
`Artifacts directory should exist: ${artifactsDir}`,
|
|
104
|
+
);
|
|
105
|
+
|
|
106
|
+
console.log(`✅ fast-fix run ${runId} completed successfully with ${loaded!.tasks.length} tasks`);
|
|
107
|
+
} finally {
|
|
108
|
+
if (prevExec === undefined) delete process.env.PI_TEAMS_EXECUTE_WORKERS;
|
|
109
|
+
else process.env.PI_TEAMS_EXECUTE_WORKERS = prevExec;
|
|
110
|
+
if (prevMock === undefined) delete process.env.PI_TEAMS_MOCK_CHILD_PI;
|
|
111
|
+
else process.env.PI_TEAMS_MOCK_CHILD_PI = prevMock;
|
|
112
|
+
fs.rmSync(cwd, { recursive: true, force: true });
|
|
113
|
+
}
|
|
114
|
+
});
|