pi-crew 0.3.8 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/package.json +1 -1
- package/skills/REFERENCE.md +136 -0
- package/skills/delegation-patterns/SKILL.md +1 -1
- package/skills/event-log-tracing/SKILL.md +1 -1
- package/skills/multi-perspective-review/SKILL.md +17 -1
- package/skills/orchestration/SKILL.md +1 -1
- package/skills/post-mortem/SKILL.md +90 -0
- package/skills/safe-bash/SKILL.md +1 -1
- package/skills/scrutinize/SKILL.md +67 -0
- package/skills/systematic-debugging/SKILL.md +60 -5
- package/skills/verification-before-done/SKILL.md +1 -1
- package/skills/workspace-isolation/SKILL.md +1 -1
- package/src/agents/discover-agents.ts +352 -14
- package/src/runtime/skill-instructions.ts +8 -2
- package/src/runtime/task-packet.ts +48 -1
- package/test-integration-check.ts +114 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,28 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.4.0] — 9arm-skills Enforcement Patterns & Integration Tests (2026-05-26)
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
- **systematic-debugging: Refuse Gate** — Hard constraints before proposing fixes. Must verify repro exists, root cause known, and hypothesis falsified before any fix.
|
|
7
|
+
- **systematic-debugging: Recite Ritual** — Psychological anchor at session start. Recite 4-step mantra before beginning any debug session.
|
|
8
|
+
- **systematic-debugging: Falsify-First** — Phase 3 now requires disproof before proof. Run disproof experiments first to save time on wrong hypotheses.
|
|
9
|
+
- **systematic-debugging: Breadcrumb Ledger** — Structured experiment tracking within debug sessions.
|
|
10
|
+
- **multi-perspective-review: Simpler Alternative Pass** — Mandatory pre-review step to question if the change should exist at all.
|
|
11
|
+
- **New skill: scrutinize** — Outsider-perspective review questioning intent before tracing code.
|
|
12
|
+
- **New skill: post-mortem** — Engineering RCA documentation with 4 required inputs gate.
|
|
13
|
+
- **skills/REFERENCE.md** — New documentation of skill chains, inventory, and anti-patterns.
|
|
14
|
+
- **Trigger conditions** added to all major skill descriptions for better skill invocation matching.
|
|
15
|
+
|
|
16
|
+
### Bug Fixes
|
|
17
|
+
- **CI reliability** — Fixed flaky tests on macOS: crew-widget and render-scheduler timing issues resolved.
|
|
18
|
+
- **Team-context import detection** — Fixed regex to correctly match only direct `/team-tool.ts` imports, not `/team-tool/context.ts`.
|
|
19
|
+
|
|
20
|
+
### Tests
|
|
21
|
+
- **New test-integration-check.ts** — Integration tests for core pi-crew functionality (agent/team/workflow discovery, fast-fix team run).
|
|
22
|
+
- **1740 tests passing** across all platforms (Ubuntu, macOS, Windows).
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
3
26
|
## [0.3.8] — Zombie Run Auto-Repair & Test Stability (2026-05-25)
|
|
4
27
|
|
|
5
28
|
### Features
|
package/package.json
CHANGED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# pi-crew Skills Reference
|
|
2
|
+
|
|
3
|
+
## Skill Chains
|
|
4
|
+
|
|
5
|
+
### Bug Investigation
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
systematic-debugging (4 phases with refuse gate)
|
|
9
|
+
↓
|
|
10
|
+
verification-before-done (evidence before claim)
|
|
11
|
+
↓
|
|
12
|
+
post-mortem (RCA documentation)
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
### Multi-phase Work
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
orchestration (phase coordination)
|
|
19
|
+
↓
|
|
20
|
+
delegation-patterns (task splitting)
|
|
21
|
+
↓
|
|
22
|
+
verification-before-done (after each phase)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Code Review (Quick)
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
scrutinize (outsider perspective + simpler alternative)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Code Review (Deep)
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
scrutinize (outsider perspective)
|
|
35
|
+
↓
|
|
36
|
+
multi-perspective-review (8-pass deep review)
|
|
37
|
+
↓
|
|
38
|
+
secure-agent-orchestration-review (security focus)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## When to Invoke
|
|
44
|
+
|
|
45
|
+
| Situation | Skill |
|
|
46
|
+
|-----------|-------|
|
|
47
|
+
| Bug / test failure / crash | `systematic-debugging` |
|
|
48
|
+
| Before claiming done | `verification-before-done` |
|
|
49
|
+
| Code review (quick) | `scrutinize` |
|
|
50
|
+
| Code review (deep) | `multi-perspective-review` |
|
|
51
|
+
| Task delegation | `delegation-patterns` |
|
|
52
|
+
| Complex multi-phase work | `orchestration` |
|
|
53
|
+
| After bug is fixed | `post-mortem` |
|
|
54
|
+
| Security review | `secure-agent-orchestration-review` |
|
|
55
|
+
| Workspace safety | `workspace-isolation` |
|
|
56
|
+
| Bash safety | `safe-bash` |
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Skills Inventory
|
|
61
|
+
|
|
62
|
+
### Core Discipline
|
|
63
|
+
|
|
64
|
+
| Skill | Description |
|
|
65
|
+
|-------|-------------|
|
|
66
|
+
| `systematic-debugging` | Four-phase debugging with refuse gates, falsify-first discipline |
|
|
67
|
+
| `verification-before-done` | Evidence before claims |
|
|
68
|
+
| `orchestration` | Multi-phase coordination, 8 rules including "respawn not absorb" |
|
|
69
|
+
|
|
70
|
+
### Review
|
|
71
|
+
|
|
72
|
+
| Skill | Description |
|
|
73
|
+
|-------|-------------|
|
|
74
|
+
| `scrutinize` | Outsider-perspective review questioning intent |
|
|
75
|
+
| `multi-perspective-review` | 8-pass deep code review |
|
|
76
|
+
| `secure-agent-orchestration-review` | Security-focused review |
|
|
77
|
+
|
|
78
|
+
### Documentation
|
|
79
|
+
|
|
80
|
+
| Skill | Description |
|
|
81
|
+
|-------|-------------|
|
|
82
|
+
| `post-mortem` | Engineering RCA record |
|
|
83
|
+
|
|
84
|
+
### Delegation
|
|
85
|
+
|
|
86
|
+
| Skill | Description |
|
|
87
|
+
|-------|-------------|
|
|
88
|
+
| `delegation-patterns` | Task splitting patterns |
|
|
89
|
+
| `requirements-to-task-packet` | Task packet creation |
|
|
90
|
+
|
|
91
|
+
### Runtime/Safety
|
|
92
|
+
|
|
93
|
+
| Skill | Description |
|
|
94
|
+
|-------|-------------|
|
|
95
|
+
| `workspace-isolation` | Security boundary enforcement |
|
|
96
|
+
| `worktree-isolation` | Git worktree safety |
|
|
97
|
+
| `safe-bash` | Bash command safety |
|
|
98
|
+
| `state-mutation-locking` | State mutation protection |
|
|
99
|
+
|
|
100
|
+
### Observability
|
|
101
|
+
|
|
102
|
+
| Skill | Description |
|
|
103
|
+
|-------|-------------|
|
|
104
|
+
| `event-log-tracing` | JSONL event log analysis |
|
|
105
|
+
| `runtime-state-reader` | Runtime state inspection |
|
|
106
|
+
| `observability-reliability` | Reliability patterns |
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Anti-patterns
|
|
111
|
+
|
|
112
|
+
| Anti-pattern | Skill | Rule |
|
|
113
|
+
|--------------|-------|------|
|
|
114
|
+
| Proposing fix before reproducing | `systematic-debugging` | Refuse Gate |
|
|
115
|
+
| Running proof before disproof | `systematic-debugging` | Phase 3 |
|
|
116
|
+
| Claiming "tests pass" without fresh run | `verification-before-done` | Gate Function |
|
|
117
|
+
| Reviewing diff-local without tracing path | `scrutinize` | Trace step |
|
|
118
|
+
| Skipping simpler-alternative pass | `multi-perspective-review` | Pre-review |
|
|
119
|
+
| Editing files yourself as orchestrator | `orchestration` | Rule 1 |
|
|
120
|
+
| Dispatching serially when parallel possible | `orchestration` | Rule 3 |
|
|
121
|
+
| Committing a red tree | `orchestration` | Rule 6 |
|
|
122
|
+
| Absorbing subagent's broken work | `orchestration` | Rule 7 |
|
|
123
|
+
| Rubber-stamp review | `multi-perspective-review` | Rules |
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Key Enforcement Patterns (from 9arm)
|
|
128
|
+
|
|
129
|
+
| Pattern | Implemented In |
|
|
130
|
+
|---------|---------------|
|
|
131
|
+
| **Refuse Gate** | `systematic-debugging` |
|
|
132
|
+
| **Recite Ritual** | `systematic-debugging` (Invocation) |
|
|
133
|
+
| **Falsify Before Proof** | `systematic-debugging` (Phase 3) |
|
|
134
|
+
| **Simpler Alternative Pass** | `scrutinize`, `multi-perspective-review` |
|
|
135
|
+
| **Required Inputs Gate** | `post-mortem` |
|
|
136
|
+
| **Respawn Not Absorb** | `orchestration` (Rule 7) |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: delegation-patterns
|
|
3
|
-
description: Subagent/team delegation workflow. Use when splitting work across pi-crew teams, direct agents, async background workers, chains, or parallel
|
|
3
|
+
description: "Subagent/team delegation workflow. Use when splitting work across pi-crew teams, direct agents, async background workers, chains, or parallel tasks. Triggers: delegate this, split this task, parallelize, dispatch workers, assign to team, spawn agents."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# delegation-patterns
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: event-log-tracing
|
|
3
|
-
description: Structured event logging
|
|
3
|
+
description: "Structured event logging for worker lifecycle, live agents, crash recovery. Use when debugging crashes, tracing agent lifecycle, investigating stale runs. Triggers: event log, trace events, worker crashed, agent died, stale run, events.jsonl."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# event-log-tracing
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: multi-perspective-review
|
|
3
|
-
description: Use when reviewing a plan, diff, implementation, worker output, release candidate, or external review
|
|
3
|
+
description: "Multi-perspective code review with simpler-alternative pass. Use when reviewing a plan, diff, implementation, worker output, release candidate, or external feedback. Triggers: review this, look at this, LGTM check, sanity check, audit this, get a second opinion, check this PR, examine this code."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# multi-perspective-review
|
|
@@ -9,6 +9,22 @@ Core principle: review early, review often, and separate concerns. Reviewer outp
|
|
|
9
9
|
|
|
10
10
|
Distilled from detailed reads of requesting-code-review, receiving-code-review, subagent review checkpoints, differential review, and specialized review-agent patterns.
|
|
11
11
|
|
|
12
|
+
## Pre-review: Simpler Alternative Pass (Mandatory)
|
|
13
|
+
|
|
14
|
+
Before running any review passes, ask:
|
|
15
|
+
|
|
16
|
+
1. **Is there a simpler, smaller, or more elegant way to achieve the same goal?**
|
|
17
|
+
- Doing nothing (is the problem real and load-bearing?)
|
|
18
|
+
- Using something that already exists in the codebase
|
|
19
|
+
- A smaller change that solves 90% of the goal with 10% of the risk
|
|
20
|
+
- Solving it at a different layer (config vs code, framework vs app)
|
|
21
|
+
2. If a better alternative exists, surface it BEFORE the line-by-line review.
|
|
22
|
+
3. Skip only if the user explicitly says "don't question scope."
|
|
23
|
+
|
|
24
|
+
This is the most valuable finding you can produce — surfacing unnecessary complexity before reviewing its details.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
12
28
|
## Review Passes
|
|
13
29
|
|
|
14
30
|
Run relevant passes separately:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: orchestration
|
|
3
|
-
description: Multi-phase orchestration
|
|
3
|
+
description: "Multi-phase orchestration for planners and executors. Use when decomposing complex tasks into parallel phases, dispatching workers, verifying gates, and iterating to closure. Triggers: orchestrate this, coordinate these tasks, run this multi-phase, dispatch workers, coordinate team."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# orchestration
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: post-mortem
|
|
3
|
+
description: "Write engineering RCA record after bug is fixed. Use when asking: write post-mortem, RCA, root cause analysis, document this fix, close out this bug. Triggers: post-mortem, postmortem, root cause, RCA, document this fix, write up the cause, close out bug."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# post-mortem
|
|
7
|
+
|
|
8
|
+
The canonical engineering record of a bug fix. Written after debugging lands a real fix.
|
|
9
|
+
|
|
10
|
+
## Required Inputs — Refuse to Draft Without These
|
|
11
|
+
|
|
12
|
+
- [ ] **Reliable repro exists** (deterministic or high-rate flake)
|
|
13
|
+
- [ ] **Root cause is known** (mechanism identified, not a hypothesis)
|
|
14
|
+
- [ ] **Fix is identified** (PR / commit / branch)
|
|
15
|
+
- [ ] **Fix is validated** (original repro now passes)
|
|
16
|
+
|
|
17
|
+
If any missing → list what's missing and stop. Do not draft.
|
|
18
|
+
|
|
19
|
+
## Structure
|
|
20
|
+
|
|
21
|
+
### 1. Summary
|
|
22
|
+
|
|
23
|
+
What broke (user terms), what fixed it (one sentence). JIRA key, PR, owner. A reader who stops here should have the right answer.
|
|
24
|
+
|
|
25
|
+
### 2. Symptom
|
|
26
|
+
|
|
27
|
+
Concrete: test output, error message, log line. No paraphrase. What was actually observed.
|
|
28
|
+
|
|
29
|
+
### 3. Root Cause
|
|
30
|
+
|
|
31
|
+
The actual bug mechanism. Code identifiers welcome — function names, file paths, branch conditions. Walk the cause chain end-to-end.
|
|
32
|
+
|
|
33
|
+
### 4. Why It Produced the Symptom
|
|
34
|
+
|
|
35
|
+
Walk the chain so reader connects symptom to cause. Often non-obvious — bug is in X but visible failure is in Y.
|
|
36
|
+
|
|
37
|
+
### 5. Fix
|
|
38
|
+
|
|
39
|
+
What changed and why this addresses root cause. Link to PR/commit. If a previous fix attempt papered over the symptom, name it and explain what was wrong.
|
|
40
|
+
|
|
41
|
+
### 6. How It Was Found
|
|
42
|
+
|
|
43
|
+
Short. The debugging path:
|
|
44
|
+
|
|
45
|
+
- What repro made it deterministic
|
|
46
|
+
- What tools cracked it
|
|
47
|
+
- Hypotheses tried and rejected (with one-line reason each)
|
|
48
|
+
- The single experiment that confirmed the cause
|
|
49
|
+
|
|
50
|
+
### 7. Why It Slipped Through
|
|
51
|
+
|
|
52
|
+
CI gap? Latent code? Workload gap? Incomplete prior fix? Review miss? Be specific.
|
|
53
|
+
|
|
54
|
+
If honest answer is "no good reason" — say so. **Blameless** — describe the gap, not the person.
|
|
55
|
+
|
|
56
|
+
### 8. Validation
|
|
57
|
+
|
|
58
|
+
How we know the fix works:
|
|
59
|
+
|
|
60
|
+
- Original failing test now passes (test name)
|
|
61
|
+
- Customer workload now completes (workload identifier)
|
|
62
|
+
- Other affected configs/workloads also tested
|
|
63
|
+
|
|
64
|
+
If only one config validated, say so explicitly.
|
|
65
|
+
|
|
66
|
+
### 9. Action Items
|
|
67
|
+
|
|
68
|
+
What + owner + tracking artifact:
|
|
69
|
+
|
|
70
|
+
- Regression test added at <seam>. (Owner, test name)
|
|
71
|
+
- CI gap closed: <new check>. (Owner, ticket)
|
|
72
|
+
- Doc/runbook updated. (Owner, link)
|
|
73
|
+
|
|
74
|
+
If none needed: "None — fix is sufficient and no class-of-bug follow-up warranted."
|
|
75
|
+
|
|
76
|
+
## Tone
|
|
77
|
+
|
|
78
|
+
This is engineer-to-engineer:
|
|
79
|
+
|
|
80
|
+
- **Code identifiers are first-class.** Keep them — future engineers grep their way back.
|
|
81
|
+
- **Mechanism over narrative.** Walk the cause chain, don't soften.
|
|
82
|
+
- **Blameless.** Describe gaps and bugs, never people.
|
|
83
|
+
- **No hedging.** State it or don't write it.
|
|
84
|
+
|
|
85
|
+
## Rules
|
|
86
|
+
|
|
87
|
+
- Never invent facts.
|
|
88
|
+
- Never strip code identifiers (they are the index).
|
|
89
|
+
- State validation coverage honestly.
|
|
90
|
+
- Get sign-off before posting to JIRA.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: safe-bash
|
|
3
|
-
description: Safe shell-command workflow. Use
|
|
3
|
+
description: "Safe shell-command workflow. Use when executing shell commands, prefer read-only, avoid destructive actions. Triggers: run this command, execute bash, safe bash, avoid rm, destructive command, shell injection."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# safe-bash
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scrutinize
|
|
3
|
+
description: "Outsider-perspective review questioning intent before tracing code. Use when asking: should this even exist?, is there a simpler way?, get a second opinion, before deep code review. Triggers: scrutinize this, question this, is there a better way?, simplify this, overkill?, too complex."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Scrutinize
|
|
7
|
+
|
|
8
|
+
Stand outside the change and ask whether it should exist at all, then verify it actually does what it claims end-to-end.
|
|
9
|
+
|
|
10
|
+
## Operating Stance
|
|
11
|
+
|
|
12
|
+
- **Outsider.** Forget who wrote it and why they think it's right. Read the artifact cold.
|
|
13
|
+
- **End-to-end, not diff-local.** The diff is the entry point, not the scope.
|
|
14
|
+
- **Actionable, concise, with rationale.** Every finding states what to change, why, and what evidence led you there.
|
|
15
|
+
|
|
16
|
+
## Workflow
|
|
17
|
+
|
|
18
|
+
### 1. Intent — Is this necessary?
|
|
19
|
+
|
|
20
|
+
- State the goal in one sentence, in your own words. If you cannot, the artifact is underspecified — say so and stop.
|
|
21
|
+
- Ask: **Is there a simpler way?**
|
|
22
|
+
- Delete/does-nothing (is the problem real and load-bearing?)
|
|
23
|
+
- Use existing code (does this already exist?)
|
|
24
|
+
- Smaller change (solves 90% of goal with 10% of risk?)
|
|
25
|
+
- Different layer (config vs code, framework vs app, build vs runtime?)
|
|
26
|
+
- If a better alternative exists, name it BEFORE the line-by-line review.
|
|
27
|
+
|
|
28
|
+
### 2. Trace — Walk the actual code path
|
|
29
|
+
|
|
30
|
+
- For each behavior the change claims, trace end-to-end through real code — not just the lines in the diff.
|
|
31
|
+
- Include unchanged code on either side of the diff. Bugs hide at the seams.
|
|
32
|
+
- Entry point → call sites → branches taken → state mutated → exit/return/side effect.
|
|
33
|
+
|
|
34
|
+
### 3. Verify — Does it do what it claims?
|
|
35
|
+
|
|
36
|
+
- Does the traced code actually produce the behavior?
|
|
37
|
+
- What inputs/states would break it? (Edge cases, concurrent callers, error paths, partial failures, retries, empty/null/unicode/huge inputs)
|
|
38
|
+
- What does it silently change? (Performance, error semantics, observability, contracts)
|
|
39
|
+
- How is it tested? (Do tests exercise the traced path, or pass while skipping it?)
|
|
40
|
+
|
|
41
|
+
### 4. Report
|
|
42
|
+
|
|
43
|
+
Format per finding:
|
|
44
|
+
|
|
45
|
+
```text
|
|
46
|
+
[severity] file:line
|
|
47
|
+
Issue: ...
|
|
48
|
+
Impact: ...
|
|
49
|
+
Fix: ...
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Severity:
|
|
53
|
+
|
|
54
|
+
- critical: data loss, secret leak, arbitrary command/path escape
|
|
55
|
+
- high: broken core workflow, ownership bypass
|
|
56
|
+
- medium: regression, flaky behavior
|
|
57
|
+
- low: polish, maintainability
|
|
58
|
+
|
|
59
|
+
Close with verdict: **ship / fix-then-ship / rework / reject** — with single biggest reason.
|
|
60
|
+
|
|
61
|
+
## Rules
|
|
62
|
+
|
|
63
|
+
- **No rubber-stamps.** "LGTM" is not an output. If nothing found, say what you traced.
|
|
64
|
+
- **Cite or it didn't happen.** Every claim needs specific path/file/line.
|
|
65
|
+
- **One simpler-alternative pass is MANDATORY.** Skip only if user says "don't question scope."
|
|
66
|
+
- **Distinguish claim from verification.** "The PR says X" and "I traced X and confirmed" are different.
|
|
67
|
+
- **No flattery, no hedging.** State the finding.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: systematic-debugging
|
|
3
|
-
description: Use when encountering a bug, test failure, blocked run, provider error, stale state, crash, or unexpected behavior
|
|
3
|
+
description: "Four-phase debugging discipline with refuse gates. Use when encountering a bug, test failure, blocked run, provider error, stale state, crash, or unexpected behavior. Triggers: debug this, investigate, fix this bug, something is broken, crash, error, test failed, it broke, not working, unexpected."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# systematic-debugging
|
|
@@ -9,6 +9,36 @@ Core principle: no fixes without root-cause investigation first. Symptom patches
|
|
|
9
9
|
|
|
10
10
|
Distilled from detailed reads of systematic-debugging, root-cause tracing, TDD, and error-analysis skill patterns.
|
|
11
11
|
|
|
12
|
+
## Invocation — Read Before Debugging
|
|
13
|
+
|
|
14
|
+
Before beginning any debug session, recite these four steps:
|
|
15
|
+
|
|
16
|
+
> **1. First is reproducibility.** Can the issue be reproduced reliably?
|
|
17
|
+
> **2. Know the fail path.** Where does the code break and what stops it from breaking?
|
|
18
|
+
> **3. Question your hypothesis.** What would disprove it?
|
|
19
|
+
> **4. Every run is a breadcrumb.** Cross-reference all of them.
|
|
20
|
+
|
|
21
|
+
If the user says "skip the ritual" → skip the recitation but still apply the four phases silently.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Refuse Gate — Do NOT Proceed Without These
|
|
26
|
+
|
|
27
|
+
Before proposing ANY fix:
|
|
28
|
+
|
|
29
|
+
- [ ] **Can you reproduce the issue reliably?** (deterministic or >50% flake rate)
|
|
30
|
+
- [ ] **Do you know the root cause?** (confirmed mechanism, not a hypothesis)
|
|
31
|
+
- [ ] **Have you tried to FALSIFY your hypothesis first?** (disproof before proof)
|
|
32
|
+
|
|
33
|
+
If ANY answer is NO:
|
|
34
|
+
→ Stop.
|
|
35
|
+
→ State what's missing.
|
|
36
|
+
→ Do not propose a fix.
|
|
37
|
+
|
|
38
|
+
Exception: if the user explicitly says "just patch the symptom" — proceed but flag it as a symptom patch, not a root-cause fix.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
12
42
|
## Four Phases
|
|
13
43
|
|
|
14
44
|
### 1. Root Cause Investigation
|
|
@@ -34,9 +64,14 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
|
|
|
34
64
|
- Identify dependencies: config home, project root markers, env vars, locks, stale caches, provider model capabilities.
|
|
35
65
|
- Do not assume small differences are irrelevant.
|
|
36
66
|
|
|
37
|
-
### 3. Hypothesis and Test
|
|
67
|
+
### 3. Hypothesis and Test — Falsify First
|
|
38
68
|
|
|
39
|
-
- State one hypothesis:
|
|
69
|
+
- State one hypothesis: "I think X is the root cause because Y."
|
|
70
|
+
- Generate 3-5 ranked hypotheses, not one. Single-hypothesis thinking anchors on the first plausible idea.
|
|
71
|
+
- For each hypothesis:
|
|
72
|
+
- What is the simplest **proof**? What is the cleanest **disproof**?
|
|
73
|
+
- Run the **disproof FIRST**. If the hypothesis survives, it's real. If it dies, you saved time chasing a phantom.
|
|
74
|
+
- Does it explain the symptom end-to-end? Walk it through.
|
|
40
75
|
- Test one variable at a time with the smallest read-only probe or targeted test.
|
|
41
76
|
- If wrong, discard the hypothesis instead of piling on fixes.
|
|
42
77
|
- After three failed fixes, question architecture or assumptions before continuing.
|
|
@@ -45,7 +80,7 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
|
|
|
45
80
|
|
|
46
81
|
- Add or identify a failing regression test when practical.
|
|
47
82
|
- Fix the root cause, not the symptom.
|
|
48
|
-
- Avoid
|
|
83
|
+
- Avoid "while I'm here" refactors.
|
|
49
84
|
- Verify targeted behavior, then broader gates.
|
|
50
85
|
|
|
51
86
|
## Evidence to Collect
|
|
@@ -60,8 +95,28 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
|
|
|
60
95
|
|
|
61
96
|
## Anti-patterns
|
|
62
97
|
|
|
63
|
-
-
|
|
98
|
+
- Proposing a fix before reproducing (the refuse gate exists for a reason).
|
|
99
|
+
- Running proof experiments before disproof (disproof first saves time).
|
|
100
|
+
- Trusting a single passing run as validation (check against all prior breadcrumbs).
|
|
64
101
|
- Assuming real user global config cannot pollute tests.
|
|
65
102
|
- Treating provider errors as only transient network failures.
|
|
66
103
|
- Removing guards because they reveal a blocked state.
|
|
67
104
|
- Editing unrelated layers before checking the hypothesis.
|
|
105
|
+
|
|
106
|
+
## Breadcrumb Ledger
|
|
107
|
+
|
|
108
|
+
Maintain a running ledger of every experiment in this session. Each entry:
|
|
109
|
+
|
|
110
|
+
| # | What Changed | What Happened | Ruled In/Out |
|
|
111
|
+
|---|-------------|--------------|-------------|
|
|
112
|
+
| 1 | Added `[DBG-001]` probe | Got `[output]` | Hypothesis A ruled out |
|
|
113
|
+
| 2 | Changed X to Y | Same error persists | Not X |
|
|
114
|
+
| 3 | Checked Z config | Found mismatch | Z is contributing |
|
|
115
|
+
|
|
116
|
+
When a new hypothesis surfaces, walk the ledger:
|
|
117
|
+
- Does it hold for **every** prior observation?
|
|
118
|
+
- If any past run contradicts it, the hypothesis is wrong or incomplete.
|
|
119
|
+
|
|
120
|
+
When in doubt, design the **single experiment** whose outcome makes it certain — run that next.
|
|
121
|
+
|
|
122
|
+
Update the ledger after every run. It is your memory across the session.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: verification-before-done
|
|
3
|
-
description:
|
|
3
|
+
description: "Evidence before claims. Use before claiming work is complete, fixed, passing, reviewed, committed, or ready to hand off. Triggers: done, fixed, complete, ready to merge, can I close, is it working, verify this, check if it passes, all good, LGTM, ready to ship."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# verification-before-done
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: workspace-isolation
|
|
3
|
-
description: Workspace isolation boundaries
|
|
3
|
+
description: "Workspace isolation boundaries. Use when ensuring agents from workspace A cannot access workspace B, or worktree-based parallel execution. Triggers: workspace isolation, cross-workspace access, escape boundary, worktree safety."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# workspace-isolation
|
|
@@ -6,6 +6,215 @@ import { parseCsv, parseFrontmatter } from "../utils/frontmatter.ts";
|
|
|
6
6
|
import { logInternalError } from "../utils/internal-error.ts";
|
|
7
7
|
import { packageRoot, projectCrewRoot, userPiRoot } from "../utils/paths.ts";
|
|
8
8
|
|
|
9
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
10
|
+
// SEC-001 Fix: Protected Agent Names Blocklist
|
|
11
|
+
// Prevents privilege escalation via agent shadowing attacks.
|
|
12
|
+
// See: SECURITY-ISSUES.md SEC-001
|
|
13
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
14
|
+
|
|
15
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
16
|
+
// SEC-005 Fix: Version-based Cache for Atomic Invalidation
|
|
17
|
+
// Uses a global version counter for atomic cache invalidation instead of
|
|
18
|
+
// relying on TTL alone. This eliminates race conditions where concurrent
|
|
19
|
+
// callers might get stale cached snapshots.
|
|
20
|
+
// See: SECURITY-ISSUES.md SEC-005
|
|
21
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
/** Version counter for atomic cache invalidation. Incremented on every mutation. */
|
|
25
|
+
let cacheVersion = 0;
|
|
26
|
+
|
|
27
|
+
/** Get current cache version. Used for atomic cache stamping. */
|
|
28
|
+
export function getCacheVersion(): number {
|
|
29
|
+
return cacheVersion;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Increment cache version for atomic invalidation.
|
|
34
|
+
* All cached entries with versions older than this are considered stale.
|
|
35
|
+
*/
|
|
36
|
+
function incrementCacheVersion(): void {
|
|
37
|
+
cacheVersion++;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Exact match blocklist for protected builtin agent names. */
|
|
41
|
+
const PROTECTED_AGENT_NAMES = new Set([
|
|
42
|
+
"executor",
|
|
43
|
+
"test-engineer",
|
|
44
|
+
"explorer",
|
|
45
|
+
"planner",
|
|
46
|
+
"analyst",
|
|
47
|
+
"critic",
|
|
48
|
+
"reviewer",
|
|
49
|
+
"verifier",
|
|
50
|
+
"writer",
|
|
51
|
+
"security-reviewer",
|
|
52
|
+
]);
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Pattern blocklist for agent names that would likely confuse or deceive
|
|
56
|
+
* workflows looking for builtin agents.
|
|
57
|
+
*
|
|
58
|
+
* Covers:
|
|
59
|
+
* - Name variations: "executor-v2", "my-executor", "custom-executor"
|
|
60
|
+
* - Misspellings that could be typo-squatted: "execultor", "explroer"
|
|
61
|
+
* - Prefix/suffix combinations with protected names
|
|
62
|
+
*/
|
|
63
|
+
const PROTECTED_AGENT_PATTERNS: Array<{ pattern: RegExp; example: string }> = [
|
|
64
|
+
// Exact variations with delimiters
|
|
65
|
+
{ pattern: /^executor[-_]?v?[0-9]/i, example: "executor-v2, executor_1" },
|
|
66
|
+
{ pattern: /^test[-_]?engineer/i, example: "test-engineer-proxy" },
|
|
67
|
+
{ pattern: /^explorer[-_]/i, example: "explorer-debug" },
|
|
68
|
+
{ pattern: /^planner[-_]/i, example: "planner-v3" },
|
|
69
|
+
// Generic prefixes that could impersonate builtins
|
|
70
|
+
{ pattern: /^(my|custom|new|local)[-_](executor|test[-_]?engineer|explorer|planner)$/i, example: "my-executor" },
|
|
71
|
+
{ pattern: /^(executor|test[-_]?engineer|explorer|planner)[-_]?(proxy|hook|override)$/i, example: "executor-override" },
|
|
72
|
+
// Common typosquatting patterns (intentional misspellings)
|
|
73
|
+
{ pattern: /^exec[au]t[o0]r$/i, example: "execator" },
|
|
74
|
+
{ pattern: /^expl[o0]rer$/i, example: "explorer" },
|
|
75
|
+
{ pattern: /^plann[ae]r$/i, example: "plannar" },
|
|
76
|
+
// Suffixes that indicate override意图
|
|
77
|
+
{ pattern: /^(executor|test[-_]?engineer|explorer|planner)[-_]?(override|replacement|shadow)$/i, example: "executor-override" },
|
|
78
|
+
];
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Check if an agent name matches any protected pattern.
|
|
82
|
+
* Returns the matched pattern description for error messages.
|
|
83
|
+
*/
|
|
84
|
+
function matchProtectedPattern(name: string): string | null {
|
|
85
|
+
const key = name.toLowerCase();
|
|
86
|
+
for (const { pattern, example } of PROTECTED_AGENT_PATTERNS) {
|
|
87
|
+
if (pattern.test(key)) {
|
|
88
|
+
return `pattern "${pattern}" (example: ${example})`;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return null;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Security event types for audit logging.
|
|
96
|
+
*/
|
|
97
|
+
interface SecurityEvent {
|
|
98
|
+
type: "AGENT_REGISTRATION_BLOCKED" | "PROJECT_AGENT_SHADOW_WARNING";
|
|
99
|
+
name: string;
|
|
100
|
+
reason: string;
|
|
101
|
+
timestamp: number;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Security event log. In production, this should be sent to a security SIEM.
|
|
106
|
+
*/
|
|
107
|
+
const securityEventLog: SecurityEvent[] = [];
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Log a security event for audit purposes.
|
|
111
|
+
* TODO: In production, integrate with project's logging infrastructure
|
|
112
|
+
* (e.g., send to SIEM, log aggregator, or security webhook).
|
|
113
|
+
*/
|
|
114
|
+
function logSecurityEvent(event: SecurityEvent): void {
|
|
115
|
+
securityEventLog.push(event);
|
|
116
|
+
|
|
117
|
+
// Console output for development/debugging (redacted in production)
|
|
118
|
+
const prefix = "\x1b[33m[SECURITY]\x1b[0m"; // Yellow warning
|
|
119
|
+
console.warn(
|
|
120
|
+
`${prefix} ${event.type}: agent="${event.name}" reason="${event.reason}" time=${new Date(event.timestamp).toISOString()}`
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Get recent security events (for debugging/testing).
|
|
126
|
+
*/
|
|
127
|
+
export function getSecurityEventLog(): readonly SecurityEvent[] {
|
|
128
|
+
return securityEventLog;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Clear security event log (for testing).
|
|
133
|
+
*/
|
|
134
|
+
export function clearSecurityEventLog(): void {
|
|
135
|
+
securityEventLog.length = 0;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Security check: throws if the agent name is protected.
|
|
140
|
+
*
|
|
141
|
+
* Checks in order:
|
|
142
|
+
* 1. Exact match against PROTECTED_AGENT_NAMES
|
|
143
|
+
* 2. Pattern match against PROTECTED_AGENT_PATTERNS
|
|
144
|
+
*
|
|
145
|
+
* Throws with detailed error message on violation.
|
|
146
|
+
* Logs the event to securityEventLog for audit.
|
|
147
|
+
*/
|
|
148
|
+
function assertAgentNameAllowed(name: string): void {
|
|
149
|
+
const key = name.toLowerCase();
|
|
150
|
+
|
|
151
|
+
// Check 1: Exact match
|
|
152
|
+
if (PROTECTED_AGENT_NAMES.has(key)) {
|
|
153
|
+
logSecurityEvent({
|
|
154
|
+
type: "AGENT_REGISTRATION_BLOCKED",
|
|
155
|
+
name,
|
|
156
|
+
reason: `exact_match:${key}`,
|
|
157
|
+
timestamp: Date.now(),
|
|
158
|
+
});
|
|
159
|
+
throw new Error(
|
|
160
|
+
`SECURITY: Cannot register agent '${name}': protected builtin name. ` +
|
|
161
|
+
`Dynamic agents cannot shadow builtin agents (executor, explorer, planner, etc.) to prevent privilege escalation.`
|
|
162
|
+
);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Check 2: Pattern match (custom-executor, my-planner, etc.)
|
|
166
|
+
const matchedPattern = matchProtectedPattern(key);
|
|
167
|
+
if (matchedPattern !== null) {
|
|
168
|
+
logSecurityEvent({
|
|
169
|
+
type: "AGENT_REGISTRATION_BLOCKED",
|
|
170
|
+
name,
|
|
171
|
+
reason: `pattern_match:${matchedPattern}`,
|
|
172
|
+
timestamp: Date.now(),
|
|
173
|
+
});
|
|
174
|
+
throw new Error(
|
|
175
|
+
`SECURITY: Cannot register agent '${name}': name matches protected pattern (${matchedPattern}). ` +
|
|
176
|
+
`This pattern is blocked to prevent privilege escalation via similar-named agents.`
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Check if a project agent name would shadow a builtin agent.
|
|
183
|
+
* Logs a warning if so, but does NOT block (project agents can be legitimate overrides).
|
|
184
|
+
*
|
|
185
|
+
* Called during agent discovery to flag potential security concerns.
|
|
186
|
+
*/
|
|
187
|
+
function checkProjectAgentShadowsBuiltin(name: string): void {
|
|
188
|
+
const key = name.toLowerCase();
|
|
189
|
+
|
|
190
|
+
// Check exact match
|
|
191
|
+
if (PROTECTED_AGENT_NAMES.has(key)) {
|
|
192
|
+
logSecurityEvent({
|
|
193
|
+
type: "PROJECT_AGENT_SHADOW_WARNING",
|
|
194
|
+
name,
|
|
195
|
+
reason: "project_shadows_protected_builtin",
|
|
196
|
+
timestamp: Date.now(),
|
|
197
|
+
});
|
|
198
|
+
console.warn(
|
|
199
|
+
`\x1b[33m[SECURITY WARNING]\x1b[0m Project agent "${name}" shadows a protected builtin. ` +
|
|
200
|
+
`This agent will be loaded but builtin agents take priority. ` +
|
|
201
|
+
`If this is intentional, consider using a different name.`
|
|
202
|
+
);
|
|
203
|
+
return;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Check pattern match
|
|
207
|
+
const matchedPattern = matchProtectedPattern(key);
|
|
208
|
+
if (matchedPattern !== null) {
|
|
209
|
+
logSecurityEvent({
|
|
210
|
+
type: "PROJECT_AGENT_SHADOW_WARNING",
|
|
211
|
+
name,
|
|
212
|
+
reason: `project_shadows_pattern:${matchedPattern}`,
|
|
213
|
+
timestamp: Date.now(),
|
|
214
|
+
});
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
9
218
|
export interface AgentDiscoveryResult {
|
|
10
219
|
builtin: AgentConfig[];
|
|
11
220
|
user: AgentConfig[];
|
|
@@ -28,6 +237,101 @@ function parseContextMode(value: string | undefined): "fresh" | "fork" | undefin
|
|
|
28
237
|
return value === "fresh" || value === "fork" ? value : undefined;
|
|
29
238
|
}
|
|
30
239
|
|
|
240
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
241
|
+
// SEC-002 Fix: Agent System Prompt Sanitization
|
|
242
|
+
// Prevents prompt injection via malicious agent files.
|
|
243
|
+
// See: SECURITY-ISSUES.md SEC-002
|
|
244
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Trust levels for agent source classification.
|
|
248
|
+
* Determines how strictly to sanitize the system prompt.
|
|
249
|
+
*/
|
|
250
|
+
type TrustLevel = "builtin" | "user" | "project";
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Convert ResourceSource to TrustLevel for sanitization.
|
|
254
|
+
*/
|
|
255
|
+
function sourceToTrustLevel(source: ResourceSource): TrustLevel {
|
|
256
|
+
switch (source) {
|
|
257
|
+
case "builtin":
|
|
258
|
+
return "builtin";
|
|
259
|
+
case "user":
|
|
260
|
+
return "user";
|
|
261
|
+
case "project":
|
|
262
|
+
return "project";
|
|
263
|
+
default:
|
|
264
|
+
return "project";
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Sanitize agent system prompt content to reduce prompt injection risk.
|
|
270
|
+
*
|
|
271
|
+
* Uses OWASP Agent Memory Guard-inspired patterns:
|
|
272
|
+
* - Strip zero-width Unicode (potential bypass vectors)
|
|
273
|
+
* - Strip HTML/JS comments and script tags
|
|
274
|
+
* - Strip known prompt injection directives
|
|
275
|
+
* - Strip encoded payloads (base64, hex)
|
|
276
|
+
* - Collapse excessive whitespace
|
|
277
|
+
*
|
|
278
|
+
* Trust levels affect sanitization strictness:
|
|
279
|
+
* - builtin: Minimal sanitization (trusted source)
|
|
280
|
+
* - user: Standard sanitization
|
|
281
|
+
* - project: Strict sanitization (untrusted source)
|
|
282
|
+
*/
|
|
283
|
+
export function sanitizeAgentSystemPrompt(
|
|
284
|
+
content: string,
|
|
285
|
+
source: ResourceSource
|
|
286
|
+
): string {
|
|
287
|
+
const trustLevel = sourceToTrustLevel(source);
|
|
288
|
+
let sanitized = content;
|
|
289
|
+
|
|
290
|
+
// 1. Strip zero-width and invisible Unicode characters (all trust levels)
|
|
291
|
+
sanitized = sanitized.replace(/[\u200B-\u200F\u2028-\u202F\u2060-\u206F\uFEFF]/g, "");
|
|
292
|
+
|
|
293
|
+
// 2. Strip HTML/JS comments (instruction hiding) — all trust levels
|
|
294
|
+
sanitized = sanitized.replace(/<!--[\s\S]*?-->|<\/?script[^>]*>/gi, "");
|
|
295
|
+
|
|
296
|
+
// 3. Strip known prompt injection directive patterns — user and project
|
|
297
|
+
if (trustLevel !== "builtin") {
|
|
298
|
+
// Strip lines that look like system directives
|
|
299
|
+
sanitized = sanitized.replace(
|
|
300
|
+
/^\s*(?:SYSTEM|INSTRUCTION|IGNORE(?:\s+ALL)?\s+(?:PREVIOUS|INSTRUCTIONS)?|OVERRIDE|YOUR\s+ROLE\s+IS|MALICIOUS|BACKDOOR)\s*:.*$/gim,
|
|
301
|
+
""
|
|
302
|
+
);
|
|
303
|
+
|
|
304
|
+
// Strip embedded instruction patterns in brackets
|
|
305
|
+
sanitized = sanitized.replace(/\[(?:SYSTEM|INSTRUCTION|OVERRIDE|MALICIOUS)\s*:[^\]]*\]/gi, "");
|
|
306
|
+
|
|
307
|
+
// Strip base64/hex-encoded command payloads
|
|
308
|
+
sanitized = sanitized.replace(/\b(base64|base32|hex)\s*['":]\s*([A-Za-z0-9+\/=]{20,})/gi, "[encoded-command-redacted]");
|
|
309
|
+
|
|
310
|
+
// Strip eval/exec patterns with encoded content
|
|
311
|
+
sanitized = sanitized.replace(/\b(eval|exec|spawn|subprocess)\s*\(\s*(?:base64|Buffer\.from)\s*\(/gi, "[suspicious-call-redacted]");
|
|
312
|
+
|
|
313
|
+
// Strip markdown that attempts to hide instructions
|
|
314
|
+
sanitized = sanitized.replace(/```\s*(?:system|instruction|prompt)\n[\s\S]*?```/gi, "");
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// 4. Project-level strict sanitization
|
|
318
|
+
if (trustLevel === "project") {
|
|
319
|
+
// Strip YAML-like assignment patterns that could override behavior
|
|
320
|
+
sanitized = sanitized.replace(/^\s*(?:role|persona|behavior|directive)\s*[=:].*$/gim, "");
|
|
321
|
+
|
|
322
|
+
// Strip potential exfiltration patterns
|
|
323
|
+
sanitized = sanitized.replace(/\b(write|append)\s+.*(?:secrets?|keys?|token|credential)/gi, "[suspicious-write-redacted]");
|
|
324
|
+
|
|
325
|
+
// Strip network exfiltration patterns
|
|
326
|
+
sanitized = sanitized.replace(/\b(fetch|curl|wget|axios)\s+.*(?:exfil|steal|leak|send)/gi, "[suspicious-network-redacted]");
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// 5. Collapse multiple blank lines (cleanup after removals)
|
|
330
|
+
sanitized = sanitized.replace(/\n{3,}/g, "\n\n");
|
|
331
|
+
|
|
332
|
+
return sanitized.trim();
|
|
333
|
+
}
|
|
334
|
+
|
|
31
335
|
function parseAgentFile(filePath: string, source: ResourceSource): AgentConfig | undefined {
|
|
32
336
|
try {
|
|
33
337
|
const content = fs.readFileSync(filePath, "utf-8");
|
|
@@ -39,12 +343,18 @@ function parseAgentFile(filePath: string, source: ResourceSource): AgentConfig |
|
|
|
39
343
|
const avoidWhen = parseCsv(frontmatter.avoidWhen);
|
|
40
344
|
const cost = parseCost(frontmatter.cost);
|
|
41
345
|
const category = frontmatter.category?.trim() || undefined;
|
|
346
|
+
|
|
347
|
+
// SEC-002: Sanitize system prompt based on source trust level
|
|
348
|
+
const rawSystemPrompt = body.trim();
|
|
349
|
+
const systemPrompt = sanitizeAgentSystemPrompt(rawSystemPrompt, source);
|
|
350
|
+
|
|
42
351
|
return {
|
|
43
352
|
name,
|
|
44
353
|
description,
|
|
45
354
|
source,
|
|
46
355
|
filePath,
|
|
47
|
-
systemPrompt
|
|
356
|
+
systemPrompt,
|
|
357
|
+
// ... rest unchanged
|
|
48
358
|
model: frontmatter.model === "false" ? undefined : frontmatter.model || undefined,
|
|
49
359
|
fallbackModels: parseCsv(frontmatter.fallbackModels),
|
|
50
360
|
thinking: frontmatter.thinking === "false" ? undefined : frontmatter.thinking || undefined,
|
|
@@ -70,11 +380,20 @@ function parseAgentFile(filePath: string, source: ResourceSource): AgentConfig |
|
|
|
70
380
|
|
|
71
381
|
function readAgentDir(dir: string, source: ResourceSource): AgentConfig[] {
|
|
72
382
|
if (!fs.existsSync(dir)) return [];
|
|
73
|
-
|
|
383
|
+
const agents = fs.readdirSync(dir)
|
|
74
384
|
.filter((entry) => entry.endsWith(".md") && !entry.endsWith(".team.md") && !entry.endsWith(".workflow.md"))
|
|
75
385
|
.map((entry) => parseAgentFile(path.join(dir, entry), source))
|
|
76
386
|
.filter((agent): agent is AgentConfig => agent !== undefined)
|
|
77
387
|
.sort((a, b) => a.name.localeCompare(b.name));
|
|
388
|
+
|
|
389
|
+
// SEC-001: Warn about project agents that shadow protected builtins
|
|
390
|
+
if (source === "project") {
|
|
391
|
+
for (const agent of agents) {
|
|
392
|
+
checkProjectAgentShadowsBuiltin(agent.name);
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
return agents;
|
|
78
397
|
}
|
|
79
398
|
|
|
80
399
|
function applyAgentOverrides(agents: AgentConfig[], cwd: string, loadedConfig?: LoadedPiTeamsConfig): AgentConfig[] {
|
|
@@ -101,22 +420,30 @@ function applyAgentOverrides(agents: AgentConfig[], cwd: string, loadedConfig?:
|
|
|
101
420
|
}
|
|
102
421
|
|
|
103
422
|
// ─── Agent Discovery Cache (Phase 3a) ────────────────────────────────────
|
|
104
|
-
//
|
|
105
|
-
//
|
|
423
|
+
// SEC-005 Fix: Uses version-based cache for atomic invalidation.
|
|
424
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
106
425
|
|
|
107
426
|
const DISCOVERY_CACHE_TTL_MS = 500;
|
|
108
|
-
|
|
427
|
+
interface CachedDiscoveryEntry {
|
|
428
|
+
result: AgentDiscoveryResult;
|
|
429
|
+
expiresAt: number;
|
|
430
|
+
cacheVersion: number; // SEC-005: Version stamp for atomic invalidation
|
|
431
|
+
}
|
|
432
|
+
const discoveryCache = new Map<string, CachedDiscoveryEntry>();
|
|
109
433
|
const DISCOVERY_CACHE_MAX_ENTRIES = 32;
|
|
110
434
|
|
|
111
435
|
function pruneDiscoveryCache(): void {
|
|
112
436
|
const now = Date.now();
|
|
437
|
+
const currentVersion = cacheVersion;
|
|
113
438
|
for (const [key, entry] of discoveryCache) {
|
|
114
|
-
if (entry.expiresAt <= now
|
|
439
|
+
if (entry.expiresAt <= now || entry.cacheVersion < currentVersion) {
|
|
440
|
+
discoveryCache.delete(key);
|
|
441
|
+
}
|
|
115
442
|
}
|
|
116
443
|
}
|
|
117
444
|
|
|
118
|
-
/** Invalidate cached discovery result for a given cwd (or all if omitted). */
|
|
119
445
|
export function invalidateAgentDiscoveryCache(cwd?: string): void {
|
|
446
|
+
incrementCacheVersion();
|
|
120
447
|
if (cwd) {
|
|
121
448
|
discoveryCache.delete(cwd);
|
|
122
449
|
} else {
|
|
@@ -126,8 +453,10 @@ export function invalidateAgentDiscoveryCache(cwd?: string): void {
|
|
|
126
453
|
|
|
127
454
|
export function discoverAgents(cwd: string): AgentDiscoveryResult {
|
|
128
455
|
pruneDiscoveryCache();
|
|
456
|
+
const currentVersion = cacheVersion;
|
|
129
457
|
const cached = discoveryCache.get(cwd);
|
|
130
|
-
|
|
458
|
+
// SEC-005: Check both TTL expiry AND version stamp
|
|
459
|
+
if (cached && cached.expiresAt > Date.now() && cached.cacheVersion >= currentVersion) {
|
|
131
460
|
return cached.result;
|
|
132
461
|
}
|
|
133
462
|
const loaded = loadConfig(cwd);
|
|
@@ -136,7 +465,8 @@ export function discoverAgents(cwd: string): AgentDiscoveryResult {
|
|
|
136
465
|
user: applyAgentOverrides(readAgentDir(path.join(userPiRoot(), "agents"), "user"), cwd, loaded),
|
|
137
466
|
project: applyAgentOverrides(readAgentDir(path.join(projectCrewRoot(cwd), "agents"), "project"), cwd, loaded),
|
|
138
467
|
};
|
|
139
|
-
|
|
468
|
+
// SEC-005: Store with current version stamp
|
|
469
|
+
discoveryCache.set(cwd, { result, expiresAt: Date.now() + DISCOVERY_CACHE_TTL_MS, cacheVersion: currentVersion });
|
|
140
470
|
while (discoveryCache.size > DISCOVERY_CACHE_MAX_ENTRIES) {
|
|
141
471
|
const oldest = discoveryCache.keys().next().value;
|
|
142
472
|
if (oldest !== undefined) discoveryCache.delete(oldest);
|
|
@@ -150,13 +480,15 @@ export function discoverAgents(cwd: string): AgentDiscoveryResult {
|
|
|
150
480
|
|
|
151
481
|
const dynamicAgents = new Map<string, AgentConfig>();
|
|
152
482
|
|
|
153
|
-
/** Register a dynamic agent at runtime. Throws if already registered. */
|
|
483
|
+
/** Register a dynamic agent at runtime. Throws if already registered or if name is protected. */
|
|
154
484
|
export function registerDynamicAgent(config: AgentConfig): void {
|
|
155
485
|
const key = config.name.toLowerCase();
|
|
486
|
+
// Security check: prevent shadowing of builtin agents (SEC-001)
|
|
487
|
+
assertAgentNameAllowed(config.name);
|
|
156
488
|
if (dynamicAgents.has(key)) {
|
|
157
489
|
throw new Error(`Agent already registered: ${config.name}`);
|
|
158
490
|
}
|
|
159
|
-
dynamicAgents.set(key, { ...config, source:
|
|
491
|
+
dynamicAgents.set(key, { ...config, source: "dynamic" }); // Always "dynamic" — cannot be spoofed
|
|
160
492
|
invalidateAgentDiscoveryCache();
|
|
161
493
|
}
|
|
162
494
|
|
|
@@ -183,10 +515,16 @@ export function allAgents(discovery: AgentDiscoveryResult | undefined): AgentCon
|
|
|
183
515
|
for (const agent of [...discovery.project, ...discovery.builtin, ...discovery.user]) {
|
|
184
516
|
byName.set(agent.name.toLowerCase(), agent);
|
|
185
517
|
}
|
|
186
|
-
// Dynamic agents
|
|
187
|
-
//
|
|
518
|
+
// Dynamic agents only fill gaps — they cannot override builtin/user agents.
|
|
519
|
+
// SECURITY: Dynamic agents are less trusted (registered at runtime by extensions/hooks).
|
|
520
|
+
// They are only used if no builtin/user agent with the same name exists.
|
|
188
521
|
for (const agent of dynamicAgents.values()) {
|
|
189
|
-
|
|
522
|
+
const key = agent.name.toLowerCase();
|
|
523
|
+
if (!byName.has(key)) {
|
|
524
|
+
byName.set(key, agent);
|
|
525
|
+
}
|
|
526
|
+
// NOTE: If an agent with the same name exists, the dynamic version is ignored.
|
|
527
|
+
// This prevents privilege escalation via agent shadowing (SEC-001).
|
|
190
528
|
}
|
|
191
529
|
return [...byName.values()].filter((agent) => !agent.disabled).sort((a, b) => a.name.localeCompare(b.name));
|
|
192
530
|
}
|
|
@@ -91,10 +91,16 @@ export function resolveTaskSkillNames(input: ResolveTaskSkillsInput): string[] {
|
|
|
91
91
|
return collectTaskSkillNames(input).slice(0, MAX_SELECTED_SKILLS);
|
|
92
92
|
}
|
|
93
93
|
|
|
94
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
95
|
+
// SEC-003 Fix: Reverse skill search order (package first, project second)
|
|
96
|
+
// Prevents malicious project skills from overriding trusted package skills.
|
|
97
|
+
// See: SECURITY-ISSUES.md SEC-003
|
|
98
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
99
|
+
|
|
94
100
|
function candidateSkillDirs(cwd: string): Array<{ root: string; source: "project" | "package" }> {
|
|
95
101
|
return [
|
|
96
|
-
{ root:
|
|
97
|
-
{ root:
|
|
102
|
+
{ root: PACKAGE_SKILLS_DIR, source: "package" }, // ✓ Trusted first
|
|
103
|
+
{ root: path.resolve(cwd, "skills"), source: "project" }, // ⚠️ Override second
|
|
98
104
|
];
|
|
99
105
|
}
|
|
100
106
|
|
|
@@ -2,6 +2,51 @@ import * as path from "node:path";
|
|
|
2
2
|
import type { TeamRunManifest, TaskPacket, TaskScope, VerificationContract } from "../state/types.ts";
|
|
3
3
|
import type { WorkflowStep } from "../workflows/workflow-config.ts";
|
|
4
4
|
|
|
5
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
6
|
+
// SEC-007 Fix: Workflow Step Task Sanitization
|
|
7
|
+
// Context provided by workers comes from workflow definitions that could
|
|
8
|
+
// be user-controlled. Sanitize task text to prevent injection.
|
|
9
|
+
// See: SECURITY-ISSUES.md SEC-007
|
|
10
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Sanitize workflow step task text to reduce injection risk.
|
|
15
|
+
*
|
|
16
|
+
* The task text is used as a prompt for worker agents. In a multi-tenant
|
|
17
|
+
* or shared workflow scenario, malicious workflow definitions could
|
|
18
|
+
* embed injection instructions.
|
|
19
|
+
*
|
|
20
|
+
* Sanitization:
|
|
21
|
+
* - Strip zero-width Unicode characters
|
|
22
|
+
* - Strip known prompt injection directive patterns
|
|
23
|
+
* - Strip base64/hex encoded payloads
|
|
24
|
+
* - Collapse excessive whitespace
|
|
25
|
+
*/
|
|
26
|
+
export function sanitizeTaskText(task: string): string {
|
|
27
|
+
let sanitized = task;
|
|
28
|
+
|
|
29
|
+
// 1. Strip zero-width and invisible Unicode characters
|
|
30
|
+
sanitized = sanitized.replace(/[\u200B-\u200F\u2028-\u202F\u2060-\u206F\uFEFF]/g, "");
|
|
31
|
+
|
|
32
|
+
// 2. Strip known prompt injection directive patterns
|
|
33
|
+
sanitized = sanitized.replace(
|
|
34
|
+
/^\s*(?:SYSTEM|INSTRUCTION|IGNORE(?:\s+ALL)?\s+INSTRUCTIONS|OVERRIDE|YOUR\s+ROLE\s+IS|MALICIOUS)\s*:.*$/gim,
|
|
35
|
+
""
|
|
36
|
+
);
|
|
37
|
+
|
|
38
|
+
// 3. Strip base64/hex encoded command payloads
|
|
39
|
+
sanitized = sanitized.replace(/\b(?:base64|base32|hex)\s*['":]\s*([A-Za-z0-9+\/=]{16,})/gi, "[encoded-redacted]");
|
|
40
|
+
|
|
41
|
+
// 4. Strip embedded instruction patterns in brackets
|
|
42
|
+
sanitized = sanitized.replace(/\[(?:SYSTEM|INSTRUCTION|OVERRIDE)\s*:[^\]]*\]/gi, "");
|
|
43
|
+
|
|
44
|
+
// 5. Collapse multiple blank lines
|
|
45
|
+
sanitized = sanitized.replace(/\n{3,}/g, "\n\n");
|
|
46
|
+
|
|
47
|
+
return sanitized.trim();
|
|
48
|
+
}
|
|
49
|
+
|
|
5
50
|
export interface BuildTaskPacketInput {
|
|
6
51
|
manifest: TeamRunManifest;
|
|
7
52
|
step: WorkflowStep;
|
|
@@ -34,8 +79,10 @@ export function buildTaskPacket(input: BuildTaskPacketInput): TaskPacket {
|
|
|
34
79
|
const scope = inferTaskScope(input.step);
|
|
35
80
|
const reads = input.step.reads === false ? [] : input.step.reads ?? [];
|
|
36
81
|
const scopePath = reads.length === 1 ? reads[0] : reads.length > 1 ? reads.join(", ") : undefined;
|
|
82
|
+
// SEC-007: Sanitize task text before inserting into task packet
|
|
83
|
+
const sanitizedTask = sanitizeTaskText(input.step.task);
|
|
37
84
|
return {
|
|
38
|
-
objective:
|
|
85
|
+
objective: sanitizedTask.replaceAll("{goal}", input.manifest.goal),
|
|
39
86
|
scope,
|
|
40
87
|
scopePath,
|
|
41
88
|
repo: path.basename(input.manifest.cwd) || input.manifest.cwd,
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Integration check: validates pi-crew core discovery and team-run functionality.
|
|
3
|
+
* Run with: node --experimental-strip-types --test test-integration-check.ts
|
|
4
|
+
*/
|
|
5
|
+
import * as fs from "node:fs";
|
|
6
|
+
import * as os from "node:os";
|
|
7
|
+
import * as path from "node:path";
|
|
8
|
+
import test from "node:test";
|
|
9
|
+
import assert from "node:assert/strict";
|
|
10
|
+
|
|
11
|
+
import { discoverAgents, allAgents } from "./src/agents/discover-agents.ts";
|
|
12
|
+
import { discoverTeams, allTeams } from "./src/teams/discover-teams.ts";
|
|
13
|
+
import { discoverWorkflows, allWorkflows } from "./src/workflows/discover-workflows.ts";
|
|
14
|
+
import { handleTeamTool } from "./src/extension/team-tool.ts";
|
|
15
|
+
import { loadRunManifestById } from "./src/state/state-store.ts";
|
|
16
|
+
|
|
17
|
+
const pkgRoot = path.resolve(import.meta.dirname ?? ".");
|
|
18
|
+
|
|
19
|
+
// ── Discovery tests ──────────────────────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
test("discovers builtin agents", () => {
|
|
22
|
+
const discovery = discoverAgents(pkgRoot);
|
|
23
|
+
assert.ok(discovery, "discoverAgents should return a result");
|
|
24
|
+
assert.ok(
|
|
25
|
+
discovery.builtin.length >= 10,
|
|
26
|
+
`Expected ≥10 builtin agents, got ${discovery.builtin.length}`,
|
|
27
|
+
);
|
|
28
|
+
const all = allAgents(discovery);
|
|
29
|
+
const names = all.map((a) => a.name);
|
|
30
|
+
assert.ok(names.includes("executor"), `Missing "executor" agent. Got: ${names.join(", ")}`);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
test("discovers builtin teams", () => {
|
|
34
|
+
const discovery = discoverTeams(pkgRoot);
|
|
35
|
+
assert.ok(discovery, "discoverTeams should return a result");
|
|
36
|
+
assert.ok(
|
|
37
|
+
discovery.builtin.length >= 6,
|
|
38
|
+
`Expected ≥6 builtin teams, got ${discovery.builtin.length}`,
|
|
39
|
+
);
|
|
40
|
+
const all = allTeams(discovery);
|
|
41
|
+
const names = all.map((t) => t.name);
|
|
42
|
+
assert.ok(names.includes("fast-fix"), `Missing "fast-fix" team. Got: ${names.join(", ")}`);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
test("discovers builtin workflows", () => {
|
|
46
|
+
const discovery = discoverWorkflows(pkgRoot);
|
|
47
|
+
assert.ok(discovery, "discoverWorkflows should return a result");
|
|
48
|
+
assert.ok(
|
|
49
|
+
discovery.builtin.length >= 6,
|
|
50
|
+
`Expected ≥6 builtin workflows, got ${discovery.builtin.length}`,
|
|
51
|
+
);
|
|
52
|
+
const all = allWorkflows(discovery);
|
|
53
|
+
const names = all.map((w) => w.name);
|
|
54
|
+
assert.ok(
|
|
55
|
+
names.includes("fast-fix"),
|
|
56
|
+
`Missing "fast-fix" workflow. Got: ${names.join(", ")}`,
|
|
57
|
+
);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
// ── Team run test ─────────────────────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
test("fast-fix team run completes successfully with mock child Pi", async () => {
|
|
63
|
+
const cwd = fs.mkdtempSync(path.join(os.tmpdir(), "pi-crew-int-check-"));
|
|
64
|
+
fs.mkdirSync(path.join(cwd, ".crew"), { recursive: true });
|
|
65
|
+
|
|
66
|
+
const prevExec = process.env.PI_TEAMS_EXECUTE_WORKERS;
|
|
67
|
+
const prevMock = process.env.PI_TEAMS_MOCK_CHILD_PI;
|
|
68
|
+
process.env.PI_TEAMS_EXECUTE_WORKERS = "1";
|
|
69
|
+
process.env.PI_TEAMS_MOCK_CHILD_PI = "success";
|
|
70
|
+
|
|
71
|
+
try {
|
|
72
|
+
const run = await handleTeamTool(
|
|
73
|
+
{ action: "run", team: "fast-fix", goal: "create a hello.txt file" },
|
|
74
|
+
{ cwd },
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
// run result is not an error
|
|
78
|
+
assert.equal(run.isError, false, `handleTeamTool returned error: ${JSON.stringify(run)}`);
|
|
79
|
+
|
|
80
|
+
const runId = run.details.runId;
|
|
81
|
+
assert.ok(runId, "Expected a runId in details");
|
|
82
|
+
|
|
83
|
+
// manifest should be persisted and completed
|
|
84
|
+
const loaded = loadRunManifestById(cwd, runId!);
|
|
85
|
+
assert.ok(loaded, "loadRunManifestById should return data");
|
|
86
|
+
assert.equal(
|
|
87
|
+
loaded!.manifest.status,
|
|
88
|
+
"completed",
|
|
89
|
+
`Expected manifest status "completed", got "${loaded!.manifest.status}"`,
|
|
90
|
+
);
|
|
91
|
+
|
|
92
|
+
// all tasks should be completed
|
|
93
|
+
const taskStatuses = loaded!.tasks.map((t) => t.status);
|
|
94
|
+
assert.ok(
|
|
95
|
+
taskStatuses.every((s) => s === "completed"),
|
|
96
|
+
`Not all tasks completed: ${JSON.stringify(taskStatuses)}`,
|
|
97
|
+
);
|
|
98
|
+
|
|
99
|
+
// artifacts directory should exist
|
|
100
|
+
const artifactsDir = path.join(cwd, ".crew", "artifacts", runId!);
|
|
101
|
+
assert.ok(
|
|
102
|
+
fs.existsSync(artifactsDir),
|
|
103
|
+
`Artifacts directory should exist: ${artifactsDir}`,
|
|
104
|
+
);
|
|
105
|
+
|
|
106
|
+
console.log(`✅ fast-fix run ${runId} completed successfully with ${loaded!.tasks.length} tasks`);
|
|
107
|
+
} finally {
|
|
108
|
+
if (prevExec === undefined) delete process.env.PI_TEAMS_EXECUTE_WORKERS;
|
|
109
|
+
else process.env.PI_TEAMS_EXECUTE_WORKERS = prevExec;
|
|
110
|
+
if (prevMock === undefined) delete process.env.PI_TEAMS_MOCK_CHILD_PI;
|
|
111
|
+
else process.env.PI_TEAMS_MOCK_CHILD_PI = prevMock;
|
|
112
|
+
fs.rmSync(cwd, { recursive: true, force: true });
|
|
113
|
+
}
|
|
114
|
+
});
|