pi-crew 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.4.0] — 9arm-skills Enforcement Patterns & Integration Tests (2026-05-26)
4
+
5
+ ### Features
6
+ - **systematic-debugging: Refuse Gate** — Hard constraints before proposing fixes. Must verify repro exists, root cause known, and hypothesis falsified before any fix.
7
+ - **systematic-debugging: Recite Ritual** — Psychological anchor at session start. Recite 4-step mantra before beginning any debug session.
8
+ - **systematic-debugging: Falsify-First** — Phase 3 now requires disproof before proof. Run disproof experiments first to save time on wrong hypotheses.
9
+ - **systematic-debugging: Breadcrumb Ledger** — Structured experiment tracking within debug sessions.
10
+ - **multi-perspective-review: Simpler Alternative Pass** — Mandatory pre-review step to question if the change should exist at all.
11
+ - **New skill: scrutinize** — Outsider-perspective review questioning intent before tracing code.
12
+ - **New skill: post-mortem** — Engineering RCA documentation with 4 required inputs gate.
13
+ - **skills/REFERENCE.md** — New documentation of skill chains, inventory, and anti-patterns.
14
+ - **Trigger conditions** added to all major skill descriptions for better skill invocation matching.
15
+
16
+ ### Bug Fixes
17
+ - **CI reliability** — Fixed flaky tests on macOS: crew-widget and render-scheduler timing issues resolved.
18
+ - **Team-context import detection** — Fixed regex to correctly match only direct `/team-tool.ts` imports, not `/team-tool/context.ts`.
19
+
20
+ ### Tests
21
+ - **New test-integration-check.ts** — Integration tests for core pi-crew functionality (agent/team/workflow discovery, fast-fix team run).
22
+ - **1740 tests passing** across all platforms (Ubuntu, macOS, Windows).
23
+
24
+ ---
25
+
3
26
  ## [0.3.8] — Zombie Run Auto-Repair & Test Stability (2026-05-25)
4
27
 
5
28
  ### Features
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.3.8",
3
+ "version": "0.4.0",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
@@ -0,0 +1,136 @@
1
+ # pi-crew Skills Reference
2
+
3
+ ## Skill Chains
4
+
5
+ ### Bug Investigation
6
+
7
+ ```
8
+ systematic-debugging (4 phases with refuse gate)
9
+
10
+ verification-before-done (evidence before claim)
11
+
12
+ post-mortem (RCA documentation)
13
+ ```
14
+
15
+ ### Multi-phase Work
16
+
17
+ ```
18
+ orchestration (phase coordination)
19
+
20
+ delegation-patterns (task splitting)
21
+
22
+ verification-before-done (after each phase)
23
+ ```
24
+
25
+ ### Code Review (Quick)
26
+
27
+ ```
28
+ scrutinize (outsider perspective + simpler alternative)
29
+ ```
30
+
31
+ ### Code Review (Deep)
32
+
33
+ ```
34
+ scrutinize (outsider perspective)
35
+
36
+ multi-perspective-review (8-pass deep review)
37
+
38
+ secure-agent-orchestration-review (security focus)
39
+ ```
40
+
41
+ ---
42
+
43
+ ## When to Invoke
44
+
45
+ | Situation | Skill |
46
+ |-----------|-------|
47
+ | Bug / test failure / crash | `systematic-debugging` |
48
+ | Before claiming done | `verification-before-done` |
49
+ | Code review (quick) | `scrutinize` |
50
+ | Code review (deep) | `multi-perspective-review` |
51
+ | Task delegation | `delegation-patterns` |
52
+ | Complex multi-phase work | `orchestration` |
53
+ | After bug is fixed | `post-mortem` |
54
+ | Security review | `secure-agent-orchestration-review` |
55
+ | Workspace safety | `workspace-isolation` |
56
+ | Bash safety | `safe-bash` |
57
+
58
+ ---
59
+
60
+ ## Skills Inventory
61
+
62
+ ### Core Discipline
63
+
64
+ | Skill | Description |
65
+ |-------|-------------|
66
+ | `systematic-debugging` | Four-phase debugging with refuse gates, falsify-first discipline |
67
+ | `verification-before-done` | Evidence before claims |
68
+ | `orchestration` | Multi-phase coordination, 8 rules including "respawn not absorb" |
69
+
70
+ ### Review
71
+
72
+ | Skill | Description |
73
+ |-------|-------------|
74
+ | `scrutinize` | Outsider-perspective review questioning intent |
75
+ | `multi-perspective-review` | 8-pass deep code review |
76
+ | `secure-agent-orchestration-review` | Security-focused review |
77
+
78
+ ### Documentation
79
+
80
+ | Skill | Description |
81
+ |-------|-------------|
82
+ | `post-mortem` | Engineering RCA record |
83
+
84
+ ### Delegation
85
+
86
+ | Skill | Description |
87
+ |-------|-------------|
88
+ | `delegation-patterns` | Task splitting patterns |
89
+ | `requirements-to-task-packet` | Task packet creation |
90
+
91
+ ### Runtime/Safety
92
+
93
+ | Skill | Description |
94
+ |-------|-------------|
95
+ | `workspace-isolation` | Security boundary enforcement |
96
+ | `worktree-isolation` | Git worktree safety |
97
+ | `safe-bash` | Bash command safety |
98
+ | `state-mutation-locking` | State mutation protection |
99
+
100
+ ### Observability
101
+
102
+ | Skill | Description |
103
+ |-------|-------------|
104
+ | `event-log-tracing` | JSONL event log analysis |
105
+ | `runtime-state-reader` | Runtime state inspection |
106
+ | `observability-reliability` | Reliability patterns |
107
+
108
+ ---
109
+
110
+ ## Anti-patterns
111
+
112
+ | Anti-pattern | Skill | Rule |
113
+ |--------------|-------|------|
114
+ | Proposing fix before reproducing | `systematic-debugging` | Refuse Gate |
115
+ | Running proof before disproof | `systematic-debugging` | Phase 3 |
116
+ | Claiming "tests pass" without fresh run | `verification-before-done` | Gate Function |
117
+ | Reviewing diff-local without tracing path | `scrutinize` | Trace step |
118
+ | Skipping simpler-alternative pass | `multi-perspective-review` | Pre-review |
119
+ | Editing files yourself as orchestrator | `orchestration` | Rule 1 |
120
+ | Dispatching serially when parallel possible | `orchestration` | Rule 3 |
121
+ | Committing a red tree | `orchestration` | Rule 6 |
122
+ | Absorbing subagent's broken work | `orchestration` | Rule 7 |
123
+ | Rubber-stamp review | `multi-perspective-review` | Rules |
124
+
125
+ ---
126
+
127
+ ## Key Enforcement Patterns (from 9arm)
128
+
129
+ | Pattern | Implemented In |
130
+ |---------|---------------|
131
+ | **Refuse Gate** | `systematic-debugging` |
132
+ | **Recite Ritual** | `systematic-debugging` (Invocation) |
133
+ | **Falsify Before Proof** | `systematic-debugging` (Phase 3) |
134
+ | **Simpler Alternative Pass** | `scrutinize`, `multi-perspective-review` |
135
+ | **Required Inputs Gate** | `post-mortem` |
136
+ | **Respawn Not Absorb** | `orchestration` (Rule 7) |
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: delegation-patterns
3
- description: Subagent/team delegation workflow. Use when splitting work across pi-crew teams, direct agents, async background workers, chains, or parallel research/review tasks.
3
+ description: "Subagent/team delegation workflow. Use when splitting work across pi-crew teams, direct agents, async background workers, chains, or parallel tasks. Triggers: delegate this, split this task, parallelize, dispatch workers, assign to team, spawn agents."
4
4
  ---
5
5
 
6
6
  # delegation-patterns
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: event-log-tracing
3
- description: Structured event logging system for worker lifecycle, live agents, and crash recovery. Use when debugging worker crashes, tracing agent lifecycle, or investigating stale runs.
3
+ description: "Structured event logging for worker lifecycle, live agents, crash recovery. Use when debugging crashes, tracing agent lifecycle, investigating stale runs. Triggers: event log, trace events, worker crashed, agent died, stale run, events.jsonl."
4
4
  ---
5
5
 
6
6
  # event-log-tracing
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: multi-perspective-review
3
- description: Use when reviewing a plan, diff, implementation, worker output, release candidate, or external review feedback.
3
+ description: "Multi-perspective code review with simpler-alternative pass. Use when reviewing a plan, diff, implementation, worker output, release candidate, or external feedback. Triggers: review this, look at this, LGTM check, sanity check, audit this, get a second opinion, check this PR, examine this code."
4
4
  ---
5
5
 
6
6
  # multi-perspective-review
@@ -9,6 +9,22 @@ Core principle: review early, review often, and separate concerns. Reviewer outp
9
9
 
10
10
  Distilled from detailed reads of requesting-code-review, receiving-code-review, subagent review checkpoints, differential review, and specialized review-agent patterns.
11
11
 
12
+ ## Pre-review: Simpler Alternative Pass (Mandatory)
13
+
14
+ Before running any review passes, ask:
15
+
16
+ 1. **Is there a simpler, smaller, or more elegant way to achieve the same goal?**
17
+ - Doing nothing (is the problem real and load-bearing?)
18
+ - Using something that already exists in the codebase
19
+ - A smaller change that solves 90% of the goal with 10% of the risk
20
+ - Solving it at a different layer (config vs code, framework vs app)
21
+ 2. If a better alternative exists, surface it BEFORE the line-by-line review.
22
+ 3. Skip only if the user explicitly says "don't question scope."
23
+
24
+ This is the most valuable finding you can produce — surfacing unnecessary complexity before reviewing its details.
25
+
26
+ ---
27
+
12
28
  ## Review Passes
13
29
 
14
30
  Run relevant passes separately:
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: orchestration
3
- description: Multi-phase orchestration skill for pi-crew planners and executors. Use when decomposing complex tasks into parallel phases, dispatching workers, verifying gates, and iterating until closure.
3
+ description: "Multi-phase orchestration for planners and executors. Use when decomposing complex tasks into parallel phases, dispatching workers, verifying gates, and iterating to closure. Triggers: orchestrate this, coordinate these tasks, run this multi-phase, dispatch workers, coordinate team."
4
4
  ---
5
5
 
6
6
  # orchestration
@@ -0,0 +1,90 @@
1
+ ---
2
+ name: post-mortem
3
+ description: "Write engineering RCA record after bug is fixed. Use when asking: write post-mortem, RCA, root cause analysis, document this fix, close out this bug. Triggers: post-mortem, postmortem, root cause, RCA, document this fix, write up the cause, close out bug."
4
+ ---
5
+
6
+ # post-mortem
7
+
8
+ The canonical engineering record of a bug fix. Written after debugging lands a real fix.
9
+
10
+ ## Required Inputs — Refuse to Draft Without These
11
+
12
+ - [ ] **Reliable repro exists** (deterministic or high-rate flake)
13
+ - [ ] **Root cause is known** (mechanism identified, not a hypothesis)
14
+ - [ ] **Fix is identified** (PR / commit / branch)
15
+ - [ ] **Fix is validated** (original repro now passes)
16
+
17
+ If any missing → list what's missing and stop. Do not draft.
18
+
19
+ ## Structure
20
+
21
+ ### 1. Summary
22
+
23
+ What broke (user terms), what fixed it (one sentence). JIRA key, PR, owner. A reader who stops here should have the right answer.
24
+
25
+ ### 2. Symptom
26
+
27
+ Concrete: test output, error message, log line. No paraphrase. What was actually observed.
28
+
29
+ ### 3. Root Cause
30
+
31
+ The actual bug mechanism. Code identifiers welcome — function names, file paths, branch conditions. Walk the cause chain end-to-end.
32
+
33
+ ### 4. Why It Produced the Symptom
34
+
35
+ Walk the chain so reader connects symptom to cause. Often non-obvious — bug is in X but visible failure is in Y.
36
+
37
+ ### 5. Fix
38
+
39
+ What changed and why this addresses root cause. Link to PR/commit. If a previous fix attempt papered over the symptom, name it and explain what was wrong.
40
+
41
+ ### 6. How It Was Found
42
+
43
+ Short. The debugging path:
44
+
45
+ - What repro made it deterministic
46
+ - What tools cracked it
47
+ - Hypotheses tried and rejected (with one-line reason each)
48
+ - The single experiment that confirmed the cause
49
+
50
+ ### 7. Why It Slipped Through
51
+
52
+ CI gap? Latent code? Workload gap? Incomplete prior fix? Review miss? Be specific.
53
+
54
+ If honest answer is "no good reason" — say so. **Blameless** — describe the gap, not the person.
55
+
56
+ ### 8. Validation
57
+
58
+ How we know the fix works:
59
+
60
+ - Original failing test now passes (test name)
61
+ - Customer workload now completes (workload identifier)
62
+ - Other affected configs/workloads also tested
63
+
64
+ If only one config validated, say so explicitly.
65
+
66
+ ### 9. Action Items
67
+
68
+ What + owner + tracking artifact:
69
+
70
+ - Regression test added at <seam>. (Owner, test name)
71
+ - CI gap closed: <new check>. (Owner, ticket)
72
+ - Doc/runbook updated. (Owner, link)
73
+
74
+ If none needed: "None — fix is sufficient and no class-of-bug follow-up warranted."
75
+
76
+ ## Tone
77
+
78
+ This is engineer-to-engineer:
79
+
80
+ - **Code identifiers are first-class.** Keep them — future engineers grep their way back.
81
+ - **Mechanism over narrative.** Walk the cause chain, don't soften.
82
+ - **Blameless.** Describe gaps and bugs, never people.
83
+ - **No hedging.** State it or don't write it.
84
+
85
+ ## Rules
86
+
87
+ - Never invent facts.
88
+ - Never strip code identifiers (they are the index).
89
+ - State validation coverage honestly.
90
+ - Get sign-off before posting to JIRA.
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: safe-bash
3
- description: Safe shell-command workflow. Use whenever a task may execute shell commands, especially to prefer read-only commands and avoid destructive actions without confirmation.
3
+ description: "Safe shell-command workflow. Use when executing shell commands, prefer read-only, avoid destructive actions. Triggers: run this command, execute bash, safe bash, avoid rm, destructive command, shell injection."
4
4
  ---
5
5
 
6
6
  # safe-bash
@@ -0,0 +1,67 @@
1
+ ---
2
+ name: scrutinize
3
+ description: "Outsider-perspective review questioning intent before tracing code. Use when asking: should this even exist?, is there a simpler way?, get a second opinion, before deep code review. Triggers: scrutinize this, question this, is there a better way?, simplify this, overkill?, too complex."
4
+ ---
5
+
6
+ # Scrutinize
7
+
8
+ Stand outside the change and ask whether it should exist at all, then verify it actually does what it claims end-to-end.
9
+
10
+ ## Operating Stance
11
+
12
+ - **Outsider.** Forget who wrote it and why they think it's right. Read the artifact cold.
13
+ - **End-to-end, not diff-local.** The diff is the entry point, not the scope.
14
+ - **Actionable, concise, with rationale.** Every finding states what to change, why, and what evidence led you there.
15
+
16
+ ## Workflow
17
+
18
+ ### 1. Intent — Is this necessary?
19
+
20
+ - State the goal in one sentence, in your own words. If you cannot, the artifact is underspecified — say so and stop.
21
+ - Ask: **Is there a simpler way?**
22
+ - Delete/does-nothing (is the problem real and load-bearing?)
23
+ - Use existing code (does this already exist?)
24
+ - Smaller change (solves 90% of goal with 10% of risk?)
25
+ - Different layer (config vs code, framework vs app, build vs runtime?)
26
+ - If a better alternative exists, name it BEFORE the line-by-line review.
27
+
28
+ ### 2. Trace — Walk the actual code path
29
+
30
+ - For each behavior the change claims, trace end-to-end through real code — not just the lines in the diff.
31
+ - Include unchanged code on either side of the diff. Bugs hide at the seams.
32
+ - Entry point → call sites → branches taken → state mutated → exit/return/side effect.
33
+
34
+ ### 3. Verify — Does it do what it claims?
35
+
36
+ - Does the traced code actually produce the behavior?
37
+ - What inputs/states would break it? (Edge cases, concurrent callers, error paths, partial failures, retries, empty/null/unicode/huge inputs)
38
+ - What does it silently change? (Performance, error semantics, observability, contracts)
39
+ - How is it tested? (Do tests exercise the traced path, or pass while skipping it?)
40
+
41
+ ### 4. Report
42
+
43
+ Format per finding:
44
+
45
+ ```text
46
+ [severity] file:line
47
+ Issue: ...
48
+ Impact: ...
49
+ Fix: ...
50
+ ```
51
+
52
+ Severity:
53
+
54
+ - critical: data loss, secret leak, arbitrary command/path escape
55
+ - high: broken core workflow, ownership bypass
56
+ - medium: regression, flaky behavior
57
+ - low: polish, maintainability
58
+
59
+ Close with verdict: **ship / fix-then-ship / rework / reject** — with single biggest reason.
60
+
61
+ ## Rules
62
+
63
+ - **No rubber-stamps.** "LGTM" is not an output. If nothing found, say what you traced.
64
+ - **Cite or it didn't happen.** Every claim needs specific path/file/line.
65
+ - **One simpler-alternative pass is MANDATORY.** Skip only if user says "don't question scope."
66
+ - **Distinguish claim from verification.** "The PR says X" and "I traced X and confirmed" are different.
67
+ - **No flattery, no hedging.** State the finding.
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: systematic-debugging
3
- description: Use when encountering a bug, test failure, blocked run, provider error, stale state, crash, or unexpected behavior before proposing fixes.
3
+ description: "Four-phase debugging discipline with refuse gates. Use when encountering a bug, test failure, blocked run, provider error, stale state, crash, or unexpected behavior. Triggers: debug this, investigate, fix this bug, something is broken, crash, error, test failed, it broke, not working, unexpected."
4
4
  ---
5
5
 
6
6
  # systematic-debugging
@@ -9,6 +9,36 @@ Core principle: no fixes without root-cause investigation first. Symptom patches
9
9
 
10
10
  Distilled from detailed reads of systematic-debugging, root-cause tracing, TDD, and error-analysis skill patterns.
11
11
 
12
+ ## Invocation — Read Before Debugging
13
+
14
+ Before beginning any debug session, recite these four steps:
15
+
16
+ > **1. First is reproducibility.** Can the issue be reproduced reliably?
17
+ > **2. Know the fail path.** Where does the code break and what stops it from breaking?
18
+ > **3. Question your hypothesis.** What would disprove it?
19
+ > **4. Every run is a breadcrumb.** Cross-reference all of them.
20
+
21
+ If the user says "skip the ritual" → skip the recitation but still apply the four phases silently.
22
+
23
+ ---
24
+
25
+ ## Refuse Gate — Do NOT Proceed Without These
26
+
27
+ Before proposing ANY fix:
28
+
29
+ - [ ] **Can you reproduce the issue reliably?** (deterministic or >50% flake rate)
30
+ - [ ] **Do you know the root cause?** (confirmed mechanism, not a hypothesis)
31
+ - [ ] **Have you tried to FALSIFY your hypothesis first?** (disproof before proof)
32
+
33
+ If ANY answer is NO:
34
+ → Stop.
35
+ → State what's missing.
36
+ → Do not propose a fix.
37
+
38
+ Exception: if the user explicitly says "just patch the symptom" — proceed but flag it as a symptom patch, not a root-cause fix.
39
+
40
+ ---
41
+
12
42
  ## Four Phases
13
43
 
14
44
  ### 1. Root Cause Investigation
@@ -34,9 +64,14 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
34
64
  - Identify dependencies: config home, project root markers, env vars, locks, stale caches, provider model capabilities.
35
65
  - Do not assume small differences are irrelevant.
36
66
 
37
- ### 3. Hypothesis and Test
67
+ ### 3. Hypothesis and Test — Falsify First
38
68
 
39
- - State one hypothesis: I think X is the root cause because Y.”
69
+ - State one hypothesis: "I think X is the root cause because Y."
70
+ - Generate 3-5 ranked hypotheses, not one. Single-hypothesis thinking anchors on the first plausible idea.
71
+ - For each hypothesis:
72
+ - What is the simplest **proof**? What is the cleanest **disproof**?
73
+ - Run the **disproof FIRST**. If the hypothesis survives, it's real. If it dies, you saved time chasing a phantom.
74
+ - Does it explain the symptom end-to-end? Walk it through.
40
75
  - Test one variable at a time with the smallest read-only probe or targeted test.
41
76
  - If wrong, discard the hypothesis instead of piling on fixes.
42
77
  - After three failed fixes, question architecture or assumptions before continuing.
@@ -45,7 +80,7 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
45
80
 
46
81
  - Add or identify a failing regression test when practical.
47
82
  - Fix the root cause, not the symptom.
48
- - Avoid while Im here refactors.
83
+ - Avoid "while I'm here" refactors.
49
84
  - Verify targeted behavior, then broader gates.
50
85
 
51
86
  ## Evidence to Collect
@@ -60,8 +95,28 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
60
95
 
61
96
  ## Anti-patterns
62
97
 
63
- - Fixing before reproducing.
98
+ - Proposing a fix before reproducing (the refuse gate exists for a reason).
99
+ - Running proof experiments before disproof (disproof first saves time).
100
+ - Trusting a single passing run as validation (check against all prior breadcrumbs).
64
101
  - Assuming real user global config cannot pollute tests.
65
102
  - Treating provider errors as only transient network failures.
66
103
  - Removing guards because they reveal a blocked state.
67
104
  - Editing unrelated layers before checking the hypothesis.
105
+
106
+ ## Breadcrumb Ledger
107
+
108
+ Maintain a running ledger of every experiment in this session. Each entry:
109
+
110
+ | # | What Changed | What Happened | Ruled In/Out |
111
+ |---|-------------|--------------|-------------|
112
+ | 1 | Added `[DBG-001]` probe | Got `[output]` | Hypothesis A ruled out |
113
+ | 2 | Changed X to Y | Same error persists | Not X |
114
+ | 3 | Checked Z config | Found mismatch | Z is contributing |
115
+
116
+ When a new hypothesis surfaces, walk the ledger:
117
+ - Does it hold for **every** prior observation?
118
+ - If any past run contradicts it, the hypothesis is wrong or incomplete.
119
+
120
+ When in doubt, design the **single experiment** whose outcome makes it certain — run that next.
121
+
122
+ Update the ledger after every run. It is your memory across the session.
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: verification-before-done
3
- description: Use when about to claim work is complete, fixed, passing, reviewed, committed, or ready to hand off.
3
+ description: "Evidence before claims. Use before claiming work is complete, fixed, passing, reviewed, committed, or ready to hand off. Triggers: done, fixed, complete, ready to merge, can I close, is it working, verify this, check if it passes, all good, LGTM, ready to ship."
4
4
  ---
5
5
 
6
6
  # verification-before-done
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: workspace-isolation
3
- description: Workspace isolation boundaries in pi-crew. Use when ensuring agents from workspace A cannot access workspace B, or when implementing worktree-based parallel execution.
3
+ description: "Workspace isolation boundaries. Use when ensuring agents from workspace A cannot access workspace B, or worktree-based parallel execution. Triggers: workspace isolation, cross-workspace access, escape boundary, worktree safety."
4
4
  ---
5
5
 
6
6
  # workspace-isolation
@@ -6,6 +6,215 @@ import { parseCsv, parseFrontmatter } from "../utils/frontmatter.ts";
6
6
  import { logInternalError } from "../utils/internal-error.ts";
7
7
  import { packageRoot, projectCrewRoot, userPiRoot } from "../utils/paths.ts";
8
8
 
9
+ // ═══════════════════════════════════════════════════════════════════════════
10
+ // SEC-001 Fix: Protected Agent Names Blocklist
11
+ // Prevents privilege escalation via agent shadowing attacks.
12
+ // See: SECURITY-ISSUES.md SEC-001
13
+ // ═══════════════════════════════════════════════════════════════════════════
14
+
15
+ // ═══════════════════════════════════════════════════════════════════════════
16
+ // SEC-005 Fix: Version-based Cache for Atomic Invalidation
17
+ // Uses a global version counter for atomic cache invalidation instead of
18
+ // relying on TTL alone. This eliminates race conditions where concurrent
19
+ // callers might get stale cached snapshots.
20
+ // See: SECURITY-ISSUES.md SEC-005
21
+ // ═══════════════════════════════════════════════════════════════════════════
22
+
23
+
24
+ /** Version counter for atomic cache invalidation. Incremented on every mutation. */
25
+ let cacheVersion = 0;
26
+
27
+ /** Get current cache version. Used for atomic cache stamping. */
28
+ export function getCacheVersion(): number {
29
+ return cacheVersion;
30
+ }
31
+
32
+ /**
33
+ * Increment cache version for atomic invalidation.
34
+ * All cached entries with versions older than this are considered stale.
35
+ */
36
+ function incrementCacheVersion(): void {
37
+ cacheVersion++;
38
+ }
39
+
40
+ /** Exact match blocklist for protected builtin agent names. */
41
+ const PROTECTED_AGENT_NAMES = new Set([
42
+ "executor",
43
+ "test-engineer",
44
+ "explorer",
45
+ "planner",
46
+ "analyst",
47
+ "critic",
48
+ "reviewer",
49
+ "verifier",
50
+ "writer",
51
+ "security-reviewer",
52
+ ]);
53
+
54
+ /**
55
+ * Pattern blocklist for agent names that would likely confuse or deceive
56
+ * workflows looking for builtin agents.
57
+ *
58
+ * Covers:
59
+ * - Name variations: "executor-v2", "my-executor", "custom-executor"
60
+ * - Misspellings that could be typo-squatted: "execultor", "explroer"
61
+ * - Prefix/suffix combinations with protected names
62
+ */
63
+ const PROTECTED_AGENT_PATTERNS: Array<{ pattern: RegExp; example: string }> = [
64
+ // Exact variations with delimiters
65
+ { pattern: /^executor[-_]?v?[0-9]/i, example: "executor-v2, executor_1" },
66
+ { pattern: /^test[-_]?engineer/i, example: "test-engineer-proxy" },
67
+ { pattern: /^explorer[-_]/i, example: "explorer-debug" },
68
+ { pattern: /^planner[-_]/i, example: "planner-v3" },
69
+ // Generic prefixes that could impersonate builtins
70
+ { pattern: /^(my|custom|new|local)[-_](executor|test[-_]?engineer|explorer|planner)$/i, example: "my-executor" },
71
+ { pattern: /^(executor|test[-_]?engineer|explorer|planner)[-_]?(proxy|hook|override)$/i, example: "executor-override" },
72
+ // Common typosquatting patterns (intentional misspellings)
73
+ { pattern: /^exec[au]t[o0]r$/i, example: "execator" },
74
+ { pattern: /^expl[o0]rer$/i, example: "explorer" },
75
+ { pattern: /^plann[ae]r$/i, example: "plannar" },
76
+ // Suffixes that indicate override意图
77
+ { pattern: /^(executor|test[-_]?engineer|explorer|planner)[-_]?(override|replacement|shadow)$/i, example: "executor-override" },
78
+ ];
79
+
80
+ /**
81
+ * Check if an agent name matches any protected pattern.
82
+ * Returns the matched pattern description for error messages.
83
+ */
84
+ function matchProtectedPattern(name: string): string | null {
85
+ const key = name.toLowerCase();
86
+ for (const { pattern, example } of PROTECTED_AGENT_PATTERNS) {
87
+ if (pattern.test(key)) {
88
+ return `pattern "${pattern}" (example: ${example})`;
89
+ }
90
+ }
91
+ return null;
92
+ }
93
+
94
+ /**
95
+ * Security event types for audit logging.
96
+ */
97
+ interface SecurityEvent {
98
+ type: "AGENT_REGISTRATION_BLOCKED" | "PROJECT_AGENT_SHADOW_WARNING";
99
+ name: string;
100
+ reason: string;
101
+ timestamp: number;
102
+ }
103
+
104
+ /**
105
+ * Security event log. In production, this should be sent to a security SIEM.
106
+ */
107
+ const securityEventLog: SecurityEvent[] = [];
108
+
109
+ /**
110
+ * Log a security event for audit purposes.
111
+ * TODO: In production, integrate with project's logging infrastructure
112
+ * (e.g., send to SIEM, log aggregator, or security webhook).
113
+ */
114
+ function logSecurityEvent(event: SecurityEvent): void {
115
+ securityEventLog.push(event);
116
+
117
+ // Console output for development/debugging (redacted in production)
118
+ const prefix = "\x1b[33m[SECURITY]\x1b[0m"; // Yellow warning
119
+ console.warn(
120
+ `${prefix} ${event.type}: agent="${event.name}" reason="${event.reason}" time=${new Date(event.timestamp).toISOString()}`
121
+ );
122
+ }
123
+
124
+ /**
125
+ * Get recent security events (for debugging/testing).
126
+ */
127
+ export function getSecurityEventLog(): readonly SecurityEvent[] {
128
+ return securityEventLog;
129
+ }
130
+
131
+ /**
132
+ * Clear security event log (for testing).
133
+ */
134
+ export function clearSecurityEventLog(): void {
135
+ securityEventLog.length = 0;
136
+ }
137
+
138
+ /**
139
+ * Security check: throws if the agent name is protected.
140
+ *
141
+ * Checks in order:
142
+ * 1. Exact match against PROTECTED_AGENT_NAMES
143
+ * 2. Pattern match against PROTECTED_AGENT_PATTERNS
144
+ *
145
+ * Throws with detailed error message on violation.
146
+ * Logs the event to securityEventLog for audit.
147
+ */
148
+ function assertAgentNameAllowed(name: string): void {
149
+ const key = name.toLowerCase();
150
+
151
+ // Check 1: Exact match
152
+ if (PROTECTED_AGENT_NAMES.has(key)) {
153
+ logSecurityEvent({
154
+ type: "AGENT_REGISTRATION_BLOCKED",
155
+ name,
156
+ reason: `exact_match:${key}`,
157
+ timestamp: Date.now(),
158
+ });
159
+ throw new Error(
160
+ `SECURITY: Cannot register agent '${name}': protected builtin name. ` +
161
+ `Dynamic agents cannot shadow builtin agents (executor, explorer, planner, etc.) to prevent privilege escalation.`
162
+ );
163
+ }
164
+
165
+ // Check 2: Pattern match (custom-executor, my-planner, etc.)
166
+ const matchedPattern = matchProtectedPattern(key);
167
+ if (matchedPattern !== null) {
168
+ logSecurityEvent({
169
+ type: "AGENT_REGISTRATION_BLOCKED",
170
+ name,
171
+ reason: `pattern_match:${matchedPattern}`,
172
+ timestamp: Date.now(),
173
+ });
174
+ throw new Error(
175
+ `SECURITY: Cannot register agent '${name}': name matches protected pattern (${matchedPattern}). ` +
176
+ `This pattern is blocked to prevent privilege escalation via similar-named agents.`
177
+ );
178
+ }
179
+ }
180
+
181
+ /**
182
+ * Check if a project agent name would shadow a builtin agent.
183
+ * Logs a warning if so, but does NOT block (project agents can be legitimate overrides).
184
+ *
185
+ * Called during agent discovery to flag potential security concerns.
186
+ */
187
+ function checkProjectAgentShadowsBuiltin(name: string): void {
188
+ const key = name.toLowerCase();
189
+
190
+ // Check exact match
191
+ if (PROTECTED_AGENT_NAMES.has(key)) {
192
+ logSecurityEvent({
193
+ type: "PROJECT_AGENT_SHADOW_WARNING",
194
+ name,
195
+ reason: "project_shadows_protected_builtin",
196
+ timestamp: Date.now(),
197
+ });
198
+ console.warn(
199
+ `\x1b[33m[SECURITY WARNING]\x1b[0m Project agent "${name}" shadows a protected builtin. ` +
200
+ `This agent will be loaded but builtin agents take priority. ` +
201
+ `If this is intentional, consider using a different name.`
202
+ );
203
+ return;
204
+ }
205
+
206
+ // Check pattern match
207
+ const matchedPattern = matchProtectedPattern(key);
208
+ if (matchedPattern !== null) {
209
+ logSecurityEvent({
210
+ type: "PROJECT_AGENT_SHADOW_WARNING",
211
+ name,
212
+ reason: `project_shadows_pattern:${matchedPattern}`,
213
+ timestamp: Date.now(),
214
+ });
215
+ }
216
+ }
217
+
9
218
  export interface AgentDiscoveryResult {
10
219
  builtin: AgentConfig[];
11
220
  user: AgentConfig[];
@@ -28,6 +237,101 @@ function parseContextMode(value: string | undefined): "fresh" | "fork" | undefin
28
237
  return value === "fresh" || value === "fork" ? value : undefined;
29
238
  }
30
239
 
240
+ // ═══════════════════════════════════════════════════════════════════════════
241
+ // SEC-002 Fix: Agent System Prompt Sanitization
242
+ // Prevents prompt injection via malicious agent files.
243
+ // See: SECURITY-ISSUES.md SEC-002
244
+ // ═══════════════════════════════════════════════════════════════════════════
245
+
246
+ /**
247
+ * Trust levels for agent source classification.
248
+ * Determines how strictly to sanitize the system prompt.
249
+ */
250
+ type TrustLevel = "builtin" | "user" | "project";
251
+
252
+ /**
253
+ * Convert ResourceSource to TrustLevel for sanitization.
254
+ */
255
+ function sourceToTrustLevel(source: ResourceSource): TrustLevel {
256
+ switch (source) {
257
+ case "builtin":
258
+ return "builtin";
259
+ case "user":
260
+ return "user";
261
+ case "project":
262
+ return "project";
263
+ default:
264
+ return "project";
265
+ }
266
+ }
267
+
268
+ /**
269
+ * Sanitize agent system prompt content to reduce prompt injection risk.
270
+ *
271
+ * Uses OWASP Agent Memory Guard-inspired patterns:
272
+ * - Strip zero-width Unicode (potential bypass vectors)
273
+ * - Strip HTML/JS comments and script tags
274
+ * - Strip known prompt injection directives
275
+ * - Strip encoded payloads (base64, hex)
276
+ * - Collapse excessive whitespace
277
+ *
278
+ * Trust levels affect sanitization strictness:
279
+ * - builtin: Minimal sanitization (trusted source)
280
+ * - user: Standard sanitization
281
+ * - project: Strict sanitization (untrusted source)
282
+ */
283
+ export function sanitizeAgentSystemPrompt(
284
+ content: string,
285
+ source: ResourceSource
286
+ ): string {
287
+ const trustLevel = sourceToTrustLevel(source);
288
+ let sanitized = content;
289
+
290
+ // 1. Strip zero-width and invisible Unicode characters (all trust levels)
291
+ sanitized = sanitized.replace(/[\u200B-\u200F\u2028-\u202F\u2060-\u206F\uFEFF]/g, "");
292
+
293
+ // 2. Strip HTML/JS comments (instruction hiding) — all trust levels
294
+ sanitized = sanitized.replace(/<!--[\s\S]*?-->|<\/?script[^>]*>/gi, "");
295
+
296
+ // 3. Strip known prompt injection directive patterns — user and project
297
+ if (trustLevel !== "builtin") {
298
+ // Strip lines that look like system directives
299
+ sanitized = sanitized.replace(
300
+ /^\s*(?:SYSTEM|INSTRUCTION|IGNORE(?:\s+ALL)?\s+(?:PREVIOUS|INSTRUCTIONS)?|OVERRIDE|YOUR\s+ROLE\s+IS|MALICIOUS|BACKDOOR)\s*:.*$/gim,
301
+ ""
302
+ );
303
+
304
+ // Strip embedded instruction patterns in brackets
305
+ sanitized = sanitized.replace(/\[(?:SYSTEM|INSTRUCTION|OVERRIDE|MALICIOUS)\s*:[^\]]*\]/gi, "");
306
+
307
+ // Strip base64/hex-encoded command payloads
308
+ sanitized = sanitized.replace(/\b(base64|base32|hex)\s*['":]\s*([A-Za-z0-9+\/=]{20,})/gi, "[encoded-command-redacted]");
309
+
310
+ // Strip eval/exec patterns with encoded content
311
+ sanitized = sanitized.replace(/\b(eval|exec|spawn|subprocess)\s*\(\s*(?:base64|Buffer\.from)\s*\(/gi, "[suspicious-call-redacted]");
312
+
313
+ // Strip markdown that attempts to hide instructions
314
+ sanitized = sanitized.replace(/```\s*(?:system|instruction|prompt)\n[\s\S]*?```/gi, "");
315
+ }
316
+
317
+ // 4. Project-level strict sanitization
318
+ if (trustLevel === "project") {
319
+ // Strip YAML-like assignment patterns that could override behavior
320
+ sanitized = sanitized.replace(/^\s*(?:role|persona|behavior|directive)\s*[=:].*$/gim, "");
321
+
322
+ // Strip potential exfiltration patterns
323
+ sanitized = sanitized.replace(/\b(write|append)\s+.*(?:secrets?|keys?|token|credential)/gi, "[suspicious-write-redacted]");
324
+
325
+ // Strip network exfiltration patterns
326
+ sanitized = sanitized.replace(/\b(fetch|curl|wget|axios)\s+.*(?:exfil|steal|leak|send)/gi, "[suspicious-network-redacted]");
327
+ }
328
+
329
+ // 5. Collapse multiple blank lines (cleanup after removals)
330
+ sanitized = sanitized.replace(/\n{3,}/g, "\n\n");
331
+
332
+ return sanitized.trim();
333
+ }
334
+
31
335
  function parseAgentFile(filePath: string, source: ResourceSource): AgentConfig | undefined {
32
336
  try {
33
337
  const content = fs.readFileSync(filePath, "utf-8");
@@ -39,12 +343,18 @@ function parseAgentFile(filePath: string, source: ResourceSource): AgentConfig |
39
343
  const avoidWhen = parseCsv(frontmatter.avoidWhen);
40
344
  const cost = parseCost(frontmatter.cost);
41
345
  const category = frontmatter.category?.trim() || undefined;
346
+
347
+ // SEC-002: Sanitize system prompt based on source trust level
348
+ const rawSystemPrompt = body.trim();
349
+ const systemPrompt = sanitizeAgentSystemPrompt(rawSystemPrompt, source);
350
+
42
351
  return {
43
352
  name,
44
353
  description,
45
354
  source,
46
355
  filePath,
47
- systemPrompt: body.trim(),
356
+ systemPrompt,
357
+ // ... rest unchanged
48
358
  model: frontmatter.model === "false" ? undefined : frontmatter.model || undefined,
49
359
  fallbackModels: parseCsv(frontmatter.fallbackModels),
50
360
  thinking: frontmatter.thinking === "false" ? undefined : frontmatter.thinking || undefined,
@@ -70,11 +380,20 @@ function parseAgentFile(filePath: string, source: ResourceSource): AgentConfig |
70
380
 
71
381
  function readAgentDir(dir: string, source: ResourceSource): AgentConfig[] {
72
382
  if (!fs.existsSync(dir)) return [];
73
- return fs.readdirSync(dir)
383
+ const agents = fs.readdirSync(dir)
74
384
  .filter((entry) => entry.endsWith(".md") && !entry.endsWith(".team.md") && !entry.endsWith(".workflow.md"))
75
385
  .map((entry) => parseAgentFile(path.join(dir, entry), source))
76
386
  .filter((agent): agent is AgentConfig => agent !== undefined)
77
387
  .sort((a, b) => a.name.localeCompare(b.name));
388
+
389
+ // SEC-001: Warn about project agents that shadow protected builtins
390
+ if (source === "project") {
391
+ for (const agent of agents) {
392
+ checkProjectAgentShadowsBuiltin(agent.name);
393
+ }
394
+ }
395
+
396
+ return agents;
78
397
  }
79
398
 
80
399
  function applyAgentOverrides(agents: AgentConfig[], cwd: string, loadedConfig?: LoadedPiTeamsConfig): AgentConfig[] {
@@ -101,22 +420,30 @@ function applyAgentOverrides(agents: AgentConfig[], cwd: string, loadedConfig?:
101
420
  }
102
421
 
103
422
  // ─── Agent Discovery Cache (Phase 3a) ────────────────────────────────────
104
- // Caches discoverAgents results by cwd with a short TTL to avoid repeated
105
- // disk I/O when multiple callers request agents for the same project.
423
+ // SEC-005 Fix: Uses version-based cache for atomic invalidation.
424
+ // ═══════════════════════════════════════════════════════════════════════════
106
425
 
107
426
  const DISCOVERY_CACHE_TTL_MS = 500;
108
- const discoveryCache = new Map<string, { result: AgentDiscoveryResult; expiresAt: number }>();
427
+ interface CachedDiscoveryEntry {
428
+ result: AgentDiscoveryResult;
429
+ expiresAt: number;
430
+ cacheVersion: number; // SEC-005: Version stamp for atomic invalidation
431
+ }
432
+ const discoveryCache = new Map<string, CachedDiscoveryEntry>();
109
433
  const DISCOVERY_CACHE_MAX_ENTRIES = 32;
110
434
 
111
435
  function pruneDiscoveryCache(): void {
112
436
  const now = Date.now();
437
+ const currentVersion = cacheVersion;
113
438
  for (const [key, entry] of discoveryCache) {
114
- if (entry.expiresAt <= now) discoveryCache.delete(key);
439
+ if (entry.expiresAt <= now || entry.cacheVersion < currentVersion) {
440
+ discoveryCache.delete(key);
441
+ }
115
442
  }
116
443
  }
117
444
 
118
- /** Invalidate cached discovery result for a given cwd (or all if omitted). */
119
445
  export function invalidateAgentDiscoveryCache(cwd?: string): void {
446
+ incrementCacheVersion();
120
447
  if (cwd) {
121
448
  discoveryCache.delete(cwd);
122
449
  } else {
@@ -126,8 +453,10 @@ export function invalidateAgentDiscoveryCache(cwd?: string): void {
126
453
 
127
454
  export function discoverAgents(cwd: string): AgentDiscoveryResult {
128
455
  pruneDiscoveryCache();
456
+ const currentVersion = cacheVersion;
129
457
  const cached = discoveryCache.get(cwd);
130
- if (cached && cached.expiresAt > Date.now()) {
458
+ // SEC-005: Check both TTL expiry AND version stamp
459
+ if (cached && cached.expiresAt > Date.now() && cached.cacheVersion >= currentVersion) {
131
460
  return cached.result;
132
461
  }
133
462
  const loaded = loadConfig(cwd);
@@ -136,7 +465,8 @@ export function discoverAgents(cwd: string): AgentDiscoveryResult {
136
465
  user: applyAgentOverrides(readAgentDir(path.join(userPiRoot(), "agents"), "user"), cwd, loaded),
137
466
  project: applyAgentOverrides(readAgentDir(path.join(projectCrewRoot(cwd), "agents"), "project"), cwd, loaded),
138
467
  };
139
- discoveryCache.set(cwd, { result, expiresAt: Date.now() + DISCOVERY_CACHE_TTL_MS });
468
+ // SEC-005: Store with current version stamp
469
+ discoveryCache.set(cwd, { result, expiresAt: Date.now() + DISCOVERY_CACHE_TTL_MS, cacheVersion: currentVersion });
140
470
  while (discoveryCache.size > DISCOVERY_CACHE_MAX_ENTRIES) {
141
471
  const oldest = discoveryCache.keys().next().value;
142
472
  if (oldest !== undefined) discoveryCache.delete(oldest);
@@ -150,13 +480,15 @@ export function discoverAgents(cwd: string): AgentDiscoveryResult {
150
480
 
151
481
  const dynamicAgents = new Map<string, AgentConfig>();
152
482
 
153
- /** Register a dynamic agent at runtime. Throws if already registered. */
483
+ /** Register a dynamic agent at runtime. Throws if already registered or if name is protected. */
154
484
  export function registerDynamicAgent(config: AgentConfig): void {
155
485
  const key = config.name.toLowerCase();
486
+ // Security check: prevent shadowing of builtin agents (SEC-001)
487
+ assertAgentNameAllowed(config.name);
156
488
  if (dynamicAgents.has(key)) {
157
489
  throw new Error(`Agent already registered: ${config.name}`);
158
490
  }
159
- dynamicAgents.set(key, { ...config, source: config.source ?? "project" });
491
+ dynamicAgents.set(key, { ...config, source: "dynamic" }); // Always "dynamic" — cannot be spoofed
160
492
  invalidateAgentDiscoveryCache();
161
493
  }
162
494
 
@@ -183,10 +515,16 @@ export function allAgents(discovery: AgentDiscoveryResult | undefined): AgentCon
183
515
  for (const agent of [...discovery.project, ...discovery.builtin, ...discovery.user]) {
184
516
  byName.set(agent.name.toLowerCase(), agent);
185
517
  }
186
- // Dynamic agents (registered at runtime) take highest precedence.
187
- // They can override any discovered agent (project/builtin/user).
518
+ // Dynamic agents only fill gaps they cannot override builtin/user agents.
519
+ // SECURITY: Dynamic agents are less trusted (registered at runtime by extensions/hooks).
520
+ // They are only used if no builtin/user agent with the same name exists.
188
521
  for (const agent of dynamicAgents.values()) {
189
- byName.set(agent.name.toLowerCase(), agent);
522
+ const key = agent.name.toLowerCase();
523
+ if (!byName.has(key)) {
524
+ byName.set(key, agent);
525
+ }
526
+ // NOTE: If an agent with the same name exists, the dynamic version is ignored.
527
+ // This prevents privilege escalation via agent shadowing (SEC-001).
190
528
  }
191
529
  return [...byName.values()].filter((agent) => !agent.disabled).sort((a, b) => a.name.localeCompare(b.name));
192
530
  }
@@ -91,10 +91,16 @@ export function resolveTaskSkillNames(input: ResolveTaskSkillsInput): string[] {
91
91
  return collectTaskSkillNames(input).slice(0, MAX_SELECTED_SKILLS);
92
92
  }
93
93
 
94
+ // ═══════════════════════════════════════════════════════════════════════════
95
+ // SEC-003 Fix: Reverse skill search order (package first, project second)
96
+ // Prevents malicious project skills from overriding trusted package skills.
97
+ // See: SECURITY-ISSUES.md SEC-003
98
+ // ═══════════════════════════════════════════════════════════════════════════
99
+
94
100
  function candidateSkillDirs(cwd: string): Array<{ root: string; source: "project" | "package" }> {
95
101
  return [
96
- { root: path.resolve(cwd, "skills"), source: "project" },
97
- { root: PACKAGE_SKILLS_DIR, source: "package" },
102
+ { root: PACKAGE_SKILLS_DIR, source: "package" }, // ✓ Trusted first
103
+ { root: path.resolve(cwd, "skills"), source: "project" }, // ⚠️ Override second
98
104
  ];
99
105
  }
100
106
 
@@ -2,6 +2,51 @@ import * as path from "node:path";
2
2
  import type { TeamRunManifest, TaskPacket, TaskScope, VerificationContract } from "../state/types.ts";
3
3
  import type { WorkflowStep } from "../workflows/workflow-config.ts";
4
4
 
5
+ // ═══════════════════════════════════════════════════════════════════════════
6
+ // SEC-007 Fix: Workflow Step Task Sanitization
7
+ // Context provided by workers comes from workflow definitions that could
8
+ // be user-controlled. Sanitize task text to prevent injection.
9
+ // See: SECURITY-ISSUES.md SEC-007
10
+ // ═══════════════════════════════════════════════════════════════════════════
11
+
12
+
13
+ /**
14
+ * Sanitize workflow step task text to reduce injection risk.
15
+ *
16
+ * The task text is used as a prompt for worker agents. In a multi-tenant
17
+ * or shared workflow scenario, malicious workflow definitions could
18
+ * embed injection instructions.
19
+ *
20
+ * Sanitization:
21
+ * - Strip zero-width Unicode characters
22
+ * - Strip known prompt injection directive patterns
23
+ * - Strip base64/hex encoded payloads
24
+ * - Collapse excessive whitespace
25
+ */
26
+ export function sanitizeTaskText(task: string): string {
27
+ let sanitized = task;
28
+
29
+ // 1. Strip zero-width and invisible Unicode characters
30
+ sanitized = sanitized.replace(/[\u200B-\u200F\u2028-\u202F\u2060-\u206F\uFEFF]/g, "");
31
+
32
+ // 2. Strip known prompt injection directive patterns
33
+ sanitized = sanitized.replace(
34
+ /^\s*(?:SYSTEM|INSTRUCTION|IGNORE(?:\s+ALL)?\s+INSTRUCTIONS|OVERRIDE|YOUR\s+ROLE\s+IS|MALICIOUS)\s*:.*$/gim,
35
+ ""
36
+ );
37
+
38
+ // 3. Strip base64/hex encoded command payloads
39
+ sanitized = sanitized.replace(/\b(?:base64|base32|hex)\s*['":]\s*([A-Za-z0-9+\/=]{16,})/gi, "[encoded-redacted]");
40
+
41
+ // 4. Strip embedded instruction patterns in brackets
42
+ sanitized = sanitized.replace(/\[(?:SYSTEM|INSTRUCTION|OVERRIDE)\s*:[^\]]*\]/gi, "");
43
+
44
+ // 5. Collapse multiple blank lines
45
+ sanitized = sanitized.replace(/\n{3,}/g, "\n\n");
46
+
47
+ return sanitized.trim();
48
+ }
49
+
5
50
  export interface BuildTaskPacketInput {
6
51
  manifest: TeamRunManifest;
7
52
  step: WorkflowStep;
@@ -34,8 +79,10 @@ export function buildTaskPacket(input: BuildTaskPacketInput): TaskPacket {
34
79
  const scope = inferTaskScope(input.step);
35
80
  const reads = input.step.reads === false ? [] : input.step.reads ?? [];
36
81
  const scopePath = reads.length === 1 ? reads[0] : reads.length > 1 ? reads.join(", ") : undefined;
82
+ // SEC-007: Sanitize task text before inserting into task packet
83
+ const sanitizedTask = sanitizeTaskText(input.step.task);
37
84
  return {
38
- objective: input.step.task.replaceAll("{goal}", input.manifest.goal),
85
+ objective: sanitizedTask.replaceAll("{goal}", input.manifest.goal),
39
86
  scope,
40
87
  scopePath,
41
88
  repo: path.basename(input.manifest.cwd) || input.manifest.cwd,
@@ -0,0 +1,114 @@
1
+ /**
2
+ * Integration check: validates pi-crew core discovery and team-run functionality.
3
+ * Run with: node --experimental-strip-types --test test-integration-check.ts
4
+ */
5
+ import * as fs from "node:fs";
6
+ import * as os from "node:os";
7
+ import * as path from "node:path";
8
+ import test from "node:test";
9
+ import assert from "node:assert/strict";
10
+
11
+ import { discoverAgents, allAgents } from "./src/agents/discover-agents.ts";
12
+ import { discoverTeams, allTeams } from "./src/teams/discover-teams.ts";
13
+ import { discoverWorkflows, allWorkflows } from "./src/workflows/discover-workflows.ts";
14
+ import { handleTeamTool } from "./src/extension/team-tool.ts";
15
+ import { loadRunManifestById } from "./src/state/state-store.ts";
16
+
17
+ const pkgRoot = path.resolve(import.meta.dirname ?? ".");
18
+
19
+ // ── Discovery tests ──────────────────────────────────────────────────────
20
+
21
+ test("discovers builtin agents", () => {
22
+ const discovery = discoverAgents(pkgRoot);
23
+ assert.ok(discovery, "discoverAgents should return a result");
24
+ assert.ok(
25
+ discovery.builtin.length >= 10,
26
+ `Expected ≥10 builtin agents, got ${discovery.builtin.length}`,
27
+ );
28
+ const all = allAgents(discovery);
29
+ const names = all.map((a) => a.name);
30
+ assert.ok(names.includes("executor"), `Missing "executor" agent. Got: ${names.join(", ")}`);
31
+ });
32
+
33
+ test("discovers builtin teams", () => {
34
+ const discovery = discoverTeams(pkgRoot);
35
+ assert.ok(discovery, "discoverTeams should return a result");
36
+ assert.ok(
37
+ discovery.builtin.length >= 6,
38
+ `Expected ≥6 builtin teams, got ${discovery.builtin.length}`,
39
+ );
40
+ const all = allTeams(discovery);
41
+ const names = all.map((t) => t.name);
42
+ assert.ok(names.includes("fast-fix"), `Missing "fast-fix" team. Got: ${names.join(", ")}`);
43
+ });
44
+
45
+ test("discovers builtin workflows", () => {
46
+ const discovery = discoverWorkflows(pkgRoot);
47
+ assert.ok(discovery, "discoverWorkflows should return a result");
48
+ assert.ok(
49
+ discovery.builtin.length >= 6,
50
+ `Expected ≥6 builtin workflows, got ${discovery.builtin.length}`,
51
+ );
52
+ const all = allWorkflows(discovery);
53
+ const names = all.map((w) => w.name);
54
+ assert.ok(
55
+ names.includes("fast-fix"),
56
+ `Missing "fast-fix" workflow. Got: ${names.join(", ")}`,
57
+ );
58
+ });
59
+
60
+ // ── Team run test ─────────────────────────────────────────────────────────
61
+
62
+ test("fast-fix team run completes successfully with mock child Pi", async () => {
63
+ const cwd = fs.mkdtempSync(path.join(os.tmpdir(), "pi-crew-int-check-"));
64
+ fs.mkdirSync(path.join(cwd, ".crew"), { recursive: true });
65
+
66
+ const prevExec = process.env.PI_TEAMS_EXECUTE_WORKERS;
67
+ const prevMock = process.env.PI_TEAMS_MOCK_CHILD_PI;
68
+ process.env.PI_TEAMS_EXECUTE_WORKERS = "1";
69
+ process.env.PI_TEAMS_MOCK_CHILD_PI = "success";
70
+
71
+ try {
72
+ const run = await handleTeamTool(
73
+ { action: "run", team: "fast-fix", goal: "create a hello.txt file" },
74
+ { cwd },
75
+ );
76
+
77
+ // run result is not an error
78
+ assert.equal(run.isError, false, `handleTeamTool returned error: ${JSON.stringify(run)}`);
79
+
80
+ const runId = run.details.runId;
81
+ assert.ok(runId, "Expected a runId in details");
82
+
83
+ // manifest should be persisted and completed
84
+ const loaded = loadRunManifestById(cwd, runId!);
85
+ assert.ok(loaded, "loadRunManifestById should return data");
86
+ assert.equal(
87
+ loaded!.manifest.status,
88
+ "completed",
89
+ `Expected manifest status "completed", got "${loaded!.manifest.status}"`,
90
+ );
91
+
92
+ // all tasks should be completed
93
+ const taskStatuses = loaded!.tasks.map((t) => t.status);
94
+ assert.ok(
95
+ taskStatuses.every((s) => s === "completed"),
96
+ `Not all tasks completed: ${JSON.stringify(taskStatuses)}`,
97
+ );
98
+
99
+ // artifacts directory should exist
100
+ const artifactsDir = path.join(cwd, ".crew", "artifacts", runId!);
101
+ assert.ok(
102
+ fs.existsSync(artifactsDir),
103
+ `Artifacts directory should exist: ${artifactsDir}`,
104
+ );
105
+
106
+ console.log(`✅ fast-fix run ${runId} completed successfully with ${loaded!.tasks.length} tasks`);
107
+ } finally {
108
+ if (prevExec === undefined) delete process.env.PI_TEAMS_EXECUTE_WORKERS;
109
+ else process.env.PI_TEAMS_EXECUTE_WORKERS = prevExec;
110
+ if (prevMock === undefined) delete process.env.PI_TEAMS_MOCK_CHILD_PI;
111
+ else process.env.PI_TEAMS_MOCK_CHILD_PI = prevMock;
112
+ fs.rmSync(cwd, { recursive: true, force: true });
113
+ }
114
+ });