pi-crew 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.4.0] — 9arm-skills Enforcement Patterns & Integration Tests (2026-05-26)
4
+
5
+ ### Features
6
+ - **systematic-debugging: Refuse Gate** — Hard constraints before proposing fixes. Must verify repro exists, root cause known, and hypothesis falsified before any fix.
7
+ - **systematic-debugging: Recite Ritual** — Psychological anchor at session start. Recite 4-step mantra before beginning any debug session.
8
+ - **systematic-debugging: Falsify-First** — Phase 3 now requires disproof before proof. Run disproof experiments first to save time on wrong hypotheses.
9
+ - **systematic-debugging: Breadcrumb Ledger** — Structured experiment tracking within debug sessions.
10
+ - **multi-perspective-review: Simpler Alternative Pass** — Mandatory pre-review step to question if the change should exist at all.
11
+ - **New skill: scrutinize** — Outsider-perspective review questioning intent before tracing code.
12
+ - **New skill: post-mortem** — Engineering RCA documentation with 4 required inputs gate.
13
+ - **skills/REFERENCE.md** — New documentation of skill chains, inventory, and anti-patterns.
14
+ - **Trigger conditions** added to all major skill descriptions for better skill invocation matching.
15
+
16
+ ### Bug Fixes
17
+ - **CI reliability** — Fixed flaky tests on macOS: crew-widget and render-scheduler timing issues resolved.
18
+ - **Team-context import detection** — Fixed regex to correctly match only direct `/team-tool.ts` imports, not `/team-tool/context.ts`.
19
+
20
+ ### Tests
21
+ - **New test-integration-check.ts** — Integration tests for core pi-crew functionality (agent/team/workflow discovery, fast-fix team run).
22
+ - **1740 tests passing** across all platforms (Ubuntu, macOS, Windows).
23
+
24
+ ---
25
+
3
26
  ## [0.3.8] — Zombie Run Auto-Repair & Test Stability (2026-05-25)
4
27
 
5
28
  ### Features
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.3.9",
3
+ "version": "0.4.0",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
@@ -0,0 +1,136 @@
1
+ # pi-crew Skills Reference
2
+
3
+ ## Skill Chains
4
+
5
+ ### Bug Investigation
6
+
7
+ ```
8
+ systematic-debugging (4 phases with refuse gate)
9
+
10
+ verification-before-done (evidence before claim)
11
+
12
+ post-mortem (RCA documentation)
13
+ ```
14
+
15
+ ### Multi-phase Work
16
+
17
+ ```
18
+ orchestration (phase coordination)
19
+
20
+ delegation-patterns (task splitting)
21
+
22
+ verification-before-done (after each phase)
23
+ ```
24
+
25
+ ### Code Review (Quick)
26
+
27
+ ```
28
+ scrutinize (outsider perspective + simpler alternative)
29
+ ```
30
+
31
+ ### Code Review (Deep)
32
+
33
+ ```
34
+ scrutinize (outsider perspective)
35
+
36
+ multi-perspective-review (8-pass deep review)
37
+
38
+ secure-agent-orchestration-review (security focus)
39
+ ```
40
+
41
+ ---
42
+
43
+ ## When to Invoke
44
+
45
+ | Situation | Skill |
46
+ |-----------|-------|
47
+ | Bug / test failure / crash | `systematic-debugging` |
48
+ | Before claiming done | `verification-before-done` |
49
+ | Code review (quick) | `scrutinize` |
50
+ | Code review (deep) | `multi-perspective-review` |
51
+ | Task delegation | `delegation-patterns` |
52
+ | Complex multi-phase work | `orchestration` |
53
+ | After bug is fixed | `post-mortem` |
54
+ | Security review | `secure-agent-orchestration-review` |
55
+ | Workspace safety | `workspace-isolation` |
56
+ | Bash safety | `safe-bash` |
57
+
58
+ ---
59
+
60
+ ## Skills Inventory
61
+
62
+ ### Core Discipline
63
+
64
+ | Skill | Description |
65
+ |-------|-------------|
66
+ | `systematic-debugging` | Four-phase debugging with refuse gates, falsify-first discipline |
67
+ | `verification-before-done` | Evidence before claims |
68
+ | `orchestration` | Multi-phase coordination, 8 rules including "respawn not absorb" |
69
+
70
+ ### Review
71
+
72
+ | Skill | Description |
73
+ |-------|-------------|
74
+ | `scrutinize` | Outsider-perspective review questioning intent |
75
+ | `multi-perspective-review` | 8-pass deep code review |
76
+ | `secure-agent-orchestration-review` | Security-focused review |
77
+
78
+ ### Documentation
79
+
80
+ | Skill | Description |
81
+ |-------|-------------|
82
+ | `post-mortem` | Engineering RCA record |
83
+
84
+ ### Delegation
85
+
86
+ | Skill | Description |
87
+ |-------|-------------|
88
+ | `delegation-patterns` | Task splitting patterns |
89
+ | `requirements-to-task-packet` | Task packet creation |
90
+
91
+ ### Runtime/Safety
92
+
93
+ | Skill | Description |
94
+ |-------|-------------|
95
+ | `workspace-isolation` | Security boundary enforcement |
96
+ | `worktree-isolation` | Git worktree safety |
97
+ | `safe-bash` | Bash command safety |
98
+ | `state-mutation-locking` | State mutation protection |
99
+
100
+ ### Observability
101
+
102
+ | Skill | Description |
103
+ |-------|-------------|
104
+ | `event-log-tracing` | JSONL event log analysis |
105
+ | `runtime-state-reader` | Runtime state inspection |
106
+ | `observability-reliability` | Reliability patterns |
107
+
108
+ ---
109
+
110
+ ## Anti-patterns
111
+
112
+ | Anti-pattern | Skill | Rule |
113
+ |--------------|-------|------|
114
+ | Proposing fix before reproducing | `systematic-debugging` | Refuse Gate |
115
+ | Running proof before disproof | `systematic-debugging` | Phase 3 |
116
+ | Claiming "tests pass" without fresh run | `verification-before-done` | Gate Function |
117
+ | Reviewing diff-local without tracing path | `scrutinize` | Trace step |
118
+ | Skipping simpler-alternative pass | `multi-perspective-review` | Pre-review |
119
+ | Editing files yourself as orchestrator | `orchestration` | Rule 1 |
120
+ | Dispatching serially when parallel possible | `orchestration` | Rule 3 |
121
+ | Committing a red tree | `orchestration` | Rule 6 |
122
+ | Absorbing subagent's broken work | `orchestration` | Rule 7 |
123
+ | Rubber-stamp review | `multi-perspective-review` | Rules |
124
+
125
+ ---
126
+
127
+ ## Key Enforcement Patterns (from 9arm)
128
+
129
+ | Pattern | Implemented In |
130
+ |---------|---------------|
131
+ | **Refuse Gate** | `systematic-debugging` |
132
+ | **Recite Ritual** | `systematic-debugging` (Invocation) |
133
+ | **Falsify Before Proof** | `systematic-debugging` (Phase 3) |
134
+ | **Simpler Alternative Pass** | `scrutinize`, `multi-perspective-review` |
135
+ | **Required Inputs Gate** | `post-mortem` |
136
+ | **Respawn Not Absorb** | `orchestration` (Rule 7) |
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: delegation-patterns
3
- description: Subagent/team delegation workflow. Use when splitting work across pi-crew teams, direct agents, async background workers, chains, or parallel research/review tasks.
3
+ description: "Subagent/team delegation workflow. Use when splitting work across pi-crew teams, direct agents, async background workers, chains, or parallel tasks. Triggers: delegate this, split this task, parallelize, dispatch workers, assign to team, spawn agents."
4
4
  ---
5
5
 
6
6
  # delegation-patterns
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: event-log-tracing
3
- description: Structured event logging system for worker lifecycle, live agents, and crash recovery. Use when debugging worker crashes, tracing agent lifecycle, or investigating stale runs.
3
+ description: "Structured event logging for worker lifecycle, live agents, crash recovery. Use when debugging crashes, tracing agent lifecycle, investigating stale runs. Triggers: event log, trace events, worker crashed, agent died, stale run, events.jsonl."
4
4
  ---
5
5
 
6
6
  # event-log-tracing
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: multi-perspective-review
3
- description: Use when reviewing a plan, diff, implementation, worker output, release candidate, or external review feedback.
3
+ description: "Multi-perspective code review with simpler-alternative pass. Use when reviewing a plan, diff, implementation, worker output, release candidate, or external feedback. Triggers: review this, look at this, LGTM check, sanity check, audit this, get a second opinion, check this PR, examine this code."
4
4
  ---
5
5
 
6
6
  # multi-perspective-review
@@ -9,6 +9,22 @@ Core principle: review early, review often, and separate concerns. Reviewer outp
9
9
 
10
10
  Distilled from detailed reads of requesting-code-review, receiving-code-review, subagent review checkpoints, differential review, and specialized review-agent patterns.
11
11
 
12
+ ## Pre-review: Simpler Alternative Pass (Mandatory)
13
+
14
+ Before running any review passes, ask:
15
+
16
+ 1. **Is there a simpler, smaller, or more elegant way to achieve the same goal?**
17
+ - Doing nothing (is the problem real and load-bearing?)
18
+ - Using something that already exists in the codebase
19
+ - A smaller change that solves 90% of the goal with 10% of the risk
20
+ - Solving it at a different layer (config vs code, framework vs app)
21
+ 2. If a better alternative exists, surface it BEFORE the line-by-line review.
22
+ 3. Skip only if the user explicitly says "don't question scope."
23
+
24
+ This is the most valuable finding you can produce — surfacing unnecessary complexity before reviewing its details.
25
+
26
+ ---
27
+
12
28
  ## Review Passes
13
29
 
14
30
  Run relevant passes separately:
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: orchestration
3
- description: Multi-phase orchestration skill for pi-crew planners and executors. Use when decomposing complex tasks into parallel phases, dispatching workers, verifying gates, and iterating until closure.
3
+ description: "Multi-phase orchestration for planners and executors. Use when decomposing complex tasks into parallel phases, dispatching workers, verifying gates, and iterating to closure. Triggers: orchestrate this, coordinate these tasks, run this multi-phase, dispatch workers, coordinate team."
4
4
  ---
5
5
 
6
6
  # orchestration
@@ -0,0 +1,90 @@
1
+ ---
2
+ name: post-mortem
3
+ description: "Write engineering RCA record after bug is fixed. Use when asking: write post-mortem, RCA, root cause analysis, document this fix, close out this bug. Triggers: post-mortem, postmortem, root cause, RCA, document this fix, write up the cause, close out bug."
4
+ ---
5
+
6
+ # post-mortem
7
+
8
+ The canonical engineering record of a bug fix. Written after debugging lands a real fix.
9
+
10
+ ## Required Inputs — Refuse to Draft Without These
11
+
12
+ - [ ] **Reliable repro exists** (deterministic or high-rate flake)
13
+ - [ ] **Root cause is known** (mechanism identified, not a hypothesis)
14
+ - [ ] **Fix is identified** (PR / commit / branch)
15
+ - [ ] **Fix is validated** (original repro now passes)
16
+
17
+ If any missing → list what's missing and stop. Do not draft.
18
+
19
+ ## Structure
20
+
21
+ ### 1. Summary
22
+
23
+ What broke (user terms), what fixed it (one sentence). JIRA key, PR, owner. A reader who stops here should have the right answer.
24
+
25
+ ### 2. Symptom
26
+
27
+ Concrete: test output, error message, log line. No paraphrase. What was actually observed.
28
+
29
+ ### 3. Root Cause
30
+
31
+ The actual bug mechanism. Code identifiers welcome — function names, file paths, branch conditions. Walk the cause chain end-to-end.
32
+
33
+ ### 4. Why It Produced the Symptom
34
+
35
+ Walk the chain so reader connects symptom to cause. Often non-obvious — bug is in X but visible failure is in Y.
36
+
37
+ ### 5. Fix
38
+
39
+ What changed and why this addresses root cause. Link to PR/commit. If a previous fix attempt papered over the symptom, name it and explain what was wrong.
40
+
41
+ ### 6. How It Was Found
42
+
43
+ Short. The debugging path:
44
+
45
+ - What repro made it deterministic
46
+ - What tools cracked it
47
+ - Hypotheses tried and rejected (with one-line reason each)
48
+ - The single experiment that confirmed the cause
49
+
50
+ ### 7. Why It Slipped Through
51
+
52
+ CI gap? Latent code? Workload gap? Incomplete prior fix? Review miss? Be specific.
53
+
54
+ If honest answer is "no good reason" — say so. **Blameless** — describe the gap, not the person.
55
+
56
+ ### 8. Validation
57
+
58
+ How we know the fix works:
59
+
60
+ - Original failing test now passes (test name)
61
+ - Customer workload now completes (workload identifier)
62
+ - Other affected configs/workloads also tested
63
+
64
+ If only one config validated, say so explicitly.
65
+
66
+ ### 9. Action Items
67
+
68
+ What + owner + tracking artifact:
69
+
70
+ - Regression test added at <seam>. (Owner, test name)
71
+ - CI gap closed: <new check>. (Owner, ticket)
72
+ - Doc/runbook updated. (Owner, link)
73
+
74
+ If none needed: "None — fix is sufficient and no class-of-bug follow-up warranted."
75
+
76
+ ## Tone
77
+
78
+ This is engineer-to-engineer:
79
+
80
+ - **Code identifiers are first-class.** Keep them — future engineers grep their way back.
81
+ - **Mechanism over narrative.** Walk the cause chain, don't soften.
82
+ - **Blameless.** Describe gaps and bugs, never people.
83
+ - **No hedging.** State it or don't write it.
84
+
85
+ ## Rules
86
+
87
+ - Never invent facts.
88
+ - Never strip code identifiers (they are the index).
89
+ - State validation coverage honestly.
90
+ - Get sign-off before posting to JIRA.
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: safe-bash
3
- description: Safe shell-command workflow. Use whenever a task may execute shell commands, especially to prefer read-only commands and avoid destructive actions without confirmation.
3
+ description: "Safe shell-command workflow. Use when executing shell commands, prefer read-only, avoid destructive actions. Triggers: run this command, execute bash, safe bash, avoid rm, destructive command, shell injection."
4
4
  ---
5
5
 
6
6
  # safe-bash
@@ -0,0 +1,67 @@
1
+ ---
2
+ name: scrutinize
3
+ description: "Outsider-perspective review questioning intent before tracing code. Use when asking: should this even exist?, is there a simpler way?, get a second opinion, before deep code review. Triggers: scrutinize this, question this, is there a better way?, simplify this, overkill?, too complex."
4
+ ---
5
+
6
+ # Scrutinize
7
+
8
+ Stand outside the change and ask whether it should exist at all, then verify it actually does what it claims end-to-end.
9
+
10
+ ## Operating Stance
11
+
12
+ - **Outsider.** Forget who wrote it and why they think it's right. Read the artifact cold.
13
+ - **End-to-end, not diff-local.** The diff is the entry point, not the scope.
14
+ - **Actionable, concise, with rationale.** Every finding states what to change, why, and what evidence led you there.
15
+
16
+ ## Workflow
17
+
18
+ ### 1. Intent — Is this necessary?
19
+
20
+ - State the goal in one sentence, in your own words. If you cannot, the artifact is underspecified — say so and stop.
21
+ - Ask: **Is there a simpler way?**
22
+ - Delete/does-nothing (is the problem real and load-bearing?)
23
+ - Use existing code (does this already exist?)
24
+ - Smaller change (solves 90% of goal with 10% of risk?)
25
+ - Different layer (config vs code, framework vs app, build vs runtime?)
26
+ - If a better alternative exists, name it BEFORE the line-by-line review.
27
+
28
+ ### 2. Trace — Walk the actual code path
29
+
30
+ - For each behavior the change claims, trace end-to-end through real code — not just the lines in the diff.
31
+ - Include unchanged code on either side of the diff. Bugs hide at the seams.
32
+ - Entry point → call sites → branches taken → state mutated → exit/return/side effect.
33
+
34
+ ### 3. Verify — Does it do what it claims?
35
+
36
+ - Does the traced code actually produce the behavior?
37
+ - What inputs/states would break it? (Edge cases, concurrent callers, error paths, partial failures, retries, empty/null/unicode/huge inputs)
38
+ - What does it silently change? (Performance, error semantics, observability, contracts)
39
+ - How is it tested? (Do tests exercise the traced path, or pass while skipping it?)
40
+
41
+ ### 4. Report
42
+
43
+ Format per finding:
44
+
45
+ ```text
46
+ [severity] file:line
47
+ Issue: ...
48
+ Impact: ...
49
+ Fix: ...
50
+ ```
51
+
52
+ Severity:
53
+
54
+ - critical: data loss, secret leak, arbitrary command/path escape
55
+ - high: broken core workflow, ownership bypass
56
+ - medium: regression, flaky behavior
57
+ - low: polish, maintainability
58
+
59
+ Close with verdict: **ship / fix-then-ship / rework / reject** — with single biggest reason.
60
+
61
+ ## Rules
62
+
63
+ - **No rubber-stamps.** "LGTM" is not an output. If nothing found, say what you traced.
64
+ - **Cite or it didn't happen.** Every claim needs specific path/file/line.
65
+ - **One simpler-alternative pass is MANDATORY.** Skip only if user says "don't question scope."
66
+ - **Distinguish claim from verification.** "The PR says X" and "I traced X and confirmed" are different.
67
+ - **No flattery, no hedging.** State the finding.
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: systematic-debugging
3
- description: Use when encountering a bug, test failure, blocked run, provider error, stale state, crash, or unexpected behavior before proposing fixes.
3
+ description: "Four-phase debugging discipline with refuse gates. Use when encountering a bug, test failure, blocked run, provider error, stale state, crash, or unexpected behavior. Triggers: debug this, investigate, fix this bug, something is broken, crash, error, test failed, it broke, not working, unexpected."
4
4
  ---
5
5
 
6
6
  # systematic-debugging
@@ -9,6 +9,36 @@ Core principle: no fixes without root-cause investigation first. Symptom patches
9
9
 
10
10
  Distilled from detailed reads of systematic-debugging, root-cause tracing, TDD, and error-analysis skill patterns.
11
11
 
12
+ ## Invocation — Read Before Debugging
13
+
14
+ Before beginning any debug session, recite these four steps:
15
+
16
+ > **1. First is reproducibility.** Can the issue be reproduced reliably?
17
+ > **2. Know the fail path.** Where does the code break and what stops it from breaking?
18
+ > **3. Question your hypothesis.** What would disprove it?
19
+ > **4. Every run is a breadcrumb.** Cross-reference all of them.
20
+
21
+ If the user says "skip the ritual" → skip the recitation but still apply the four phases silently.
22
+
23
+ ---
24
+
25
+ ## Refuse Gate — Do NOT Proceed Without These
26
+
27
+ Before proposing ANY fix:
28
+
29
+ - [ ] **Can you reproduce the issue reliably?** (deterministic or >50% flake rate)
30
+ - [ ] **Do you know the root cause?** (confirmed mechanism, not a hypothesis)
31
+ - [ ] **Have you tried to FALSIFY your hypothesis first?** (disproof before proof)
32
+
33
+ If ANY answer is NO:
34
+ → Stop.
35
+ → State what's missing.
36
+ → Do not propose a fix.
37
+
38
+ Exception: if the user explicitly says "just patch the symptom" — proceed but flag it as a symptom patch, not a root-cause fix.
39
+
40
+ ---
41
+
12
42
  ## Four Phases
13
43
 
14
44
  ### 1. Root Cause Investigation
@@ -34,9 +64,14 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
34
64
  - Identify dependencies: config home, project root markers, env vars, locks, stale caches, provider model capabilities.
35
65
  - Do not assume small differences are irrelevant.
36
66
 
37
- ### 3. Hypothesis and Test
67
+ ### 3. Hypothesis and Test — Falsify First
38
68
 
39
- - State one hypothesis: I think X is the root cause because Y.”
69
+ - State one hypothesis: "I think X is the root cause because Y."
70
+ - Generate 3-5 ranked hypotheses, not one. Single-hypothesis thinking anchors on the first plausible idea.
71
+ - For each hypothesis:
72
+ - What is the simplest **proof**? What is the cleanest **disproof**?
73
+ - Run the **disproof FIRST**. If the hypothesis survives, it's real. If it dies, you saved time chasing a phantom.
74
+ - Does it explain the symptom end-to-end? Walk it through.
40
75
  - Test one variable at a time with the smallest read-only probe or targeted test.
41
76
  - If wrong, discard the hypothesis instead of piling on fixes.
42
77
  - After three failed fixes, question architecture or assumptions before continuing.
@@ -45,7 +80,7 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
45
80
 
46
81
  - Add or identify a failing regression test when practical.
47
82
  - Fix the root cause, not the symptom.
48
- - Avoid while Im here refactors.
83
+ - Avoid "while I'm here" refactors.
49
84
  - Verify targeted behavior, then broader gates.
50
85
 
51
86
  ## Evidence to Collect
@@ -60,8 +95,28 @@ user/tool params → config resolution → team/workflow/agent discovery → mod
60
95
 
61
96
  ## Anti-patterns
62
97
 
63
- - Fixing before reproducing.
98
+ - Proposing a fix before reproducing (the refuse gate exists for a reason).
99
+ - Running proof experiments before disproof (disproof first saves time).
100
+ - Trusting a single passing run as validation (check against all prior breadcrumbs).
64
101
  - Assuming real user global config cannot pollute tests.
65
102
  - Treating provider errors as only transient network failures.
66
103
  - Removing guards because they reveal a blocked state.
67
104
  - Editing unrelated layers before checking the hypothesis.
105
+
106
+ ## Breadcrumb Ledger
107
+
108
+ Maintain a running ledger of every experiment in this session. Each entry:
109
+
110
+ | # | What Changed | What Happened | Ruled In/Out |
111
+ |---|-------------|--------------|-------------|
112
+ | 1 | Added `[DBG-001]` probe | Got `[output]` | Hypothesis A ruled out |
113
+ | 2 | Changed X to Y | Same error persists | Not X |
114
+ | 3 | Checked Z config | Found mismatch | Z is contributing |
115
+
116
+ When a new hypothesis surfaces, walk the ledger:
117
+ - Does it hold for **every** prior observation?
118
+ - If any past run contradicts it, the hypothesis is wrong or incomplete.
119
+
120
+ When in doubt, design the **single experiment** whose outcome makes it certain — run that next.
121
+
122
+ Update the ledger after every run. It is your memory across the session.
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: verification-before-done
3
- description: Use when about to claim work is complete, fixed, passing, reviewed, committed, or ready to hand off.
3
+ description: "Evidence before claims. Use before claiming work is complete, fixed, passing, reviewed, committed, or ready to hand off. Triggers: done, fixed, complete, ready to merge, can I close, is it working, verify this, check if it passes, all good, LGTM, ready to ship."
4
4
  ---
5
5
 
6
6
  # verification-before-done
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: workspace-isolation
3
- description: Workspace isolation boundaries in pi-crew. Use when ensuring agents from workspace A cannot access workspace B, or when implementing worktree-based parallel execution.
3
+ description: "Workspace isolation boundaries. Use when ensuring agents from workspace A cannot access workspace B, or worktree-based parallel execution. Triggers: workspace isolation, cross-workspace access, escape boundary, worktree safety."
4
4
  ---
5
5
 
6
6
  # workspace-isolation
@@ -0,0 +1,114 @@
1
+ /**
2
+ * Integration check: validates pi-crew core discovery and team-run functionality.
3
+ * Run with: node --experimental-strip-types --test test-integration-check.ts
4
+ */
5
+ import * as fs from "node:fs";
6
+ import * as os from "node:os";
7
+ import * as path from "node:path";
8
+ import test from "node:test";
9
+ import assert from "node:assert/strict";
10
+
11
+ import { discoverAgents, allAgents } from "./src/agents/discover-agents.ts";
12
+ import { discoverTeams, allTeams } from "./src/teams/discover-teams.ts";
13
+ import { discoverWorkflows, allWorkflows } from "./src/workflows/discover-workflows.ts";
14
+ import { handleTeamTool } from "./src/extension/team-tool.ts";
15
+ import { loadRunManifestById } from "./src/state/state-store.ts";
16
+
17
+ const pkgRoot = path.resolve(import.meta.dirname ?? ".");
18
+
19
+ // ── Discovery tests ──────────────────────────────────────────────────────
20
+
21
+ test("discovers builtin agents", () => {
22
+ const discovery = discoverAgents(pkgRoot);
23
+ assert.ok(discovery, "discoverAgents should return a result");
24
+ assert.ok(
25
+ discovery.builtin.length >= 10,
26
+ `Expected ≥10 builtin agents, got ${discovery.builtin.length}`,
27
+ );
28
+ const all = allAgents(discovery);
29
+ const names = all.map((a) => a.name);
30
+ assert.ok(names.includes("executor"), `Missing "executor" agent. Got: ${names.join(", ")}`);
31
+ });
32
+
33
+ test("discovers builtin teams", () => {
34
+ const discovery = discoverTeams(pkgRoot);
35
+ assert.ok(discovery, "discoverTeams should return a result");
36
+ assert.ok(
37
+ discovery.builtin.length >= 6,
38
+ `Expected ≥6 builtin teams, got ${discovery.builtin.length}`,
39
+ );
40
+ const all = allTeams(discovery);
41
+ const names = all.map((t) => t.name);
42
+ assert.ok(names.includes("fast-fix"), `Missing "fast-fix" team. Got: ${names.join(", ")}`);
43
+ });
44
+
45
+ test("discovers builtin workflows", () => {
46
+ const discovery = discoverWorkflows(pkgRoot);
47
+ assert.ok(discovery, "discoverWorkflows should return a result");
48
+ assert.ok(
49
+ discovery.builtin.length >= 6,
50
+ `Expected ≥6 builtin workflows, got ${discovery.builtin.length}`,
51
+ );
52
+ const all = allWorkflows(discovery);
53
+ const names = all.map((w) => w.name);
54
+ assert.ok(
55
+ names.includes("fast-fix"),
56
+ `Missing "fast-fix" workflow. Got: ${names.join(", ")}`,
57
+ );
58
+ });
59
+
60
+ // ── Team run test ─────────────────────────────────────────────────────────
61
+
62
+ test("fast-fix team run completes successfully with mock child Pi", async () => {
63
+ const cwd = fs.mkdtempSync(path.join(os.tmpdir(), "pi-crew-int-check-"));
64
+ fs.mkdirSync(path.join(cwd, ".crew"), { recursive: true });
65
+
66
+ const prevExec = process.env.PI_TEAMS_EXECUTE_WORKERS;
67
+ const prevMock = process.env.PI_TEAMS_MOCK_CHILD_PI;
68
+ process.env.PI_TEAMS_EXECUTE_WORKERS = "1";
69
+ process.env.PI_TEAMS_MOCK_CHILD_PI = "success";
70
+
71
+ try {
72
+ const run = await handleTeamTool(
73
+ { action: "run", team: "fast-fix", goal: "create a hello.txt file" },
74
+ { cwd },
75
+ );
76
+
77
+ // run result is not an error
78
+ assert.equal(run.isError, false, `handleTeamTool returned error: ${JSON.stringify(run)}`);
79
+
80
+ const runId = run.details.runId;
81
+ assert.ok(runId, "Expected a runId in details");
82
+
83
+ // manifest should be persisted and completed
84
+ const loaded = loadRunManifestById(cwd, runId!);
85
+ assert.ok(loaded, "loadRunManifestById should return data");
86
+ assert.equal(
87
+ loaded!.manifest.status,
88
+ "completed",
89
+ `Expected manifest status "completed", got "${loaded!.manifest.status}"`,
90
+ );
91
+
92
+ // all tasks should be completed
93
+ const taskStatuses = loaded!.tasks.map((t) => t.status);
94
+ assert.ok(
95
+ taskStatuses.every((s) => s === "completed"),
96
+ `Not all tasks completed: ${JSON.stringify(taskStatuses)}`,
97
+ );
98
+
99
+ // artifacts directory should exist
100
+ const artifactsDir = path.join(cwd, ".crew", "artifacts", runId!);
101
+ assert.ok(
102
+ fs.existsSync(artifactsDir),
103
+ `Artifacts directory should exist: ${artifactsDir}`,
104
+ );
105
+
106
+ console.log(`✅ fast-fix run ${runId} completed successfully with ${loaded!.tasks.length} tasks`);
107
+ } finally {
108
+ if (prevExec === undefined) delete process.env.PI_TEAMS_EXECUTE_WORKERS;
109
+ else process.env.PI_TEAMS_EXECUTE_WORKERS = prevExec;
110
+ if (prevMock === undefined) delete process.env.PI_TEAMS_MOCK_CHILD_PI;
111
+ else process.env.PI_TEAMS_MOCK_CHILD_PI = prevMock;
112
+ fs.rmSync(cwd, { recursive: true, force: true });
113
+ }
114
+ });