pi-crew 0.5.2 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/CHANGELOG.md +183 -0
  2. package/README.md +17 -1
  3. package/docs/architecture.md +2 -0
  4. package/docs/bugs/cross-session-notification-leakage.md +82 -0
  5. package/docs/coding-agent-optimization.md +268 -0
  6. package/docs/deep-review-report.md +384 -0
  7. package/docs/distillation/cybersecurity-patterns.md +294 -0
  8. package/docs/migration-v0.4-v0.5.md +208 -0
  9. package/docs/optimization-plan.md +642 -0
  10. package/docs/pi-crew-v0.5.5-audit-fix-plan.md +133 -0
  11. package/docs/pi-mono-opportunities.md +969 -0
  12. package/docs/pi-mono-review.md +291 -0
  13. package/docs/skills/REFERENCE.md +144 -0
  14. package/package.json +12 -9
  15. package/skills/artifact-analysis-loop/SKILL.md +302 -0
  16. package/skills/async-worker-recovery/SKILL.md +19 -1
  17. package/skills/child-pi-spawning/SKILL.md +19 -6
  18. package/skills/context-artifact-hygiene/SKILL.md +19 -2
  19. package/skills/delegation-patterns/SKILL.md +68 -3
  20. package/skills/detection-pipeline-design/SKILL.md +285 -0
  21. package/skills/event-log-tracing/SKILL.md +20 -6
  22. package/skills/git-master/SKILL.md +20 -6
  23. package/skills/hunting-investigation-loop/SKILL.md +401 -0
  24. package/skills/incident-playbook-construction/SKILL.md +383 -0
  25. package/skills/live-agent-lifecycle/SKILL.md +20 -6
  26. package/skills/mailbox-interactive/SKILL.md +19 -6
  27. package/skills/model-routing-context/SKILL.md +19 -1
  28. package/skills/multi-perspective-review/SKILL.md +19 -4
  29. package/skills/observability-reliability/SKILL.md +19 -2
  30. package/skills/orchestration/SKILL.md +20 -2
  31. package/skills/ownership-session-security/SKILL.md +20 -2
  32. package/skills/pi-extension-lifecycle/SKILL.md +20 -2
  33. package/skills/post-mortem/SKILL.md +7 -2
  34. package/skills/read-only-explorer/SKILL.md +20 -6
  35. package/skills/requirements-to-task-packet/SKILL.md +23 -3
  36. package/skills/resource-discovery-config/SKILL.md +20 -2
  37. package/skills/runtime-state-reader/SKILL.md +20 -2
  38. package/skills/safe-bash/SKILL.md +21 -6
  39. package/skills/scrutinize/SKILL.md +20 -2
  40. package/skills/secure-agent-orchestration-review/SKILL.md +29 -2
  41. package/skills/security-review/SKILL.md +560 -0
  42. package/skills/state-mutation-locking/SKILL.md +22 -2
  43. package/skills/systematic-debugging/SKILL.md +8 -6
  44. package/skills/threat-hypothesis-framework/SKILL.md +175 -0
  45. package/skills/ui-render-performance/SKILL.md +20 -2
  46. package/skills/verification-before-done/SKILL.md +17 -2
  47. package/skills/widget-rendering/SKILL.md +21 -6
  48. package/skills/workspace-isolation/SKILL.md +20 -6
  49. package/skills/worktree-isolation/SKILL.md +20 -6
  50. package/src/agents/agent-config.ts +40 -1
  51. package/src/benchmark/benchmark-runner.ts +45 -0
  52. package/src/benchmark/feedback-loop.ts +5 -0
  53. package/src/config/config.ts +32 -5
  54. package/src/config/role-tools.ts +82 -0
  55. package/src/config/suggestions.ts +8 -0
  56. package/src/config/types.ts +4 -0
  57. package/src/extension/async-notifier.ts +10 -1
  58. package/src/extension/crew-cleanup.ts +114 -0
  59. package/src/extension/cross-extension-rpc.ts +1 -1
  60. package/src/extension/notification-router.ts +18 -0
  61. package/src/extension/register.ts +27 -19
  62. package/src/extension/registration/subagent-tools.ts +1 -1
  63. package/src/extension/team-tool/anchor.ts +201 -0
  64. package/src/extension/team-tool/api.ts +2 -1
  65. package/src/extension/team-tool/auto-summarize.ts +154 -0
  66. package/src/extension/team-tool/run.ts +42 -7
  67. package/src/extension/team-tool.ts +44 -2
  68. package/src/hooks/registry.ts +1 -3
  69. package/src/observability/event-bus.ts +69 -0
  70. package/src/observability/event-to-metric.ts +0 -2
  71. package/src/runtime/anchor-manager.ts +473 -0
  72. package/src/runtime/async-runner.ts +8 -4
  73. package/src/runtime/auto-summarize.ts +350 -0
  74. package/src/runtime/background-runner.ts +10 -3
  75. package/src/runtime/budget-tracker.ts +354 -0
  76. package/src/runtime/chain-runner.ts +507 -0
  77. package/src/runtime/child-pi.ts +123 -35
  78. package/src/runtime/crash-recovery.ts +5 -4
  79. package/src/runtime/crew-agent-runtime.ts +1 -0
  80. package/src/runtime/custom-tools/irc-tool.ts +13 -0
  81. package/src/runtime/custom-tools/submit-result-tool.ts +3 -2
  82. package/src/runtime/delivery-coordinator.ts +10 -3
  83. package/src/runtime/dynamic-script-runner.ts +482 -0
  84. package/src/runtime/foreground-control.ts +87 -17
  85. package/src/runtime/handoff-manager.ts +589 -0
  86. package/src/runtime/hidden-handoff.ts +424 -0
  87. package/src/runtime/live-agent-manager.ts +20 -4
  88. package/src/runtime/live-session-runtime.ts +39 -4
  89. package/src/runtime/manifest-cache.ts +2 -1
  90. package/src/runtime/model-resolver.ts +16 -4
  91. package/src/runtime/phase-tracker.ts +373 -0
  92. package/src/runtime/pi-args.ts +11 -1
  93. package/src/runtime/pi-json-output.ts +31 -0
  94. package/src/runtime/pipeline-runner.ts +514 -0
  95. package/src/runtime/progress-tracker.ts +124 -0
  96. package/src/runtime/retry-runner.ts +354 -0
  97. package/src/runtime/sandbox.ts +252 -0
  98. package/src/runtime/scheduler.ts +7 -2
  99. package/src/runtime/skill-effectiveness.ts +473 -0
  100. package/src/runtime/skill-instructions.ts +37 -3
  101. package/src/runtime/subagent-manager.ts +1 -1
  102. package/src/runtime/task-graph.ts +11 -1
  103. package/src/runtime/task-runner.ts +92 -18
  104. package/src/runtime/team-runner.ts +13 -12
  105. package/src/runtime/tool-progress.ts +10 -3
  106. package/src/runtime/verification-gates.ts +367 -0
  107. package/src/schema/team-tool-schema.ts +37 -0
  108. package/src/skills/discover-skills.ts +5 -0
  109. package/src/state/active-run-registry.ts +9 -2
  110. package/src/state/contracts.ts +9 -0
  111. package/src/state/crew-init.ts +3 -3
  112. package/src/state/decision-ledger.ts +98 -55
  113. package/src/state/event-log-rotation.ts +2 -2
  114. package/src/state/event-log.ts +144 -10
  115. package/src/state/hook-instinct-bridge.ts +5 -5
  116. package/src/state/mailbox.ts +10 -0
  117. package/src/state/run-cache.ts +18 -8
  118. package/src/state/state-store.ts +3 -1
  119. package/src/state/types.ts +4 -0
  120. package/src/tools/safe-bash-extension.ts +1 -0
  121. package/src/tools/safe-bash.ts +152 -20
  122. package/src/types/new-api-types.ts +34 -0
  123. package/src/ui/agent-management-overlay.ts +5 -1
  124. package/src/ui/crew-widget.ts +29 -15
  125. package/src/ui/overlays/mailbox-detail-overlay.ts +13 -2
  126. package/src/ui/powerbar-publisher.ts +101 -7
  127. package/src/ui/tool-render.ts +15 -15
  128. package/src/ui/transcript-cache.ts +13 -0
  129. package/src/utils/bm25-search.ts +16 -8
  130. package/src/utils/env-filter.ts +8 -5
  131. package/src/utils/redaction.ts +169 -15
  132. package/src/utils/session-utils.ts +52 -0
  133. package/src/utils/sse-parser.ts +10 -1
  134. package/src/worktree/cleanup.ts +6 -1
  135. package/src/worktree/worktree-manager.ts +32 -13
  136. package/workflows/chain.workflow.md +252 -0
  137. package/workflows/pipeline.workflow.md +27 -0
@@ -0,0 +1,969 @@
1
+ # pi-crew Enhancement Opportunities: Detailed Implementation Plans
2
+
3
+ **Date:** 2026-05-28
4
+ **Based on:** pi-mono `origin/main` review
5
+ **Priority:** Ordered by impact-to-effort ratio
6
+
7
+ ---
8
+
9
+ ## Opportunity 1: BM25 Semantic Reranking for `team action='recommend'`
10
+
11
+ **Priority:** HIGH
12
+ **Effort:** Medium (2–3 days)
13
+ **Impact:** Significant improvement in team/agent recommendation accuracy
14
+
15
+ ### Problem Statement
16
+
17
+ Current `recommendTeam()` in `src/extension/team-recommendation.ts` uses **keyword matching** — a simple term-overlap approach. It checks if goal text contains specific keywords (`"review"`, `"security"`, `"implement"`, etc.) to select teams and workflows.
18
+
19
+ **Weaknesses:**
20
+ - `"review my security setup"` → works by accident (contains both keywords)
21
+ - `"check if my code has vulnerabilities"` → **fails** (no keyword match, misclassifies as generic)
22
+ - `"analyze the authentication flow"` → **fails** (research-like phrasing, but actually review)
23
+ - `"find all uses of eval()"` → **fails** (investigation but not security review)
24
+ - `"audit the dependency tree"` → **fails** (audit ≠ review keyword in current impl)
25
+
26
+ BM25 search (`src/utils/bm25-search.ts`) already exists with `searchAgents()` and `searchTeams()`. It's used in `team action='search'` but **not** in `team action='recommend'`.
27
+
28
+ ### Current Architecture
29
+
30
+ ```
31
+ recommendTeam(goal)
32
+ ├── detectTeamIntent() ← keyword + pattern matching
33
+ ├── decomposeGoal() ← parses numbered/bulleted/conjunction lists
34
+ └── metadata routing ← team routing metadata (triggers, useWhen)
35
+ └── NOT using: BM25 search
36
+ ```
37
+
38
+ ### Proposed Architecture
39
+
40
+ ```
41
+ recommendTeam(goal)
42
+ ├── Phase 1: Keyword Intent (fast path for obvious cases)
43
+ │ ├── detectTeamIntent() ← keep for explicit triggers
44
+ │ └── metadata routing ← keep for exact matches
45
+
46
+ ├── Phase 2: BM25 Semantic Reranking (fallback + nuance)
47
+ │ ├── searchTeams(goal) ← BM25 over team name/description/roles
48
+ │ └── searchAgents(goal) ← BM25 over agent name/description/skills
49
+
50
+ ├── Phase 3: Score Fusion
51
+ │ ├── Combine keyword score + BM25 score
52
+ │ └── Boost agents matched on skills (weighted)
53
+
54
+ ├── decomposeGoal() ← keep as-is
55
+ └── Format + return
56
+ ```
57
+
58
+ ### Implementation Details
59
+
60
+ #### Step 1: Create a scoring fusion module
61
+
62
+ **File:** `src/extension/recommendation-scoring.ts` (new)
63
+
64
+ ```typescript
65
+ import { searchAgents, searchTeams } from "../utils/bm25-search.ts";
66
+ import type { TeamConfig } from "../teams/team-config.ts";
67
+ import type { AgentConfig } from "../agents/agent-config.ts";
68
+
69
+ /**
70
+ * BM25-boosted team/agent score with normalized scores.
71
+ */
72
+ export interface SemanticTeamScore {
73
+ team: string;
74
+ bm25Score: number; // normalized 0-1
75
+ matchedOn: string[];
76
+ blendedScore?: number; // after fusion
77
+ }
78
+
79
+ export interface SemanticAgentScore {
80
+ agent: string;
81
+ bm25Score: number; // normalized 0-1
82
+ matchedOn: string[];
83
+ skills: string[];
84
+ blendedScore?: number; // after fusion
85
+ }
86
+
87
+ /**
88
+ * Fuse keyword-based intent with BM25 semantic search.
89
+ *
90
+ * Algorithm:
91
+ * 1. Run keyword intent (existing) → base score per team
92
+ * 2. Run BM25 search → semantic score per team/agent
93
+ * 3. Normalize BM25 scores to [0, 1]
94
+ * 4. Blend: final_score = α × keyword_score + (1-α) × bm25_score
95
+ * where α = 0.4 (keyword still matters for explicit triggers)
96
+ */
97
+ export async function computeSemanticScores(
98
+ goal: string,
99
+ resources?: { teams?: TeamConfig[]; agents?: AgentConfig[] }
100
+ ): Promise<{
101
+ teamScores: Map<string, SemanticTeamScore>;
102
+ agentScores: Map<string, SemanticAgentScore>;
103
+ }> {
104
+ const [teamResults, agentResults] = await Promise.all([
105
+ searchTeams(goal, { limit: 10 }),
106
+ searchAgents(goal, { limit: 20 }),
107
+ ]);
108
+
109
+ const teamScores = new Map<string, SemanticTeamScore>();
110
+ const agentScores = new Map<string, SemanticAgentScore>();
111
+
112
+ const maxTeamScore = teamResults[0]?.score ?? 1;
113
+ const maxAgentScore = agentResults[0]?.score ?? 1;
114
+
115
+ for (const r of teamResults) {
116
+ const normalized = maxTeamScore > 0 ? r.score / maxTeamScore : 0;
117
+ teamScores.set(r.team.name, {
118
+ team: r.team.name,
119
+ bm25Score: normalized,
120
+ matchedOn: r.matchedOn,
121
+ });
122
+ }
123
+
124
+ for (const r of agentResults) {
125
+ const normalized = maxAgentScore > 0 ? r.score / maxAgentScore : 0;
126
+ agentScores.set(r.agent.name, {
127
+ agent: r.agent.name,
128
+ bm25Score: normalized,
129
+ matchedOn: r.matchedOn,
130
+ skills: r.agent.skills ?? [],
131
+ });
132
+ }
133
+
134
+ return { teamScores, agentScores };
135
+ }
136
+
137
+ /**
138
+ * Blend keyword intent with semantic BM25 scores.
139
+ *
140
+ * Blend formula:
141
+ * team_score = α × base_intent_score + (1-α) × bm25_score
142
+ * where:
143
+ * α = 0.4
144
+ * base_intent_score = 1.0 if keyword matches, else 0.3
145
+ * bm25_score = normalized BM25 from searchTeams()
146
+ */
147
+ export function blendScores(
148
+ keywordTeam: string,
149
+ teamScores: Map<string, SemanticTeamScore>,
150
+ agentScores: Map<string, SemanticAgentScore>,
151
+ ALPHA = 0.4
152
+ ): void {
153
+ const intentScore = keywordTeam ? 1.0 : 0.3;
154
+
155
+ // Team: blend keyword + BM25
156
+ for (const [team, score] of teamScores) {
157
+ const bm25Component = score.bm25Score * (1 - ALPHA);
158
+ const intentComponent = (team === keywordTeam ? intentScore : 0.3) * ALPHA;
159
+ score.blendedScore = intentComponent + bm25Component;
160
+ }
161
+
162
+ // Agent: BM25 + skill domain bonus
163
+ const SKILL_DOMAINS: Record<string, string[]> = {
164
+ "test-engineer": ["test", "spec", "coverage", "verify", "qa", "unit", "integration"],
165
+ "security-reviewer": ["security", "vulnerability", "auth", "owasp", "penetration", "audit"],
166
+ "reviewer": ["review", "check", "verify", "lint", "style"],
167
+ "writer": ["write", "doc", "readme", "guide", "document"],
168
+ "explorer": ["research", "investigate", "find", "trace", "explore", "discover"],
169
+ "planner": ["plan", "design", "architecture", "strategy"],
170
+ "executor": ["implement", "code", "build", "create", "add", "fix"],
171
+ };
172
+
173
+ for (const [, score] of agentScores) {
174
+ const skillBonus = Object.entries(SKILL_DOMAINS)
175
+ .filter(([_, keywords]) => keywords.some((kw) => score.agent.toLowerCase().includes(kw)))
176
+ .length * 0.05;
177
+ score.blendedScore = score.bm25Score + Math.min(skillBonus, 0.2);
178
+ }
179
+ }
180
+ ```
181
+
182
+ #### Step 2: Integrate into `recommendTeam()`
183
+
184
+ **File:** `src/extension/team-recommendation.ts` (modify)
185
+
186
+ ```typescript
187
+ import { computeSemanticScores, blendScores } from "./recommendation-scoring.ts";
188
+
189
+ // In recommendTeam(), after keyword intent detection.
190
+ // Add as optional enhancement with try/catch:
191
+
192
+ // Replace the metadata routing section with:
193
+ const bm25BoostedTeam = await (async () => {
194
+ try {
195
+ const { teamScores } = await computeSemanticScores(goal, resources);
196
+ if (teamScores.size > 0) {
197
+ blendScores(team, teamScores, new Map(), 0.4);
198
+ const sorted = [...teamScores.values()].sort(
199
+ (a, b) => (b.blendedScore ?? 0) - (a.blendedScore ?? 0)
200
+ );
201
+ const top = sorted[0];
202
+ // If BM25 strongly prefers a different team, override
203
+ if (top && top.team !== team && (top.blendedScore ?? 0) > (teamScores.get(team)?.blendedScore ?? 0) + 0.2) {
204
+ return { team: top.team, reason: `BM25 semantic match (${top.matchedOn.join(", ")})` };
205
+ }
206
+ }
207
+ } catch {
208
+ // BM25 scoring is best-effort
209
+ }
210
+ return null;
211
+ })();
212
+
213
+ if (bm25BoostedTeam) {
214
+ team = bm25BoostedTeam.team as typeof team;
215
+ reasons.push(bm25BoostedTeam.reason);
216
+ confidence = "high";
217
+ }
218
+ ```
219
+
220
+ #### Step 3: Add config flag
221
+
222
+ **File:** `src/config/types.ts`
223
+
224
+ ```typescript
225
+ export interface PiTeamsAutonomousConfig {
226
+ // ... existing fields ...
227
+ /** Use BM25 semantic reranking (default: true) */
228
+ useSemanticReranking?: boolean;
229
+ }
230
+ ```
231
+
232
+ #### Step 4: Add tests
233
+
234
+ **File:** `test/unit/recommendation-semantic.test.ts` (new)
235
+
236
+ Key test cases:
237
+ - `"audit dependency tree"` → should suggest `review` team
238
+ - `"find XSS vulnerabilities"` → should suggest `security-reviewer` agent
239
+ - `"analyze auth flow"` → should suggest `review` team (not research)
240
+ - `"check code quality"` → should suggest `review` team (not executor)
241
+ - Existing keyword matches should still work (regression)
242
+
243
+ ### Expected Outcomes
244
+
245
+ | Scenario | Before | After |
246
+ |----------|--------|-------|
247
+ | `"audit dependency tree"` | `default`, low confidence | `review`, high confidence |
248
+ | `"find XSS vulnerabilities"` | `default`, medium | `security-reviewer`, high |
249
+ | `"analyze auth flow"` | `default`, low | `review`, high |
250
+ | `"implement feature X"` | `implementation` (keyword) | `implementation`, high |
251
+
252
+ ---
253
+
254
+ ## Opportunity 2: Extended Hook Phases (`before_turn` / `after_turn`)
255
+
256
+ **Priority:** MEDIUM
257
+ **Effort:** Medium (2 days)
258
+ **Impact:** Enables observability, per-turn policies, early abort
259
+
260
+ ### Problem Statement
261
+
262
+ pi-crew currently has **no turn-level hooks**. When a task runs:
263
+ 1. `before_task_start` — fires once per task
264
+ 2. [Task executes — many turns silently]
265
+ 3. `task_result` — fires once when task completes
266
+
267
+ Users can't:
268
+ - Abort a task mid-execution based on turn content
269
+ - Log per-turn metrics (turn count, token usage, thinking time)
270
+ - Inject turn-specific instructions
271
+ - Detect dangerous operations before they complete
272
+
273
+ pi-mono's `AgentHarness` formalizes `turn` as a first-class phase with `turn_end` events.
274
+
275
+ ### Key Discovery
276
+
277
+ `child-pi.ts` already tracks `turnCount` via `onJsonEvent` listening for `turn_end` events from pi:
278
+
279
+ ```typescript
280
+ // child-pi.ts line ~457
281
+ onJsonEvent: (event) => {
282
+ if (event && typeof event === "object" && !Array.isArray(event)) {
283
+ const obj = event as Record<string, unknown>;
284
+ if (obj.type === "turn_end") {
285
+ turnCount += 1; // ← turn tracking already exists!
286
+ // ... soft/hard limit logic ...
287
+ }
288
+ }
289
+ }
290
+ ```
291
+
292
+ The `turn_end` event from pi contains:
293
+ - `message: AgentMessage` — assistant's response
294
+ - `toolResults: ToolResultMessage[]` — tools called in this turn
295
+
296
+ We can hook into this to fire `before_turn` / `after_turn` hooks.
297
+
298
+ ### Design
299
+
300
+ Add two new **non-blocking** hooks:
301
+
302
+ ```typescript
303
+ // src/hooks/types.ts
304
+ export interface TurnContext extends HookContext {
305
+ taskId: string;
306
+ runId: string;
307
+ turnNumber: number;
308
+ messageLength: number;
309
+ toolCallCount: number;
310
+ thinkingMs?: number;
311
+ model?: string;
312
+ }
313
+ ```
314
+
315
+ ### Implementation
316
+
317
+ #### Step 1: Extend hook types
318
+
319
+ **File:** `src/hooks/types.ts`
320
+
321
+ ```typescript
322
+ // Add TurnContext
323
+ export interface TurnContext extends HookContext {
324
+ taskId: string;
325
+ runId: string;
326
+ turnNumber: number;
327
+ messageLength: number;
328
+ toolCallCount: number;
329
+ thinkingMs?: number;
330
+ model?: string;
331
+ }
332
+
333
+ // Add to HookName (registry must be updated first)
334
+ export type HookName =
335
+ | "before_run_start" | "before_task_start" | "task_result"
336
+ | "before_cancel" | "before_retry" | "before_forget"
337
+ | "before_cleanup" | "before_publish" | "session_before_switch"
338
+ | "run_recovery"
339
+ | "before_turn" | "after_turn"; // NEW
340
+ ```
341
+
342
+ #### Step 2: Add hook to registry
343
+
344
+ **File:** `src/hooks/registry.ts`
345
+
346
+ ```typescript
347
+ // No changes needed — registry is generic over HookName.
348
+ // Just need to add "before_turn" | "after_turn" to the HookName union in types.ts.
349
+ // All executeHook() calls will work automatically.
350
+ ```
351
+
352
+ #### Step 3: Extend ChildPiLifecycleEvent
353
+
354
+ **File:** `src/runtime/child-pi.ts`
355
+
356
+ ```typescript
357
+ // In ChildPiLifecycleEvent type (line ~109):
358
+ export interface ChildPiLifecycleEvent {
359
+ type: "spawned" | "spawn_error" | "response_timeout" | "final_drain"
360
+ | "hard_kill" | "exit" | "close" | "turn_begin" | "turn_end"; // NEW
361
+ pid?: number;
362
+ exitCode?: number | null;
363
+ error?: string;
364
+ stderr?: string;
365
+ ts: string;
366
+ // NEW fields for turn events:
367
+ turnNumber?: number;
368
+ messageLength?: number;
369
+ toolCallCount?: number;
370
+ thinkingMs?: number;
371
+ }
372
+ ```
373
+
374
+ #### Step 4: Instrument turn tracking in child-pi
375
+
376
+ **File:** `src/runtime/child-pi.ts` (around line 450-470)
377
+
378
+ Replace the existing `onJsonEvent` block:
379
+
380
+ ```typescript
381
+ onJsonEvent: (event) => {
382
+ restartNoResponseTimer();
383
+ if (event && typeof event === "object" && !Array.isArray(event)) {
384
+ const obj = event as Record<string, unknown>;
385
+
386
+ // Emit before_turn hook BEFORE processing turn_end
387
+ if (obj.type === "turn_end") {
388
+ const turnNumber = turnCount + 1; // next turn number
389
+ const message = obj.message as Record<string, unknown> | undefined;
390
+ const toolResults = obj.toolResults as unknown[] | undefined;
391
+ const messageLength = JSON.stringify(message).length;
392
+ const toolCallCount = toolResults?.length ?? 0;
393
+
394
+ // Fire before_turn via lifecycle event
395
+ input.onLifecycleEvent?.({
396
+ type: "turn_begin",
397
+ pid: child.pid,
398
+ turnNumber,
399
+ messageLength,
400
+ toolCallCount,
401
+ ts: new Date().toISOString(),
402
+ });
403
+ }
404
+
405
+ // Existing turn-count-based steering
406
+ if (obj.type === "turn_end") {
407
+ turnCount += 1;
408
+ // ... existing soft/hard limit logic ...
409
+
410
+ // Fire after_turn via lifecycle event
411
+ input.onLifecycleEvent?.({
412
+ type: "turn_end",
413
+ pid: child.pid,
414
+ turnNumber: turnCount,
415
+ messageLength: JSON.stringify(obj.message).length,
416
+ toolCallCount: (obj.toolResults as unknown[])?.length ?? 0,
417
+ ts: new Date().toISOString(),
418
+ });
419
+ }
420
+ }
421
+ input.onJsonEvent?.(event);
422
+ // ... rest unchanged ...
423
+ }
424
+ ```
425
+
426
+ #### Step 5: Wire lifecycle events to hooks in task-runner
427
+
428
+ **File:** `src/runtime/task-runner.ts` (around line 429)
429
+
430
+ ```typescript
431
+ // In runChildPi call, add onLifecycleEvent handler:
432
+ const childResult = await runChildPi({
433
+ // ... existing params ...
434
+ onLifecycleEvent: async (event) => {
435
+ // Existing logging logic ...
436
+
437
+ // NEW: Fire turn hooks
438
+ if (event.type === "turn_begin") {
439
+ await executeHook("before_turn", {
440
+ taskId: task.id,
441
+ runId: manifest.runId,
442
+ turnNumber: event.turnNumber ?? 0,
443
+ messageLength: event.messageLength ?? 0,
444
+ toolCallCount: event.toolCallCount ?? 0,
445
+ thinkingMs: event.thinkingMs,
446
+ cwd: task.cwd,
447
+ }).catch(() => {}); // non-blocking
448
+ }
449
+
450
+ if (event.type === "turn_end") {
451
+ await executeHook("after_turn", {
452
+ taskId: task.id,
453
+ runId: manifest.runId,
454
+ turnNumber: event.turnNumber ?? 0,
455
+ messageLength: event.messageLength ?? 0,
456
+ toolCallCount: event.toolCallCount ?? 0,
457
+ thinkingMs: event.thinkingMs,
458
+ cwd: task.cwd,
459
+ }).catch(() => {}); // non-blocking
460
+ }
461
+ },
462
+ });
463
+ ```
464
+
465
+ #### Step 6: Add tests
466
+
467
+ **File:** `test/unit/turn-hooks.test.ts` (new)
468
+
469
+ ```typescript
470
+ import { describe, it, beforeEach, afterEach } from "node:test";
471
+ import assert from "node:assert/strict";
472
+ import { registerHook, clearHooks, executeHook } from "../../src/hooks/registry.ts";
473
+
474
+ describe("before_turn hook", () => {
475
+ beforeEach(() => clearHooks());
476
+ afterEach(() => clearHooks());
477
+
478
+ it("fires with correct turn context", async () => {
479
+ registerHook({
480
+ name: "before_turn",
481
+ mode: "non_blocking",
482
+ handler: (ctx) => {
483
+ assert.equal(ctx.taskId, "task-1");
484
+ assert.equal(ctx.turnNumber, 3);
485
+ assert.equal(ctx.messageLength, 150);
486
+ assert.equal(ctx.toolCallCount, 2);
487
+ return { outcome: "allow" };
488
+ },
489
+ });
490
+
491
+ const report = await executeHook("before_turn", {
492
+ taskId: "task-1",
493
+ runId: "run-1",
494
+ turnNumber: 3,
495
+ messageLength: 150,
496
+ toolCallCount: 2,
497
+ cwd: "/tmp",
498
+ });
499
+
500
+ assert.equal(report.outcome, "allow");
501
+ });
502
+
503
+ it("does not block task execution (non-blocking)", async () => {
504
+ registerHook({
505
+ name: "before_turn",
506
+ mode: "non_blocking",
507
+ handler: async () => {
508
+ await new Promise((r) => setTimeout(r, 100));
509
+ return { outcome: "allow" };
510
+ },
511
+ });
512
+
513
+ const start = Date.now();
514
+ const report = await executeHook("before_turn", {
515
+ taskId: "task-1",
516
+ runId: "run-1",
517
+ turnNumber: 1,
518
+ messageLength: 0,
519
+ toolCallCount: 0,
520
+ cwd: "/tmp",
521
+ });
522
+ const elapsed = Date.now() - start;
523
+
524
+ assert.equal(report.outcome, "allow");
525
+ assert.ok(elapsed < 50, "Non-blocking hook should not delay execution");
526
+ });
527
+ });
528
+
529
+ describe("after_turn hook", () => {
530
+ beforeEach(() => clearHooks());
531
+ afterEach(() => clearHooks());
532
+
533
+ it("receives complete turn metrics", async () => {
534
+ const received: Record<string, unknown> = {};
535
+ registerHook({
536
+ name: "after_turn",
537
+ mode: "non_blocking",
538
+ handler: (ctx) => {
539
+ Object.assign(received, ctx);
540
+ return { outcome: "allow" };
541
+ },
542
+ });
543
+
544
+ await executeHook("after_turn", {
545
+ taskId: "task-1",
546
+ runId: "run-1",
547
+ turnNumber: 5,
548
+ messageLength: 1024,
549
+ toolCallCount: 3,
550
+ thinkingMs: 3500,
551
+ cwd: "/tmp",
552
+ });
553
+
554
+ assert.equal(received.turnNumber, 5);
555
+ assert.equal(received.messageLength, 1024);
556
+ assert.equal(received.toolCallCount, 3);
557
+ assert.equal(received.thinkingMs, 3500);
558
+ });
559
+ });
560
+ ```
561
+
562
+ ### Expected Outcomes
563
+
564
+ | Use Case | Before | After |
565
+ |----------|--------|-------|
566
+ | Per-turn observability | None | `before_turn`/`after_turn` fire per turn |
567
+ | Dangerous operation detection | Only at task end | Can abort mid-task via `before_turn` block |
568
+ | Turn metrics logging | None | Available: turn count, message length, tool calls, thinking time |
569
+ | Thinking time tracking | Not exposed | Available via `thinkingMs` |
570
+
571
+ ---
572
+
573
+ ## Opportunity 3: Hook Lifecycle Test Suite
574
+
575
+ **Priority:** MEDIUM
576
+ **Effort:** Small (1 day)
577
+ **Impact:** Ensures hook reliability, prevents regressions
578
+
579
+ ### Current State
580
+
581
+ pi-crew has 3 hook test files:
582
+ - `test/unit/hooks.test.ts` — basic registry/execution tests (8 tests)
583
+ - `test/unit/lifecycle-hooks.test.ts` — lifecycle integration (6 tests)
584
+ - `test/unit/recovery-hooks.test.ts` — recovery hooks
585
+
586
+ **Gap:** No tests for `task_result`, `before_publish`, `session_before_switch`, `run_recovery`, `before_retry` hooks.
587
+
588
+ ### Test Suite Plan
589
+
590
+ **File:** `test/unit/hook-full-lifecycle.test.ts` (new)
591
+
592
+ ```typescript
593
+ import { describe, it, beforeEach, afterEach } from "node:test";
594
+ import assert from "node:assert/strict";
595
+ import { registerHook, clearHooks, executeHook } from "../../src/hooks/registry.ts";
596
+ import type { HookResult } from "../../src/hooks/types.ts";
597
+
598
+ describe("task_result hook", () => {
599
+ beforeEach(() => clearHooks());
600
+ afterEach(() => clearHooks());
601
+
602
+ it("receives task context and result data", async () => {
603
+ let receivedCtx: Record<string, unknown> = {};
604
+ registerHook({
605
+ name: "task_result",
606
+ mode: "non_blocking",
607
+ handler: (ctx) => {
608
+ receivedCtx = { ...ctx };
609
+ return { outcome: "allow" };
610
+ },
611
+ });
612
+
613
+ await executeHook("task_result", {
614
+ taskId: "task-1",
615
+ runId: "run-1",
616
+ cwd: "/tmp",
617
+ data: { status: "success", outputLength: 2048 },
618
+ });
619
+
620
+ assert.equal(receivedCtx.taskId, "task-1");
621
+ assert.deepEqual((receivedCtx as Record<string, unknown>).data, { status: "success", outputLength: 2048 });
622
+ });
623
+
624
+ it("non-blocking hook does not affect task completion", async () => {
625
+ registerHook({
626
+ name: "task_result",
627
+ mode: "non_blocking",
628
+ handler: async () => {
629
+ await new Promise((r) => setTimeout(r, 500));
630
+ return { outcome: "allow" };
631
+ },
632
+ });
633
+
634
+ const start = Date.now();
635
+ const report = await executeHook("task_result", {
636
+ taskId: "task-1",
637
+ runId: "run-1",
638
+ cwd: "/tmp",
639
+ });
640
+ assert.ok(Date.now() - start < 100);
641
+ assert.equal(report.outcome, "allow");
642
+ });
643
+ });
644
+
645
+ describe("run_recovery hook", () => {
646
+ beforeEach(() => clearHooks());
647
+ afterEach(() => clearHooks());
648
+
649
+ it("fires with run context on crash recovery", async () => {
650
+ registerHook({
651
+ name: "run_recovery",
652
+ mode: "blocking",
653
+ handler: (ctx) => ({ outcome: "allow" }),
654
+ });
655
+
656
+ const report = await executeHook("run_recovery", {
657
+ runId: "run-crash-1",
658
+ cwd: "/tmp",
659
+ data: { crashReason: "child process exit", pid: 12345 },
660
+ });
661
+
662
+ assert.equal(report.outcome, "allow");
663
+ });
664
+
665
+ it("can block recovery", async () => {
666
+ registerHook({
667
+ name: "run_recovery",
668
+ mode: "blocking",
669
+ handler: () => ({ outcome: "block", reason: "Maintenance hold" }),
670
+ });
671
+
672
+ const report = await executeHook("run_recovery", {
673
+ runId: "run-blocked",
674
+ cwd: "/tmp",
675
+ });
676
+
677
+ assert.equal(report.outcome, "block");
678
+ assert.match(report.reason ?? "", /Maintenance hold/);
679
+ });
680
+ });
681
+
682
+ describe("before_retry hook", () => {
683
+ beforeEach(() => clearHooks());
684
+ afterEach(() => clearHooks());
685
+
686
+ it("can allow retry", async () => {
687
+ registerHook({
688
+ name: "before_retry",
689
+ mode: "blocking",
690
+ handler: () => ({ outcome: "allow" }),
691
+ });
692
+
693
+ const report = await executeHook("before_retry", {
694
+ runId: "run-1",
695
+ cwd: "/tmp",
696
+ data: { attemptNumber: 2 },
697
+ });
698
+
699
+ assert.equal(report.outcome, "allow");
700
+ });
701
+
702
+ it("can block retry with reason", async () => {
703
+ registerHook({
704
+ name: "before_retry",
705
+ mode: "blocking",
706
+ handler: () => ({ outcome: "block", reason: "Max retries exceeded" }),
707
+ });
708
+
709
+ const report = await executeHook("before_retry", {
710
+ runId: "run-max",
711
+ cwd: "/tmp",
712
+ });
713
+
714
+ assert.equal(report.outcome, "block");
715
+ });
716
+ });
717
+
718
+ describe("before_publish hook", () => {
719
+ beforeEach(() => clearHooks());
720
+ afterEach(() => clearHooks());
721
+
722
+ it("fires before run publication", async () => {
723
+ registerHook({
724
+ name: "before_publish",
725
+ mode: "blocking",
726
+ handler: () => ({ outcome: "allow" }),
727
+ });
728
+
729
+ const report = await executeHook("before_publish", {
730
+ runId: "run-pub",
731
+ cwd: "/tmp",
732
+ });
733
+
734
+ assert.equal(report.outcome, "allow");
735
+ });
736
+ });
737
+
738
+ describe("session_before_switch hook", () => {
739
+ beforeEach(() => clearHooks());
740
+ afterEach(() => clearHooks());
741
+
742
+ it("receives session context", async () => {
743
+ registerHook({
744
+ name: "session_before_switch",
745
+ mode: "blocking",
746
+ handler: () => ({ outcome: "allow" }),
747
+ });
748
+
749
+ const report = await executeHook("session_before_switch", {
750
+ runId: "run-session",
751
+ cwd: "/tmp",
752
+ data: { fromSession: "old", toSession: "new" },
753
+ });
754
+
755
+ assert.equal(report.outcome, "allow");
756
+ });
757
+ });
758
+ ```
759
+
760
+ ### Additional Coverage
761
+
762
+ Add edge case tests to `test/unit/hooks.test.ts`:
763
+ - Hook timeout: if a non-blocking hook hangs >5s, it should not block
764
+ - Multiple hooks same name: all execute in registration order
765
+ - Hook with modify outcome: context is mutated correctly
766
+ - Error in non-blocking hook: error is logged, execution continues
767
+ - Dynamic hook registration during hook execution: safe (no concurrent modification)
768
+
769
+ ---
770
+
771
+ ## Opportunity 4: Phase Tracking + Hook Documentation
772
+
773
+ **Priority:** LOW-MEDIUM
774
+ **Effort:** Small-Medium (1-2 days)
775
+ **Impact:** Developer experience, observability
776
+
777
+ ### A. Task Phase Tracking
778
+
779
+ Add a `phase` field to task records for observability.
780
+
781
+ **File:** `src/state/types.ts`
782
+
783
+ ```typescript
784
+ // Add to TaskRecord or create new TaskPhase type
785
+ export type TaskPhase =
786
+ | "pending" // Queued, not started
787
+ | "exploring" // Initial research/discovery
788
+ | "planning" // Planning subtasks
789
+ | "executing" // Actively running
790
+ | "verifying" // Running verification
791
+ | "finalizing" // Wrapping up
792
+ | "done" // Complete
793
+ | "failed" // Error
794
+ | "cancelled"; // User cancelled
795
+
796
+ // In TaskRecord:
797
+ export interface TaskRecord {
798
+ // ... existing fields ...
799
+ phase?: TaskPhase;
800
+ phaseHistory?: Array<{ phase: TaskPhase; at: string; turnNumber?: number }>;
801
+ }
802
+ ```
803
+
804
+ **File:** `src/runtime/task-runner.ts` — update phase at key points:
805
+
806
+ ```typescript
807
+ // On task start:
808
+ updateTaskPhase(task.id, "exploring");
809
+
810
+ // After planner produces plan:
811
+ updateTaskPhase(task.id, "planning");
812
+
813
+ // During execution:
814
+ updateTaskPhase(task.id, "executing");
815
+
816
+ // After verification:
817
+ updateTaskPhase(task.id, "verifying");
818
+
819
+ // On completion:
820
+ updateTaskPhase(task.id, "done");
821
+
822
+ // On error:
823
+ updateTaskPhase(task.id, "failed");
824
+ ```
825
+
826
+ **File:** `src/extension/team-tool/status.ts` — surface phase in `team action='status'`:
827
+
828
+ ```
829
+ Task: 01_explore [exploring] ████████░░ 80%
830
+ Task: 02_plan [pending] ░░░░░░░░░░ 0%
831
+ ```
832
+
833
+ ### B. Hook Documentation
834
+
835
+ **File:** `docs/hooks.md` (new)
836
+
837
+ ```markdown
838
+ # pi-crew Hook System
839
+
840
+ Hooks allow you to intercept and modify pi-crew lifecycle events. They can block operations,
841
+ inject data, or log for observability.
842
+
843
+ ## Available Hooks
844
+
845
+ | Hook | Type | Blocking | Description |
846
+ |------|------|----------|-------------|
847
+ | `before_run_start` | Run | ✅ | Fires before a team run begins. Return `block` to prevent the run. |
848
+ | `before_task_start` | Task | ✅ | Fires before each task begins. Return `block` to skip the task. |
849
+ | `task_result` | Task | ❌ | Fires after each task completes. Non-blocking — won't affect task outcome. |
850
+ | `before_cancel` | Run | ✅ | Fires before a run is cancelled. Return `block` to prevent cancellation. |
851
+ | `before_retry` | Run | ✅ | Fires before a failed run is retried. Return `block` to prevent retry. |
852
+ | `before_forget` | Run | ✅ | Fires before run state is deleted. Return `block` to preserve state. |
853
+ | `before_cleanup` | Run | ✅ | Fires before cleanup operation. Return `block` to prevent cleanup. |
854
+ | `before_publish` | Run | ✅ | Fires before a run is published. Return `block` to prevent publishing. |
855
+ | `session_before_switch` | Session | ✅ | Fires before switching sessions. Return `block` to prevent switch. |
856
+ | `run_recovery` | Run | ✅ | Fires during crash recovery. Return `block` to abort recovery. |
857
+ | `before_turn` | Turn | ❌ | Fires before each turn (requires implementation). |
858
+ | `after_turn` | Turn | ❌ | Fires after each turn completes (requires implementation). |
859
+
860
+ ## Hook Modes
861
+
862
+ ### Blocking
863
+ Blocking hooks receive the context and return a decision. If the decision is `block`,
864
+ the operation is aborted immediately.
865
+
866
+ ```typescript
867
+ registerHook({
868
+ name: "before_run_start",
869
+ mode: "blocking",
870
+ handler: (ctx) => {
871
+ if (ctx.data?.someCondition) {
872
+ return { outcome: "block", reason: "Condition not met" };
873
+ }
874
+ return { outcome: "allow" };
875
+ },
876
+ });
877
+ ```
878
+
879
+ ### Non-Blocking
880
+ Non-blocking hooks run asynchronously. Errors are caught and logged but don't affect the operation.
881
+
882
+ ```typescript
883
+ registerHook({
884
+ name: "task_result",
885
+ mode: "non_blocking",
886
+ handler: async (ctx) => {
887
+ await sendToExternalSystem(ctx.data);
888
+ return { outcome: "allow" };
889
+ },
890
+ });
891
+ ```
892
+
893
+ ## Modify Outcome
894
+
895
+ Hooks can modify context data for subsequent hooks or the operation itself.
896
+
897
+ ```typescript
898
+ registerHook({
899
+ name: "before_task_start",
900
+ mode: "non_blocking",
901
+ handler: (ctx) => ({
902
+ outcome: "modify",
903
+ data: { ...ctx.data, injectedField: "value" },
904
+ }),
905
+ });
906
+ ```
907
+
908
+ ## Example: Auto-cancel duplicate runs
909
+
910
+ ```typescript
911
+ registerHook({
912
+ name: "before_run_start",
913
+ mode: "blocking",
914
+ handler: (ctx) => {
915
+ const activeRuns = getActiveRuns();
916
+ if (activeRuns.some(r => r.goal === ctx.data?.goal && r.runId !== ctx.runId)) {
917
+ return { outcome: "block", reason: "Another run with the same goal is already active." };
918
+ }
919
+ return { outcome: "allow" };
920
+ },
921
+ });
922
+ ```
923
+
924
+ ## Example: External logging
925
+
926
+ ```typescript
927
+ registerHook({
928
+ name: "task_result",
929
+ mode: "non_blocking",
930
+ handler: async (ctx) => {
931
+ await fetch("https://metrics.example.com/hook", {
932
+ method: "POST",
933
+ body: JSON.stringify({ runId: ctx.runId, taskId: ctx.taskId, ...ctx.data }),
934
+ });
935
+ return { outcome: "allow" };
936
+ },
937
+ });
938
+ ```
939
+
940
+ ## Example: Rate limiting
941
+
942
+ ```typescript
943
+ const recentCancellations = new Map<string, number>();
944
+ registerHook({
945
+ name: "before_cancel",
946
+ mode: "blocking",
947
+ handler: (ctx) => {
948
+ const count = (recentCancellations.get(ctx.runId) ?? 0) + 1;
949
+ recentCancellations.set(ctx.runId, count);
950
+ if (count > 3) {
951
+ return { outcome: "block", reason: "Too many cancellations. Wait before cancelling again." };
952
+ }
953
+ return { outcome: "allow" };
954
+ },
955
+ });
956
+ ```
957
+ ```
958
+
959
+ ---
960
+
961
+ ## Implementation Priority
962
+
963
+ | # | Opportunity | Priority | Effort | Impact | Action |
964
+ |---|-------------|----------|--------|--------|--------|
965
+ | 1 | BM25 Semantic Reranking | HIGH | Medium | High | Start next sprint |
966
+ | 2 | Extended Hook Phases | MEDIUM | Medium | Medium | Design review needed |
967
+ | 3 | Hook Lifecycle Tests | MEDIUM | Small | Medium | Write tests now |
968
+ | 4A | Task Phase Tracking | LOW | Small | Low | Nice-to-have |
969
+ | 4B | Hook Documentation | LOW | Small | Medium | Write docs now |