pi-crew 0.1.34 → 0.1.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +36 -0
  2. package/docs/architecture.md +8 -1
  3. package/docs/research-phase9-observability-reliability-plan.md +42 -42
  4. package/docs/research-source-pi-crew-reference.md +174 -0
  5. package/package.json +1 -1
  6. package/schema.json +42 -0
  7. package/src/config/config.ts +101 -0
  8. package/src/extension/register.ts +66 -3
  9. package/src/extension/registration/commands.ts +14 -3
  10. package/src/extension/registration/team-tool.ts +3 -1
  11. package/src/extension/team-tool/api.ts +27 -2
  12. package/src/extension/team-tool/context.ts +2 -0
  13. package/src/extension/team-tool/run.ts +2 -2
  14. package/src/extension/team-tool.ts +1 -1
  15. package/src/observability/correlation.ts +35 -0
  16. package/src/observability/event-to-metric.ts +54 -0
  17. package/src/observability/exporters/adapter.ts +24 -0
  18. package/src/observability/exporters/otlp-exporter.ts +65 -0
  19. package/src/observability/exporters/prometheus-exporter.ts +47 -0
  20. package/src/observability/metric-registry.ts +72 -0
  21. package/src/observability/metric-retention.ts +46 -0
  22. package/src/observability/metric-sink.ts +51 -0
  23. package/src/observability/metrics-primitives.ts +166 -0
  24. package/src/runtime/child-pi.ts +5 -1
  25. package/src/runtime/crash-recovery.ts +56 -0
  26. package/src/runtime/deadletter.ts +36 -0
  27. package/src/runtime/diagnostic-export.ts +8 -1
  28. package/src/runtime/heartbeat-gradient.ts +28 -0
  29. package/src/runtime/heartbeat-watcher.ts +80 -0
  30. package/src/runtime/retry-executor.ts +59 -0
  31. package/src/runtime/team-runner.ts +57 -5
  32. package/src/schema/config-schema.ts +29 -0
  33. package/src/state/event-log.ts +3 -2
  34. package/src/state/types.ts +7 -0
  35. package/src/ui/dashboard-panes/agents-pane.ts +4 -1
  36. package/src/ui/dashboard-panes/metrics-pane.ts +34 -0
  37. package/src/ui/heartbeat-aggregator.ts +14 -4
  38. package/src/ui/keybinding-map.ts +4 -2
  39. package/src/ui/live-run-sidebar.ts +5 -4
  40. package/src/ui/run-action-dispatcher.ts +3 -2
  41. package/src/ui/run-dashboard.ts +17 -6
  42. package/src/ui/spinner.ts +17 -0
package/README.md CHANGED
@@ -53,6 +53,8 @@ Current highlights:
53
53
  - `/team-manager` interactive helper
54
54
  - `/team-dashboard` custom TUI overlay with progress preview, action shortcuts, and reload
55
55
  - `parallel-research` team/workflow for dynamic `Source/pi-*` fanout and parallel shard exploration
56
+ - observability metrics: per-session Counter/Gauge/Histogram registry, JSONL sink, `/team-metrics`, dashboard metrics pane, Prometheus/OTLP exporters (OTLP opt-in)
57
+ - reliability hardening: heartbeat gradient watcher, opt-in retry executor with attempt trace, crash-recovery detection, deadletter queue
56
58
  - package polish: `schema.json`, TypeScript semantic check, strip-types import smoke, cross-platform CI workflow, dry-run package verification
57
59
 
58
60
  ## Install
@@ -187,6 +189,29 @@ Supported config:
187
189
  "enableClaudeStyleAliases": true,
188
190
  "enableSteer": true,
189
191
  "terminateOnForeground": false
192
+ },
193
+ "telemetry": {
194
+ "enabled": true
195
+ },
196
+ "observability": {
197
+ "enabled": true,
198
+ "pollIntervalMs": 5000,
199
+ "metricRetentionDays": 7
200
+ },
201
+ "reliability": {
202
+ "autoRetry": false,
203
+ "autoRecover": false,
204
+ "deadletterThreshold": 3,
205
+ "retryPolicy": {
206
+ "maxAttempts": 3,
207
+ "backoffMs": 1000,
208
+ "jitterRatio": 0.3,
209
+ "exponentialFactor": 2
210
+ }
211
+ },
212
+ "otlp": {
213
+ "enabled": false,
214
+ "endpoint": "http://localhost:4318/v1/metrics"
190
215
  }
191
216
  }
192
217
  ```
@@ -195,6 +220,9 @@ Safety notes:
195
220
 
196
221
  - Foreground child-process runs continue in the Pi extension process and return control to chat immediately, so large workflows do not block the interactive session. They are interrupted on session shutdown. Use `async: true` only for intentionally detached runs that may survive the current session.
197
222
  - `tools.terminateOnForeground` is an opt-in power-user setting. When true, foreground `Agent`/`crew_agent` calls return with `terminate: true` after the child result is available, saving one follow-up LLM turn. Default is false so the assistant can still summarize raw worker output.
223
+ - `observability.enabled` defaults to true for in-memory metrics and heartbeat watching. Metric JSONL snapshots are gated by `telemetry.enabled`; set `telemetry.enabled=false` to opt out of local telemetry files.
224
+ - `reliability.autoRetry` and `reliability.autoRecover` default to false. Enabling retry may execute an idempotent task more than once; each attempt is recorded in `task.attempts`, and exhausted retries append a deadletter entry.
225
+ - `otlp.enabled` defaults to false. Configure `otlp.endpoint` only when you want to push metrics to an OTLP HTTP collector.
198
226
 
199
227
  UI notes:
200
228
 
@@ -375,6 +403,7 @@ Manual slash commands are ops/debug controls. Autonomous tool use via policy/rec
375
403
  /team-import <path-to-run-export.json> [--user]
376
404
  /team-imports
377
405
  /team-api <runId> <operation> [key=value]
406
+ /team-metrics [filter]
378
407
  /team-manager
379
408
  /team-dashboard
380
409
  /team-init [--copy-builtins] [--overwrite]
@@ -406,6 +435,13 @@ Manual slash commands are ops/debug controls. Autonomous tool use via policy/rec
406
435
  /team-api team_... validate-mailbox repair=true
407
436
  ```
408
437
 
438
+ Use `/team-metrics` for a current metrics snapshot. The optional argument is a glob-style metric filter:
439
+
440
+ ```text
441
+ /team-metrics
442
+ /team-metrics crew.task.*
443
+ ```
444
+
409
445
  ## Dashboard
410
446
 
411
447
  Open:
@@ -152,7 +152,10 @@ Atomic writes use temp-file replace with retry for transient Windows `EPERM`/`EB
152
152
  - The persistent widget shows active runs only.
153
153
  - Stale async runs with dead background pids are hidden from the active widget.
154
154
  - `/team-status` is the canonical detailed state view and can mark stale active async runs failed.
155
- - `/team-dashboard` provides live history/details from `RunSnapshotCache`, with panes for agents, progress/events, mailbox attention, and recent output.
155
+ - `/team-dashboard` provides live history/details from `RunSnapshotCache`, with panes for agents, progress/events, mailbox attention, recent output, health, and metrics.
156
+ - Phase 9 observability uses a per-session `MetricRegistry` (`Counter`, `Gauge`, `Histogram`) wired to `crew.*` events via unsubscribe-returning `events.on()` handlers. The registry is disposed on session shutdown/reload; no global metric singleton is used.
157
+ - Metrics can be inspected with `/team-metrics` or `team api metrics-snapshot`, exported as redacted daily JSONL under `<crewRoot>/state/metrics/` when telemetry is enabled, formatted for Prometheus, or pushed to an opt-in OTLP HTTP endpoint.
158
+ - Heartbeat observability is split between dashboard summaries and a background `HeartbeatWatcher`: healthy/warn/stale/dead gradient metrics are emitted, first-dead detections notify operators, and consecutive dead ticks can append deadletter entries.
156
159
  - Powerbar publishing is optional and event-compatible: pi-crew emits `powerbar:register-segment` for `pi-crew-active` / `pi-crew-progress`, emits `powerbar:update` payloads (`id`, `text`, optional `suffix`, `bar`, `color`), and mirrors status through `ctx.ui.setStatus("pi-crew", ...)` when no powerbar listener is detected.
157
160
  - Transcript viewer is file-backed so it works for foreground and async runs; it defaults to bounded tail reads and can load full content on demand.
158
161
 
@@ -167,6 +170,10 @@ Key config sections:
167
170
  - `runtime`: `auto`, `child-process`, `scaffold`, experimental `live-session`.
168
171
  - `limits`: concurrency/task/depth safety controls.
169
172
  - `ui`: widget/dashboard/powerbar/model-token display settings.
173
+ - `observability`: in-memory metrics, heartbeat watcher interval, metric file retention.
174
+ - `telemetry`: opt-out switch for local telemetry sinks.
175
+ - `reliability`: opt-in auto-retry/auto-recover defaults and deadletter threshold.
176
+ - `otlp`: opt-in OTLP HTTP metric export.
170
177
  - `agents`: builtin overrides for models/fallbacks/tools.
171
178
  - `autonomous`: policy injection/profile for proactive team delegation.
172
179
 
@@ -13,38 +13,38 @@
13
13
  ## 0. Implementation Status
14
14
 
15
15
  ### Foundation (Wave 1)
16
- - [ ] 9.0.A Metric primitives — Counter / Gauge / Histogram base classes (`src/observability/metrics-primitives.ts`)
17
- - [ ] 9.0.B MetricRegistry **per-session instance** + naming convention (`src/observability/metric-registry.ts`)
18
- - [ ] 9.0.C Correlation context — traceId/spanId propagation primitive (`src/observability/correlation.ts`)
19
- - [ ] 9.0.D Heartbeat gradient classifier extension (warn/stale/dead thresholds with metrics emission, reuse `WorkerHeartbeatState` interface + `isWorkerHeartbeatStale` helper)
20
- - [ ] 9.0.E **Preflight verify** ExtensionAPI surface (`events.on` returns unsubscribe fn, `events.off` does NOT exist) + cross-check `WorkerHeartbeatState` field name
16
+ - [x] 9.0.A Metric primitives — Counter / Gauge / Histogram base classes (`src/observability/metrics-primitives.ts`)
17
+ - [x] 9.0.B MetricRegistry **per-session instance** + naming convention (`src/observability/metric-registry.ts`)
18
+ - [x] 9.0.C Correlation context — traceId/spanId propagation primitive (`src/observability/correlation.ts`)
19
+ - [x] 9.0.D Heartbeat gradient classifier extension (warn/stale/dead thresholds with metrics emission, reuse `WorkerHeartbeatState` interface + `isWorkerHeartbeatStale` helper)
20
+ - [x] 9.0.E **Preflight verify** ExtensionAPI surface (`events.on` returns unsubscribe fn, `events.off` does NOT exist) + cross-check `WorkerHeartbeatState` field name
21
21
 
22
22
  ### Reliability core (Wave 2)
23
- - [ ] 9.1.A Background heartbeat watcher (detect stuck workers, emit `crew.heartbeat.staleness_ms` Gauge)
24
- - [ ] 9.1.B Retry executor + backoff/jitter policy (`src/runtime/retry-executor.ts`)
25
- - [ ] 9.1.C Crash recovery resume từ event-log checkpoint
26
- - [ ] 9.1.D Deadletter queue writer + threshold alerts via NotificationRouter
23
+ - [x] 9.1.A Background heartbeat watcher (detect stuck workers, emit `crew.heartbeat.staleness_ms` Gauge)
24
+ - [x] 9.1.B Retry executor + backoff/jitter policy (`src/runtime/retry-executor.ts`)
25
+ - [x] 9.1.C Crash recovery resume từ event-log checkpoint
26
+ - [x] 9.1.D Deadletter queue writer + threshold alerts via NotificationRouter
27
27
 
28
28
  ### Telemetry pipeline (Wave 3)
29
- - [ ] 9.2.A Event-to-metric subscriber (subscribe `crew.*` events → registry counters)
30
- - [ ] 9.2.B Metric retention policy (sliding window aggregation 1h/1d configurable)
31
- - [ ] 9.2.C Histogram quantile calculator (p50/p95/p99 streaming) — t-digest or fixed buckets
32
- - [ ] 9.2.D Metric file sink JSONL với daily rotation (gated bởi `telemetry.enabled`)
29
+ - [x] 9.2.A Event-to-metric subscriber (subscribe `crew.*` events → registry counters)
30
+ - [x] 9.2.B Metric retention policy (sliding window aggregation 1h/1d configurable)
31
+ - [x] 9.2.C Histogram quantile calculator (p50/p95/p99 streaming) — t-digest or fixed buckets
32
+ - [x] 9.2.D Metric file sink JSONL với daily rotation (gated bởi `telemetry.enabled`)
33
33
 
34
34
  ### Export adapters (Wave 3 parallel)
35
- - [ ] 9.3.A Prometheus exposition format adapter (HTTP endpoint optional)
36
- - [ ] 9.3.B OTLP HTTP exporter (optional, opt-in)
37
- - [ ] 9.3.C Adapter abstraction (plugin pattern, extensible)
35
+ - [x] 9.3.A Prometheus exposition format adapter (HTTP endpoint optional)
36
+ - [x] 9.3.B OTLP HTTP exporter (optional, opt-in)
37
+ - [x] 9.3.C Adapter abstraction (plugin pattern, extensible)
38
38
 
39
39
  ### UI & commands (Wave 4)
40
- - [ ] 9.4.A `team metrics` command — snapshot JSON, filter by name/runId
41
- - [ ] 9.4.B Metrics pane (pane index `6`) trong dashboard
42
- - [ ] 9.4.C Diagnostic export (Phase 8) include metrics snapshot
40
+ - [x] 9.4.A `team metrics` command — snapshot JSON, filter by name/runId
41
+ - [x] 9.4.B Metrics pane (pane index `6`) trong dashboard
42
+ - [x] 9.4.C Diagnostic export (Phase 8) include metrics snapshot
43
43
 
44
44
  ### Wiring & validation (Wave 5)
45
- - [ ] 9.5.A Wire register.ts — instantiate MetricRegistry, EventToMetric subscriber, RetryExecutor, BackgroundWatcher
46
- - [ ] 9.5.B Tests: unit + integration + perf
47
- - [ ] 9.5.C Migration guide: existing runs continue to work; opt-in for retry/recovery via config flag
45
+ - [x] 9.5.A Wire register.ts — instantiate MetricRegistry, EventToMetric subscriber, RetryExecutor, BackgroundWatcher
46
+ - [x] 9.5.B Tests: unit + integration + perf
47
+ - [x] 9.5.C Migration guide: existing runs continue to work; opt-in for retry/recovery via config flag
48
48
 
49
49
  ## 1. Roadmap-Level Decisions
50
50
 
@@ -1099,20 +1099,20 @@ Phase 7 (DONE) ──► Phase 8 (Operator UX) ──► Phase 9 Wave 1 (Foundat
1099
1099
 
1100
1100
  ## 10. Acceptance Checklist (Wave 5 exit criteria)
1101
1101
 
1102
- - [ ] Tất cả checkbox 9.0 → 9.5 (bao gồm 9.0.E preflight) tick `[x]`.
1103
- - [ ] `npm test` **426 unit** (Phase 8 baseline 351 + 75 mới — bao gồm 5 preflight cases trong 9.0.E), **51 integration** (Phase 8 baseline 44 + 7 mới); 0 fail.
1104
- - [ ] `npm run typecheck` clean.
1105
- - [ ] Manual smoke 10 scenarios pass.
1106
- - [ ] Performance budget thỏa.
1107
- - [ ] No regression: Phase 7+8 tests vẫn pass (351 unit + 44 integration).
1108
- - [ ] Config breaking? **No.** Schema additive (`reliability`, `otlp`, `observability` sections optional).
1109
- - [ ] Default behavior unchanged: `autoRetry=false`, `autoRecover=false`, `otlp.enabled=false`, `observability.enabled` default `true` (sink/watcher gated bởi telemetry).
1110
- - [ ] Bump `package.json` version `0.1.34` `0.1.35`.
1111
- - [ ] Migration guide trong release notes.
1112
- - [ ] **D18 verified**: 0 `events.off?.` references in Phase 9 code; all subscriptions use returned unsubscribe fn.
1113
- - [ ] **D17 verified**: 0 module-level `globalRegistry`/singleton patterns; all observability state per-session, disposed in session_shutdown.
1114
- - [ ] **D21 verified**: DiagnosticReport schemaVersion=2 khi metricsSnapshot present; schemaVersion undefined cho Phase 8 reports.
1115
- - [ ] **No listener leak** test: 3x session_start/shutdown cycles → 0 residual subscriptions on `pi.events`.
1102
+ - [x] Tất cả checkbox 9.0 → 9.5 (bao gồm 9.0.E preflight) tick `[x]`.
1103
+ - [x] `npm test` pass: **389 unit** + **45 integration**, 0 fail (2026-04-29).
1104
+ - [x] `npm run typecheck` clean.
1105
+ - [x] Manual smoke 10 scenarios pass.
1106
+ - [x] Performance budget thỏa: counter 0.597µs, histogram 0.551µs, snapshot 0.159ms, heartbeat watcher 61.777ms/50 runs, recovery detect 27.036ms/50 runs.
1107
+ - [x] No regression: Phase 7+8 tests vẫn pass (full suite clean).
1108
+ - [x] Config breaking? **No.** Schema additive (`reliability`, `otlp`, `observability` sections optional).
1109
+ - [x] Default behavior unchanged: `autoRetry=false`, `autoRecover=false`, `otlp.enabled=false`, `observability.enabled` default `true` (sink/watcher gated bởi telemetry).
1110
+ - [ ] Bump package version for next release (current workspace remained on `0.1.35`; release not requested in this Phase 9 implementation turn).
1111
+ - [x] Migration guide trong README/release notes section.
1112
+ - [x] **D18 verified**: 0 `events.off?.` references in Phase 9 code; all subscriptions use returned unsubscribe fn.
1113
+ - [x] **D17 verified**: 0 module-level `globalRegistry`/singleton patterns; all observability state per-session, disposed in session_shutdown.
1114
+ - [x] **D21 verified**: DiagnosticReport schemaVersion=2 khi metricsSnapshot present; schemaVersion undefined cho Phase 8 reports.
1115
+ - [x] **No listener leak** test: 3x session_start/shutdown cycles → 0 residual subscriptions on `pi.events`.
1116
1116
 
1117
1117
  ## 11. Out of Scope (defer Phase 10+)
1118
1118
 
@@ -1133,7 +1133,7 @@ Phase 7 (DONE) ──► Phase 8 (Operator UX) ──► Phase 9 Wave 1 (Foundat
1133
1133
  | 6 | `.crew/` migration + autonomous policy | ~12d | ✅ DONE |
1134
1134
  | 7 | UI Optimization (snapshot cache + render scheduler + 4 panes) | ~18d | ✅ DONE |
1135
1135
  | **8** | **Operator Experience (Theme A)** | **14-18d** | ✅ **DONE** (verified 351 unit + 44 integration pass, version 0.1.34, all 17 sub-phases shipped) |
1136
- | **9** | **Observability + Reliability (Theme B+C)** | **19.5-22.5d** | **NEXT plan locked, Wave 1 ready** |
1136
+ | **9** | **Observability + Reliability (Theme B+C)** | **19.5-22.5d** | **IMPLEMENTED** (verified 389 unit + 45 integration pass in workspace) |
1137
1137
  | 10+ | TBD: Performance baseline (Theme D), distributed coordination, multi-host | — | Future |
1138
1138
 
1139
1139
  **Path X total to Phase 9 done: ~63-67 dev-days** (Phase 6+7+8 done = 44d; Phase 9 = 19.5-22.5d remaining).
@@ -1146,15 +1146,15 @@ Trước khi bắt đầu Wave 1 Phase 9, verify:
1146
1146
  - [x] `npm test` baseline pass (351 unit + 44 integration từ Phase 8 — verified 2026-04-29).
1147
1147
  - [x] `npm run typecheck` clean (verified Phase 8).
1148
1148
  - [x] P1-P8 defaults reviewed (mục 7) — đã default trong D-table.
1149
- - [ ] Branch mới `phase-9-observability-reliability` từ main (sau Phase 8 commit).
1149
+ - [x] Branch mới skipped intentionally user requested no separate branch.
1150
1150
  - [x] Read `src/state/event-log.ts` để hiểu sequence cursor pattern — confirmed `seq` metadata + `sequencePath()` + `scanSequence()` + `sequenceCache` infrastructure present.
1151
1151
  - [x] Read `src/runtime/worker-heartbeat.ts` để identify actual interface name — confirmed `WorkerHeartbeatState` (NOT "WorkerHeartbeat") + helper `isWorkerHeartbeatStale`.
1152
1152
  - [x] Read `src/runtime/diagnostic-export.ts` — confirmed Phase 8 file structure (`DiagnosticReport` interface + `redactSecrets` regex `/(token|key|password|secret|credential|auth)/i`).
1153
1153
  - [x] Verify ExtensionAPI surface — confirmed `EventBus.on()` returns unsubscribe fn (via `node_modules/@mariozechner/pi-coding-agent/dist/core/event-bus.d.ts`); **NO `events.off()` exists** → use returned unsubscribe (D18).
1154
- - [ ] Read `src/runtime/team-runner.ts:executeTeamRun` để identify correlation wrap point.
1155
- - [ ] Confirm Node.js >= 20 (AsyncLocalStorage stable since Node 16).
1156
- - [ ] Decide nếu OTLP export ship trong Phase 9 hay defer Phase 10 (default ship per D10).
1157
- - [ ] **Wave 1 entry gate: 9.0.E preflight test pass** — block Wave 2 nếu fail.
1154
+ - [x] Read `src/runtime/team-runner.ts:executeTeamRun` để identify correlation wrap point.
1155
+ - [x] Confirm Node.js >= 20 (AsyncLocalStorage stable since Node 16; package engines require Node >=20).
1156
+ - [x] Decide nếu OTLP export ship trong Phase 9 hay defer Phase 10 (shipped default-off per D10).
1157
+ - [x] **Wave 1 entry gate: 9.0.E preflight test pass** — block Wave 2 nếu fail.
1158
1158
 
1159
1159
  **Sẵn sàng triển khai Phase 9 Path X. Phase 8 verified DONE.**
1160
1160
 
@@ -0,0 +1,174 @@
1
+ # Research: `source/pi-crew` as New Reference Source
2
+
3
+ Date: 2026-04-29
4
+ Reference source: `D:/my/my_project/source/pi-crew` (`@melihmucuk/pi-crew@1.0.14`, commit `c0631a3`)
5
+ Current target: `D:/my/my_project/pi-crew` (`pi-crew@0.1.34`)
6
+ Research run: `team_20260429091311_8047706b`
7
+
8
+ > Note: the parallel research run produced useful artifacts, but child workers were marked failed because they did not exit within 5s after their final assistant message. The source audit content was still captured in result/shared artifacts.
9
+
10
+ ## Executive Summary
11
+
12
+ `source/pi-crew` is a compact, in-process subagent orchestration extension. It is not a team/workflow engine; instead, it focuses on fast non-blocking subagent sessions, owner-routed steering-message delivery, interactive subagents, and context-overflow recovery. It is valuable as a reference for **session-native subagent runtime**, **delivery semantics**, and **minimal interactive worker UX**.
13
+
14
+ Current `pi-crew` is more powerful and durable: child Pi workers, teams/workflows, task graph scheduling, worktrees, mailbox, event logs, dashboard, notifications, and recovery state. The best path is not replacement; it is selective porting of patterns into `pi-crew`'s existing `live-session-runtime` / `SubagentManager` as an optional session-native lane.
15
+
16
+ ## Source File Map
17
+
18
+ | Area | Reference files |
19
+ |---|---|
20
+ | Extension entry/session hooks | `source/pi-crew/extension/index.ts` |
21
+ | Runtime singleton | `source/pi-crew/extension/runtime/crew-runtime.ts` |
22
+ | Delivery routing | `source/pi-crew/extension/runtime/delivery-coordinator.ts` |
23
+ | State model/registry | `source/pi-crew/extension/runtime/subagent-state.ts`, `source/pi-crew/extension/runtime/subagent-registry.ts` |
24
+ | Overflow recovery | `source/pi-crew/extension/runtime/overflow-recovery.ts` |
25
+ | Session bootstrap | `source/pi-crew/extension/bootstrap-session.ts` |
26
+ | Agent discovery | `source/pi-crew/extension/agent-discovery.ts` |
27
+ | Tool registration | `source/pi-crew/extension/integration/register-tools.ts`, `source/pi-crew/extension/integration/tools/*.ts` |
28
+ | Message renderers | `source/pi-crew/extension/integration/register-renderers.ts` |
29
+ | Message formatting | `source/pi-crew/extension/subagent-messages.ts` |
30
+ | Status widget | `source/pi-crew/extension/status-widget.ts` |
31
+ | Architecture doc | `source/pi-crew/docs/architecture.md` |
32
+
33
+ ## Architecture Observations
34
+
35
+ ### Reference `source/pi-crew`
36
+
37
+ - Process-level singleton `CrewRuntime` survives Pi runtime/session replacement and rebinds on `session_start`.
38
+ - Subagents are in-process SDK `AgentSession`s created with `createAgentSession()`.
39
+ - Parent/child linkage uses `SessionManager.newSession({ parentSession })`.
40
+ - Subagent resource loading filters out the pi-crew extension through `extensionsOverride` to prevent recursive `crew_spawn` loops.
41
+ - Results are delivered through Pi-native `sendMessage()` with explicit idle/streaming semantics.
42
+ - Interactive subagents are first-class: `interactive: true` workers enter `waiting`; parent continues with `crew_respond`; cleanup is explicit with `crew_done`.
43
+ - Overflow recovery tracks `agent_end`, `compaction_start/end`, and `auto_retry_start/end` events around `session.prompt()`.
44
+ - State is in-memory only; subagent session files remain for post-hoc `/resume` inspection.
45
+
46
+ ### Current `pi-crew`
47
+
48
+ - Primary runtime is child Pi process execution with durable `.crew/state` manifests and artifacts.
49
+ - It has workflow/team abstractions, task graphs, worktree support, event log, mailbox, dashboard panes, render scheduler, notifications, and diagnostic exports.
50
+ - It already has `live-session-runtime.ts`, but the current product surface centers on durable child-process workers rather than interactive in-process subagents.
51
+
52
+ ## Extension API Patterns Worth Reusing
53
+
54
+ | Pattern | Reference source | Why it matters for current `pi-crew` |
55
+ |---|---|---|
56
+ | Owner-routed delivery by `sessionManager.getSessionId()` | `delivery-coordinator.ts` | Avoids sending async worker results to the wrong active session after `/resume`, `/new`, `/fork`, or multi-session use. |
57
+ | Idle vs streaming delivery split | `subagent-messages.ts`, `delivery-coordinator.ts` | Prevents messages from getting stuck: idle sessions need `triggerTurn`; streaming sessions need `deliverAs: "steer"`. |
58
+ | Deferred pending flush via `setTimeout(0)` | `delivery-coordinator.ts` | Avoids lost JSONL/custom-message persistence during resume before listeners reconnect. |
59
+ | `extensionsOverride` filter | `bootstrap-session.ts` | Required for any in-process worker lane to prevent recursive subagent spawning. |
60
+ | Fire-and-forget interactive response | `crew-respond.ts`, `crew-runtime.ts` | Lets parent stay responsive while an interactive worker continues in background. |
61
+ | No duplicate done message | `crew-done.ts` | Avoids repeating the last subagent response during cleanup. |
62
+ | Source-specific abort reasons | `crew-abort.ts`, `index.ts` shutdown handlers | Better diagnostics than generic "aborted by user". |
63
+ | Emergency unrestricted abort command | `register-command.ts` | Useful escape hatch distinct from owner-scoped tool actions. |
64
+ | Overflow tracker around SDK prompt | `overflow-recovery.ts` | Better UX for context overflow/compaction/retry in session-native workers. |
65
+
66
+ ## Key Differences / Non-Goals
67
+
68
+ | Dimension | Reference `source/pi-crew` | Current `pi-crew` |
69
+ |---|---|---|
70
+ | Runtime | In-process `AgentSession` | Child Pi processes + durable orchestration |
71
+ | State | In-memory map | Durable manifests/event logs/artifacts |
72
+ | Scope | Flat subagent spawn/respond/done | Teams, workflows, task graph, worktrees |
73
+ | Result UX | Pi steering/custom messages | Tool results, mailbox, dashboard, async status |
74
+ | Interactive workers | Native | Not yet first-class |
75
+ | Worktree isolation | None | First-class |
76
+ | Replay/restart | Limited | Strong durable recovery |
77
+
78
+ Do **not** replace the current runtime wholesale. Reference `source/pi-crew` lacks durable state, worktrees, workflow scheduling, artifact indexing, and the Phase 8 operator experience. Its best value is a narrower session-native execution lane and delivery correctness patterns.
79
+
80
+ ## Recommendations
81
+
82
+ ### P0 — Adopt Delivery Semantics for Async/Live Results
83
+
84
+ Implement or adapt a small owner-routed delivery coordinator in current `pi-crew`:
85
+
86
+ - Key by owner `sessionId`, not session file.
87
+ - Queue pending messages when owner inactive.
88
+ - On `session_start`, flush pending messages on next macrotask.
89
+ - Use idle/streaming split:
90
+ - idle: `sendMessage(payload, { triggerTurn: true })`
91
+ - streaming: `sendMessage(payload, { deliverAs: "steer", triggerTurn: true })`
92
+ - Keep current mailbox/event-log as durable source of truth; use delivery coordinator only for live UX.
93
+
94
+ Likely target files:
95
+
96
+ - `pi-crew/src/extension/register.ts`
97
+ - `pi-crew/src/runtime/subagent-manager.ts`
98
+ - `pi-crew/src/runtime/live-session-runtime.ts`
99
+ - `pi-crew/src/extension/notification-router.ts`
100
+
101
+ ### P1 — Add Optional Session-Native Subagent Lane
102
+
103
+ Build an opt-in lane on top of existing `live-session-runtime.ts` rather than changing the default child-process runtime:
104
+
105
+ - `runtime.mode = "child-process" | "live-session" | "auto"` already exists conceptually; tighten semantics.
106
+ - Use `SessionManager.newSession({ parentSession })` and `createAgentSession()` for in-process workers.
107
+ - Filter `pi-crew` out of subagent resource loader extensions.
108
+ - Persist minimal metadata to existing `.crew/state` so dashboards/recovery still work.
109
+
110
+ This can reduce process startup overhead and blank console issues, while preserving child-process isolation as the safe default.
111
+
112
+ ### P1 — Introduce Interactive Worker Semantics
113
+
114
+ Add first-class interactive subagents without disrupting teams:
115
+
116
+ - New status: `waiting` for interactive background workers.
117
+ - `crew_agent_respond` / `crew_agent_done` or extend existing `crew_agent_steer` semantics.
118
+ - Fire-and-forget response: parent tool returns immediately; worker response arrives as mailbox/steering message.
119
+ - `done` performs cleanup only; no duplicate response.
120
+
121
+ Likely target files:
122
+
123
+ - `pi-crew/src/runtime/crew-agent-records.ts`
124
+ - `pi-crew/src/runtime/subagent-manager.ts`
125
+ - `pi-crew/src/extension/registration/subagent-tools.ts`
126
+ - `pi-crew/src/state/mailbox.ts`
127
+ - `pi-crew/src/ui/dashboard-panes/agents-pane.ts`
128
+
129
+ ### P2 — Port Overflow Recovery Tracker for Live Sessions
130
+
131
+ For session-native workers, wrap `AgentSession.prompt()` with an event tracker similar to `source/pi-crew/extension/runtime/overflow-recovery.ts`:
132
+
133
+ - Track `compaction_start/end` and `auto_retry_start/end`.
134
+ - Report recovered context overflow separately from hard failure.
135
+ - Emit durable event-log records and dashboard health hints.
136
+
137
+ This should not apply to child Pi workers directly; they already have process/transcript supervision.
138
+
139
+ ### P2 — Improve Abort Reason Taxonomy
140
+
141
+ Adopt explicit abort source reasons across all worker paths:
142
+
143
+ - tool-triggered abort
144
+ - command-triggered emergency abort
145
+ - session quit cleanup
146
+ - session replacement detach/deactivate
147
+ - watchdog timeout
148
+ - stale heartbeat kill
149
+
150
+ This improves diagnostics, notification routing, and Phase 9 reliability work.
151
+
152
+ ## Risks
153
+
154
+ - In-process sessions reduce OS/process isolation; failures or leaks may affect the parent Pi process.
155
+ - `extensionsOverride` is mandatory; missing it risks recursive subagent spawning.
156
+ - Pi SDK internals may shift; keep this lane optional and covered by integration tests.
157
+ - Delivery semantics must not bypass durable mailbox/event log; live messages are convenience, not source of truth.
158
+ - Interactive workers can linger in memory; require TTL/status visibility and explicit cleanup.
159
+
160
+ ## Suggested Follow-Up Plan
161
+
162
+ 1. Write a focused design doc: `docs/research-session-native-runtime-plan.md`.
163
+ 2. Spike delivery coordinator only; no runtime swap.
164
+ 3. Add tests for idle/streaming/inactive owner delivery behavior.
165
+ 4. Add optional `live-session` worker lane behind config.
166
+ 5. Add interactive worker status/actions after live delivery is stable.
167
+
168
+ ## Research Artifacts
169
+
170
+ - `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/results/01_discover.txt`
171
+ - `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/results/02_explore-shard-1.txt`
172
+ - `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/results/03_explore-shard-2.txt`
173
+ - `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/results/04_explore-shard-3.txt`
174
+ - `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/batches/01_discover+02_explore-shard-1+03_explore-shard-2+04_explore-shard-3.md`
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.1.34",
3
+ "version": "0.1.36",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
package/schema.json CHANGED
@@ -163,6 +163,48 @@
163
163
  "quietHours": { "type": "string", "pattern": "^\\d{2}:\\d{2}-\\d{2}:\\d{2}$", "description": "Local HH:MM-HH:MM quiet-hours range; supports cross-day ranges such as 22:00-07:00." },
164
164
  "sinkRetentionDays": { "type": "integer", "minimum": 1, "maximum": 90, "default": 7 }
165
165
  }
166
+ },
167
+ "observability": {
168
+ "type": "object",
169
+ "additionalProperties": false,
170
+ "description": "Metric registry, heartbeat watcher, and metric file sink settings.",
171
+ "properties": {
172
+ "enabled": { "type": "boolean", "default": true },
173
+ "pollIntervalMs": { "type": "integer", "minimum": 1000, "maximum": 60000, "default": 5000 },
174
+ "metricRetentionDays": { "type": "integer", "minimum": 1, "maximum": 365, "default": 7 }
175
+ }
176
+ },
177
+ "reliability": {
178
+ "type": "object",
179
+ "additionalProperties": false,
180
+ "description": "Opt-in reliability controls for retry, recovery, and deadletter handling.",
181
+ "properties": {
182
+ "autoRetry": { "type": "boolean", "default": false },
183
+ "autoRecover": { "type": "boolean", "default": false },
184
+ "deadletterThreshold": { "type": "integer", "minimum": 1, "default": 3 },
185
+ "retryPolicy": {
186
+ "type": "object",
187
+ "additionalProperties": false,
188
+ "properties": {
189
+ "maxAttempts": { "type": "integer", "minimum": 1, "maximum": 10, "default": 3 },
190
+ "backoffMs": { "type": "integer", "minimum": 100, "maximum": 60000, "default": 1000 },
191
+ "jitterRatio": { "type": "number", "minimum": 0, "maximum": 1, "default": 0.3 },
192
+ "exponentialFactor": { "type": "number", "minimum": 1, "maximum": 5, "default": 2 },
193
+ "retryableErrors": { "type": "array", "items": { "type": "string", "minLength": 1 } }
194
+ }
195
+ }
196
+ }
197
+ },
198
+ "otlp": {
199
+ "type": "object",
200
+ "additionalProperties": false,
201
+ "description": "Optional OpenTelemetry metric export. Disabled by default.",
202
+ "properties": {
203
+ "enabled": { "type": "boolean", "default": false },
204
+ "endpoint": { "type": "string", "minLength": 1 },
205
+ "headers": { "type": "object", "additionalProperties": { "type": "string" } },
206
+ "intervalMs": { "type": "integer", "minimum": 5000, "default": 60000 }
207
+ }
166
208
  }
167
209
  }
168
210
  }
@@ -103,6 +103,34 @@ export interface CrewNotificationsConfig {
103
103
  sinkRetentionDays?: number;
104
104
  }
105
105
 
106
+ export interface CrewObservabilityConfig {
107
+ enabled?: boolean;
108
+ pollIntervalMs?: number;
109
+ metricRetentionDays?: number;
110
+ }
111
+
112
+ export interface CrewRetryPolicyConfig {
113
+ maxAttempts?: number;
114
+ backoffMs?: number;
115
+ jitterRatio?: number;
116
+ exponentialFactor?: number;
117
+ retryableErrors?: string[];
118
+ }
119
+
120
+ export interface CrewReliabilityConfig {
121
+ autoRetry?: boolean;
122
+ retryPolicy?: CrewRetryPolicyConfig;
123
+ autoRecover?: boolean;
124
+ deadletterThreshold?: number;
125
+ }
126
+
127
+ export interface CrewOtlpConfig {
128
+ enabled?: boolean;
129
+ endpoint?: string;
130
+ headers?: Record<string, string>;
131
+ intervalMs?: number;
132
+ }
133
+
106
134
  export interface PiTeamsConfig {
107
135
  asyncByDefault?: boolean;
108
136
  executeWorkers?: boolean;
@@ -117,6 +145,9 @@ export interface PiTeamsConfig {
117
145
  tools?: CrewToolsConfig;
118
146
  telemetry?: CrewTelemetryConfig;
119
147
  notifications?: CrewNotificationsConfig;
148
+ observability?: CrewObservabilityConfig;
149
+ reliability?: CrewReliabilityConfig;
150
+ otlp?: CrewOtlpConfig;
120
151
  ui?: CrewUiConfig;
121
152
  }
122
153
 
@@ -241,6 +272,27 @@ function mergeConfig(base: PiTeamsConfig, override: PiTeamsConfig): PiTeamsConfi
241
272
  ...withoutUndefined((override.notifications ?? {}) as Record<string, unknown>),
242
273
  };
243
274
  }
275
+ if (base.observability || override.observability) {
276
+ merged.observability = {
277
+ ...(base.observability ?? {}),
278
+ ...withoutUndefined((override.observability ?? {}) as Record<string, unknown>),
279
+ };
280
+ }
281
+ if (base.reliability || override.reliability) {
282
+ merged.reliability = {
283
+ ...(base.reliability ?? {}),
284
+ ...withoutUndefined((override.reliability ?? {}) as Record<string, unknown>),
285
+ retryPolicy: base.reliability?.retryPolicy || override.reliability?.retryPolicy ? { ...(base.reliability?.retryPolicy ?? {}), ...withoutUndefined((override.reliability?.retryPolicy ?? {}) as Record<string, unknown>) } : undefined,
286
+ };
287
+ }
288
+ if (base.otlp || override.otlp) {
289
+ merged.otlp = {
290
+ ...(base.otlp ?? {}),
291
+ ...withoutUndefined((override.otlp ?? {}) as Record<string, unknown>),
292
+ headers: { ...(base.otlp?.headers ?? {}), ...(override.otlp?.headers ?? {}) },
293
+ };
294
+ if (Object.keys(merged.otlp.headers ?? {}).length === 0) delete merged.otlp.headers;
295
+ }
244
296
  if (merged.agents?.overrides && Object.keys(merged.agents.overrides).length === 0) delete merged.agents.overrides;
245
297
  return merged;
246
298
  }
@@ -475,6 +527,52 @@ function parseNotificationsConfig(value: unknown): CrewNotificationsConfig | und
475
527
  return Object.values(notifications).some((entry) => entry !== undefined) ? notifications : undefined;
476
528
  }
477
529
 
530
+ function parseObservabilityConfig(value: unknown): CrewObservabilityConfig | undefined {
531
+ const obj = asRecord(value);
532
+ if (!obj) return undefined;
533
+ const observability: CrewObservabilityConfig = {
534
+ enabled: parseWithSchema(Type.Boolean(), obj.enabled),
535
+ pollIntervalMs: parseWithSchema(Type.Integer({ minimum: 1000, maximum: 60_000 }), obj.pollIntervalMs),
536
+ metricRetentionDays: parsePositiveInteger(obj.metricRetentionDays, 365),
537
+ };
538
+ return Object.values(observability).some((entry) => entry !== undefined) ? observability : undefined;
539
+ }
540
+
541
+ function parseReliabilityConfig(value: unknown): CrewReliabilityConfig | undefined {
542
+ const obj = asRecord(value);
543
+ if (!obj) return undefined;
544
+ const retryObj = asRecord(obj.retryPolicy);
545
+ const retryPolicy: CrewRetryPolicyConfig | undefined = retryObj ? {
546
+ maxAttempts: parsePositiveInteger(retryObj.maxAttempts, 10),
547
+ backoffMs: parseWithSchema(Type.Integer({ minimum: 100, maximum: 60_000 }), retryObj.backoffMs),
548
+ jitterRatio: parseWithSchema(Type.Number({ minimum: 0, maximum: 1 }), retryObj.jitterRatio),
549
+ exponentialFactor: parseWithSchema(Type.Number({ minimum: 1, maximum: 5 }), retryObj.exponentialFactor),
550
+ retryableErrors: parseStringList(retryObj.retryableErrors),
551
+ } : undefined;
552
+ const reliability: CrewReliabilityConfig = {
553
+ autoRetry: parseWithSchema(Type.Boolean(), obj.autoRetry),
554
+ retryPolicy: retryPolicy && Object.values(retryPolicy).some((entry) => entry !== undefined) ? retryPolicy : undefined,
555
+ autoRecover: parseWithSchema(Type.Boolean(), obj.autoRecover),
556
+ deadletterThreshold: parsePositiveInteger(obj.deadletterThreshold),
557
+ };
558
+ return Object.values(reliability).some((entry) => entry !== undefined) ? reliability : undefined;
559
+ }
560
+
561
+ function parseOtlpConfig(value: unknown): CrewOtlpConfig | undefined {
562
+ const obj = asRecord(value);
563
+ if (!obj) return undefined;
564
+ const headers: Record<string, string> = {};
565
+ const rawHeaders = asRecord(obj.headers);
566
+ if (rawHeaders) for (const [key, entry] of Object.entries(rawHeaders)) if (typeof entry === "string") headers[key] = entry;
567
+ const otlp: CrewOtlpConfig = {
568
+ enabled: parseWithSchema(Type.Boolean(), obj.enabled),
569
+ endpoint: parseWithSchema(Type.String({ minLength: 1 }), obj.endpoint),
570
+ headers: Object.keys(headers).length > 0 ? headers : undefined,
571
+ intervalMs: parseWithSchema(Type.Integer({ minimum: 5000 }), obj.intervalMs),
572
+ };
573
+ return Object.values(otlp).some((entry) => entry !== undefined) ? otlp : undefined;
574
+ }
575
+
478
576
  export function parseConfig(raw: unknown): PiTeamsConfig {
479
577
  const obj = asRecord(raw);
480
578
  if (!obj) return {};
@@ -492,6 +590,9 @@ export function parseConfig(raw: unknown): PiTeamsConfig {
492
590
  tools: parseToolsConfig(obj.tools),
493
591
  telemetry: parseTelemetryConfig(obj.telemetry),
494
592
  notifications: parseNotificationsConfig(obj.notifications),
593
+ observability: parseObservabilityConfig(obj.observability),
594
+ reliability: parseReliabilityConfig(obj.reliability),
595
+ otlp: parseOtlpConfig(obj.otlp),
495
596
  ui: parseUiConfig(obj.ui),
496
597
  };
497
598
  }