pi-crew 0.1.34 → 0.1.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -0
- package/docs/architecture.md +8 -1
- package/docs/research-phase9-observability-reliability-plan.md +42 -42
- package/docs/research-source-pi-crew-reference.md +174 -0
- package/package.json +1 -1
- package/schema.json +42 -0
- package/src/config/config.ts +101 -0
- package/src/extension/register.ts +66 -3
- package/src/extension/registration/commands.ts +14 -3
- package/src/extension/registration/team-tool.ts +3 -1
- package/src/extension/team-tool/api.ts +27 -2
- package/src/extension/team-tool/context.ts +2 -0
- package/src/extension/team-tool/run.ts +2 -2
- package/src/extension/team-tool.ts +1 -1
- package/src/observability/correlation.ts +35 -0
- package/src/observability/event-to-metric.ts +54 -0
- package/src/observability/exporters/adapter.ts +24 -0
- package/src/observability/exporters/otlp-exporter.ts +65 -0
- package/src/observability/exporters/prometheus-exporter.ts +47 -0
- package/src/observability/metric-registry.ts +72 -0
- package/src/observability/metric-retention.ts +46 -0
- package/src/observability/metric-sink.ts +51 -0
- package/src/observability/metrics-primitives.ts +166 -0
- package/src/runtime/child-pi.ts +5 -1
- package/src/runtime/crash-recovery.ts +56 -0
- package/src/runtime/deadletter.ts +36 -0
- package/src/runtime/diagnostic-export.ts +8 -1
- package/src/runtime/heartbeat-gradient.ts +28 -0
- package/src/runtime/heartbeat-watcher.ts +80 -0
- package/src/runtime/retry-executor.ts +59 -0
- package/src/runtime/team-runner.ts +57 -5
- package/src/schema/config-schema.ts +29 -0
- package/src/state/event-log.ts +3 -2
- package/src/state/types.ts +7 -0
- package/src/ui/dashboard-panes/agents-pane.ts +4 -1
- package/src/ui/dashboard-panes/metrics-pane.ts +34 -0
- package/src/ui/heartbeat-aggregator.ts +14 -4
- package/src/ui/keybinding-map.ts +4 -2
- package/src/ui/live-run-sidebar.ts +5 -4
- package/src/ui/run-action-dispatcher.ts +3 -2
- package/src/ui/run-dashboard.ts +17 -6
- package/src/ui/spinner.ts +17 -0
package/README.md
CHANGED
|
@@ -53,6 +53,8 @@ Current highlights:
|
|
|
53
53
|
- `/team-manager` interactive helper
|
|
54
54
|
- `/team-dashboard` custom TUI overlay with progress preview, action shortcuts, and reload
|
|
55
55
|
- `parallel-research` team/workflow for dynamic `Source/pi-*` fanout and parallel shard exploration
|
|
56
|
+
- observability metrics: per-session Counter/Gauge/Histogram registry, JSONL sink, `/team-metrics`, dashboard metrics pane, Prometheus/OTLP exporters (OTLP opt-in)
|
|
57
|
+
- reliability hardening: heartbeat gradient watcher, opt-in retry executor with attempt trace, crash-recovery detection, deadletter queue
|
|
56
58
|
- package polish: `schema.json`, TypeScript semantic check, strip-types import smoke, cross-platform CI workflow, dry-run package verification
|
|
57
59
|
|
|
58
60
|
## Install
|
|
@@ -187,6 +189,29 @@ Supported config:
|
|
|
187
189
|
"enableClaudeStyleAliases": true,
|
|
188
190
|
"enableSteer": true,
|
|
189
191
|
"terminateOnForeground": false
|
|
192
|
+
},
|
|
193
|
+
"telemetry": {
|
|
194
|
+
"enabled": true
|
|
195
|
+
},
|
|
196
|
+
"observability": {
|
|
197
|
+
"enabled": true,
|
|
198
|
+
"pollIntervalMs": 5000,
|
|
199
|
+
"metricRetentionDays": 7
|
|
200
|
+
},
|
|
201
|
+
"reliability": {
|
|
202
|
+
"autoRetry": false,
|
|
203
|
+
"autoRecover": false,
|
|
204
|
+
"deadletterThreshold": 3,
|
|
205
|
+
"retryPolicy": {
|
|
206
|
+
"maxAttempts": 3,
|
|
207
|
+
"backoffMs": 1000,
|
|
208
|
+
"jitterRatio": 0.3,
|
|
209
|
+
"exponentialFactor": 2
|
|
210
|
+
}
|
|
211
|
+
},
|
|
212
|
+
"otlp": {
|
|
213
|
+
"enabled": false,
|
|
214
|
+
"endpoint": "http://localhost:4318/v1/metrics"
|
|
190
215
|
}
|
|
191
216
|
}
|
|
192
217
|
```
|
|
@@ -195,6 +220,9 @@ Safety notes:
|
|
|
195
220
|
|
|
196
221
|
- Foreground child-process runs continue in the Pi extension process and return control to chat immediately, so large workflows do not block the interactive session. They are interrupted on session shutdown. Use `async: true` only for intentionally detached runs that may survive the current session.
|
|
197
222
|
- `tools.terminateOnForeground` is an opt-in power-user setting. When true, foreground `Agent`/`crew_agent` calls return with `terminate: true` after the child result is available, saving one follow-up LLM turn. Default is false so the assistant can still summarize raw worker output.
|
|
223
|
+
- `observability.enabled` defaults to true for in-memory metrics and heartbeat watching. Metric JSONL snapshots are gated by `telemetry.enabled`; set `telemetry.enabled=false` to opt out of local telemetry files.
|
|
224
|
+
- `reliability.autoRetry` and `reliability.autoRecover` default to false. Enabling retry may execute an idempotent task more than once; each attempt is recorded in `task.attempts`, and exhausted retries append a deadletter entry.
|
|
225
|
+
- `otlp.enabled` defaults to false. Configure `otlp.endpoint` only when you want to push metrics to an OTLP HTTP collector.
|
|
198
226
|
|
|
199
227
|
UI notes:
|
|
200
228
|
|
|
@@ -375,6 +403,7 @@ Manual slash commands are ops/debug controls. Autonomous tool use via policy/rec
|
|
|
375
403
|
/team-import <path-to-run-export.json> [--user]
|
|
376
404
|
/team-imports
|
|
377
405
|
/team-api <runId> <operation> [key=value]
|
|
406
|
+
/team-metrics [filter]
|
|
378
407
|
/team-manager
|
|
379
408
|
/team-dashboard
|
|
380
409
|
/team-init [--copy-builtins] [--overwrite]
|
|
@@ -406,6 +435,13 @@ Manual slash commands are ops/debug controls. Autonomous tool use via policy/rec
|
|
|
406
435
|
/team-api team_... validate-mailbox repair=true
|
|
407
436
|
```
|
|
408
437
|
|
|
438
|
+
Use `/team-metrics` for a current metrics snapshot. The optional argument is a glob-style metric filter:
|
|
439
|
+
|
|
440
|
+
```text
|
|
441
|
+
/team-metrics
|
|
442
|
+
/team-metrics crew.task.*
|
|
443
|
+
```
|
|
444
|
+
|
|
409
445
|
## Dashboard
|
|
410
446
|
|
|
411
447
|
Open:
|
package/docs/architecture.md
CHANGED
|
@@ -152,7 +152,10 @@ Atomic writes use temp-file replace with retry for transient Windows `EPERM`/`EB
|
|
|
152
152
|
- The persistent widget shows active runs only.
|
|
153
153
|
- Stale async runs with dead background pids are hidden from the active widget.
|
|
154
154
|
- `/team-status` is the canonical detailed state view and can mark stale active async runs failed.
|
|
155
|
-
- `/team-dashboard` provides live history/details from `RunSnapshotCache`, with panes for agents, progress/events, mailbox attention,
|
|
155
|
+
- `/team-dashboard` provides live history/details from `RunSnapshotCache`, with panes for agents, progress/events, mailbox attention, recent output, health, and metrics.
|
|
156
|
+
- Phase 9 observability uses a per-session `MetricRegistry` (`Counter`, `Gauge`, `Histogram`) wired to `crew.*` events via unsubscribe-returning `events.on()` handlers. The registry is disposed on session shutdown/reload; no global metric singleton is used.
|
|
157
|
+
- Metrics can be inspected with `/team-metrics` or `team api metrics-snapshot`, exported as redacted daily JSONL under `<crewRoot>/state/metrics/` when telemetry is enabled, formatted for Prometheus, or pushed to an opt-in OTLP HTTP endpoint.
|
|
158
|
+
- Heartbeat observability is split between dashboard summaries and a background `HeartbeatWatcher`: healthy/warn/stale/dead gradient metrics are emitted, first-dead detections notify operators, and consecutive dead ticks can append deadletter entries.
|
|
156
159
|
- Powerbar publishing is optional and event-compatible: pi-crew emits `powerbar:register-segment` for `pi-crew-active` / `pi-crew-progress`, emits `powerbar:update` payloads (`id`, `text`, optional `suffix`, `bar`, `color`), and mirrors status through `ctx.ui.setStatus("pi-crew", ...)` when no powerbar listener is detected.
|
|
157
160
|
- Transcript viewer is file-backed so it works for foreground and async runs; it defaults to bounded tail reads and can load full content on demand.
|
|
158
161
|
|
|
@@ -167,6 +170,10 @@ Key config sections:
|
|
|
167
170
|
- `runtime`: `auto`, `child-process`, `scaffold`, experimental `live-session`.
|
|
168
171
|
- `limits`: concurrency/task/depth safety controls.
|
|
169
172
|
- `ui`: widget/dashboard/powerbar/model-token display settings.
|
|
173
|
+
- `observability`: in-memory metrics, heartbeat watcher interval, metric file retention.
|
|
174
|
+
- `telemetry`: opt-out switch for local telemetry sinks.
|
|
175
|
+
- `reliability`: opt-in auto-retry/auto-recover defaults and deadletter threshold.
|
|
176
|
+
- `otlp`: opt-in OTLP HTTP metric export.
|
|
170
177
|
- `agents`: builtin overrides for models/fallbacks/tools.
|
|
171
178
|
- `autonomous`: policy injection/profile for proactive team delegation.
|
|
172
179
|
|
|
@@ -13,38 +13,38 @@
|
|
|
13
13
|
## 0. Implementation Status
|
|
14
14
|
|
|
15
15
|
### Foundation (Wave 1)
|
|
16
|
-
- [
|
|
17
|
-
- [
|
|
18
|
-
- [
|
|
19
|
-
- [
|
|
20
|
-
- [
|
|
16
|
+
- [x] 9.0.A Metric primitives — Counter / Gauge / Histogram base classes (`src/observability/metrics-primitives.ts`)
|
|
17
|
+
- [x] 9.0.B MetricRegistry **per-session instance** + naming convention (`src/observability/metric-registry.ts`)
|
|
18
|
+
- [x] 9.0.C Correlation context — traceId/spanId propagation primitive (`src/observability/correlation.ts`)
|
|
19
|
+
- [x] 9.0.D Heartbeat gradient classifier extension (warn/stale/dead thresholds with metrics emission, reuse `WorkerHeartbeatState` interface + `isWorkerHeartbeatStale` helper)
|
|
20
|
+
- [x] 9.0.E **Preflight verify** ExtensionAPI surface (`events.on` returns unsubscribe fn, `events.off` does NOT exist) + cross-check `WorkerHeartbeatState` field name
|
|
21
21
|
|
|
22
22
|
### Reliability core (Wave 2)
|
|
23
|
-
- [
|
|
24
|
-
- [
|
|
25
|
-
- [
|
|
26
|
-
- [
|
|
23
|
+
- [x] 9.1.A Background heartbeat watcher (detect stuck workers, emit `crew.heartbeat.staleness_ms` Gauge)
|
|
24
|
+
- [x] 9.1.B Retry executor + backoff/jitter policy (`src/runtime/retry-executor.ts`)
|
|
25
|
+
- [x] 9.1.C Crash recovery resume từ event-log checkpoint
|
|
26
|
+
- [x] 9.1.D Deadletter queue writer + threshold alerts via NotificationRouter
|
|
27
27
|
|
|
28
28
|
### Telemetry pipeline (Wave 3)
|
|
29
|
-
- [
|
|
30
|
-
- [
|
|
31
|
-
- [
|
|
32
|
-
- [
|
|
29
|
+
- [x] 9.2.A Event-to-metric subscriber (subscribe `crew.*` events → registry counters)
|
|
30
|
+
- [x] 9.2.B Metric retention policy (sliding window aggregation 1h/1d configurable)
|
|
31
|
+
- [x] 9.2.C Histogram quantile calculator (p50/p95/p99 streaming) — t-digest or fixed buckets
|
|
32
|
+
- [x] 9.2.D Metric file sink JSONL với daily rotation (gated bởi `telemetry.enabled`)
|
|
33
33
|
|
|
34
34
|
### Export adapters (Wave 3 parallel)
|
|
35
|
-
- [
|
|
36
|
-
- [
|
|
37
|
-
- [
|
|
35
|
+
- [x] 9.3.A Prometheus exposition format adapter (HTTP endpoint optional)
|
|
36
|
+
- [x] 9.3.B OTLP HTTP exporter (optional, opt-in)
|
|
37
|
+
- [x] 9.3.C Adapter abstraction (plugin pattern, extensible)
|
|
38
38
|
|
|
39
39
|
### UI & commands (Wave 4)
|
|
40
|
-
- [
|
|
41
|
-
- [
|
|
42
|
-
- [
|
|
40
|
+
- [x] 9.4.A `team metrics` command — snapshot JSON, filter by name/runId
|
|
41
|
+
- [x] 9.4.B Metrics pane (pane index `6`) trong dashboard
|
|
42
|
+
- [x] 9.4.C Diagnostic export (Phase 8) include metrics snapshot
|
|
43
43
|
|
|
44
44
|
### Wiring & validation (Wave 5)
|
|
45
|
-
- [
|
|
46
|
-
- [
|
|
47
|
-
- [
|
|
45
|
+
- [x] 9.5.A Wire register.ts — instantiate MetricRegistry, EventToMetric subscriber, RetryExecutor, BackgroundWatcher
|
|
46
|
+
- [x] 9.5.B Tests: unit + integration + perf
|
|
47
|
+
- [x] 9.5.C Migration guide: existing runs continue to work; opt-in for retry/recovery via config flag
|
|
48
48
|
|
|
49
49
|
## 1. Roadmap-Level Decisions
|
|
50
50
|
|
|
@@ -1099,20 +1099,20 @@ Phase 7 (DONE) ──► Phase 8 (Operator UX) ──► Phase 9 Wave 1 (Foundat
|
|
|
1099
1099
|
|
|
1100
1100
|
## 10. Acceptance Checklist (Wave 5 exit criteria)
|
|
1101
1101
|
|
|
1102
|
-
- [
|
|
1103
|
-
- [
|
|
1104
|
-
- [
|
|
1105
|
-
- [
|
|
1106
|
-
- [
|
|
1107
|
-
- [
|
|
1108
|
-
- [
|
|
1109
|
-
- [
|
|
1110
|
-
- [ ] Bump
|
|
1111
|
-
- [
|
|
1112
|
-
- [
|
|
1113
|
-
- [
|
|
1114
|
-
- [
|
|
1115
|
-
- [
|
|
1102
|
+
- [x] Tất cả checkbox 9.0 → 9.5 (bao gồm 9.0.E preflight) tick `[x]`.
|
|
1103
|
+
- [x] `npm test` pass: **389 unit** + **45 integration**, 0 fail (2026-04-29).
|
|
1104
|
+
- [x] `npm run typecheck` clean.
|
|
1105
|
+
- [x] Manual smoke 10 scenarios pass.
|
|
1106
|
+
- [x] Performance budget thỏa: counter 0.597µs, histogram 0.551µs, snapshot 0.159ms, heartbeat watcher 61.777ms/50 runs, recovery detect 27.036ms/50 runs.
|
|
1107
|
+
- [x] No regression: Phase 7+8 tests vẫn pass (full suite clean).
|
|
1108
|
+
- [x] Config breaking? **No.** Schema additive (`reliability`, `otlp`, `observability` sections optional).
|
|
1109
|
+
- [x] Default behavior unchanged: `autoRetry=false`, `autoRecover=false`, `otlp.enabled=false`, `observability.enabled` default `true` (sink/watcher gated bởi telemetry).
|
|
1110
|
+
- [ ] Bump package version for next release (current workspace remained on `0.1.35`; release not requested in this Phase 9 implementation turn).
|
|
1111
|
+
- [x] Migration guide trong README/release notes section.
|
|
1112
|
+
- [x] **D18 verified**: 0 `events.off?.` references in Phase 9 code; all subscriptions use returned unsubscribe fn.
|
|
1113
|
+
- [x] **D17 verified**: 0 module-level `globalRegistry`/singleton patterns; all observability state per-session, disposed in session_shutdown.
|
|
1114
|
+
- [x] **D21 verified**: DiagnosticReport schemaVersion=2 khi metricsSnapshot present; schemaVersion undefined cho Phase 8 reports.
|
|
1115
|
+
- [x] **No listener leak** test: 3x session_start/shutdown cycles → 0 residual subscriptions on `pi.events`.
|
|
1116
1116
|
|
|
1117
1117
|
## 11. Out of Scope (defer Phase 10+)
|
|
1118
1118
|
|
|
@@ -1133,7 +1133,7 @@ Phase 7 (DONE) ──► Phase 8 (Operator UX) ──► Phase 9 Wave 1 (Foundat
|
|
|
1133
1133
|
| 6 | `.crew/` migration + autonomous policy | ~12d | ✅ DONE |
|
|
1134
1134
|
| 7 | UI Optimization (snapshot cache + render scheduler + 4 panes) | ~18d | ✅ DONE |
|
|
1135
1135
|
| **8** | **Operator Experience (Theme A)** | **14-18d** | ✅ **DONE** (verified 351 unit + 44 integration pass, version 0.1.34, all 17 sub-phases shipped) |
|
|
1136
|
-
| **9** | **Observability + Reliability (Theme B+C)** | **19.5-22.5d** | **
|
|
1136
|
+
| **9** | **Observability + Reliability (Theme B+C)** | **19.5-22.5d** | ✅ **IMPLEMENTED** (verified 389 unit + 45 integration pass in workspace) |
|
|
1137
1137
|
| 10+ | TBD: Performance baseline (Theme D), distributed coordination, multi-host | — | Future |
|
|
1138
1138
|
|
|
1139
1139
|
**Path X total to Phase 9 done: ~63-67 dev-days** (Phase 6+7+8 done = 44d; Phase 9 = 19.5-22.5d remaining).
|
|
@@ -1146,15 +1146,15 @@ Trước khi bắt đầu Wave 1 Phase 9, verify:
|
|
|
1146
1146
|
- [x] `npm test` baseline pass (351 unit + 44 integration từ Phase 8 — verified 2026-04-29).
|
|
1147
1147
|
- [x] `npm run typecheck` clean (verified Phase 8).
|
|
1148
1148
|
- [x] P1-P8 defaults reviewed (mục 7) — đã default trong D-table.
|
|
1149
|
-
- [
|
|
1149
|
+
- [x] Branch mới skipped intentionally — user requested no separate branch.
|
|
1150
1150
|
- [x] Read `src/state/event-log.ts` để hiểu sequence cursor pattern — confirmed `seq` metadata + `sequencePath()` + `scanSequence()` + `sequenceCache` infrastructure present.
|
|
1151
1151
|
- [x] Read `src/runtime/worker-heartbeat.ts` để identify actual interface name — confirmed `WorkerHeartbeatState` (NOT "WorkerHeartbeat") + helper `isWorkerHeartbeatStale`.
|
|
1152
1152
|
- [x] Read `src/runtime/diagnostic-export.ts` — confirmed Phase 8 file structure (`DiagnosticReport` interface + `redactSecrets` regex `/(token|key|password|secret|credential|auth)/i`).
|
|
1153
1153
|
- [x] Verify ExtensionAPI surface — confirmed `EventBus.on()` returns unsubscribe fn (via `node_modules/@mariozechner/pi-coding-agent/dist/core/event-bus.d.ts`); **NO `events.off()` exists** → use returned unsubscribe (D18).
|
|
1154
|
-
- [
|
|
1155
|
-
- [
|
|
1156
|
-
- [
|
|
1157
|
-
- [
|
|
1154
|
+
- [x] Read `src/runtime/team-runner.ts:executeTeamRun` để identify correlation wrap point.
|
|
1155
|
+
- [x] Confirm Node.js >= 20 (AsyncLocalStorage stable since Node 16; package engines require Node >=20).
|
|
1156
|
+
- [x] Decide nếu OTLP export ship trong Phase 9 hay defer Phase 10 (shipped default-off per D10).
|
|
1157
|
+
- [x] **Wave 1 entry gate: 9.0.E preflight test pass** — block Wave 2 nếu fail.
|
|
1158
1158
|
|
|
1159
1159
|
**Sẵn sàng triển khai Phase 9 Path X. Phase 8 verified DONE.**
|
|
1160
1160
|
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# Research: `source/pi-crew` as New Reference Source
|
|
2
|
+
|
|
3
|
+
Date: 2026-04-29
|
|
4
|
+
Reference source: `D:/my/my_project/source/pi-crew` (`@melihmucuk/pi-crew@1.0.14`, commit `c0631a3`)
|
|
5
|
+
Current target: `D:/my/my_project/pi-crew` (`pi-crew@0.1.34`)
|
|
6
|
+
Research run: `team_20260429091311_8047706b`
|
|
7
|
+
|
|
8
|
+
> Note: the parallel research run produced useful artifacts, but child workers were marked failed because they did not exit within 5s after their final assistant message. The source audit content was still captured in result/shared artifacts.
|
|
9
|
+
|
|
10
|
+
## Executive Summary
|
|
11
|
+
|
|
12
|
+
`source/pi-crew` is a compact, in-process subagent orchestration extension. It is not a team/workflow engine; instead, it focuses on fast non-blocking subagent sessions, owner-routed steering-message delivery, interactive subagents, and context-overflow recovery. It is valuable as a reference for **session-native subagent runtime**, **delivery semantics**, and **minimal interactive worker UX**.
|
|
13
|
+
|
|
14
|
+
Current `pi-crew` is more powerful and durable: child Pi workers, teams/workflows, task graph scheduling, worktrees, mailbox, event logs, dashboard, notifications, and recovery state. The best path is not replacement; it is selective porting of patterns into `pi-crew`'s existing `live-session-runtime` / `SubagentManager` as an optional session-native lane.
|
|
15
|
+
|
|
16
|
+
## Source File Map
|
|
17
|
+
|
|
18
|
+
| Area | Reference files |
|
|
19
|
+
|---|---|
|
|
20
|
+
| Extension entry/session hooks | `source/pi-crew/extension/index.ts` |
|
|
21
|
+
| Runtime singleton | `source/pi-crew/extension/runtime/crew-runtime.ts` |
|
|
22
|
+
| Delivery routing | `source/pi-crew/extension/runtime/delivery-coordinator.ts` |
|
|
23
|
+
| State model/registry | `source/pi-crew/extension/runtime/subagent-state.ts`, `source/pi-crew/extension/runtime/subagent-registry.ts` |
|
|
24
|
+
| Overflow recovery | `source/pi-crew/extension/runtime/overflow-recovery.ts` |
|
|
25
|
+
| Session bootstrap | `source/pi-crew/extension/bootstrap-session.ts` |
|
|
26
|
+
| Agent discovery | `source/pi-crew/extension/agent-discovery.ts` |
|
|
27
|
+
| Tool registration | `source/pi-crew/extension/integration/register-tools.ts`, `source/pi-crew/extension/integration/tools/*.ts` |
|
|
28
|
+
| Message renderers | `source/pi-crew/extension/integration/register-renderers.ts` |
|
|
29
|
+
| Message formatting | `source/pi-crew/extension/subagent-messages.ts` |
|
|
30
|
+
| Status widget | `source/pi-crew/extension/status-widget.ts` |
|
|
31
|
+
| Architecture doc | `source/pi-crew/docs/architecture.md` |
|
|
32
|
+
|
|
33
|
+
## Architecture Observations
|
|
34
|
+
|
|
35
|
+
### Reference `source/pi-crew`
|
|
36
|
+
|
|
37
|
+
- Process-level singleton `CrewRuntime` survives Pi runtime/session replacement and rebinds on `session_start`.
|
|
38
|
+
- Subagents are in-process SDK `AgentSession`s created with `createAgentSession()`.
|
|
39
|
+
- Parent/child linkage uses `SessionManager.newSession({ parentSession })`.
|
|
40
|
+
- Subagent resource loading filters out the pi-crew extension through `extensionsOverride` to prevent recursive `crew_spawn` loops.
|
|
41
|
+
- Results are delivered through Pi-native `sendMessage()` with explicit idle/streaming semantics.
|
|
42
|
+
- Interactive subagents are first-class: `interactive: true` workers enter `waiting`; parent continues with `crew_respond`; cleanup is explicit with `crew_done`.
|
|
43
|
+
- Overflow recovery tracks `agent_end`, `compaction_start/end`, and `auto_retry_start/end` events around `session.prompt()`.
|
|
44
|
+
- State is in-memory only; subagent session files remain for post-hoc `/resume` inspection.
|
|
45
|
+
|
|
46
|
+
### Current `pi-crew`
|
|
47
|
+
|
|
48
|
+
- Primary runtime is child Pi process execution with durable `.crew/state` manifests and artifacts.
|
|
49
|
+
- It has workflow/team abstractions, task graphs, worktree support, event log, mailbox, dashboard panes, render scheduler, notifications, and diagnostic exports.
|
|
50
|
+
- It already has `live-session-runtime.ts`, but the current product surface centers on durable child-process workers rather than interactive in-process subagents.
|
|
51
|
+
|
|
52
|
+
## Extension API Patterns Worth Reusing
|
|
53
|
+
|
|
54
|
+
| Pattern | Reference source | Why it matters for current `pi-crew` |
|
|
55
|
+
|---|---|---|
|
|
56
|
+
| Owner-routed delivery by `sessionManager.getSessionId()` | `delivery-coordinator.ts` | Avoids sending async worker results to the wrong active session after `/resume`, `/new`, `/fork`, or multi-session use. |
|
|
57
|
+
| Idle vs streaming delivery split | `subagent-messages.ts`, `delivery-coordinator.ts` | Prevents messages from getting stuck: idle sessions need `triggerTurn`; streaming sessions need `deliverAs: "steer"`. |
|
|
58
|
+
| Deferred pending flush via `setTimeout(0)` | `delivery-coordinator.ts` | Avoids lost JSONL/custom-message persistence during resume before listeners reconnect. |
|
|
59
|
+
| `extensionsOverride` filter | `bootstrap-session.ts` | Required for any in-process worker lane to prevent recursive subagent spawning. |
|
|
60
|
+
| Fire-and-forget interactive response | `crew-respond.ts`, `crew-runtime.ts` | Lets parent stay responsive while an interactive worker continues in background. |
|
|
61
|
+
| No duplicate done message | `crew-done.ts` | Avoids repeating the last subagent response during cleanup. |
|
|
62
|
+
| Source-specific abort reasons | `crew-abort.ts`, `index.ts` shutdown handlers | Better diagnostics than generic "aborted by user". |
|
|
63
|
+
| Emergency unrestricted abort command | `register-command.ts` | Useful escape hatch distinct from owner-scoped tool actions. |
|
|
64
|
+
| Overflow tracker around SDK prompt | `overflow-recovery.ts` | Better UX for context overflow/compaction/retry in session-native workers. |
|
|
65
|
+
|
|
66
|
+
## Key Differences / Non-Goals
|
|
67
|
+
|
|
68
|
+
| Dimension | Reference `source/pi-crew` | Current `pi-crew` |
|
|
69
|
+
|---|---|---|
|
|
70
|
+
| Runtime | In-process `AgentSession` | Child Pi processes + durable orchestration |
|
|
71
|
+
| State | In-memory map | Durable manifests/event logs/artifacts |
|
|
72
|
+
| Scope | Flat subagent spawn/respond/done | Teams, workflows, task graph, worktrees |
|
|
73
|
+
| Result UX | Pi steering/custom messages | Tool results, mailbox, dashboard, async status |
|
|
74
|
+
| Interactive workers | Native | Not yet first-class |
|
|
75
|
+
| Worktree isolation | None | First-class |
|
|
76
|
+
| Replay/restart | Limited | Strong durable recovery |
|
|
77
|
+
|
|
78
|
+
Do **not** replace the current runtime wholesale. Reference `source/pi-crew` lacks durable state, worktrees, workflow scheduling, artifact indexing, and the Phase 8 operator experience. Its best value is a narrower session-native execution lane and delivery correctness patterns.
|
|
79
|
+
|
|
80
|
+
## Recommendations
|
|
81
|
+
|
|
82
|
+
### P0 — Adopt Delivery Semantics for Async/Live Results
|
|
83
|
+
|
|
84
|
+
Implement or adapt a small owner-routed delivery coordinator in current `pi-crew`:
|
|
85
|
+
|
|
86
|
+
- Key by owner `sessionId`, not session file.
|
|
87
|
+
- Queue pending messages when owner inactive.
|
|
88
|
+
- On `session_start`, flush pending messages on next macrotask.
|
|
89
|
+
- Use idle/streaming split:
|
|
90
|
+
- idle: `sendMessage(payload, { triggerTurn: true })`
|
|
91
|
+
- streaming: `sendMessage(payload, { deliverAs: "steer", triggerTurn: true })`
|
|
92
|
+
- Keep current mailbox/event-log as durable source of truth; use delivery coordinator only for live UX.
|
|
93
|
+
|
|
94
|
+
Likely target files:
|
|
95
|
+
|
|
96
|
+
- `pi-crew/src/extension/register.ts`
|
|
97
|
+
- `pi-crew/src/runtime/subagent-manager.ts`
|
|
98
|
+
- `pi-crew/src/runtime/live-session-runtime.ts`
|
|
99
|
+
- `pi-crew/src/extension/notification-router.ts`
|
|
100
|
+
|
|
101
|
+
### P1 — Add Optional Session-Native Subagent Lane
|
|
102
|
+
|
|
103
|
+
Build an opt-in lane on top of existing `live-session-runtime.ts` rather than changing the default child-process runtime:
|
|
104
|
+
|
|
105
|
+
- `runtime.mode = "child-process" | "live-session" | "auto"` already exists conceptually; tighten semantics.
|
|
106
|
+
- Use `SessionManager.newSession({ parentSession })` and `createAgentSession()` for in-process workers.
|
|
107
|
+
- Filter `pi-crew` out of subagent resource loader extensions.
|
|
108
|
+
- Persist minimal metadata to existing `.crew/state` so dashboards/recovery still work.
|
|
109
|
+
|
|
110
|
+
This can reduce process startup overhead and blank console issues, while preserving child-process isolation as the safe default.
|
|
111
|
+
|
|
112
|
+
### P1 — Introduce Interactive Worker Semantics
|
|
113
|
+
|
|
114
|
+
Add first-class interactive subagents without disrupting teams:
|
|
115
|
+
|
|
116
|
+
- New status: `waiting` for interactive background workers.
|
|
117
|
+
- `crew_agent_respond` / `crew_agent_done` or extend existing `crew_agent_steer` semantics.
|
|
118
|
+
- Fire-and-forget response: parent tool returns immediately; worker response arrives as mailbox/steering message.
|
|
119
|
+
- `done` performs cleanup only; no duplicate response.
|
|
120
|
+
|
|
121
|
+
Likely target files:
|
|
122
|
+
|
|
123
|
+
- `pi-crew/src/runtime/crew-agent-records.ts`
|
|
124
|
+
- `pi-crew/src/runtime/subagent-manager.ts`
|
|
125
|
+
- `pi-crew/src/extension/registration/subagent-tools.ts`
|
|
126
|
+
- `pi-crew/src/state/mailbox.ts`
|
|
127
|
+
- `pi-crew/src/ui/dashboard-panes/agents-pane.ts`
|
|
128
|
+
|
|
129
|
+
### P2 — Port Overflow Recovery Tracker for Live Sessions
|
|
130
|
+
|
|
131
|
+
For session-native workers, wrap `AgentSession.prompt()` with an event tracker similar to `source/pi-crew/extension/runtime/overflow-recovery.ts`:
|
|
132
|
+
|
|
133
|
+
- Track `compaction_start/end` and `auto_retry_start/end`.
|
|
134
|
+
- Report recovered context overflow separately from hard failure.
|
|
135
|
+
- Emit durable event-log records and dashboard health hints.
|
|
136
|
+
|
|
137
|
+
This should not apply to child Pi workers directly; they already have process/transcript supervision.
|
|
138
|
+
|
|
139
|
+
### P2 — Improve Abort Reason Taxonomy
|
|
140
|
+
|
|
141
|
+
Adopt explicit abort source reasons across all worker paths:
|
|
142
|
+
|
|
143
|
+
- tool-triggered abort
|
|
144
|
+
- command-triggered emergency abort
|
|
145
|
+
- session quit cleanup
|
|
146
|
+
- session replacement detach/deactivate
|
|
147
|
+
- watchdog timeout
|
|
148
|
+
- stale heartbeat kill
|
|
149
|
+
|
|
150
|
+
This improves diagnostics, notification routing, and Phase 9 reliability work.
|
|
151
|
+
|
|
152
|
+
## Risks
|
|
153
|
+
|
|
154
|
+
- In-process sessions reduce OS/process isolation; failures or leaks may affect the parent Pi process.
|
|
155
|
+
- `extensionsOverride` is mandatory; missing it risks recursive subagent spawning.
|
|
156
|
+
- Pi SDK internals may shift; keep this lane optional and covered by integration tests.
|
|
157
|
+
- Delivery semantics must not bypass durable mailbox/event log; live messages are convenience, not source of truth.
|
|
158
|
+
- Interactive workers can linger in memory; require TTL/status visibility and explicit cleanup.
|
|
159
|
+
|
|
160
|
+
## Suggested Follow-Up Plan
|
|
161
|
+
|
|
162
|
+
1. Write a focused design doc: `docs/research-session-native-runtime-plan.md`.
|
|
163
|
+
2. Spike delivery coordinator only; no runtime swap.
|
|
164
|
+
3. Add tests for idle/streaming/inactive owner delivery behavior.
|
|
165
|
+
4. Add optional `live-session` worker lane behind config.
|
|
166
|
+
5. Add interactive worker status/actions after live delivery is stable.
|
|
167
|
+
|
|
168
|
+
## Research Artifacts
|
|
169
|
+
|
|
170
|
+
- `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/results/01_discover.txt`
|
|
171
|
+
- `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/results/02_explore-shard-1.txt`
|
|
172
|
+
- `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/results/03_explore-shard-2.txt`
|
|
173
|
+
- `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/results/04_explore-shard-3.txt`
|
|
174
|
+
- `D:/my/my_project/.crew/artifacts/team_20260429091311_8047706b/batches/01_discover+02_explore-shard-1+03_explore-shard-2+04_explore-shard-3.md`
|
package/package.json
CHANGED
package/schema.json
CHANGED
|
@@ -163,6 +163,48 @@
|
|
|
163
163
|
"quietHours": { "type": "string", "pattern": "^\\d{2}:\\d{2}-\\d{2}:\\d{2}$", "description": "Local HH:MM-HH:MM quiet-hours range; supports cross-day ranges such as 22:00-07:00." },
|
|
164
164
|
"sinkRetentionDays": { "type": "integer", "minimum": 1, "maximum": 90, "default": 7 }
|
|
165
165
|
}
|
|
166
|
+
},
|
|
167
|
+
"observability": {
|
|
168
|
+
"type": "object",
|
|
169
|
+
"additionalProperties": false,
|
|
170
|
+
"description": "Metric registry, heartbeat watcher, and metric file sink settings.",
|
|
171
|
+
"properties": {
|
|
172
|
+
"enabled": { "type": "boolean", "default": true },
|
|
173
|
+
"pollIntervalMs": { "type": "integer", "minimum": 1000, "maximum": 60000, "default": 5000 },
|
|
174
|
+
"metricRetentionDays": { "type": "integer", "minimum": 1, "maximum": 365, "default": 7 }
|
|
175
|
+
}
|
|
176
|
+
},
|
|
177
|
+
"reliability": {
|
|
178
|
+
"type": "object",
|
|
179
|
+
"additionalProperties": false,
|
|
180
|
+
"description": "Opt-in reliability controls for retry, recovery, and deadletter handling.",
|
|
181
|
+
"properties": {
|
|
182
|
+
"autoRetry": { "type": "boolean", "default": false },
|
|
183
|
+
"autoRecover": { "type": "boolean", "default": false },
|
|
184
|
+
"deadletterThreshold": { "type": "integer", "minimum": 1, "default": 3 },
|
|
185
|
+
"retryPolicy": {
|
|
186
|
+
"type": "object",
|
|
187
|
+
"additionalProperties": false,
|
|
188
|
+
"properties": {
|
|
189
|
+
"maxAttempts": { "type": "integer", "minimum": 1, "maximum": 10, "default": 3 },
|
|
190
|
+
"backoffMs": { "type": "integer", "minimum": 100, "maximum": 60000, "default": 1000 },
|
|
191
|
+
"jitterRatio": { "type": "number", "minimum": 0, "maximum": 1, "default": 0.3 },
|
|
192
|
+
"exponentialFactor": { "type": "number", "minimum": 1, "maximum": 5, "default": 2 },
|
|
193
|
+
"retryableErrors": { "type": "array", "items": { "type": "string", "minLength": 1 } }
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
},
|
|
198
|
+
"otlp": {
|
|
199
|
+
"type": "object",
|
|
200
|
+
"additionalProperties": false,
|
|
201
|
+
"description": "Optional OpenTelemetry metric export. Disabled by default.",
|
|
202
|
+
"properties": {
|
|
203
|
+
"enabled": { "type": "boolean", "default": false },
|
|
204
|
+
"endpoint": { "type": "string", "minLength": 1 },
|
|
205
|
+
"headers": { "type": "object", "additionalProperties": { "type": "string" } },
|
|
206
|
+
"intervalMs": { "type": "integer", "minimum": 5000, "default": 60000 }
|
|
207
|
+
}
|
|
166
208
|
}
|
|
167
209
|
}
|
|
168
210
|
}
|
package/src/config/config.ts
CHANGED
|
@@ -103,6 +103,34 @@ export interface CrewNotificationsConfig {
|
|
|
103
103
|
sinkRetentionDays?: number;
|
|
104
104
|
}
|
|
105
105
|
|
|
106
|
+
export interface CrewObservabilityConfig {
|
|
107
|
+
enabled?: boolean;
|
|
108
|
+
pollIntervalMs?: number;
|
|
109
|
+
metricRetentionDays?: number;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export interface CrewRetryPolicyConfig {
|
|
113
|
+
maxAttempts?: number;
|
|
114
|
+
backoffMs?: number;
|
|
115
|
+
jitterRatio?: number;
|
|
116
|
+
exponentialFactor?: number;
|
|
117
|
+
retryableErrors?: string[];
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
export interface CrewReliabilityConfig {
|
|
121
|
+
autoRetry?: boolean;
|
|
122
|
+
retryPolicy?: CrewRetryPolicyConfig;
|
|
123
|
+
autoRecover?: boolean;
|
|
124
|
+
deadletterThreshold?: number;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
export interface CrewOtlpConfig {
|
|
128
|
+
enabled?: boolean;
|
|
129
|
+
endpoint?: string;
|
|
130
|
+
headers?: Record<string, string>;
|
|
131
|
+
intervalMs?: number;
|
|
132
|
+
}
|
|
133
|
+
|
|
106
134
|
export interface PiTeamsConfig {
|
|
107
135
|
asyncByDefault?: boolean;
|
|
108
136
|
executeWorkers?: boolean;
|
|
@@ -117,6 +145,9 @@ export interface PiTeamsConfig {
|
|
|
117
145
|
tools?: CrewToolsConfig;
|
|
118
146
|
telemetry?: CrewTelemetryConfig;
|
|
119
147
|
notifications?: CrewNotificationsConfig;
|
|
148
|
+
observability?: CrewObservabilityConfig;
|
|
149
|
+
reliability?: CrewReliabilityConfig;
|
|
150
|
+
otlp?: CrewOtlpConfig;
|
|
120
151
|
ui?: CrewUiConfig;
|
|
121
152
|
}
|
|
122
153
|
|
|
@@ -241,6 +272,27 @@ function mergeConfig(base: PiTeamsConfig, override: PiTeamsConfig): PiTeamsConfi
|
|
|
241
272
|
...withoutUndefined((override.notifications ?? {}) as Record<string, unknown>),
|
|
242
273
|
};
|
|
243
274
|
}
|
|
275
|
+
if (base.observability || override.observability) {
|
|
276
|
+
merged.observability = {
|
|
277
|
+
...(base.observability ?? {}),
|
|
278
|
+
...withoutUndefined((override.observability ?? {}) as Record<string, unknown>),
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
if (base.reliability || override.reliability) {
|
|
282
|
+
merged.reliability = {
|
|
283
|
+
...(base.reliability ?? {}),
|
|
284
|
+
...withoutUndefined((override.reliability ?? {}) as Record<string, unknown>),
|
|
285
|
+
retryPolicy: base.reliability?.retryPolicy || override.reliability?.retryPolicy ? { ...(base.reliability?.retryPolicy ?? {}), ...withoutUndefined((override.reliability?.retryPolicy ?? {}) as Record<string, unknown>) } : undefined,
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
if (base.otlp || override.otlp) {
|
|
289
|
+
merged.otlp = {
|
|
290
|
+
...(base.otlp ?? {}),
|
|
291
|
+
...withoutUndefined((override.otlp ?? {}) as Record<string, unknown>),
|
|
292
|
+
headers: { ...(base.otlp?.headers ?? {}), ...(override.otlp?.headers ?? {}) },
|
|
293
|
+
};
|
|
294
|
+
if (Object.keys(merged.otlp.headers ?? {}).length === 0) delete merged.otlp.headers;
|
|
295
|
+
}
|
|
244
296
|
if (merged.agents?.overrides && Object.keys(merged.agents.overrides).length === 0) delete merged.agents.overrides;
|
|
245
297
|
return merged;
|
|
246
298
|
}
|
|
@@ -475,6 +527,52 @@ function parseNotificationsConfig(value: unknown): CrewNotificationsConfig | und
|
|
|
475
527
|
return Object.values(notifications).some((entry) => entry !== undefined) ? notifications : undefined;
|
|
476
528
|
}
|
|
477
529
|
|
|
530
|
+
function parseObservabilityConfig(value: unknown): CrewObservabilityConfig | undefined {
|
|
531
|
+
const obj = asRecord(value);
|
|
532
|
+
if (!obj) return undefined;
|
|
533
|
+
const observability: CrewObservabilityConfig = {
|
|
534
|
+
enabled: parseWithSchema(Type.Boolean(), obj.enabled),
|
|
535
|
+
pollIntervalMs: parseWithSchema(Type.Integer({ minimum: 1000, maximum: 60_000 }), obj.pollIntervalMs),
|
|
536
|
+
metricRetentionDays: parsePositiveInteger(obj.metricRetentionDays, 365),
|
|
537
|
+
};
|
|
538
|
+
return Object.values(observability).some((entry) => entry !== undefined) ? observability : undefined;
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
function parseReliabilityConfig(value: unknown): CrewReliabilityConfig | undefined {
|
|
542
|
+
const obj = asRecord(value);
|
|
543
|
+
if (!obj) return undefined;
|
|
544
|
+
const retryObj = asRecord(obj.retryPolicy);
|
|
545
|
+
const retryPolicy: CrewRetryPolicyConfig | undefined = retryObj ? {
|
|
546
|
+
maxAttempts: parsePositiveInteger(retryObj.maxAttempts, 10),
|
|
547
|
+
backoffMs: parseWithSchema(Type.Integer({ minimum: 100, maximum: 60_000 }), retryObj.backoffMs),
|
|
548
|
+
jitterRatio: parseWithSchema(Type.Number({ minimum: 0, maximum: 1 }), retryObj.jitterRatio),
|
|
549
|
+
exponentialFactor: parseWithSchema(Type.Number({ minimum: 1, maximum: 5 }), retryObj.exponentialFactor),
|
|
550
|
+
retryableErrors: parseStringList(retryObj.retryableErrors),
|
|
551
|
+
} : undefined;
|
|
552
|
+
const reliability: CrewReliabilityConfig = {
|
|
553
|
+
autoRetry: parseWithSchema(Type.Boolean(), obj.autoRetry),
|
|
554
|
+
retryPolicy: retryPolicy && Object.values(retryPolicy).some((entry) => entry !== undefined) ? retryPolicy : undefined,
|
|
555
|
+
autoRecover: parseWithSchema(Type.Boolean(), obj.autoRecover),
|
|
556
|
+
deadletterThreshold: parsePositiveInteger(obj.deadletterThreshold),
|
|
557
|
+
};
|
|
558
|
+
return Object.values(reliability).some((entry) => entry !== undefined) ? reliability : undefined;
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
function parseOtlpConfig(value: unknown): CrewOtlpConfig | undefined {
|
|
562
|
+
const obj = asRecord(value);
|
|
563
|
+
if (!obj) return undefined;
|
|
564
|
+
const headers: Record<string, string> = {};
|
|
565
|
+
const rawHeaders = asRecord(obj.headers);
|
|
566
|
+
if (rawHeaders) for (const [key, entry] of Object.entries(rawHeaders)) if (typeof entry === "string") headers[key] = entry;
|
|
567
|
+
const otlp: CrewOtlpConfig = {
|
|
568
|
+
enabled: parseWithSchema(Type.Boolean(), obj.enabled),
|
|
569
|
+
endpoint: parseWithSchema(Type.String({ minLength: 1 }), obj.endpoint),
|
|
570
|
+
headers: Object.keys(headers).length > 0 ? headers : undefined,
|
|
571
|
+
intervalMs: parseWithSchema(Type.Integer({ minimum: 5000 }), obj.intervalMs),
|
|
572
|
+
};
|
|
573
|
+
return Object.values(otlp).some((entry) => entry !== undefined) ? otlp : undefined;
|
|
574
|
+
}
|
|
575
|
+
|
|
478
576
|
export function parseConfig(raw: unknown): PiTeamsConfig {
|
|
479
577
|
const obj = asRecord(raw);
|
|
480
578
|
if (!obj) return {};
|
|
@@ -492,6 +590,9 @@ export function parseConfig(raw: unknown): PiTeamsConfig {
|
|
|
492
590
|
tools: parseToolsConfig(obj.tools),
|
|
493
591
|
telemetry: parseTelemetryConfig(obj.telemetry),
|
|
494
592
|
notifications: parseNotificationsConfig(obj.notifications),
|
|
593
|
+
observability: parseObservabilityConfig(obj.observability),
|
|
594
|
+
reliability: parseReliabilityConfig(obj.reliability),
|
|
595
|
+
otlp: parseOtlpConfig(obj.otlp),
|
|
495
596
|
ui: parseUiConfig(obj.ui),
|
|
496
597
|
};
|
|
497
598
|
}
|