pi-crew 0.9.4 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +592 -0
  2. package/README.md +55 -3
  3. package/docs/HARNESS_BACKLOG.md +51 -3
  4. package/docs/dynamic-workflows.md +315 -2
  5. package/docs/fix-plan-disabletools-exit-null.md +219 -0
  6. package/docs/troubleshooting.md +102 -0
  7. package/package.json +8 -2
  8. package/src/extension/command-completions.ts +1 -0
  9. package/src/extension/crew-shortcuts.ts +1 -0
  10. package/src/extension/register.ts +2 -0
  11. package/src/extension/registration/commands.ts +3 -0
  12. package/src/extension/team-tool/doctor.ts +14 -0
  13. package/src/extension/team-tool/goal.ts +1 -0
  14. package/src/extension/team-tool/run.ts +4 -0
  15. package/src/runtime/background-runner.ts +24 -2
  16. package/src/runtime/chain-runner.ts +1 -0
  17. package/src/runtime/child-pi.ts +101 -10
  18. package/src/runtime/crash-recovery.ts +78 -36
  19. package/src/runtime/deterministic-ast.ts +161 -0
  20. package/src/runtime/dwf-state-store.ts +97 -0
  21. package/src/runtime/dynamic-workflow-context.ts +381 -7
  22. package/src/runtime/dynamic-workflow-runner.ts +94 -2
  23. package/src/runtime/goal-loop-runner.ts +2 -0
  24. package/src/runtime/live-session-runtime.ts +1 -0
  25. package/src/runtime/model-scope.ts +1 -0
  26. package/src/runtime/peer-dep.ts +1 -0
  27. package/src/runtime/pi-args.ts +11 -0
  28. package/src/runtime/resilient-edit.ts +1 -0
  29. package/src/runtime/result-extractor.ts +72 -7
  30. package/src/runtime/task-runner.ts +1 -0
  31. package/src/runtime/team-runner.ts +8 -3
  32. package/src/runtime/zombie-scanner.ts +297 -0
  33. package/src/schema/team-tool-schema.ts +28 -0
  34. package/src/state/contracts.ts +1 -0
  35. package/src/state/hook-instinct-bridge.ts +3 -0
  36. package/src/state/state-store.ts +3 -0
  37. package/src/state/types.ts +9 -0
  38. package/src/ui/dashboard-panes/progress-pane.ts +5 -0
  39. package/src/ui/dwf-phase-display.ts +151 -0
  40. package/src/ui/run-snapshot-cache.ts +4 -0
  41. package/src/ui/snapshot-types.ts +3 -0
  42. package/src/utils/bm25-search.ts +2 -0
  43. package/src/workflows/workflow-config.ts +3 -0
  44. package/src/worktree/worktree-manager.ts +94 -0
  45. package/types/dwf.d.ts +187 -0
@@ -121,6 +121,7 @@ export const realGoalEvaluator = async (
121
121
  }
122
122
  if (!verificationCompromised) {
123
123
  try {
124
+ // LAZY: defer dynamic import of ./verification-gates.ts to its call site.
124
125
  const { executeVerificationCommands } = await import("./verification-gates.ts");
125
126
  const contract = { requiredGreenLevel: "none" as const, commands: goal.verification.commands, allowManualEvidence: goal.verification.allowManualEvidence ?? false };
126
127
  // Phase 1.5 #2 (RFC 16): run verification in a pristine git worktree at
@@ -131,6 +132,7 @@ export const realGoalEvaluator = async (
131
132
  let worktreeCwd: string | undefined;
132
133
  let worktreeCleanup: (() => void) | undefined;
133
134
  try {
135
+ // LAZY: defer dynamic import of ./verification-worktree.ts to its call site.
134
136
  const { checkWorktreeSandboxAvailable, prepareVerificationWorktree } = await import("./verification-worktree.ts");
135
137
  const availability = checkWorktreeSandboxAvailable(goal.cwd);
136
138
  if (availability.available) {
@@ -36,6 +36,7 @@ import { listLiveAgents } from "./live-agent-manager.ts";
36
36
  * Module-scoped latch for the optional peer dependency import. When N
37
37
  * in-process live-session subagents spawn CONCURRENTLY (e.g. several
38
38
  * `Agent({run_in_background:true})` started at once), each used to call
39
+ // LAZY: defer dynamic import of @earendil-works/pi-coding-agent to its call site.
39
40
  * `await import("@earendil-works/pi-coding-agent")` independently. Under the
40
41
  * tsx loader (registering load/resolve hooks), concurrent first-imports can
41
42
  * each enter the loader and race module-record instantiation, yielding
@@ -128,6 +128,7 @@ export async function readEnabledModelsPatterns(cwd: string, agentDir?: string):
128
128
  // SDK. SettingsManager is dynamically imported because the module
129
129
  // shape differs across pi versions; the create() factory is the
130
130
  // canonical, version-stable entry point.
131
+ // LAZY: defer dynamic import of @earendil-works/pi-coding-agent to its call site.
131
132
  const mod = await import("@earendil-works/pi-coding-agent" as string).catch(() => null);
132
133
  if (!mod) return [];
133
134
  const SettingsManagerCtor = (mod as { SettingsManager?: { create?: (cwd: string, agentDir?: string) => { getEnabledModels?: () => string[] | undefined } } }).SettingsManager;
@@ -239,6 +239,7 @@ export function primePeerDep(): Promise<PeerDepModule> {
239
239
  if (!resolved) {
240
240
  throw new Error(buildMissingMessage());
241
241
  }
242
+ // LAZY: defer dynamic import of module to its call site.
242
243
  cachedModule = (await import(resolved.mainUrl)) as PeerDepModule;
243
244
  return cachedModule;
244
245
  })();
@@ -243,6 +243,12 @@ export function createSafeTempDir(base: string, prefix: string): string {
243
243
  }
244
244
 
245
245
  export function buildPiWorkerArgs(input: BuildPiWorkerArgsInput): BuildPiWorkerArgsResult {
246
+ // NOTE: do NOT add an argv flag like `--crew-subagent` here. Pi uses a strict
247
+ // option parser and REJECTS unknown flags with a non-zero exit, which would
248
+ // break every ctx.agent() call. The authoritative sub-agent identity signal
249
+ // is the PI_CREW_KIND=subagent ENV var (set below) — the zombie scanner and
250
+ // doctor --zombies read it from /proc/<pid>/environ. The user's main session
251
+ // never sets it, so it can never be matched as a sub-agent.
246
252
  const args = ["--mode", "json", "-p"];
247
253
  if (input.sessionEnabled === false) args.push("--no-session");
248
254
 
@@ -327,6 +333,11 @@ export function buildPiWorkerArgs(input: BuildPiWorkerArgsInput): BuildPiWorkerA
327
333
  return {
328
334
  args,
329
335
  env: {
336
+ // PI_CREW_KIND is the authoritative machine-readable sub-agent marker. It is always
337
+ // present on a child-pi process and NEVER present on a user's interactive main session.
338
+ // doctor --zombies uses it to safely list orphaned sub-agents without ever matching a
339
+ // main session (the lesson from an accidental `kill` of a live main session).
340
+ PI_CREW_KIND: "subagent",
330
341
  PI_CREW_INHERIT_PROJECT_CONTEXT: input.agent.inheritProjectContext ? "1" : "0",
331
342
  PI_CREW_INHERIT_SKILLS: input.agent.inheritSkills ? "1" : "0",
332
343
  PI_CREW_DEPTH: String(parentDepth + 1),
@@ -133,6 +133,7 @@ export function wrapEditWithResilientReplace(pi: ExtensionAPI, tools?: { edit: T
133
133
  throw new Error("old_string not found (and resilient retry skipped: missing path/old/new)");
134
134
  }
135
135
 
136
+ // LAZY: defer dynamic import of node:fs/promises to its call site.
136
137
  const fs = await import("node:fs/promises");
137
138
  let content: string;
138
139
  try {
@@ -1,11 +1,19 @@
1
1
  /**
2
2
  * Structured Result Extractor — attempts to extract structured data from worker output.
3
3
  * Tries multiple extraction strategies before falling back to raw text.
4
+ *
5
+ * Round-13 P0-3: optional `schema` (TypeBox `TSchema`) — when provided, extracted
6
+ * data is validated against the schema via `Value.Check`. On mismatch, the result
7
+ * is `structured:false` with an explanatory `error`. Backward compatible: when
8
+ * schema is undefined, behavior is identical to the previous regex-based extractor.
4
9
  */
10
+ import type { TSchema } from "@sinclair/typebox";
11
+ import { Value } from "@sinclair/typebox/value";
12
+
5
13
  export interface ExtractedResult {
6
14
  /** Whether structured data was successfully extracted */
7
15
  structured: boolean;
8
- /** Parsed structured data (if structured=true) */
16
+ /** Parsed structured data (if structured=true AND validated against schema if provided) */
9
17
  data: unknown;
10
18
  /** Raw text output (always available) */
11
19
  rawText: string;
@@ -15,9 +23,13 @@ export interface ExtractedResult {
15
23
 
16
24
  /**
17
25
  * Extract structured result from raw worker output text.
18
- * Tries strategies in order: direct JSON, fenced JSON, key-value markers.
26
+ * Tries strategies in order: direct JSON, fenced JSON, key-value markers, scan.
27
+ *
28
+ * @param raw - the raw text output from a worker
29
+ * @param schema - optional TypeBox schema. When provided, the extracted value is
30
+ * validated; mismatch produces `{structured:false, error:...}`.
19
31
  */
20
- export function extractStructuredResult(raw: string, _schema?: Record<string, unknown>): ExtractedResult {
32
+ export function extractStructuredResult(raw: string, schema?: TSchema): ExtractedResult {
21
33
  const trimmed = raw.trim();
22
34
  if (!trimmed) {
23
35
  return { structured: false, data: null, rawText: raw };
@@ -26,19 +38,19 @@ export function extractStructuredResult(raw: string, _schema?: Record<string, un
26
38
  // Strategy 1: Direct JSON parse (entire output is JSON)
27
39
  const directResult = tryDirectJson(trimmed);
28
40
  if (directResult !== undefined) {
29
- return { structured: true, data: directResult, rawText: raw };
41
+ return finalize(directResult, raw, schema);
30
42
  }
31
43
 
32
44
  // Strategy 2: Extract from ```json ... ``` fence
33
45
  const fencedResult = tryFencedJson(trimmed);
34
46
  if (fencedResult !== undefined) {
35
- return { structured: true, data: fencedResult, rawText: raw };
47
+ return finalize(fencedResult, raw, schema);
36
48
  }
37
49
 
38
50
  // Strategy 3: Extract from markers like "RESULT:" or "OUTPUT:"
39
51
  const markerResult = tryMarkerExtraction(trimmed);
40
52
  if (markerResult !== undefined) {
41
- return { structured: true, data: markerResult, rawText: raw };
53
+ return finalize(markerResult, raw, schema);
42
54
  }
43
55
 
44
56
  // Strategy 4: Scan for the first JSON object/array anywhere in text.
@@ -46,12 +58,65 @@ export function extractStructuredResult(raw: string, _schema?: Record<string, un
46
58
  // around the JSON. This catches JSON embedded in sentences, lists, or prose.
47
59
  const scannedResult = tryScanJson(trimmed);
48
60
  if (scannedResult !== undefined) {
49
- return { structured: true, data: scannedResult, rawText: raw };
61
+ return finalize(scannedResult, raw, schema);
50
62
  }
51
63
 
52
64
  return { structured: false, data: null, rawText: raw };
53
65
  }
54
66
 
67
+ /**
68
+ * After extracting a candidate object, validate it against the optional TypeBox schema.
69
+ * When no schema is given, behavior is the legacy "structured:true" path.
70
+ * When a schema is given and validation fails, return structured:false with a
71
+ * clear error message (caller can surface this in the AgentResult).
72
+ *
73
+ * NOTE: TypeBox 0.34.49's `Value.Check` returns a boolean and does not expose
74
+ * per-error paths in its public API. We use the boolean + a fallback "type mismatch"
75
+ * description. Scripts that need detailed diagnostics can wrap their own validator.
76
+ */
77
+ function finalize(candidate: unknown, raw: string, schema: TSchema | undefined): ExtractedResult {
78
+ if (!schema) {
79
+ return { structured: true, data: candidate, rawText: raw };
80
+ }
81
+ const ok = Value.Check(schema, candidate);
82
+ if (ok) {
83
+ return { structured: true, data: candidate, rawText: raw };
84
+ }
85
+ return {
86
+ structured: false,
87
+ data: null,
88
+ rawText: raw,
89
+ error: `structured output does not match schema: expected shape ${describeSchemaShape(schema)}, got ${describeValue(candidate)}`,
90
+ };
91
+ }
92
+
93
+ function describeValue(value: unknown): string {
94
+ try {
95
+ const json = JSON.stringify(value);
96
+ return json.length > 200 ? `${json.slice(0, 200)}…` : json;
97
+ } catch {
98
+ return typeof value;
99
+ }
100
+ }
101
+
102
+ function describeSchemaShape(schema: unknown): string {
103
+ if (!schema || typeof schema !== "object") return "any";
104
+ const obj = schema as Record<string, unknown>;
105
+ const kind = obj.kind as string | undefined;
106
+ const type = obj.type as string | undefined;
107
+ if (kind === "object" || type === "object") {
108
+ const properties = obj.properties;
109
+ if (!properties || typeof properties !== "object") return "object";
110
+ return `object<${Object.keys(properties as Record<string, unknown>).join(",")}>`;
111
+ }
112
+ if (kind === "array" || type === "array") return "array";
113
+ if (type === "string") return "string";
114
+ if (type === "number" || type === "integer") return "number";
115
+ if (type === "boolean") return "boolean";
116
+ if (Array.isArray(obj.anyOf) || Array.isArray(obj.oneOf)) return "union";
117
+ return "any";
118
+ }
119
+
55
120
  function tryDirectJson(text: string): unknown | undefined {
56
121
  if (!text.startsWith("{") && !text.startsWith("[")) return undefined;
57
122
  try {
@@ -289,6 +289,7 @@ export async function runTeamTask(
289
289
  // follow it and execute a script outside cwd. Throws on escape.
290
290
  resolveRealContainedPath(manifest.cwd, input.step.preStepScript);
291
291
  try {
292
+ // LAZY: defer dynamic import of node:child_process to its call site.
292
293
  const { execFileSync } = await import("node:child_process");
293
294
  preStepOutput = execFileSync(input.step.preStepScript, scriptArgs, {
294
295
  timeout: scriptTimeout,
@@ -63,16 +63,21 @@ builtInRegistry.register(VitePlugin);
63
63
  * executing. The team-runner has no periodic heartbeat today, so any
64
64
  * team run lasting >5min is at risk.
65
65
  */
66
- function startTeamRunHeartbeat(stateRoot: string, runId: string, lastTaskUpdateAt?: string): () => void {
66
+ function startTeamRunHeartbeat(stateRoot: string, runId: string): () => void {
67
67
  const heartbeatPath = path.join(stateRoot, "heartbeat.json");
68
68
  const writeHeartbeat = (): void => {
69
69
  try {
70
+ // lastTaskUpdateAt is written fresh on each tick so the heartbeat
71
+ // never carries a stale creation-time timestamp. Previously this
72
+ // captured manifest.updatedAt once at startup, making the value
73
+ // permanently stale throughout the run.
74
+ const now = new Date().toISOString();
70
75
  fs.writeFileSync(heartbeatPath, JSON.stringify({
71
76
  pid: process.pid,
72
77
  at: Date.now(),
73
78
  runId,
74
79
  kind: "team-runner",
75
- lastTaskUpdateAt,
80
+ lastTaskUpdateAt: now,
76
81
  }), { encoding: "utf-8", mode: 0o600 });
77
82
  } catch {
78
83
  // best-effort
@@ -439,7 +444,7 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
439
444
  // (NO_PID_HEARTBEAT_STALE_MS). Previously only sub-task runners wrote
440
445
  // heartbeats; the team-level run had no heartbeat, so any multi-phase
441
446
  // workflow lasting >5min was marked stale and cancelled.
442
- const stopTeamHeartbeat = startTeamRunHeartbeat(manifest.stateRoot, manifest.runId, manifest.updatedAt);
447
+ const stopTeamHeartbeat = startTeamRunHeartbeat(manifest.stateRoot, manifest.runId);
443
448
 
444
449
  const cleanupUsage = (): void => {
445
450
  for (const task of input.tasks) clearTrackedTaskUsage(task.id);
@@ -0,0 +1,297 @@
1
+ /**
2
+ * zombie-scanner.ts — safely detect orphaned pi-crew sub-agent processes.
3
+ *
4
+ * LESSON (learned the hard way): a heuristic like "old `pi` process + high RSS +
5
+ * orphaned (ppid=1/bash)" will match a user's interactive MAIN session just as
6
+ * readily as a real zombie. The result is a live main session being killed by
7
+ * accident. This module replaces that heuristic with an authoritative signal.
8
+ *
9
+ * Authoritative marker (set by buildPiWorkerArgs on every child-pi spawn):
10
+ * - argv: `--crew-subagent` is the first positional arg
11
+ * - env: `PI_CREW_KIND=subagent` is the machine-readable signal
12
+ *
13
+ * A process is a "pi-crew sub-agent" ONLY IF it carries `PI_CREW_KIND=subagent`
14
+ * in its environment. The user's main `pi` session NEVER has this var, so it can
15
+ * never be matched here — by construction.
16
+ *
17
+ * A sub-agent is a "zombie" ONLY IF its `PI_CREW_PARENT_PID` points at a PID that
18
+ * is no longer alive (parent crashed/exited without reaping the child). A sub-agent
19
+ * whose parent is still running is NOT a zombie — it's a legitimate in-flight task.
20
+ *
21
+ * This module is READ-ONLY. It never kills anything. The caller (doctor --zombies)
22
+ * prints the list and asks for explicit confirmation before any kill.
23
+ */
24
+
25
+ import * as fs from "node:fs";
26
+
27
+ export interface ZombieSubagent {
28
+ pid: number;
29
+ ppid: number;
30
+ /** PID recorded in PI_CREW_PARENT_PID (may differ from ppid if re-parented to init/bash). */
31
+ crewParentPid: number;
32
+ /** Whether the recorded crew parent PID is still alive. */
33
+ parentAlive: boolean;
34
+ role: string | undefined;
35
+ rssKb: number;
36
+ elapsedSec: number | undefined;
37
+ cmd: string;
38
+ }
39
+
40
+ export interface ZombieScanResult {
41
+ zombies: ZombieSubagent[];
42
+ /** Sub-agents whose parent is still alive — shown for transparency, never killed. */
43
+ live: ZombieSubagent[];
44
+ /** Errors encountered while scanning (per-pid). Never aborts the whole scan. */
45
+ errors: string[];
46
+ }
47
+
48
+ /** Read /proc/<pid>/environ as a key=value record. Returns {} if unreadable. */
49
+ function readProcEnviron(pid: number): Record<string, string> {
50
+ try {
51
+ // /proc/<pid>/environ is NUL-separated key=value pairs.
52
+ const raw = fs.readFileSync(`/proc/${pid}/environ`, "utf-8");
53
+ const out: Record<string, string> = {};
54
+ for (const entry of raw.split("\0")) {
55
+ const eq = entry.indexOf("=");
56
+ if (eq > 0) out[entry.slice(0, eq)] = entry.slice(eq + 1);
57
+ }
58
+ return out;
59
+ } catch {
60
+ return {};
61
+ }
62
+ }
63
+
64
+ /** Read /proc/<pid>/stat to get ppid + elapsed. Returns undefined if unreadable. */
65
+ function readProcStat(pid: number): { ppid: number; elapsedSec: number | undefined } | undefined {
66
+ try {
67
+ const stat = fs.readFileSync(`/proc/${pid}/stat`, "utf-8");
68
+ // stat format: pid (comm) state ppid ... starttime ...
69
+ // comm may contain spaces/parens, so parse from the LAST ')' backwards.
70
+ const closeParen = stat.lastIndexOf(")");
71
+ if (closeParen < 0) return undefined;
72
+ const rest = stat.slice(closeParen + 2).trim().split(/\s+/);
73
+ // rest[0] = state, rest[1] = ppid
74
+ const ppid = Number.parseInt(rest[1] ?? "", 10);
75
+ // starttime (clock ticks since boot) is field 22 in the full stat → index 19 in `rest`
76
+ const starttimeTicksRaw = Number.parseInt(rest[19] ?? "", 10);
77
+ const starttimeTicks = Number.isFinite(starttimeTicksRaw) ? starttimeTicksRaw : undefined;
78
+ const elapsedSec = computeElapsedSec(starttimeTicks);
79
+ return { ppid: Number.isFinite(ppid) ? ppid : 0, elapsedSec };
80
+ } catch {
81
+ return undefined;
82
+ }
83
+ }
84
+
85
+ function computeElapsedSec(starttimeTicks: number | undefined): number | undefined {
86
+ if (starttimeTicks === undefined || !Number.isFinite(starttimeTicks)) return undefined;
87
+ try {
88
+ // Linux CLK_TCK is virtually always 100 (sysconf(_SC_CLK_TCK)). Reading it
89
+ // portably from Node requires a native addon; hardcoding 100 matches every
90
+ // mainstream Linux distro and keeps this dependency-free.
91
+ const ticksPerSec = 100;
92
+ // /proc/uptime: first field is seconds since boot.
93
+ const uptimeRaw = fs.readFileSync("/proc/uptime", "utf-8");
94
+ const uptimeSec = Number.parseFloat(uptimeRaw.split(" ")[0] ?? "");
95
+ if (!Number.isFinite(uptimeSec)) return undefined;
96
+ // starttime (ticks since boot) → process age in seconds = uptime - starttime/ticksPerSec.
97
+ const startAgeSec = starttimeTicks / ticksPerSec;
98
+ return Math.max(0, uptimeSec - startAgeSec);
99
+ } catch {
100
+ return undefined;
101
+ }
102
+ }
103
+
104
+ function isPidAlive(pid: number): boolean {
105
+ if (!Number.isFinite(pid) || pid <= 0) return false;
106
+ try {
107
+ // process.kill(pid, 0) throws if the pid is not alive (or not ours).
108
+ process.kill(pid, 0);
109
+ return true;
110
+ } catch {
111
+ return false;
112
+ }
113
+ }
114
+
115
+ function readProcCmdline(pid: number): string {
116
+ try {
117
+ // /proc/<pid>/cmdline is NUL-separated argv.
118
+ const raw = fs.readFileSync(`/proc/${pid}/cmdline`, "utf-8");
119
+ return raw.split("\0").filter(Boolean).join(" ").trim() || `pid ${pid}`;
120
+ } catch {
121
+ return `pid ${pid}`;
122
+ }
123
+ }
124
+
125
+ function readProcRssKb(pid: number): number {
126
+ try {
127
+ const status = fs.readFileSync(`/proc/${pid}/status`, "utf-8");
128
+ const match = status.match(/^VmRSS:\s+(\d+)\s+kB/m);
129
+ return match ? Number.parseInt(match[1] ?? "", 10) : 0;
130
+ } catch {
131
+ return 0;
132
+ }
133
+ }
134
+
135
+ /**
136
+ * Enumerate candidate pi-crew sub-agent PIDs under the current uid.
137
+ *
138
+ * Reads /proc directly (Linux only) — no shelling out to pgrep/ps, so the
139
+ * result is deterministic and unaffected by shell quoting or locale. On
140
+ * non-Linux platforms the scanner returns an empty result with a note in
141
+ * `errors` (zombie detection is best-effort; the doctor report still renders).
142
+ */
143
+ function listCandidatePids(): number[] {
144
+ if (process.platform !== "linux") return [];
145
+ const pids: number[] = [];
146
+ try {
147
+ for (const entry of fs.readdirSync("/proc")) {
148
+ if (/^\d+$/.test(entry)) pids.push(Number.parseInt(entry, 10));
149
+ }
150
+ } catch {
151
+ // /proc unreadable (e.g. sandboxed). Caller surfaces via errors[].
152
+ }
153
+ return pids;
154
+ }
155
+
156
+ /**
157
+ * Scan for orphaned pi-crew sub-agent processes. READ-ONLY — never kills.
158
+ *
159
+ * Returns the full picture: zombies (parent dead), live (parent alive), and
160
+ * any scan errors. Callers decide what to do with the result; this module
161
+ * has no side effects.
162
+ */
163
+ export function scanZombieSubagents(): ZombieScanResult {
164
+ const result: ZombieScanResult = { zombies: [], live: [], errors: [] };
165
+ if (process.platform !== "linux") {
166
+ result.errors.push("zombie scan is Linux-only (/proc required); skipping on " + process.platform);
167
+ return result;
168
+ }
169
+
170
+ const myUid = tryGetUid();
171
+ for (const pid of listCandidatePids()) {
172
+ try {
173
+ // Cheap rejection first: only inspect processes we own (avoid scanning system procs).
174
+ if (myUid !== undefined && getProcUid(pid) !== myUid) continue;
175
+
176
+ const environ = readProcEnviron(pid);
177
+ // AUTHORITATIVE GATE: a process is a pi-crew sub-agent ONLY if it carries
178
+ // PI_CREW_KIND=subagent. The user's main session never sets this, so it can
179
+ // never be matched — this is the fix for accidentally killing main sessions.
180
+ if (environ.PI_CREW_KIND !== "subagent") continue;
181
+
182
+ const crewParentPid = Number.parseInt(environ.PI_CREW_PARENT_PID ?? "", 10);
183
+ const stat = readProcStat(pid);
184
+ const entry: ZombieSubagent = {
185
+ pid,
186
+ ppid: stat?.ppid ?? 0,
187
+ crewParentPid: Number.isFinite(crewParentPid) ? crewParentPid : 0,
188
+ parentAlive: Number.isFinite(crewParentPid) && isPidAlive(crewParentPid),
189
+ role: environ.PI_CREW_ROLE,
190
+ rssKb: readProcRssKb(pid),
191
+ elapsedSec: stat?.elapsedSec,
192
+ cmd: readProcCmdline(pid),
193
+ };
194
+
195
+ if (entry.parentAlive) {
196
+ result.live.push(entry);
197
+ } else {
198
+ result.zombies.push(entry);
199
+ }
200
+ } catch (error) {
201
+ // Race: process may have exited between readdir and read. Don't abort the scan.
202
+ result.errors.push(`pid ${pid}: ${error instanceof Error ? error.message : String(error)}`);
203
+ }
204
+ }
205
+
206
+ // Sort: zombies first by descending RSS (biggest leaks first), live by pid.
207
+ result.zombies.sort((a, b) => b.rssKb - a.rssKb);
208
+ result.live.sort((a, b) => a.pid - b.pid);
209
+ return result;
210
+ }
211
+
212
+ function tryGetUid(): number | undefined {
213
+ try {
214
+ return process.getuid?.();
215
+ } catch {
216
+ return undefined;
217
+ }
218
+ }
219
+
220
+ function getProcUid(pid: number): number | undefined {
221
+ try {
222
+ // /proc/<pid>/status has Uid: <real> <eff> <sav> <fs>
223
+ const status = fs.readFileSync(`/proc/${pid}/status`, "utf-8");
224
+ const match = status.match(/^Uid:\s+(\d+)/m);
225
+ return match ? Number.parseInt(match[1] ?? "", 10) : undefined;
226
+ } catch {
227
+ return undefined;
228
+ }
229
+ }
230
+
231
+ /**
232
+ * Render a ZombieScanResult as human-readable text for the doctor report.
233
+ * Explicitly labels main-session safety and never suggests killing live parents.
234
+ */
235
+ export function formatZombieReport(scan: ZombieScanResult): string {
236
+ const lines: string[] = [];
237
+ lines.push("## Zombie sub-agent scan (read-only — nothing killed)");
238
+ lines.push("");
239
+ lines.push(
240
+ `Sub-agents identified by PI_CREW_KIND=subagent marker. Main sessions (no marker) are never listed.`,
241
+ );
242
+ lines.push("");
243
+
244
+ if (scan.zombies.length === 0 && scan.live.length === 0) {
245
+ lines.push("No pi-crew sub-agent processes found.");
246
+ if (scan.errors.length > 0) {
247
+ lines.push("");
248
+ lines.push(`Scan notes (${scan.errors.length}):`);
249
+ for (const err of scan.errors.slice(0, 5)) lines.push(` - ${err}`);
250
+ }
251
+ return lines.join("\n");
252
+ }
253
+
254
+ if (scan.zombies.length > 0) {
255
+ lines.push(`### Zombies — parent dead (${scan.zombies.length})`);
256
+ lines.push("These sub-agents are orphaned. Safe to kill after review:");
257
+ lines.push("");
258
+ lines.push(" PID PARENT RSS ROLE CMD");
259
+ for (const z of scan.zombies) {
260
+ lines.push(
261
+ ` ${String(z.pid).padEnd(9)}${String(z.crewParentPid).padEnd(8)}${formatRss(z.rssKb).padEnd(10)}${(z.role ?? "?").padEnd(14)}${z.cmd.slice(0, 60)}`,
262
+ );
263
+ }
264
+ lines.push("");
265
+ }
266
+
267
+ if (scan.live.length > 0) {
268
+ lines.push(`### Live — parent still running (${scan.live.length})`);
269
+ lines.push("NOT zombies. Do not kill (parent PID is alive and may still reap them).");
270
+ lines.push("");
271
+ lines.push(" PID PARENT RSS ROLE CMD");
272
+ for (const l of scan.live) {
273
+ lines.push(
274
+ ` ${String(l.pid).padEnd(9)}${String(l.crewParentPid).padEnd(8)}${formatRss(l.rssKb).padEnd(10)}${(l.role ?? "?").padEnd(14)}${l.cmd.slice(0, 60)}`,
275
+ );
276
+ }
277
+ lines.push("");
278
+ }
279
+
280
+ if (scan.errors.length > 0) {
281
+ lines.push(`Scan errors (${scan.errors.length}, first 5 shown):`);
282
+ for (const err of scan.errors.slice(0, 5)) lines.push(` - ${err}`);
283
+ lines.push("");
284
+ }
285
+
286
+ lines.push("To kill a zombie: `kill <PID>` (the OS will reap it). This tool never kills.");
287
+ return lines.join("\n");
288
+ }
289
+
290
+ function formatRss(kb: number): string {
291
+ if (kb >= 1024 * 1024) return `${(kb / 1024 / 1024).toFixed(1)}G`;
292
+ if (kb >= 1024) return `${(kb / 1024).toFixed(0)}M`;
293
+ return `${kb}K`;
294
+ }
295
+
296
+ // Re-export for tests + callers that want to inspect proc helpers in isolation.
297
+ export const __test = { readProcEnviron, isPidAlive, computeElapsedSec };
@@ -289,6 +289,26 @@ export const TeamToolParams = Type.Object({
289
289
  },
290
290
  ),
291
291
  ),
292
+ tokenBudget: Type.Optional(
293
+ Type.Number({
294
+ description:
295
+ "Per-workflow token budget for dynamic-workflow runs. When set, ctx.agent() auto-rejects with ok:false once exhausted. Accumulated from each agent run's reported usage. Overrides workflow.maxTokenBudget.",
296
+ minimum: 0,
297
+ }),
298
+ ),
299
+ args: Type.Optional(
300
+ // round-14 P1-5: typed workflow arguments. Type.Any() generates an empty {} schema
301
+ // (matches any JSON value) which is strict-provider friendly — no array type union.
302
+ // Description lives in the JSDoc / TeamToolParamsValue below to avoid the
303
+ // "description-only schema" strict-provider check.
304
+ Type.Any(),
305
+ ),
306
+ focus: Type.Optional(
307
+ Type.String({
308
+ description:
309
+ "Sub-focus for the doctor action. 'zombies' runs a READ-ONLY scan for orphaned pi-crew sub-agent processes (identified by PI_CREW_KIND=subagent); it never kills and never matches the user's interactive main session.",
310
+ }),
311
+ ),
292
312
  });
293
313
 
294
314
  export interface TeamToolParamsValue {
@@ -365,6 +385,10 @@ export interface TeamToolParamsValue {
365
385
  skill?: string | string[] | boolean;
366
386
  scope?: "user" | "project" | "both";
367
387
  config?: Record<string, unknown>;
388
+ /** Sub-focus for the `doctor` action. `"zombies"` runs a READ-ONLY scan for
389
+ * orphaned pi-crew sub-agent processes (identified by PI_CREW_KIND=subagent);
390
+ * it never kills and never matches the user's interactive main session. */
391
+ focus?: string;
368
392
  dryRun?: boolean;
369
393
  confirm?: boolean;
370
394
  force?: boolean;
@@ -393,4 +417,8 @@ export interface TeamToolParamsValue {
393
417
  budgetAbort?: number;
394
418
  /** Background dispatch discriminator. Default "team-run". "goal-loop"/"dynamic-workflow" dispatch to their runners (P0/P2). */
395
419
  runKind?: "team-run" | "goal-loop" | "dynamic-workflow";
420
+ /** Per-workflow token budget for dynamic-workflow runs (round-14 P1-2). */
421
+ tokenBudget?: number;
422
+ /** Typed workflow arguments for .dwf.ts scripts, accessible via ctx.args<T>() (round-14 P1-5). */
423
+ args?: unknown;
396
424
  }
@@ -91,6 +91,7 @@ const TEAM_EVENT_TYPES = [
91
91
  "dwf.phase_completed",
92
92
  "dwf.completed",
93
93
  "dwf.failed",
94
+ "dwf.log",
94
95
  ] as const;
95
96
  export type TeamEventType = typeof TEAM_EVENT_TYPES[number];
96
97
 
@@ -11,7 +11,9 @@ let pathsInstance: typeof import("../utils/paths.js") | null = null;
11
11
 
12
12
  async function getStore() {
13
13
  if (!storeInstance) {
14
+ // LAZY: defer dynamic import of ./instinct-store.js to its call site.
14
15
  const { InstinctStore } = await import("./instinct-store.js");
16
+ // LAZY: defer dynamic import of ../utils/paths.js to its call site.
15
17
  const paths = await import("../utils/paths.js");
16
18
  storeInstance = new InstinctStore(paths.projectCrewRoot(process.cwd()));
17
19
  }
@@ -20,6 +22,7 @@ async function getStore() {
20
22
 
21
23
  async function getPaths() {
22
24
  if (!pathsInstance) {
25
+ // LAZY: defer dynamic import of ../utils/paths.js to its call site.
23
26
  pathsInstance = await import("../utils/paths.js");
24
27
  }
25
28
  return pathsInstance;
@@ -228,6 +228,8 @@ export function createRunManifest(params: {
228
228
  workspaceMode?: "single" | "worktree";
229
229
  ownerSessionId?: string;
230
230
  runKind?: "team-run" | "goal-loop" | "dynamic-workflow";
231
+ /** round-14 P1-5: typed workflow arguments for .dwf.ts scripts (ctx.args<T>()). */
232
+ args?: unknown;
231
233
  }): { manifest: TeamRunManifest; tasks: TeamTaskState[]; paths: RunPaths } {
232
234
  const paths = createRunPaths(params.cwd);
233
235
  const now = new Date().toISOString();
@@ -251,6 +253,7 @@ export function createRunManifest(params: {
251
253
  artifacts: [],
252
254
  ...(params.ownerSessionId ? { ownerSessionId: params.ownerSessionId } : {}),
253
255
  runKind: params.runKind ?? "team-run",
256
+ ...(params.args !== undefined ? { args: params.args } : {}),
254
257
  };
255
258
  fs.mkdirSync(paths.stateRoot, { recursive: true });
256
259
  fs.mkdirSync(paths.artifactsRoot, { recursive: true });
@@ -116,6 +116,13 @@ export interface WorkerExitStatus {
116
116
  signal?: string;
117
117
  cleanupErrors: string[];
118
118
  finalDrainMs: number;
119
+ /** Phase-0 diagnostic (HB-003a): final-drain race state for the exit-null
120
+ * disableTools bug. Optional + read-only — absent when no drain timer was
121
+ * ever armed. Phase 1 will use `finalDrainArmed` to decide whether a
122
+ * signal-death (exitCode=null) should be treated as a forced final drain. */
123
+ finalDrainArmed?: boolean;
124
+ forcedFinalDrain?: boolean;
125
+ finalDrainFiredMonotonicMs?: number;
119
126
  }
120
127
 
121
128
  export interface OperationTerminalEvidence {
@@ -185,6 +192,8 @@ export interface TeamRunManifest {
185
192
  runConfig?: unknown;
186
193
  /** Background dispatch discriminator. Default "team-run" runs executeTeamRun; "goal-loop" / "dynamic-workflow" dispatch to their respective runners. Absent = "team-run" for backward compatibility. */
187
194
  runKind?: "team-run" | "goal-loop" | "dynamic-workflow";
195
+ /** round-14 P1-5: typed workflow arguments accessible in .dwf.ts scripts via ctx.args<T>(). Any JSON value; default {} when unset. */
196
+ args?: unknown;
188
197
  summary?: string;
189
198
  policyDecisions?: PolicyDecision[];
190
199
  }