pi-crew 0.9.4 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +592 -0
- package/README.md +55 -3
- package/docs/HARNESS_BACKLOG.md +51 -3
- package/docs/dynamic-workflows.md +315 -2
- package/docs/fix-plan-disabletools-exit-null.md +219 -0
- package/docs/troubleshooting.md +102 -0
- package/package.json +8 -2
- package/src/extension/command-completions.ts +1 -0
- package/src/extension/crew-shortcuts.ts +1 -0
- package/src/extension/register.ts +2 -0
- package/src/extension/registration/commands.ts +3 -0
- package/src/extension/team-tool/doctor.ts +14 -0
- package/src/extension/team-tool/goal.ts +1 -0
- package/src/extension/team-tool/run.ts +4 -0
- package/src/runtime/background-runner.ts +24 -2
- package/src/runtime/chain-runner.ts +1 -0
- package/src/runtime/child-pi.ts +101 -10
- package/src/runtime/crash-recovery.ts +78 -36
- package/src/runtime/deterministic-ast.ts +161 -0
- package/src/runtime/dwf-state-store.ts +97 -0
- package/src/runtime/dynamic-workflow-context.ts +381 -7
- package/src/runtime/dynamic-workflow-runner.ts +94 -2
- package/src/runtime/goal-loop-runner.ts +2 -0
- package/src/runtime/live-session-runtime.ts +1 -0
- package/src/runtime/model-scope.ts +1 -0
- package/src/runtime/peer-dep.ts +1 -0
- package/src/runtime/pi-args.ts +11 -0
- package/src/runtime/resilient-edit.ts +1 -0
- package/src/runtime/result-extractor.ts +72 -7
- package/src/runtime/task-runner.ts +1 -0
- package/src/runtime/team-runner.ts +8 -3
- package/src/runtime/zombie-scanner.ts +297 -0
- package/src/schema/team-tool-schema.ts +28 -0
- package/src/state/contracts.ts +1 -0
- package/src/state/hook-instinct-bridge.ts +3 -0
- package/src/state/state-store.ts +3 -0
- package/src/state/types.ts +9 -0
- package/src/ui/dashboard-panes/progress-pane.ts +5 -0
- package/src/ui/dwf-phase-display.ts +151 -0
- package/src/ui/run-snapshot-cache.ts +4 -0
- package/src/ui/snapshot-types.ts +3 -0
- package/src/utils/bm25-search.ts +2 -0
- package/src/workflows/workflow-config.ts +3 -0
- package/src/worktree/worktree-manager.ts +94 -0
- package/types/dwf.d.ts +187 -0
|
@@ -121,6 +121,7 @@ export const realGoalEvaluator = async (
|
|
|
121
121
|
}
|
|
122
122
|
if (!verificationCompromised) {
|
|
123
123
|
try {
|
|
124
|
+
// LAZY: defer dynamic import of ./verification-gates.ts to its call site.
|
|
124
125
|
const { executeVerificationCommands } = await import("./verification-gates.ts");
|
|
125
126
|
const contract = { requiredGreenLevel: "none" as const, commands: goal.verification.commands, allowManualEvidence: goal.verification.allowManualEvidence ?? false };
|
|
126
127
|
// Phase 1.5 #2 (RFC 16): run verification in a pristine git worktree at
|
|
@@ -131,6 +132,7 @@ export const realGoalEvaluator = async (
|
|
|
131
132
|
let worktreeCwd: string | undefined;
|
|
132
133
|
let worktreeCleanup: (() => void) | undefined;
|
|
133
134
|
try {
|
|
135
|
+
// LAZY: defer dynamic import of ./verification-worktree.ts to its call site.
|
|
134
136
|
const { checkWorktreeSandboxAvailable, prepareVerificationWorktree } = await import("./verification-worktree.ts");
|
|
135
137
|
const availability = checkWorktreeSandboxAvailable(goal.cwd);
|
|
136
138
|
if (availability.available) {
|
|
@@ -36,6 +36,7 @@ import { listLiveAgents } from "./live-agent-manager.ts";
|
|
|
36
36
|
* Module-scoped latch for the optional peer dependency import. When N
|
|
37
37
|
* in-process live-session subagents spawn CONCURRENTLY (e.g. several
|
|
38
38
|
* `Agent({run_in_background:true})` started at once), each used to call
|
|
39
|
+
// LAZY: defer dynamic import of @earendil-works/pi-coding-agent to its call site.
|
|
39
40
|
* `await import("@earendil-works/pi-coding-agent")` independently. Under the
|
|
40
41
|
* tsx loader (registering load/resolve hooks), concurrent first-imports can
|
|
41
42
|
* each enter the loader and race module-record instantiation, yielding
|
|
@@ -128,6 +128,7 @@ export async function readEnabledModelsPatterns(cwd: string, agentDir?: string):
|
|
|
128
128
|
// SDK. SettingsManager is dynamically imported because the module
|
|
129
129
|
// shape differs across pi versions; the create() factory is the
|
|
130
130
|
// canonical, version-stable entry point.
|
|
131
|
+
// LAZY: defer dynamic import of @earendil-works/pi-coding-agent to its call site.
|
|
131
132
|
const mod = await import("@earendil-works/pi-coding-agent" as string).catch(() => null);
|
|
132
133
|
if (!mod) return [];
|
|
133
134
|
const SettingsManagerCtor = (mod as { SettingsManager?: { create?: (cwd: string, agentDir?: string) => { getEnabledModels?: () => string[] | undefined } } }).SettingsManager;
|
package/src/runtime/peer-dep.ts
CHANGED
|
@@ -239,6 +239,7 @@ export function primePeerDep(): Promise<PeerDepModule> {
|
|
|
239
239
|
if (!resolved) {
|
|
240
240
|
throw new Error(buildMissingMessage());
|
|
241
241
|
}
|
|
242
|
+
// LAZY: defer dynamic import of module to its call site.
|
|
242
243
|
cachedModule = (await import(resolved.mainUrl)) as PeerDepModule;
|
|
243
244
|
return cachedModule;
|
|
244
245
|
})();
|
package/src/runtime/pi-args.ts
CHANGED
|
@@ -243,6 +243,12 @@ export function createSafeTempDir(base: string, prefix: string): string {
|
|
|
243
243
|
}
|
|
244
244
|
|
|
245
245
|
export function buildPiWorkerArgs(input: BuildPiWorkerArgsInput): BuildPiWorkerArgsResult {
|
|
246
|
+
// NOTE: do NOT add an argv flag like `--crew-subagent` here. Pi uses a strict
|
|
247
|
+
// option parser and REJECTS unknown flags with a non-zero exit, which would
|
|
248
|
+
// break every ctx.agent() call. The authoritative sub-agent identity signal
|
|
249
|
+
// is the PI_CREW_KIND=subagent ENV var (set below) — the zombie scanner and
|
|
250
|
+
// doctor --zombies read it from /proc/<pid>/environ. The user's main session
|
|
251
|
+
// never sets it, so it can never be matched as a sub-agent.
|
|
246
252
|
const args = ["--mode", "json", "-p"];
|
|
247
253
|
if (input.sessionEnabled === false) args.push("--no-session");
|
|
248
254
|
|
|
@@ -327,6 +333,11 @@ export function buildPiWorkerArgs(input: BuildPiWorkerArgsInput): BuildPiWorkerA
|
|
|
327
333
|
return {
|
|
328
334
|
args,
|
|
329
335
|
env: {
|
|
336
|
+
// PI_CREW_KIND is the authoritative machine-readable sub-agent marker. It is always
|
|
337
|
+
// present on a child-pi process and NEVER present on a user's interactive main session.
|
|
338
|
+
// doctor --zombies uses it to safely list orphaned sub-agents without ever matching a
|
|
339
|
+
// main session (the lesson from an accidental `kill` of a live main session).
|
|
340
|
+
PI_CREW_KIND: "subagent",
|
|
330
341
|
PI_CREW_INHERIT_PROJECT_CONTEXT: input.agent.inheritProjectContext ? "1" : "0",
|
|
331
342
|
PI_CREW_INHERIT_SKILLS: input.agent.inheritSkills ? "1" : "0",
|
|
332
343
|
PI_CREW_DEPTH: String(parentDepth + 1),
|
|
@@ -133,6 +133,7 @@ export function wrapEditWithResilientReplace(pi: ExtensionAPI, tools?: { edit: T
|
|
|
133
133
|
throw new Error("old_string not found (and resilient retry skipped: missing path/old/new)");
|
|
134
134
|
}
|
|
135
135
|
|
|
136
|
+
// LAZY: defer dynamic import of node:fs/promises to its call site.
|
|
136
137
|
const fs = await import("node:fs/promises");
|
|
137
138
|
let content: string;
|
|
138
139
|
try {
|
|
@@ -1,11 +1,19 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Structured Result Extractor — attempts to extract structured data from worker output.
|
|
3
3
|
* Tries multiple extraction strategies before falling back to raw text.
|
|
4
|
+
*
|
|
5
|
+
* Round-13 P0-3: optional `schema` (TypeBox `TSchema`) — when provided, extracted
|
|
6
|
+
* data is validated against the schema via `Value.Check`. On mismatch, the result
|
|
7
|
+
* is `structured:false` with an explanatory `error`. Backward compatible: when
|
|
8
|
+
* schema is undefined, behavior is identical to the previous regex-based extractor.
|
|
4
9
|
*/
|
|
10
|
+
import type { TSchema } from "@sinclair/typebox";
|
|
11
|
+
import { Value } from "@sinclair/typebox/value";
|
|
12
|
+
|
|
5
13
|
export interface ExtractedResult {
|
|
6
14
|
/** Whether structured data was successfully extracted */
|
|
7
15
|
structured: boolean;
|
|
8
|
-
/** Parsed structured data (if structured=true) */
|
|
16
|
+
/** Parsed structured data (if structured=true AND validated against schema if provided) */
|
|
9
17
|
data: unknown;
|
|
10
18
|
/** Raw text output (always available) */
|
|
11
19
|
rawText: string;
|
|
@@ -15,9 +23,13 @@ export interface ExtractedResult {
|
|
|
15
23
|
|
|
16
24
|
/**
|
|
17
25
|
* Extract structured result from raw worker output text.
|
|
18
|
-
* Tries strategies in order: direct JSON, fenced JSON, key-value markers.
|
|
26
|
+
* Tries strategies in order: direct JSON, fenced JSON, key-value markers, scan.
|
|
27
|
+
*
|
|
28
|
+
* @param raw - the raw text output from a worker
|
|
29
|
+
* @param schema - optional TypeBox schema. When provided, the extracted value is
|
|
30
|
+
* validated; mismatch produces `{structured:false, error:...}`.
|
|
19
31
|
*/
|
|
20
|
-
export function extractStructuredResult(raw: string,
|
|
32
|
+
export function extractStructuredResult(raw: string, schema?: TSchema): ExtractedResult {
|
|
21
33
|
const trimmed = raw.trim();
|
|
22
34
|
if (!trimmed) {
|
|
23
35
|
return { structured: false, data: null, rawText: raw };
|
|
@@ -26,19 +38,19 @@ export function extractStructuredResult(raw: string, _schema?: Record<string, un
|
|
|
26
38
|
// Strategy 1: Direct JSON parse (entire output is JSON)
|
|
27
39
|
const directResult = tryDirectJson(trimmed);
|
|
28
40
|
if (directResult !== undefined) {
|
|
29
|
-
return
|
|
41
|
+
return finalize(directResult, raw, schema);
|
|
30
42
|
}
|
|
31
43
|
|
|
32
44
|
// Strategy 2: Extract from ```json ... ``` fence
|
|
33
45
|
const fencedResult = tryFencedJson(trimmed);
|
|
34
46
|
if (fencedResult !== undefined) {
|
|
35
|
-
return
|
|
47
|
+
return finalize(fencedResult, raw, schema);
|
|
36
48
|
}
|
|
37
49
|
|
|
38
50
|
// Strategy 3: Extract from markers like "RESULT:" or "OUTPUT:"
|
|
39
51
|
const markerResult = tryMarkerExtraction(trimmed);
|
|
40
52
|
if (markerResult !== undefined) {
|
|
41
|
-
return
|
|
53
|
+
return finalize(markerResult, raw, schema);
|
|
42
54
|
}
|
|
43
55
|
|
|
44
56
|
// Strategy 4: Scan for the first JSON object/array anywhere in text.
|
|
@@ -46,12 +58,65 @@ export function extractStructuredResult(raw: string, _schema?: Record<string, un
|
|
|
46
58
|
// around the JSON. This catches JSON embedded in sentences, lists, or prose.
|
|
47
59
|
const scannedResult = tryScanJson(trimmed);
|
|
48
60
|
if (scannedResult !== undefined) {
|
|
49
|
-
return
|
|
61
|
+
return finalize(scannedResult, raw, schema);
|
|
50
62
|
}
|
|
51
63
|
|
|
52
64
|
return { structured: false, data: null, rawText: raw };
|
|
53
65
|
}
|
|
54
66
|
|
|
67
|
+
/**
|
|
68
|
+
* After extracting a candidate object, validate it against the optional TypeBox schema.
|
|
69
|
+
* When no schema is given, behavior is the legacy "structured:true" path.
|
|
70
|
+
* When a schema is given and validation fails, return structured:false with a
|
|
71
|
+
* clear error message (caller can surface this in the AgentResult).
|
|
72
|
+
*
|
|
73
|
+
* NOTE: TypeBox 0.34.49's `Value.Check` returns a boolean and does not expose
|
|
74
|
+
* per-error paths in its public API. We use the boolean + a fallback "type mismatch"
|
|
75
|
+
* description. Scripts that need detailed diagnostics can wrap their own validator.
|
|
76
|
+
*/
|
|
77
|
+
function finalize(candidate: unknown, raw: string, schema: TSchema | undefined): ExtractedResult {
|
|
78
|
+
if (!schema) {
|
|
79
|
+
return { structured: true, data: candidate, rawText: raw };
|
|
80
|
+
}
|
|
81
|
+
const ok = Value.Check(schema, candidate);
|
|
82
|
+
if (ok) {
|
|
83
|
+
return { structured: true, data: candidate, rawText: raw };
|
|
84
|
+
}
|
|
85
|
+
return {
|
|
86
|
+
structured: false,
|
|
87
|
+
data: null,
|
|
88
|
+
rawText: raw,
|
|
89
|
+
error: `structured output does not match schema: expected shape ${describeSchemaShape(schema)}, got ${describeValue(candidate)}`,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function describeValue(value: unknown): string {
|
|
94
|
+
try {
|
|
95
|
+
const json = JSON.stringify(value);
|
|
96
|
+
return json.length > 200 ? `${json.slice(0, 200)}…` : json;
|
|
97
|
+
} catch {
|
|
98
|
+
return typeof value;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function describeSchemaShape(schema: unknown): string {
|
|
103
|
+
if (!schema || typeof schema !== "object") return "any";
|
|
104
|
+
const obj = schema as Record<string, unknown>;
|
|
105
|
+
const kind = obj.kind as string | undefined;
|
|
106
|
+
const type = obj.type as string | undefined;
|
|
107
|
+
if (kind === "object" || type === "object") {
|
|
108
|
+
const properties = obj.properties;
|
|
109
|
+
if (!properties || typeof properties !== "object") return "object";
|
|
110
|
+
return `object<${Object.keys(properties as Record<string, unknown>).join(",")}>`;
|
|
111
|
+
}
|
|
112
|
+
if (kind === "array" || type === "array") return "array";
|
|
113
|
+
if (type === "string") return "string";
|
|
114
|
+
if (type === "number" || type === "integer") return "number";
|
|
115
|
+
if (type === "boolean") return "boolean";
|
|
116
|
+
if (Array.isArray(obj.anyOf) || Array.isArray(obj.oneOf)) return "union";
|
|
117
|
+
return "any";
|
|
118
|
+
}
|
|
119
|
+
|
|
55
120
|
function tryDirectJson(text: string): unknown | undefined {
|
|
56
121
|
if (!text.startsWith("{") && !text.startsWith("[")) return undefined;
|
|
57
122
|
try {
|
|
@@ -289,6 +289,7 @@ export async function runTeamTask(
|
|
|
289
289
|
// follow it and execute a script outside cwd. Throws on escape.
|
|
290
290
|
resolveRealContainedPath(manifest.cwd, input.step.preStepScript);
|
|
291
291
|
try {
|
|
292
|
+
// LAZY: defer dynamic import of node:child_process to its call site.
|
|
292
293
|
const { execFileSync } = await import("node:child_process");
|
|
293
294
|
preStepOutput = execFileSync(input.step.preStepScript, scriptArgs, {
|
|
294
295
|
timeout: scriptTimeout,
|
|
@@ -63,16 +63,21 @@ builtInRegistry.register(VitePlugin);
|
|
|
63
63
|
* executing. The team-runner has no periodic heartbeat today, so any
|
|
64
64
|
* team run lasting >5min is at risk.
|
|
65
65
|
*/
|
|
66
|
-
function startTeamRunHeartbeat(stateRoot: string, runId: string
|
|
66
|
+
function startTeamRunHeartbeat(stateRoot: string, runId: string): () => void {
|
|
67
67
|
const heartbeatPath = path.join(stateRoot, "heartbeat.json");
|
|
68
68
|
const writeHeartbeat = (): void => {
|
|
69
69
|
try {
|
|
70
|
+
// lastTaskUpdateAt is written fresh on each tick so the heartbeat
|
|
71
|
+
// never carries a stale creation-time timestamp. Previously this
|
|
72
|
+
// captured manifest.updatedAt once at startup, making the value
|
|
73
|
+
// permanently stale throughout the run.
|
|
74
|
+
const now = new Date().toISOString();
|
|
70
75
|
fs.writeFileSync(heartbeatPath, JSON.stringify({
|
|
71
76
|
pid: process.pid,
|
|
72
77
|
at: Date.now(),
|
|
73
78
|
runId,
|
|
74
79
|
kind: "team-runner",
|
|
75
|
-
lastTaskUpdateAt,
|
|
80
|
+
lastTaskUpdateAt: now,
|
|
76
81
|
}), { encoding: "utf-8", mode: 0o600 });
|
|
77
82
|
} catch {
|
|
78
83
|
// best-effort
|
|
@@ -439,7 +444,7 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
|
|
|
439
444
|
// (NO_PID_HEARTBEAT_STALE_MS). Previously only sub-task runners wrote
|
|
440
445
|
// heartbeats; the team-level run had no heartbeat, so any multi-phase
|
|
441
446
|
// workflow lasting >5min was marked stale and cancelled.
|
|
442
|
-
const stopTeamHeartbeat = startTeamRunHeartbeat(manifest.stateRoot, manifest.runId
|
|
447
|
+
const stopTeamHeartbeat = startTeamRunHeartbeat(manifest.stateRoot, manifest.runId);
|
|
443
448
|
|
|
444
449
|
const cleanupUsage = (): void => {
|
|
445
450
|
for (const task of input.tasks) clearTrackedTaskUsage(task.id);
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* zombie-scanner.ts — safely detect orphaned pi-crew sub-agent processes.
|
|
3
|
+
*
|
|
4
|
+
* LESSON (learned the hard way): a heuristic like "old `pi` process + high RSS +
|
|
5
|
+
* orphaned (ppid=1/bash)" will match a user's interactive MAIN session just as
|
|
6
|
+
* readily as a real zombie. The result is a live main session being killed by
|
|
7
|
+
* accident. This module replaces that heuristic with an authoritative signal.
|
|
8
|
+
*
|
|
9
|
+
* Authoritative marker (set by buildPiWorkerArgs on every child-pi spawn):
|
|
10
|
+
* - argv: `--crew-subagent` is the first positional arg
|
|
11
|
+
* - env: `PI_CREW_KIND=subagent` is the machine-readable signal
|
|
12
|
+
*
|
|
13
|
+
* A process is a "pi-crew sub-agent" ONLY IF it carries `PI_CREW_KIND=subagent`
|
|
14
|
+
* in its environment. The user's main `pi` session NEVER has this var, so it can
|
|
15
|
+
* never be matched here — by construction.
|
|
16
|
+
*
|
|
17
|
+
* A sub-agent is a "zombie" ONLY IF its `PI_CREW_PARENT_PID` points at a PID that
|
|
18
|
+
* is no longer alive (parent crashed/exited without reaping the child). A sub-agent
|
|
19
|
+
* whose parent is still running is NOT a zombie — it's a legitimate in-flight task.
|
|
20
|
+
*
|
|
21
|
+
* This module is READ-ONLY. It never kills anything. The caller (doctor --zombies)
|
|
22
|
+
* prints the list and asks for explicit confirmation before any kill.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import * as fs from "node:fs";
|
|
26
|
+
|
|
27
|
+
export interface ZombieSubagent {
|
|
28
|
+
pid: number;
|
|
29
|
+
ppid: number;
|
|
30
|
+
/** PID recorded in PI_CREW_PARENT_PID (may differ from ppid if re-parented to init/bash). */
|
|
31
|
+
crewParentPid: number;
|
|
32
|
+
/** Whether the recorded crew parent PID is still alive. */
|
|
33
|
+
parentAlive: boolean;
|
|
34
|
+
role: string | undefined;
|
|
35
|
+
rssKb: number;
|
|
36
|
+
elapsedSec: number | undefined;
|
|
37
|
+
cmd: string;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export interface ZombieScanResult {
|
|
41
|
+
zombies: ZombieSubagent[];
|
|
42
|
+
/** Sub-agents whose parent is still alive — shown for transparency, never killed. */
|
|
43
|
+
live: ZombieSubagent[];
|
|
44
|
+
/** Errors encountered while scanning (per-pid). Never aborts the whole scan. */
|
|
45
|
+
errors: string[];
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Read /proc/<pid>/environ as a key=value record. Returns {} if unreadable. */
|
|
49
|
+
function readProcEnviron(pid: number): Record<string, string> {
|
|
50
|
+
try {
|
|
51
|
+
// /proc/<pid>/environ is NUL-separated key=value pairs.
|
|
52
|
+
const raw = fs.readFileSync(`/proc/${pid}/environ`, "utf-8");
|
|
53
|
+
const out: Record<string, string> = {};
|
|
54
|
+
for (const entry of raw.split("\0")) {
|
|
55
|
+
const eq = entry.indexOf("=");
|
|
56
|
+
if (eq > 0) out[entry.slice(0, eq)] = entry.slice(eq + 1);
|
|
57
|
+
}
|
|
58
|
+
return out;
|
|
59
|
+
} catch {
|
|
60
|
+
return {};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/** Read /proc/<pid>/stat to get ppid + elapsed. Returns undefined if unreadable. */
|
|
65
|
+
function readProcStat(pid: number): { ppid: number; elapsedSec: number | undefined } | undefined {
|
|
66
|
+
try {
|
|
67
|
+
const stat = fs.readFileSync(`/proc/${pid}/stat`, "utf-8");
|
|
68
|
+
// stat format: pid (comm) state ppid ... starttime ...
|
|
69
|
+
// comm may contain spaces/parens, so parse from the LAST ')' backwards.
|
|
70
|
+
const closeParen = stat.lastIndexOf(")");
|
|
71
|
+
if (closeParen < 0) return undefined;
|
|
72
|
+
const rest = stat.slice(closeParen + 2).trim().split(/\s+/);
|
|
73
|
+
// rest[0] = state, rest[1] = ppid
|
|
74
|
+
const ppid = Number.parseInt(rest[1] ?? "", 10);
|
|
75
|
+
// starttime (clock ticks since boot) is field 22 in the full stat → index 19 in `rest`
|
|
76
|
+
const starttimeTicksRaw = Number.parseInt(rest[19] ?? "", 10);
|
|
77
|
+
const starttimeTicks = Number.isFinite(starttimeTicksRaw) ? starttimeTicksRaw : undefined;
|
|
78
|
+
const elapsedSec = computeElapsedSec(starttimeTicks);
|
|
79
|
+
return { ppid: Number.isFinite(ppid) ? ppid : 0, elapsedSec };
|
|
80
|
+
} catch {
|
|
81
|
+
return undefined;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function computeElapsedSec(starttimeTicks: number | undefined): number | undefined {
|
|
86
|
+
if (starttimeTicks === undefined || !Number.isFinite(starttimeTicks)) return undefined;
|
|
87
|
+
try {
|
|
88
|
+
// Linux CLK_TCK is virtually always 100 (sysconf(_SC_CLK_TCK)). Reading it
|
|
89
|
+
// portably from Node requires a native addon; hardcoding 100 matches every
|
|
90
|
+
// mainstream Linux distro and keeps this dependency-free.
|
|
91
|
+
const ticksPerSec = 100;
|
|
92
|
+
// /proc/uptime: first field is seconds since boot.
|
|
93
|
+
const uptimeRaw = fs.readFileSync("/proc/uptime", "utf-8");
|
|
94
|
+
const uptimeSec = Number.parseFloat(uptimeRaw.split(" ")[0] ?? "");
|
|
95
|
+
if (!Number.isFinite(uptimeSec)) return undefined;
|
|
96
|
+
// starttime (ticks since boot) → process age in seconds = uptime - starttime/ticksPerSec.
|
|
97
|
+
const startAgeSec = starttimeTicks / ticksPerSec;
|
|
98
|
+
return Math.max(0, uptimeSec - startAgeSec);
|
|
99
|
+
} catch {
|
|
100
|
+
return undefined;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function isPidAlive(pid: number): boolean {
|
|
105
|
+
if (!Number.isFinite(pid) || pid <= 0) return false;
|
|
106
|
+
try {
|
|
107
|
+
// process.kill(pid, 0) throws if the pid is not alive (or not ours).
|
|
108
|
+
process.kill(pid, 0);
|
|
109
|
+
return true;
|
|
110
|
+
} catch {
|
|
111
|
+
return false;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function readProcCmdline(pid: number): string {
|
|
116
|
+
try {
|
|
117
|
+
// /proc/<pid>/cmdline is NUL-separated argv.
|
|
118
|
+
const raw = fs.readFileSync(`/proc/${pid}/cmdline`, "utf-8");
|
|
119
|
+
return raw.split("\0").filter(Boolean).join(" ").trim() || `pid ${pid}`;
|
|
120
|
+
} catch {
|
|
121
|
+
return `pid ${pid}`;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function readProcRssKb(pid: number): number {
|
|
126
|
+
try {
|
|
127
|
+
const status = fs.readFileSync(`/proc/${pid}/status`, "utf-8");
|
|
128
|
+
const match = status.match(/^VmRSS:\s+(\d+)\s+kB/m);
|
|
129
|
+
return match ? Number.parseInt(match[1] ?? "", 10) : 0;
|
|
130
|
+
} catch {
|
|
131
|
+
return 0;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Enumerate candidate pi-crew sub-agent PIDs under the current uid.
|
|
137
|
+
*
|
|
138
|
+
* Reads /proc directly (Linux only) — no shelling out to pgrep/ps, so the
|
|
139
|
+
* result is deterministic and unaffected by shell quoting or locale. On
|
|
140
|
+
* non-Linux platforms the scanner returns an empty result with a note in
|
|
141
|
+
* `errors` (zombie detection is best-effort; the doctor report still renders).
|
|
142
|
+
*/
|
|
143
|
+
function listCandidatePids(): number[] {
|
|
144
|
+
if (process.platform !== "linux") return [];
|
|
145
|
+
const pids: number[] = [];
|
|
146
|
+
try {
|
|
147
|
+
for (const entry of fs.readdirSync("/proc")) {
|
|
148
|
+
if (/^\d+$/.test(entry)) pids.push(Number.parseInt(entry, 10));
|
|
149
|
+
}
|
|
150
|
+
} catch {
|
|
151
|
+
// /proc unreadable (e.g. sandboxed). Caller surfaces via errors[].
|
|
152
|
+
}
|
|
153
|
+
return pids;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Scan for orphaned pi-crew sub-agent processes. READ-ONLY — never kills.
|
|
158
|
+
*
|
|
159
|
+
* Returns the full picture: zombies (parent dead), live (parent alive), and
|
|
160
|
+
* any scan errors. Callers decide what to do with the result; this module
|
|
161
|
+
* has no side effects.
|
|
162
|
+
*/
|
|
163
|
+
export function scanZombieSubagents(): ZombieScanResult {
|
|
164
|
+
const result: ZombieScanResult = { zombies: [], live: [], errors: [] };
|
|
165
|
+
if (process.platform !== "linux") {
|
|
166
|
+
result.errors.push("zombie scan is Linux-only (/proc required); skipping on " + process.platform);
|
|
167
|
+
return result;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const myUid = tryGetUid();
|
|
171
|
+
for (const pid of listCandidatePids()) {
|
|
172
|
+
try {
|
|
173
|
+
// Cheap rejection first: only inspect processes we own (avoid scanning system procs).
|
|
174
|
+
if (myUid !== undefined && getProcUid(pid) !== myUid) continue;
|
|
175
|
+
|
|
176
|
+
const environ = readProcEnviron(pid);
|
|
177
|
+
// AUTHORITATIVE GATE: a process is a pi-crew sub-agent ONLY if it carries
|
|
178
|
+
// PI_CREW_KIND=subagent. The user's main session never sets this, so it can
|
|
179
|
+
// never be matched — this is the fix for accidentally killing main sessions.
|
|
180
|
+
if (environ.PI_CREW_KIND !== "subagent") continue;
|
|
181
|
+
|
|
182
|
+
const crewParentPid = Number.parseInt(environ.PI_CREW_PARENT_PID ?? "", 10);
|
|
183
|
+
const stat = readProcStat(pid);
|
|
184
|
+
const entry: ZombieSubagent = {
|
|
185
|
+
pid,
|
|
186
|
+
ppid: stat?.ppid ?? 0,
|
|
187
|
+
crewParentPid: Number.isFinite(crewParentPid) ? crewParentPid : 0,
|
|
188
|
+
parentAlive: Number.isFinite(crewParentPid) && isPidAlive(crewParentPid),
|
|
189
|
+
role: environ.PI_CREW_ROLE,
|
|
190
|
+
rssKb: readProcRssKb(pid),
|
|
191
|
+
elapsedSec: stat?.elapsedSec,
|
|
192
|
+
cmd: readProcCmdline(pid),
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
if (entry.parentAlive) {
|
|
196
|
+
result.live.push(entry);
|
|
197
|
+
} else {
|
|
198
|
+
result.zombies.push(entry);
|
|
199
|
+
}
|
|
200
|
+
} catch (error) {
|
|
201
|
+
// Race: process may have exited between readdir and read. Don't abort the scan.
|
|
202
|
+
result.errors.push(`pid ${pid}: ${error instanceof Error ? error.message : String(error)}`);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Sort: zombies first by descending RSS (biggest leaks first), live by pid.
|
|
207
|
+
result.zombies.sort((a, b) => b.rssKb - a.rssKb);
|
|
208
|
+
result.live.sort((a, b) => a.pid - b.pid);
|
|
209
|
+
return result;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
function tryGetUid(): number | undefined {
|
|
213
|
+
try {
|
|
214
|
+
return process.getuid?.();
|
|
215
|
+
} catch {
|
|
216
|
+
return undefined;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
function getProcUid(pid: number): number | undefined {
|
|
221
|
+
try {
|
|
222
|
+
// /proc/<pid>/status has Uid: <real> <eff> <sav> <fs>
|
|
223
|
+
const status = fs.readFileSync(`/proc/${pid}/status`, "utf-8");
|
|
224
|
+
const match = status.match(/^Uid:\s+(\d+)/m);
|
|
225
|
+
return match ? Number.parseInt(match[1] ?? "", 10) : undefined;
|
|
226
|
+
} catch {
|
|
227
|
+
return undefined;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Render a ZombieScanResult as human-readable text for the doctor report.
|
|
233
|
+
* Explicitly labels main-session safety and never suggests killing live parents.
|
|
234
|
+
*/
|
|
235
|
+
export function formatZombieReport(scan: ZombieScanResult): string {
|
|
236
|
+
const lines: string[] = [];
|
|
237
|
+
lines.push("## Zombie sub-agent scan (read-only — nothing killed)");
|
|
238
|
+
lines.push("");
|
|
239
|
+
lines.push(
|
|
240
|
+
`Sub-agents identified by PI_CREW_KIND=subagent marker. Main sessions (no marker) are never listed.`,
|
|
241
|
+
);
|
|
242
|
+
lines.push("");
|
|
243
|
+
|
|
244
|
+
if (scan.zombies.length === 0 && scan.live.length === 0) {
|
|
245
|
+
lines.push("No pi-crew sub-agent processes found.");
|
|
246
|
+
if (scan.errors.length > 0) {
|
|
247
|
+
lines.push("");
|
|
248
|
+
lines.push(`Scan notes (${scan.errors.length}):`);
|
|
249
|
+
for (const err of scan.errors.slice(0, 5)) lines.push(` - ${err}`);
|
|
250
|
+
}
|
|
251
|
+
return lines.join("\n");
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if (scan.zombies.length > 0) {
|
|
255
|
+
lines.push(`### Zombies — parent dead (${scan.zombies.length})`);
|
|
256
|
+
lines.push("These sub-agents are orphaned. Safe to kill after review:");
|
|
257
|
+
lines.push("");
|
|
258
|
+
lines.push(" PID PARENT RSS ROLE CMD");
|
|
259
|
+
for (const z of scan.zombies) {
|
|
260
|
+
lines.push(
|
|
261
|
+
` ${String(z.pid).padEnd(9)}${String(z.crewParentPid).padEnd(8)}${formatRss(z.rssKb).padEnd(10)}${(z.role ?? "?").padEnd(14)}${z.cmd.slice(0, 60)}`,
|
|
262
|
+
);
|
|
263
|
+
}
|
|
264
|
+
lines.push("");
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
if (scan.live.length > 0) {
|
|
268
|
+
lines.push(`### Live — parent still running (${scan.live.length})`);
|
|
269
|
+
lines.push("NOT zombies. Do not kill (parent PID is alive and may still reap them).");
|
|
270
|
+
lines.push("");
|
|
271
|
+
lines.push(" PID PARENT RSS ROLE CMD");
|
|
272
|
+
for (const l of scan.live) {
|
|
273
|
+
lines.push(
|
|
274
|
+
` ${String(l.pid).padEnd(9)}${String(l.crewParentPid).padEnd(8)}${formatRss(l.rssKb).padEnd(10)}${(l.role ?? "?").padEnd(14)}${l.cmd.slice(0, 60)}`,
|
|
275
|
+
);
|
|
276
|
+
}
|
|
277
|
+
lines.push("");
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if (scan.errors.length > 0) {
|
|
281
|
+
lines.push(`Scan errors (${scan.errors.length}, first 5 shown):`);
|
|
282
|
+
for (const err of scan.errors.slice(0, 5)) lines.push(` - ${err}`);
|
|
283
|
+
lines.push("");
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
lines.push("To kill a zombie: `kill <PID>` (the OS will reap it). This tool never kills.");
|
|
287
|
+
return lines.join("\n");
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function formatRss(kb: number): string {
|
|
291
|
+
if (kb >= 1024 * 1024) return `${(kb / 1024 / 1024).toFixed(1)}G`;
|
|
292
|
+
if (kb >= 1024) return `${(kb / 1024).toFixed(0)}M`;
|
|
293
|
+
return `${kb}K`;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Re-export for tests + callers that want to inspect proc helpers in isolation.
|
|
297
|
+
export const __test = { readProcEnviron, isPidAlive, computeElapsedSec };
|
|
@@ -289,6 +289,26 @@ export const TeamToolParams = Type.Object({
|
|
|
289
289
|
},
|
|
290
290
|
),
|
|
291
291
|
),
|
|
292
|
+
tokenBudget: Type.Optional(
|
|
293
|
+
Type.Number({
|
|
294
|
+
description:
|
|
295
|
+
"Per-workflow token budget for dynamic-workflow runs. When set, ctx.agent() auto-rejects with ok:false once exhausted. Accumulated from each agent run's reported usage. Overrides workflow.maxTokenBudget.",
|
|
296
|
+
minimum: 0,
|
|
297
|
+
}),
|
|
298
|
+
),
|
|
299
|
+
args: Type.Optional(
|
|
300
|
+
// round-14 P1-5: typed workflow arguments. Type.Any() generates an empty {} schema
|
|
301
|
+
// (matches any JSON value) which is strict-provider friendly — no array type union.
|
|
302
|
+
// Description lives in the JSDoc / TeamToolParamsValue below to avoid the
|
|
303
|
+
// "description-only schema" strict-provider check.
|
|
304
|
+
Type.Any(),
|
|
305
|
+
),
|
|
306
|
+
focus: Type.Optional(
|
|
307
|
+
Type.String({
|
|
308
|
+
description:
|
|
309
|
+
"Sub-focus for the doctor action. 'zombies' runs a READ-ONLY scan for orphaned pi-crew sub-agent processes (identified by PI_CREW_KIND=subagent); it never kills and never matches the user's interactive main session.",
|
|
310
|
+
}),
|
|
311
|
+
),
|
|
292
312
|
});
|
|
293
313
|
|
|
294
314
|
export interface TeamToolParamsValue {
|
|
@@ -365,6 +385,10 @@ export interface TeamToolParamsValue {
|
|
|
365
385
|
skill?: string | string[] | boolean;
|
|
366
386
|
scope?: "user" | "project" | "both";
|
|
367
387
|
config?: Record<string, unknown>;
|
|
388
|
+
/** Sub-focus for the `doctor` action. `"zombies"` runs a READ-ONLY scan for
|
|
389
|
+
* orphaned pi-crew sub-agent processes (identified by PI_CREW_KIND=subagent);
|
|
390
|
+
* it never kills and never matches the user's interactive main session. */
|
|
391
|
+
focus?: string;
|
|
368
392
|
dryRun?: boolean;
|
|
369
393
|
confirm?: boolean;
|
|
370
394
|
force?: boolean;
|
|
@@ -393,4 +417,8 @@ export interface TeamToolParamsValue {
|
|
|
393
417
|
budgetAbort?: number;
|
|
394
418
|
/** Background dispatch discriminator. Default "team-run". "goal-loop"/"dynamic-workflow" dispatch to their runners (P0/P2). */
|
|
395
419
|
runKind?: "team-run" | "goal-loop" | "dynamic-workflow";
|
|
420
|
+
/** Per-workflow token budget for dynamic-workflow runs (round-14 P1-2). */
|
|
421
|
+
tokenBudget?: number;
|
|
422
|
+
/** Typed workflow arguments for .dwf.ts scripts, accessible via ctx.args<T>() (round-14 P1-5). */
|
|
423
|
+
args?: unknown;
|
|
396
424
|
}
|
package/src/state/contracts.ts
CHANGED
|
@@ -11,7 +11,9 @@ let pathsInstance: typeof import("../utils/paths.js") | null = null;
|
|
|
11
11
|
|
|
12
12
|
async function getStore() {
|
|
13
13
|
if (!storeInstance) {
|
|
14
|
+
// LAZY: defer dynamic import of ./instinct-store.js to its call site.
|
|
14
15
|
const { InstinctStore } = await import("./instinct-store.js");
|
|
16
|
+
// LAZY: defer dynamic import of ../utils/paths.js to its call site.
|
|
15
17
|
const paths = await import("../utils/paths.js");
|
|
16
18
|
storeInstance = new InstinctStore(paths.projectCrewRoot(process.cwd()));
|
|
17
19
|
}
|
|
@@ -20,6 +22,7 @@ async function getStore() {
|
|
|
20
22
|
|
|
21
23
|
async function getPaths() {
|
|
22
24
|
if (!pathsInstance) {
|
|
25
|
+
// LAZY: defer dynamic import of ../utils/paths.js to its call site.
|
|
23
26
|
pathsInstance = await import("../utils/paths.js");
|
|
24
27
|
}
|
|
25
28
|
return pathsInstance;
|
package/src/state/state-store.ts
CHANGED
|
@@ -228,6 +228,8 @@ export function createRunManifest(params: {
|
|
|
228
228
|
workspaceMode?: "single" | "worktree";
|
|
229
229
|
ownerSessionId?: string;
|
|
230
230
|
runKind?: "team-run" | "goal-loop" | "dynamic-workflow";
|
|
231
|
+
/** round-14 P1-5: typed workflow arguments for .dwf.ts scripts (ctx.args<T>()). */
|
|
232
|
+
args?: unknown;
|
|
231
233
|
}): { manifest: TeamRunManifest; tasks: TeamTaskState[]; paths: RunPaths } {
|
|
232
234
|
const paths = createRunPaths(params.cwd);
|
|
233
235
|
const now = new Date().toISOString();
|
|
@@ -251,6 +253,7 @@ export function createRunManifest(params: {
|
|
|
251
253
|
artifacts: [],
|
|
252
254
|
...(params.ownerSessionId ? { ownerSessionId: params.ownerSessionId } : {}),
|
|
253
255
|
runKind: params.runKind ?? "team-run",
|
|
256
|
+
...(params.args !== undefined ? { args: params.args } : {}),
|
|
254
257
|
};
|
|
255
258
|
fs.mkdirSync(paths.stateRoot, { recursive: true });
|
|
256
259
|
fs.mkdirSync(paths.artifactsRoot, { recursive: true });
|
package/src/state/types.ts
CHANGED
|
@@ -116,6 +116,13 @@ export interface WorkerExitStatus {
|
|
|
116
116
|
signal?: string;
|
|
117
117
|
cleanupErrors: string[];
|
|
118
118
|
finalDrainMs: number;
|
|
119
|
+
/** Phase-0 diagnostic (HB-003a): final-drain race state for the exit-null
|
|
120
|
+
* disableTools bug. Optional + read-only — absent when no drain timer was
|
|
121
|
+
* ever armed. Phase 1 will use `finalDrainArmed` to decide whether a
|
|
122
|
+
* signal-death (exitCode=null) should be treated as a forced final drain. */
|
|
123
|
+
finalDrainArmed?: boolean;
|
|
124
|
+
forcedFinalDrain?: boolean;
|
|
125
|
+
finalDrainFiredMonotonicMs?: number;
|
|
119
126
|
}
|
|
120
127
|
|
|
121
128
|
export interface OperationTerminalEvidence {
|
|
@@ -185,6 +192,8 @@ export interface TeamRunManifest {
|
|
|
185
192
|
runConfig?: unknown;
|
|
186
193
|
/** Background dispatch discriminator. Default "team-run" runs executeTeamRun; "goal-loop" / "dynamic-workflow" dispatch to their respective runners. Absent = "team-run" for backward compatibility. */
|
|
187
194
|
runKind?: "team-run" | "goal-loop" | "dynamic-workflow";
|
|
195
|
+
/** round-14 P1-5: typed workflow arguments accessible in .dwf.ts scripts via ctx.args<T>(). Any JSON value; default {} when unset. */
|
|
196
|
+
args?: unknown;
|
|
188
197
|
summary?: string;
|
|
189
198
|
policyDecisions?: PolicyDecision[];
|
|
190
199
|
}
|