npm - pi-crew - Versions diffs - 0.9.4 → 0.9.7 - Mend

pi-crew 0.9.4 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +592 -0
package/README.md +55 -3
package/docs/HARNESS_BACKLOG.md +51 -3
package/docs/dynamic-workflows.md +315 -2
package/docs/fix-plan-disabletools-exit-null.md +219 -0
package/docs/troubleshooting.md +102 -0
package/package.json +8 -2
package/src/extension/command-completions.ts +1 -0
package/src/extension/crew-shortcuts.ts +1 -0
package/src/extension/register.ts +2 -0
package/src/extension/registration/commands.ts +3 -0
package/src/extension/team-tool/doctor.ts +14 -0
package/src/extension/team-tool/goal.ts +1 -0
package/src/extension/team-tool/run.ts +4 -0
package/src/runtime/background-runner.ts +24 -2
package/src/runtime/chain-runner.ts +1 -0
package/src/runtime/child-pi.ts +101 -10
package/src/runtime/crash-recovery.ts +78 -36
package/src/runtime/deterministic-ast.ts +161 -0
package/src/runtime/dwf-state-store.ts +97 -0
package/src/runtime/dynamic-workflow-context.ts +381 -7
package/src/runtime/dynamic-workflow-runner.ts +94 -2
package/src/runtime/goal-loop-runner.ts +2 -0
package/src/runtime/live-session-runtime.ts +1 -0
package/src/runtime/model-scope.ts +1 -0
package/src/runtime/peer-dep.ts +1 -0
package/src/runtime/pi-args.ts +11 -0
package/src/runtime/resilient-edit.ts +1 -0
package/src/runtime/result-extractor.ts +72 -7
package/src/runtime/task-runner.ts +1 -0
package/src/runtime/team-runner.ts +8 -3
package/src/runtime/zombie-scanner.ts +297 -0
package/src/schema/team-tool-schema.ts +28 -0
package/src/state/contracts.ts +1 -0
package/src/state/hook-instinct-bridge.ts +3 -0
package/src/state/state-store.ts +3 -0
package/src/state/types.ts +9 -0
package/src/ui/dashboard-panes/progress-pane.ts +5 -0
package/src/ui/dwf-phase-display.ts +151 -0
package/src/ui/run-snapshot-cache.ts +4 -0
package/src/ui/snapshot-types.ts +3 -0
package/src/utils/bm25-search.ts +2 -0
package/src/workflows/workflow-config.ts +3 -0
package/src/worktree/worktree-manager.ts +94 -0
package/types/dwf.d.ts +187 -0

package/src/runtime/crash-recovery.ts CHANGED Viewed

@@ -1,10 +1,11 @@
 import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
 import * as fs from "node:fs";
+import * as path from "node:path";
 import type { MetricRegistry } from "../observability/metric-registry.ts";
 import { appendEvent, scanSequence } from "../state/event-log.ts";
 import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
 import { withRunLockSync } from "../state/locks.ts";
-import { loadRunManifestById, saveRunManifest, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
+import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
 import type { TeamTaskState } from "../state/types.ts";
 import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
 import type { ManifestCache } from "./manifest-cache.ts";
@@ -215,6 +216,43 @@ function tryRemoveRunDirectories(entry: { stateRoot: string; cwd: string }): voi
 	// NOTE: artifactsRoot is shared across runs and cleaned up by pruneFinishedRuns/pruneUserLevelRuns — not deleted here.
 }
+/**
+ * Age (ms) of the team-level heartbeat file for a run. The team-runner writes
+ * `<stateRoot>/heartbeat.json` periodically while a workflow is executing
+ * (startTeamHeartbeat), so a fresh heartbeat is strong evidence the run is alive
+ * even when its recorded PID check is inconclusive or its active-run-index
+ * entry's `updatedAt` was frozen at registration. Returns Infinity when absent.
+ */
+function heartbeatAgeMs(entry: { stateRoot: string }, now: number): number {
+	try {
+		const mtime = fs.statSync(path.join(entry.stateRoot, "heartbeat.json")).mtimeMs;
+		return Number.isFinite(mtime) ? now - mtime : Infinity;
+	} catch {
+		return Infinity;
+	}
+}
+/**
+ * True if there is recent evidence the run is (or was very recently) alive, so
+ * it must NOT be purged. Any one of these signals is sufficient:
+ *   - on-disk `manifest.updatedAt` fresher than `staleThresholdMs` (rewritten on
+ *     every task transition / status change), and/or
+ *   - team-level `heartbeat.json` fresher than `staleThresholdMs`.
+ * `entry.updatedAt` is intentionally NOT consulted: it is frozen at
+ * registration and never refreshed during execution, which previously caused
+ * long-running legitimate runs to be falsely purged — destroying their
+ * stateRoot, and because saveRunTasks() silently no-ops once the state dir is
+ * gone, hanging the workflow permanently at the current task with no
+ * recoverable state ("Run not found").
+ */
+function hasRecentLifeEvidence(entry: { stateRoot: string }, manifestUpdatedAt: string | undefined, now: number, staleThresholdMs: number): boolean {
+	const manifestMs = manifestUpdatedAt ? new Date(manifestUpdatedAt).getTime() : NaN;
+	if (Number.isFinite(manifestMs) && now - manifestMs <= staleThresholdMs) return true;
+	const hbAge = heartbeatAgeMs(entry, now);
+	if (Number.isFinite(hbAge) && hbAge <= staleThresholdMs) return true;
+	return false;
+}
 /**
  * Purge the global active-run-index of entries whose manifest is no longer active.
  *
@@ -244,7 +282,7 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
 		}
 		// 3. Read manifest status
-		let manifest: { status?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
+		let manifest: { status?: string; updatedAt?: string; async?: { pid?: number }; ownerSessionId?: string } | undefined;
 		try {
 			manifest = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8"));
 		} catch {
@@ -262,46 +300,52 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
 			continue;
 		}
-		// 5. Still "running" — check if worker PID is dead and no heartbeat
+		// 5. Still "running" with an async worker PID — only purge when the worker
+		// is actually dead AND there is no recent evidence of life. We must NOT
+		// rely solely on `entry.updatedAt` (frozen at registration) nor on a single
+		// dead-PID reading: a long-running worker (e.g. a 15-minute explorer)
+		// legitimately keeps the run "running" while periodically rewriting the
+		// on-disk manifest.updatedAt and heartbeat.json. Falsely purging such a run
+		// destroys its stateRoot, and because saveRunTasks() silently no-ops once
+		// the state dir is gone, the workflow then hangs permanently at the
+		// current task with no recoverable state ("Run not found"). When we do mark
+		// a run cancelled here, we KEEP its stateRoot so the run stays queryable/
+		// resumable and its diagnostics survive; the finished-run pruner removes
+		// the directory later on its normal schedule.
 		if (manifest?.status === "running" && manifest.async?.pid !== undefined) {
 			const pidAlive = checkProcessLiveness(manifest.async.pid).alive;
-			if (!pidAlive) {
-				// Check age — if manifest hasn't been updated in > threshold, it's stale
-				const updatedAt = new Date(entry.updatedAt).getTime();
-				if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
-					// Dead PID + stale update → cancel the manifest and unregister
-					try {
-						const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
-						if (fullLoaded) {
-							const now_iso = new Date(now).toISOString();
-							const repairedTasks = fullLoaded.tasks.map((task) => {
-								if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
-									return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
-								}
-								return task;
-							});
-							saveRunTasks(fullLoaded.manifest, repairedTasks);
-							for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
-							updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
-							saveRunManifest(fullLoaded.manifest);
-							void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
-						}
-					} catch {
-						// Best-effort manifest cleanup
+			if (!pidAlive && !hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
+				// Dead PID + no recent life evidence → cancel the manifest and unregister
+				try {
+					const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
+					if (fullLoaded) {
+						const now_iso = new Date(now).toISOString();
+						const repairedTasks = fullLoaded.tasks.map((task) => {
+							if (task.status === "running" || task.status === "queued" || task.status === "waiting") {
+								return { ...task, status: "cancelled" as const, finishedAt: now_iso, error: "Orphaned run: worker process dead and no recent activity" };
+							}
+							return task;
+						});
+						saveRunTasks(fullLoaded.manifest, repairedTasks);
+						for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
+						updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
+						void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
 					}
-					unregisterActiveRun(entry.runId);
-					tryRemoveRunDirectories(entry);
-					purged.push(entry.runId);
-					continue;
+				} catch {
+					// Best-effort manifest cleanup
 				}
+				unregisterActiveRun(entry.runId);
+				purged.push(entry.runId);
+				continue;
 			}
 		}
-		// 6. "running" but no async worker PID — possible orphaned run where manifest
-		// was never updated after worker exit. Check updatedAt age.
+		// 6. "running" but no async worker PID — possible orphaned run where the
+		// manifest was never updated to a terminal status after the worker exited.
+		// Uses the same life-evidence corroboration as condition 5; the stateRoot is
+		// kept on cancel so the run stays queryable/resumable with diagnostics.
 		if (manifest?.status === "running" && manifest.async === undefined) {
-			const updatedAt = new Date(entry.updatedAt).getTime();
-			if (Number.isFinite(updatedAt) && now - updatedAt > staleThresholdMs) {
+			if (!hasRecentLifeEvidence(entry, manifest.updatedAt, now, staleThresholdMs)) {
 				try {
 					const fullLoaded = loadRunManifestById(entry.cwd, entry.runId); // NOTE: no withRunLock - best-effort only; concurrent writes may cause inconsistency
 					if (fullLoaded && fullLoaded.manifest.status === "running") {
@@ -315,14 +359,12 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
 						saveRunTasks(fullLoaded.manifest, repairedTasks);
 						for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
 						updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: no async worker and no manifest update in over " + Math.round(staleThresholdMs / 60000) + " minutes");
-						saveRunManifest(fullLoaded.manifest);
 						void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
 					}
 				} catch {
 					// Best-effort
 				}
 				unregisterActiveRun(entry.runId);
-				tryRemoveRunDirectories(entry);
 				purged.push(entry.runId);
 				continue;
 			}

package/src/runtime/deterministic-ast.ts ADDED Viewed

@@ -0,0 +1,161 @@
+/**
+ * deterministic-ast.ts — AST-based determinism enforcement for dynamic-workflow scripts (round-13 P0-2).
+ *
+ * Rejects `Date.now()`, `Math.random()`, and `new Date()` at workflow-load time
+ * using a true AST walk (not regex) so that:
+ *   - Prompts mentioning "Date.now()" as string literals are accepted.
+ *   - Comments containing "Date.now()" are accepted.
+ *   - `Date.parse()`, `Date.UTC()`, `Math.floor()`, etc. are accepted (only `now` and `random` are blocked).
+ *
+ * Adapted from pi-dynamic-workflows/src/workflow.ts (MIT) — see NOTICE.md.
+ *
+ * The walker uses acorn's parse() with permissive flags (allowAwaitOutsideFunction,
+ * allowReturnOutsideFunction) so we don't reject perfectly valid workflow scripts
+ * that contain top-level `await` or `return`.
+ *
+ * On parse error, this function returns silently: jiti will surface a clearer
+ * parse error downstream. We don't double-report parse errors.
+ */
+import { parse } from "acorn";
+const NONDETERMINISM_ERROR =
+	"Workflow scripts must be deterministic: Date.now()/Math.random()/new Date() are unavailable. These introduce non-reproducible behavior across runs. Use ctx.vars for cached state, or pass a fixed seed via ctx.setArgs(). To bypass this check (escape hatch), set PI_CREW_DWF_SKIP_DETERMINISM_CHECK=1.";
+export class DeterminismError extends Error {
+	constructor() {
+		super(NONDETERMINISM_ERROR);
+		this.name = "DeterminismError";
+	}
+}
+/**
+ * Parse `script` and walk the AST looking for non-deterministic calls.
+ * Throws DeterminismError on the first hit. Silently returns on parse error
+ * (jiti will produce a clearer message downstream).
+ */
+export function assertDeterministicScript(script: string): void {
+	let ast: AstNode;
+	try {
+		ast = parse(script, {
+			ecmaVersion: "latest",
+			sourceType: "module",
+			allowAwaitOutsideFunction: true,
+			allowReturnOutsideFunction: true,
+			ranges: false,
+		}) as unknown as AstNode;
+	} catch {
+		// Parse errors are handled by jiti downstream — don't double-report.
+		return;
+	}
+	assertDeterministicAst(ast);
+}
+/**
+ * Escape hatch: when PI_CREW_DWF_SKIP_DETERMINISM_CHECK=1 the check is bypassed.
+ * Power users may need this when a workflow legitimately depends on time/random
+ * (e.g. randomized benchmark scripts).
+ */
+export function isDeterminismCheckEnabled(): boolean {
+	return process.env.PI_CREW_DWF_SKIP_DETERMINISM_CHECK !== "1";
+}
+// ---------------------------------------------------------------------------
+// AST walker
+// ---------------------------------------------------------------------------
+interface AstNode {
+	type: string;
+	[key: string]: unknown;
+}
+function asAstNode(value: unknown): AstNode | undefined {
+	if (!value || typeof value !== "object") return undefined;
+	const obj = value as Record<string, unknown>;
+	if (typeof obj.type !== "string") return undefined;
+	return obj as AstNode;
+}
+function astChildren(node: AstNode): AstNode[] {
+	const out: AstNode[] = [];
+	for (const value of Object.values(node)) {
+		if (Array.isArray(value)) {
+			for (const item of value) {
+				const child = asAstNode(item);
+				if (child) out.push(child);
+			}
+		} else {
+			const child = asAstNode(value);
+			if (child) out.push(child);
+		}
+	}
+	return out;
+}
+function assertDeterministicAst(node: AstNode): void {
+	if (isDateNowCall(node) || isMathRandomCall(node) || isNewDateExpression(node)) {
+		throw new DeterminismError();
+	}
+	for (const child of astChildren(node)) assertDeterministicAst(child);
+}
+function isDateNowCall(node: AstNode): boolean {
+	return node.type === "CallExpression" && isMemberExpression(node, "callee", "Date", "now");
+}
+function isMathRandomCall(node: AstNode): boolean {
+	return node.type === "CallExpression" && isMemberExpression(node, "callee", "Math", "random");
+}
+function isNewDateExpression(node: AstNode): boolean {
+	if (node.type !== "NewExpression") return false;
+	const callee = asAstNode(node.callee);
+	return callee?.type === "Identifier" && callee.name === "Date";
+}
+/**
+ * Test whether `node[childKey]` is a MemberExpression of shape `objectName.propertyName`,
+ * where the property is either a static Identifier or a resolvable static string.
+ * `childKey` is the property name on `node` (usually "callee" for CallExpression).
+ */
+function isMemberExpression(node: AstNode, childKey: string, objectName: string, propertyName: string): boolean {
+	const child = asAstNode(node[childKey]);
+	if (!child || child.type !== "MemberExpression") return false;
+	const object = asAstNode(child.object);
+	if (!object || object.type !== "Identifier" || object.name !== objectName) return false;
+	return propertyNameOf(child) === propertyName;
+}
+function propertyNameOf(node: AstNode): string | undefined {
+	const computed = node.computed === true;
+	const property = asAstNode(node.property);
+	if (!property) return undefined;
+	if (!computed && property.type === "Identifier") {
+		return property.name as string | undefined;
+	}
+	return staticStringOf(property);
+}
+function staticStringOf(node: AstNode | undefined): string | undefined {
+	if (!node) return undefined;
+	if (node.type === "Literal" && typeof node.value === "string") return node.value;
+	if (node.type === "TemplateLiteral") {
+		const expressions = node.expressions;
+		if (Array.isArray(expressions) && expressions.length > 0) return undefined;
+		const quasis = node.quasis;
+		if (!Array.isArray(quasis)) return undefined;
+		return quasis
+			.map((q) => {
+				const quasi = asAstNode(q);
+				const value = quasi?.value as { cooked?: string; raw?: string } | undefined;
+				return value?.cooked ?? value?.raw ?? "";
+			})
+			.join("");
+	}
+	if (node.type === "BinaryExpression" && node.operator === "+") {
+		const left = staticStringOf(asAstNode(node.left));
+		const right = staticStringOf(asAstNode(node.right));
+		if (left !== undefined && right !== undefined) return left + right;
+	}
+	return undefined;
+}

package/src/runtime/dwf-state-store.ts ADDED Viewed

@@ -0,0 +1,97 @@
+/**
+ * dwf-state-store.ts — Persistent checkpoint state for dynamic-workflow runs (P2-3, round-18).
+ *
+ * Modeled on GoalStore (goal-state-store.ts) and FileCheckpointStore (checkpoint.ts),
+ * but scoped to a single run's stateRoot (which is already <crewRoot>/state/runs/<runId>).
+ *
+ * Stores DwfCheckpointState as atomic JSON at <stateRoot>/dwf-checkpoint.json.
+ * atomicWriteJson (temp + rename + fsync) guarantees either the old or the new file,
+ * never a partial write — safe across crashes.
+ *
+ * Resume semantics (round-18): the runner loads a checkpoint on run start and hydrates
+ * ctx.vars/phases/logs from it; on clean completion the runner deletes it. A missing or
+ * corrupt checkpoint is treated as a fresh run (load() returns undefined). If a crash
+ * happens mid-agent, that agent simply re-runs from scratch on resume — agent results
+ * are expected to be idempotent-ish.
+ */
+import { mkdirSync, existsSync, readFileSync, unlinkSync } from "node:fs";
+import { dirname } from "node:path";
+import { atomicWriteJson } from "../state/atomic-write.ts";
+import { logInternalError } from "../utils/internal-error.ts";
+export interface DwfCheckpointState {
+	runId: string;
+	vars: Record<string, unknown>;
+	phases: string[];
+	currentPhase: string | undefined;
+	logs: string[]; // capped copy (≤1000); the events log (dwf.log) is the durable source of truth
+	spent: number; // budget accumulator (round-14 P1-2)
+	agentCount: number;
+	updatedAt: string;
+}
+/**
+ * DwfStore — atomic CRUD for a single run's DWF checkpoint.
+ *
+ * Concurrency: writes are atomic (atomicWriteJson). The DWF runner is the sole
+ * writer during a run; `team resume` loads the checkpoint read-only before the
+ * script re-executes. No file-lock is needed here because only one runner owns a
+ * run's stateRoot at a time (run locks protect manifest transitions elsewhere).
+ *
+ * Note: the constructor takes the run's stateRoot directly (NOT cwd + runId) to
+ * avoid a double-nesting bug — stateRoot is already <crewRoot>/state/runs/<runId>,
+ * so the checkpoint lands at <crewRoot>/state/runs/<runId>/dwf-checkpoint.json.
+ * This mirrors FileCheckpointStore (checkpoint.ts: constructor(stateRoot)).
+ */
+export class DwfStore {
+	private readonly stateRoot: string;
+	constructor(stateRoot: string) {
+		this.stateRoot = stateRoot;
+	}
+	private get path(): string {
+		return `${this.stateRoot}/dwf-checkpoint.json`;
+	}
+	/** Load the checkpoint for this run's stateRoot. Returns undefined if missing or corrupt (fresh run). */
+	load(): DwfCheckpointState | undefined {
+		const path = this.path;
+		try {
+			if (!existsSync(path)) return undefined;
+			const raw = readFileSync(path, "utf-8");
+			const parsed = JSON.parse(raw);
+			// Corrupt-guard: a valid checkpoint must be an object with a string runId
+			// (mirrors GoalStore.load's typeof parsed.goalId !== "string" check).
+			if (!parsed || typeof parsed !== "object" || typeof parsed.runId !== "string") return undefined;
+			return parsed as DwfCheckpointState;
+		} catch {
+			return undefined;
+		}
+	}
+	/** Atomically persist a checkpoint state. Stamps `updatedAt` (callers need not set it). */
+	save(state: DwfCheckpointState): void {
+		const path = this.path;
+		const next = { ...state, updatedAt: new Date().toISOString() };
+		try {
+			mkdirSync(dirname(path), { recursive: true });
+			atomicWriteJson(path, next);
+		} catch (error) {
+			logInternalError("dwf-state-store.save", error, `runId=${state.runId}`);
+			throw error;
+		}
+	}
+	/** Remove the checkpoint file (after a clean completion). Best-effort; never throws. */
+	delete(): void {
+		const path = this.path;
+		try {
+			if (!existsSync(path)) return;
+			unlinkSync(path);
+		} catch (error) {
+			logInternalError("dwf-state-store.delete", error);
+		}
+	}
+}