pi-crew 0.8.14 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/CHANGELOG.md +366 -0
  2. package/README.md +112 -2
  3. package/docs/FEATURE_INTAKE.md +1 -1
  4. package/docs/HARNESS.md +20 -19
  5. package/docs/PROJECT_REVIEW.md +132 -133
  6. package/docs/PROJECT_REVIEW_FIXES.md +130 -131
  7. package/docs/actions-reference.md +127 -121
  8. package/docs/architecture.md +1 -1
  9. package/docs/code-review-2026-05-11.md +134 -134
  10. package/docs/commands-reference.md +108 -106
  11. package/docs/comparison-pi-subagents-vs-pi-crew.md +105 -105
  12. package/docs/deep-review-report.md +1 -1
  13. package/docs/dynamic-workflows.md +90 -0
  14. package/docs/fixes/BATCH_A_H1_H2.md +17 -17
  15. package/docs/fixes/bug-007-async-notifier-stale-ctx.md +23 -23
  16. package/docs/followup-plan-2026-05-12.md +135 -135
  17. package/docs/followup-review-2026-05-12.md +86 -86
  18. package/docs/followup-review-round3-2026-05-12.md +123 -123
  19. package/docs/goals.md +59 -0
  20. package/docs/implementation-plan-top3.md +4 -4
  21. package/docs/issue-29-analysis.md +2 -2
  22. package/docs/oh-my-pi-research.md +154 -154
  23. package/docs/optimization-plan.md +2 -0
  24. package/docs/perf/baseline-2026-05.md +9 -9
  25. package/docs/perf/final-report-2026-05.md +2 -2
  26. package/docs/perf/sprint-1-report.md +2 -2
  27. package/docs/perf/sprint-2-report.md +1 -1
  28. package/docs/perf/upgrade-plan-2026-05.md +72 -72
  29. package/docs/pi-crew-bugs.md +230 -230
  30. package/docs/pi-crew-investigation-report.md +102 -102
  31. package/docs/pi-crew-test-round5.md +4 -4
  32. package/docs/runtime-analysis-child-vs-live.md +57 -57
  33. package/docs/runtime-migration-in-process-analysis.md +97 -97
  34. package/package.json +2 -4
  35. package/skills/orchestration/SKILL.md +11 -11
  36. package/src/agents/agent-config.ts +4 -0
  37. package/src/config/config.ts +39 -0
  38. package/src/config/types.ts +11 -0
  39. package/src/extension/action-suggestions.ts +2 -1
  40. package/src/extension/async-notifier.ts +10 -0
  41. package/src/extension/help.ts +14 -0
  42. package/src/extension/registration/commands.ts +27 -0
  43. package/src/extension/team-tool/destructive-gate.ts +1 -1
  44. package/src/extension/team-tool/goal-wrap.ts +288 -0
  45. package/src/extension/team-tool/goal.ts +405 -0
  46. package/src/extension/team-tool/run.ts +103 -4
  47. package/src/extension/team-tool/workflow-manage.ts +194 -0
  48. package/src/extension/team-tool.ts +20 -0
  49. package/src/hooks/types.ts +3 -1
  50. package/src/runtime/async-runner.ts +27 -2
  51. package/src/runtime/background-runner.ts +68 -19
  52. package/src/runtime/child-pi.ts +9 -1
  53. package/src/runtime/completion-guard.ts +1 -1
  54. package/src/runtime/dynamic-workflow-context.ts +450 -0
  55. package/src/runtime/dynamic-workflow-runner.ts +180 -0
  56. package/src/runtime/global-worker-cap.ts +96 -0
  57. package/src/runtime/goal-evaluator.ts +294 -0
  58. package/src/runtime/goal-loop-runner.ts +612 -0
  59. package/src/runtime/goal-state-store.ts +209 -0
  60. package/src/runtime/iteration-hooks.ts +2 -1
  61. package/src/runtime/pi-args.ts +10 -2
  62. package/src/runtime/post-checks.ts +2 -1
  63. package/src/runtime/result-extractor.ts +32 -0
  64. package/src/runtime/team-runner.ts +11 -1
  65. package/src/runtime/verification-gates.ts +88 -5
  66. package/src/runtime/verification-integrity.ts +110 -0
  67. package/src/runtime/verification-worktree.ts +136 -0
  68. package/src/runtime/workspace-lock.ts +448 -0
  69. package/src/schema/config-schema.ts +26 -0
  70. package/src/schema/team-tool-schema.ts +39 -4
  71. package/src/state/atomic-write.ts +9 -0
  72. package/src/state/contracts.ts +14 -0
  73. package/src/state/crew-init.ts +18 -5
  74. package/src/state/event-log.ts +7 -1
  75. package/src/state/state-store.ts +2 -0
  76. package/src/state/types.ts +82 -0
  77. package/src/state/worker-atomic-writer.ts +190 -0
  78. package/src/utils/env-allowlist.ts +30 -0
  79. package/src/utils/redaction.ts +104 -24
  80. package/src/utils/safe-paths.ts +55 -14
  81. package/src/workflows/discover-workflows.ts +25 -1
  82. package/src/workflows/workflow-config.ts +13 -0
  83. package/src/worktree/cleanup.ts +2 -1
  84. package/src/worktree/worktree-manager.ts +4 -3
  85. package/teams/parallel-research.team.md +1 -1
  86. package/workflows/examples/hello.dwf.ts +24 -0
@@ -0,0 +1,136 @@
1
+ /**
2
+ * Phase 1.5 #2 — Git-worktree verification sandbox.
3
+ *
4
+ * RFC: research-findings/goal-workflow/16-PHASE1.5-WORKTREE-SANDBOX-RFC.md
5
+ *
6
+ * Closes the two remaining Med-High tampering residuals from RFC 13 §6:
7
+ * (1) Round-trip manifest tamper (MAJ#2): worker edits package.json, runs
8
+ * test, reverts before T_verify_done → snapshot matches → undetected.
9
+ * (2) Invoked-script tampering: worker rewrites a script the verification
10
+ * command invokes; only MANIFEST_FILES are hashed → invisible.
11
+ *
12
+ * Mitigation: run verification commands in a pristine `git worktree` at the
13
+ * T_snap commit SHA. Worker edits in the main workspace are invisible to
14
+ * the verification subprocess. Content-addressed execution: the test passes
15
+ * (or fails) against the original code, regardless of worker edits.
16
+ *
17
+ * Opt-in via PI_CREW_VERIFICATION_WORKTREE=1 (mirrors Phase 1.5 #1 pattern).
18
+ * Auto-fallback to existing behavior when: not a git repo, dirty index,
19
+ * git unavailable, or opt-out explicitly set. NEVER blocks the goal loop.
20
+ *
21
+ * Pure leaf module: depends only on node: built-ins + git CLI. No imports
22
+ * from goal-loop-runner or verification-gates (keeps unit-testable).
23
+ */
24
+ import { execFileSync } from "node:child_process";
25
+ import * as fs from "node:fs";
26
+ import * as os from "node:os";
27
+ import * as path from "node:path";
28
+
29
+ export interface VerificationWorktree {
30
+ /** Absolute path to the pristine worktree directory. */
31
+ worktreePath: string;
32
+ /** Commit SHA the worktree is checked out at (matches T_snap). */
33
+ commitSha: string;
34
+ /** Cleanup handle — call to remove the worktree + temp dir. Idempotent. */
35
+ cleanup: () => void;
36
+ }
37
+
38
+ /** Whether the worktree sandbox is enabled (env var opt-in). */
39
+ export function isWorktreeSandboxEnabled(): boolean {
40
+ const v = process.env.PI_CREW_VERIFICATION_WORKTREE ?? process.env.PI_TEAMS_VERIFICATION_WORKTREE;
41
+ return v === "1" || v === "true";
42
+ }
43
+
44
+ /**
45
+ * Detect whether the worktree sandbox is AVAILABLE at `cwd`:
46
+ * - opt-in env var set
47
+ * - git executable on PATH
48
+ * - cwd is inside a git repo
49
+ * - git index is clean (no uncommitted changes that would be lost)
50
+ *
51
+ * Returns false (with reason) when any precondition fails. Callers MUST
52
+ * gracefully fall back to non-sandboxed execution — never block the goal.
53
+ */
54
+ export function checkWorktreeSandboxAvailable(cwd: string): { available: true; commitSha: string } | { available: false; reason: string } {
55
+ if (!isWorktreeSandboxEnabled()) {
56
+ return { available: false, reason: "PI_CREW_VERIFICATION_WORKTREE not set (opt-in)" };
57
+ }
58
+ try {
59
+ // Is cwd inside a git repo? `git rev-parse --show-toplevel` errors out
60
+ // (non-zero exit) when not in a repo. execFileSync throws on non-zero.
61
+ const toplevel = execFileSync("git", ["rev-parse", "--show-toplevel"], { cwd, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }).trim();
62
+ if (!toplevel) return { available: false, reason: "git rev-parse returned empty toplevel" };
63
+ // Current commit SHA (this is what T_snap will pin to).
64
+ const commitSha = execFileSync("git", ["rev-parse", "HEAD"], { cwd, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }).trim();
65
+ if (!commitSha) return { available: false, reason: "git rev-parse HEAD returned empty SHA" };
66
+ // Dirty index? `git status --porcelain` outputs non-empty if there are
67
+ // uncommitted changes. We refuse to sandbox a dirty workspace because
68
+ // the worktree would NOT contain the in-progress edits (T_snap would
69
+ // pin to a stale commit). Better to fall back + warn than silently
70
+ // verify against the wrong code.
71
+ const status = execFileSync("git", ["status", "--porcelain"], { cwd, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }).trim();
72
+ if (status.length > 0) return { available: false, reason: `dirty git index (${status.split("\n").length} changed files); refusing to sandbox — worktree would pin to stale commit` };
73
+ return { available: true, commitSha };
74
+ } catch (error) {
75
+ const msg = error instanceof Error ? error.message : String(error);
76
+ return { available: false, reason: `git precondition check failed: ${msg.slice(0, 200)}` };
77
+ }
78
+ }
79
+
80
+ /**
81
+ * Prepare a pristine git worktree at `commitSha`. The worktree is a fresh
82
+ * checkout of the project at that commit — it does NOT contain worker edits
83
+ * from the main workspace.
84
+ *
85
+ * `git worktree add --detach <tmp>/wt-<sha8> <sha>` creates a detached-HEAD
86
+ * worktree (no branch pollution). Returns the worktree path + cleanup handle.
87
+ *
88
+ * Cleanup is idempotent (safe to call multiple times) and best-effort (swallows
89
+ * errors so a stuck worktree doesn't propagate into the goal loop).
90
+ */
91
+ export function prepareVerificationWorktree(cwd: string, commitSha: string): VerificationWorktree {
92
+ // Temp parent dir under os.tmpdir() so worktrees are auto-cleaned on reboot.
93
+ const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), "pi-crew-wt-"));
94
+ const shortSha = commitSha.slice(0, 8);
95
+ const worktreePath = path.join(tmpRoot, `wt-${shortSha}`);
96
+ let cleaned = false;
97
+ const cleanup = (): void => {
98
+ if (cleaned) return;
99
+ cleaned = true;
100
+ // Remove the worktree (force = proceed even if it has untracked files).
101
+ try {
102
+ execFileSync("git", ["worktree", "remove", "--force", worktreePath], { cwd, stdio: ["ignore", "pipe", "pipe"], timeout: 5000 });
103
+ } catch {
104
+ // Fall back to `git worktree prune` if remove fails (already gone).
105
+ try { execFileSync("git", ["worktree", "prune"], { cwd, stdio: ["ignore", "pipe", "pipe"], timeout: 5000 }); } catch { /* best-effort */ }
106
+ }
107
+ // Remove the temp parent dir.
108
+ try { fs.rmSync(tmpRoot, { recursive: true, force: true }); } catch { /* best-effort */ }
109
+ };
110
+ try {
111
+ execFileSync("git", ["worktree", "add", "--detach", worktreePath, commitSha], { cwd, stdio: ["ignore", "pipe", "pipe"], timeout: 30_000 });
112
+ return { worktreePath, commitSha, cleanup };
113
+ } catch (error) {
114
+ cleanup();
115
+ const msg = error instanceof Error ? error.message : String(error);
116
+ throw new Error(`git worktree add failed (cwd=${cwd}, sha=${shortSha}): ${msg.slice(0, 300)}`);
117
+ }
118
+ }
119
+
120
+ /**
121
+ * RAII wrapper: prepare worktree, run `fn(worktree)`, ALWAYS cleanup in finally.
122
+ *
123
+ * `fn` may throw — the worktree is removed regardless. The original error
124
+ * propagates (cleanup errors are swallowed and best-effort).
125
+ *
126
+ * If preparation fails, the function rethrows WITHOUT calling fn — caller
127
+ * must handle the prep failure (typically by falling back to non-sandboxed).
128
+ */
129
+ export async function withVerificationWorktree<T>(cwd: string, commitSha: string, fn: (worktree: VerificationWorktree) => Promise<T> | T): Promise<T> {
130
+ const worktree = prepareVerificationWorktree(cwd, commitSha);
131
+ try {
132
+ return await fn(worktree);
133
+ } finally {
134
+ worktree.cleanup();
135
+ }
136
+ }
@@ -0,0 +1,448 @@
1
+ /**
2
+ * workspace-lock.ts — Per-cwd workspace lock with startTime-safe liveness (P1g).
3
+ *
4
+ * RFC: research-findings/goal-workflow/13-VISION-RFC.md v0.5 §P1g + D10.
5
+ *
6
+ * Closes #8 (multi-goal clobber) and the B-2 PID-recycling gap. Each
7
+ * `workspaceMode:"single"` goal acquires this lock for its entire lifetime,
8
+ * serializing concurrent goals that share a cwd.
9
+ *
10
+ * Lockfile location: `<crewRoot>/state/workspace-locks/<sha256(absCwd)>.lock`
11
+ * Lockfile contents: { pid, startTime, heartbeat, goalId, acquiredAt }
12
+ *
13
+ * ─── LIVENESS = stale-reconciler startTime pattern (D10, B-2 fix) ───
14
+ * A lock is STALE iff EITHER:
15
+ * (a) the recorded pid's CURRENT startTime ≠ the lockfile startTime
16
+ * (the PID was recycled to a different process), OR
17
+ * (b) the heartbeat is older than HEARTBEAT_STALE_MS (default 60s)
18
+ * (the process crashed without exiting / heartbeat stopped).
19
+ *
20
+ * Why NOT child-pi.ts killProcessPid (B-2): killProcessPid uses
21
+ * process.kill(pid, 0) which is PID-only — vulnerable to PID recycling. The
22
+ * startTime + before/after re-verify pattern is TOCTOU-correct.
23
+ *
24
+ * getProcessStartTime is NOT exported from stale-reconciler.ts, so its logic
25
+ * is REPLICATED here (RFC §P1g explicitly permits importing OR replicating).
26
+ * The replication matches stale-reconciler.ts:112 field-for-field.
27
+ *
28
+ * Granularity: per-goal, held for the goal's lifetime (release() on goal end).
29
+ * Contention: default QUEUE (poll until released or stale);
30
+ * opts.failOnWorkspaceBusy:true → THROW instead of queue.
31
+ */
32
+
33
+ import { createHash } from "node:crypto";
34
+ import {
35
+ existsSync,
36
+ mkdirSync,
37
+ readFileSync,
38
+ readdirSync,
39
+ unlinkSync,
40
+ openSync,
41
+ closeSync,
42
+ statSync,
43
+ writeFileSync,
44
+ } from "node:fs";
45
+ import * as path from "node:path";
46
+ import { atomicWriteJson } from "../state/atomic-write.ts";
47
+ import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
48
+
49
+ /** Heartbeat staleness threshold (ms). Default 60s per RFC §P1g. */
50
+ const DEFAULT_HEARTBEAT_STALE_MS = 60_000;
51
+
52
+ /** Polling interval while queued waiting for a held lock (ms). */
53
+ const DEFAULT_LOCK_POLL_MS = 500;
54
+
55
+ /**
56
+ * Resolve a pid's process start time in ms, reusing the stale-reconciler
57
+ * pattern (src/runtime/stale-reconciler.ts:112). Returns undefined if the
58
+ * process is gone or /proc is unavailable (non-Linux). The absolute value
59
+ * matters less than its uniqueness per PID lifecycle. Used to detect PID
60
+ * recycling: a recycled PID has a different startTime than the recorded one.
61
+ *
62
+ * Callers (esp. tests) may inject a custom resolver to simulate PID recycling
63
+ * deterministically without spawning real processes.
64
+ */
65
+ export type StartTimeResolver = (pid: number) => number | undefined;
66
+
67
+ export const defaultStartTimeResolver: StartTimeResolver = (pid: number): number | undefined => {
68
+ try {
69
+ const stat = readFileSync(`/proc/${pid}/stat`, "utf-8");
70
+ const lastParen = stat.lastIndexOf(")");
71
+ if (lastParen === -1) return undefined;
72
+ const fieldsAfterComm = stat.slice(lastParen + 1).trim().split(/\s+/);
73
+ // starttime is at index 19 (the 20th field after comm) of /proc/<pid>/stat.
74
+ const startTimeClockTicks = Number(fieldsAfterComm[19]);
75
+ if (!Number.isFinite(startTimeClockTicks)) return undefined;
76
+ // Convert clock ticks to ms (~CLK_TCK). Absolute uniqueness is what matters.
77
+ return Math.floor(startTimeClockTicks * 10);
78
+ } catch {
79
+ return undefined;
80
+ }
81
+ };
82
+
83
+ /** Lockfile contents (persisted as JSON). */
84
+ export interface WorkspaceLockContents {
85
+ pid: number;
86
+ startTime: number | undefined;
87
+ heartbeat: number;
88
+ goalId: string;
89
+ acquiredAt: string;
90
+ }
91
+
92
+ /**
93
+ * Opaque handle returned by acquireWorkspaceLock. Call release() to free the
94
+ * lock when the goal ends. release() is a no-op if the lock was already
95
+ * reclaimed/re-acquired by another goal (guarded by goalId + pid + startTime).
96
+ */
97
+ export interface WorkspaceLockHandle {
98
+ readonly cwd: string;
99
+ readonly goalId: string;
100
+ readonly lockPath: string;
101
+ /** The startTime value written to the lockfile at acquire (release guard). */
102
+ readonly startTime: number | undefined;
103
+ release(): void;
104
+ }
105
+
106
+ export interface AcquireWorkspaceLockOptions {
107
+ /** Throw instead of queue when the workspace is already held (default: queue). */
108
+ failOnWorkspaceBusy?: boolean;
109
+ /** Override the heartbeat-staleness threshold (ms). */
110
+ heartbeatStaleMs?: number;
111
+ /** Override the polling interval while queued (ms). */
112
+ pollMs?: number;
113
+ /** Test injection: override process start time resolution. */
114
+ startTimeResolver?: StartTimeResolver;
115
+ /** Test injection: override current time (ms). Default Date.now(). */
116
+ now?: () => number;
117
+ /** Test injection: override the current pid. Default process.pid. */
118
+ pid?: number;
119
+ /** Abort waiting when this signal aborts. */
120
+ signal?: AbortSignal;
121
+ }
122
+
123
+ /**
124
+ * Resolve the lockfile path for a cwd. Lockfiles live under the project's
125
+ * `.crew/state/workspace-locks/` (or user crew-root fallback) and are named by
126
+ * the sha256 of the absolute cwd to avoid filesystem-unsafe characters and to
127
+ * normalize symlink-equivalent paths.
128
+ */
129
+ export function workspaceLockPath(cwd: string): string {
130
+ const absCwd = path.resolve(cwd);
131
+ const crewRoot = projectCrewRoot(absCwd) ?? userCrewRoot();
132
+ const locksDir = path.join(crewRoot, "state", "workspace-locks");
133
+ const hash = createHash("sha256").update(absCwd).digest("hex");
134
+ return path.join(locksDir, `${hash}.lock`);
135
+ }
136
+
137
+ /** Read + parse a lockfile. Returns undefined if missing/corrupt. */
138
+ function readLock(lockPath: string): WorkspaceLockContents | undefined {
139
+ if (!existsSync(lockPath)) return undefined;
140
+ try {
141
+ const parsed = JSON.parse(readFileSync(lockPath, "utf-8"));
142
+ if (!parsed || typeof parsed !== "object") return undefined;
143
+ return parsed as WorkspaceLockContents;
144
+ } catch {
145
+ return undefined;
146
+ }
147
+ }
148
+
149
+ /**
150
+ * Write the lockfile atomically (temp+rename+fsync via atomicWriteJson).
151
+ * Used for HEARTBEAT refresh only (a claim that already owns the lock is refreshing its
152
+ * timestamp — overwrite is correct because the owner verified ownership first).
153
+ */
154
+ function writeLock(lockPath: string, contents: WorkspaceLockContents): void {
155
+ mkdirSync(path.dirname(lockPath), { recursive: true });
156
+ atomicWriteJson(lockPath, contents);
157
+ }
158
+
159
+ /**
160
+ * CLAIM the lockfile atomically via O_EXCL (cold-review #3 NIT #N1 fix).
161
+ * The previous claim used temp+rename (writeLock), which is NOT cross-process atomic —
162
+ * two goals that both observed a free lock in the same tick could both writeLock and
163
+ * both believe they own it (the same class of TOCTOU cold-review #2 caught for CAS).
164
+ * O_EXCL (openSync "wx") IS atomic at the OS level: only one process can create the
165
+ * file. Returns true on success, false if the file already exists (EEXIST). Stale
166
+ * lockfiles older than the threshold are force-deleted + retried once.
167
+ */
168
+ /**
169
+ * CLAIM the lockfile atomically via O_EXCL (cold-review #3 NIT #N1 fix).
170
+ * The previous claim used temp+rename (writeLock), which is NOT cross-process atomic —
171
+ * two goals that both observed a free/stale lock in the same tick could both writeLock
172
+ * and both believe they own it (the same class of TOCTOU cold-review #2 caught for CAS).
173
+ * O_EXCL (openSync "wx") IS atomic at the OS level: only one process can create the file.
174
+ *
175
+ * `forceOverwrite`: when the caller has ALREADY verified (via isLockStale) that the existing
176
+ * lock is logically stale (e.g. PID recycled — a stronger signal than mtime age), the caller
177
+ * passes forceOverwrite:true and claimLock unlinks then claims, bypassing the mtime age check.
178
+ * (Without this, a stale-by-PID-recycling lock whose mtime is recent would never be claimed,
179
+ * because tryCreate sees EEXIST and the mtime age check fails — infinite re-queue = hang.)
180
+ *
181
+ * Returns true on success, false if the file already exists and is not stale.
182
+ */
183
+ function claimLock(lockPath: string, contents: WorkspaceLockContents, staleReclaimMs: number, forceOverwrite = false): boolean {
184
+ mkdirSync(path.dirname(lockPath), { recursive: true });
185
+ const json = JSON.stringify(contents);
186
+ const tryCreate = (): boolean => {
187
+ try {
188
+ const fd = openSync(lockPath, "wx"); // O_EXCL — throws EEXIST if it exists.
189
+ try {
190
+ writeFileSync(fd, json);
191
+ } finally {
192
+ closeSync(fd);
193
+ }
194
+ return true;
195
+ } catch (error) {
196
+ const code = (error as NodeJS.ErrnoException).code;
197
+ if (code !== "EEXIST") throw error;
198
+ return false;
199
+ }
200
+ };
201
+ if (forceOverwrite) {
202
+ // Caller verified the existing lock is logically stale; remove it and claim. A concurrent
203
+ // reclaimer might re-create between our unlink and our open — that's fine, we lose the race
204
+ // and return false, falling through to the queue path.
205
+ try { unlinkSync(lockPath); } catch { /* best-effort */ }
206
+ return tryCreate();
207
+ }
208
+ if (tryCreate()) return true;
209
+ // Stale recovery by mtime age: if the lockfile is older than staleReclaimMs, force-delete + retry.
210
+ try {
211
+ const stat = statSync(lockPath);
212
+ if (Date.now() - stat.mtimeMs > staleReclaimMs) {
213
+ try { unlinkSync(lockPath); } catch { /* fall through */ }
214
+ return tryCreate();
215
+ }
216
+ } catch { /* fall through to false */ }
217
+ return false;
218
+ }
219
+
220
+ /**
221
+ * Is the lock STALE? RFC §P1g + D10 dual-check:
222
+ * (a) startTime mismatch → PID recycled to a different process, OR
223
+ * (b) heartbeat older than heartbeatStaleMs → crash w/o exit / abandoned.
224
+ *
225
+ * On platforms where startTime is unavailable (non-Linux), only the heartbeat
226
+ * check applies (weaker PID-reuse detection — documented platform limitation,
227
+ * matching stale-reconciler.ts).
228
+ */
229
+ function isLockStale(
230
+ lock: WorkspaceLockContents,
231
+ resolveStartTime: StartTimeResolver,
232
+ heartbeatStaleMs: number,
233
+ now: number,
234
+ ): { stale: boolean; reason?: string } {
235
+ // (a) startTime mismatch → PID recycled to a different process.
236
+ if (lock.startTime !== undefined) {
237
+ const currentStartTime = resolveStartTime(lock.pid);
238
+ if (currentStartTime !== undefined && currentStartTime !== lock.startTime) {
239
+ return { stale: true, reason: "pid_recycled" };
240
+ }
241
+ // currentStartTime === undefined: process gone OR /proc unavailable →
242
+ // fall through to the heartbeat check (corroborating evidence).
243
+ }
244
+ // (b) heartbeat older than threshold → crash without exit / abandoned.
245
+ const heartbeatAge = now - lock.heartbeat;
246
+ if (heartbeatAge > heartbeatStaleMs) {
247
+ return { stale: true, reason: "heartbeat_stale" };
248
+ }
249
+ return { stale: false };
250
+ }
251
+
252
+ /**
253
+ * Acquire the workspace lock for `goalId` at `cwd`. If the lock is held by a
254
+ * live goal, the default behavior is QUEUE (poll until released or the holder
255
+ * goes stale); with opts.failOnWorkspaceBusy:true, throws instead.
256
+ *
257
+ * Stale locks (PID recycled or heartbeat expired) are reclaimed transparently.
258
+ *
259
+ * The returned handle's release() deletes the lockfile ONLY if it still
260
+ * belongs to this goal+pid+startTime — so a stale handle cannot clobber a
261
+ * lock reclaimed and re-acquired by another goal after this goal went stale.
262
+ *
263
+ * In-process serialization: the read→stale-check→write sequence is
264
+ * synchronous within one event-loop tick, so concurrent in-process acquires
265
+ * cannot both observe a free lock and both write (no interleave between the
266
+ * sync read and sync write).
267
+ */
268
+ export async function acquireWorkspaceLock(
269
+ cwd: string,
270
+ goalId: string,
271
+ opts: AcquireWorkspaceLockOptions = {},
272
+ ): Promise<WorkspaceLockHandle> {
273
+ const lockPath = workspaceLockPath(cwd);
274
+ const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
275
+ const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
276
+ const pollMs = opts.pollMs ?? DEFAULT_LOCK_POLL_MS;
277
+ const now = opts.now ?? Date.now;
278
+ const pid = opts.pid ?? process.pid;
279
+ const writtenStartTime = resolveStartTime(pid);
280
+
281
+ while (true) {
282
+ // Poll-loop: re-check the lock each tick until free/stale or aborted.
283
+ if (opts.signal?.aborted) {
284
+ throw new Error(
285
+ `workspace lock acquisition aborted for goal ${goalId} (cwd=${cwd})`,
286
+ );
287
+ }
288
+ const existing = readLock(lockPath);
289
+ // Classify the existing lock: "absent" / "stale" (PID recycled or heartbeat dead) / "live".
290
+ // Cold-review #3 NIT #N1 fix: when stale, pass forceOverwrite:true to claimLock so it
291
+ // unlinks the stale file before claiming. Without this, a stale-by-PID lock whose mtime
292
+ // is recent would never pass claimLock's mtime age check (it would return false) and the
293
+ // acquireWorkspaceLock poll loop would re-queue forever = hang.
294
+ const existingKind: "absent" | "stale" | "live" = !existing
295
+ ? "absent"
296
+ : (isLockStale(existing, resolveStartTime, heartbeatStaleMs, now()).stale ? "stale" : "live");
297
+ if (existingKind !== "live") {
298
+ // Claim the lock (covers both no-lock and stale-lock cases).
299
+ // Cold-review #3 NIT #N1 fix: claim via O_EXCL (claimLock), NOT temp+rename — two
300
+ // processes racing past the isLockStale check could both writeLock and both believe
301
+ // they own the lock. claimLock atomically creates the file; if it returns false we
302
+ // lost the race, so fall through to the queue/re-throw path below.
303
+ const contents: WorkspaceLockContents = {
304
+ pid,
305
+ startTime: writtenStartTime,
306
+ heartbeat: now(),
307
+ goalId,
308
+ acquiredAt: new Date(now()).toISOString(),
309
+ };
310
+ const claimed = claimLock(lockPath, contents, heartbeatStaleMs, existingKind === "stale");
311
+ if (claimed) {
312
+ return {
313
+ cwd,
314
+ goalId,
315
+ lockPath,
316
+ startTime: writtenStartTime,
317
+ release(): void {
318
+ safeRelease(lockPath, goalId, pid, writtenStartTime);
319
+ },
320
+ };
321
+ }
322
+ // claimLock lost the race (another process claimed between our stale-check and
323
+ // our claim). Fall through to the busy path (throw or queue) — re-check next tick.
324
+ }
325
+ // Lock is held and live.
326
+ if (opts.failOnWorkspaceBusy) {
327
+ throw new Error(
328
+ `workspace busy: cwd=${cwd} held by goalId=${existing!.goalId} (pid=${existing!.pid})`,
329
+ );
330
+ }
331
+ // Queue: wait for the next poll interval, then re-check.
332
+ await sleepOrAbort(pollMs, opts.signal);
333
+ }
334
+ }
335
+
336
+ /**
337
+ * Delete the lockfile at `lockPath` only if it still belongs to
338
+ * (goalId, pid, startTime). A stale handle (whose lock was reclaimed and
339
+ * re-acquired by another goal) must NOT delete the new owner's lock.
340
+ */
341
+ function safeRelease(
342
+ lockPath: string,
343
+ goalId: string,
344
+ pid: number,
345
+ writtenStartTime: number | undefined,
346
+ ): void {
347
+ try {
348
+ const current = readLock(lockPath);
349
+ if (
350
+ current &&
351
+ current.goalId === goalId &&
352
+ current.pid === pid &&
353
+ current.startTime === writtenStartTime
354
+ ) {
355
+ unlinkSync(lockPath);
356
+ }
357
+ } catch {
358
+ /* best-effort — release must never throw into a finally block */
359
+ }
360
+ }
361
+
362
+ /** Sleep that resolves after `ms`, or rejects early if `signal` aborts. */
363
+ function sleepOrAbort(ms: number, signal?: AbortSignal): Promise<void> {
364
+ if (!signal) return new Promise<void>((r) => setTimeout(r, ms));
365
+ return new Promise<void>((resolve, reject) => {
366
+ const timer = setTimeout(resolve, ms);
367
+ signal.addEventListener(
368
+ "abort",
369
+ () => {
370
+ clearTimeout(timer);
371
+ reject(new Error("workspace lock acquisition aborted"));
372
+ },
373
+ { once: true },
374
+ );
375
+ });
376
+ }
377
+
378
+ /**
379
+ * Reclaim all stale locks under `dir` (the workspace-locks directory). Returns
380
+ * the list of reclaimed lock paths. Stale = PID recycled OR heartbeat older
381
+ * than threshold. Corrupt/unreadable locks are also reclaimed.
382
+ *
383
+ * Useful as a startup or periodic sweep to clear locks left by crashed
384
+ * processes before any goal tries to acquire them.
385
+ */
386
+ /**
387
+ * Peek whether the workspace is currently locked by a live owner (without acquiring).
388
+ * Used by `goal start` / `goal resume` to fail-fast with a clear error BEFORE spawning.
389
+ * Returns the goalId of the current owner if busy, undefined if free (or lock missing).
390
+ */
391
+ export function isWorkspaceBusy(
392
+ cwd: string,
393
+ opts: { startTimeResolver?: StartTimeResolver; heartbeatStaleMs?: number; now?: () => number } = {},
394
+ ): string | undefined {
395
+ const lockPath = workspaceLockPath(cwd);
396
+ const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
397
+ const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
398
+ const now = opts.now ?? Date.now;
399
+ const existing = readLock(lockPath);
400
+ if (!existing) return undefined;
401
+ const { stale } = isLockStale(existing, resolveStartTime, heartbeatStaleMs, now());
402
+ return stale ? undefined : existing.goalId;
403
+ }
404
+
405
+ export function reclaimStaleLocks(
406
+ dir: string,
407
+ opts: {
408
+ heartbeatStaleMs?: number;
409
+ startTimeResolver?: StartTimeResolver;
410
+ now?: () => number;
411
+ } = {},
412
+ ): string[] {
413
+ const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
414
+ const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
415
+ const now = opts.now ?? Date.now;
416
+ const reclaimed: string[] = [];
417
+ if (!existsSync(dir)) return reclaimed;
418
+ let entries: string[];
419
+ try {
420
+ entries = readdirSync(dir);
421
+ } catch {
422
+ return reclaimed;
423
+ }
424
+ for (const entry of entries) {
425
+ if (!entry.endsWith(".lock")) continue;
426
+ const lockPath = path.join(dir, entry);
427
+ const lock = readLock(lockPath);
428
+ if (!lock) {
429
+ // Corrupt/empty — reclaim.
430
+ try {
431
+ unlinkSync(lockPath);
432
+ reclaimed.push(lockPath);
433
+ } catch {
434
+ /* best-effort */
435
+ }
436
+ continue;
437
+ }
438
+ if (isLockStale(lock, resolveStartTime, heartbeatStaleMs, now()).stale) {
439
+ try {
440
+ unlinkSync(lockPath);
441
+ reclaimed.push(lockPath);
442
+ } catch {
443
+ /* best-effort */
444
+ }
445
+ }
446
+ }
447
+ return reclaimed;
448
+ }
@@ -59,6 +59,31 @@ export const PiTeamsWorktreeConfigSchema = Type.Object({
59
59
  seedPaths: Type.Optional(Type.Array(Type.String({ minLength: 1 }))),
60
60
  }, { additionalProperties: false });
61
61
 
62
+ /**
63
+ * Goal-wrap config (RFC v0.5 vision: apply `goal` completion-guarantee to builtin workflows).
64
+ * Per-workflow toggle. When enabled, a builtin workflow runs as the WORKER TURN inside a
65
+ * goal loop (worker → judge → feedback → redo until achieved / maxTurns / budget / stuck).
66
+ * Default OFF — opt-in per workflow. Only applies to builtin workflows that have a clear
67
+ * 'done' condition (implementation, fast-fix). Read-only workflows (review, research) are
68
+ * not goal-wrappable.
69
+ */
70
+ export const GoalWrapWorkflowConfigSchema = Type.Object({
71
+ enabled: Type.Optional(Type.Boolean()),
72
+ maxTurns: Type.Optional(Type.Integer({ minimum: 1, maximum: 50 })),
73
+ evaluatorModel: Type.Optional(Type.String({ minLength: 1 })),
74
+ verification: Type.Optional(Type.Object({
75
+ commands: Type.Array(Type.String({ minLength: 1 })),
76
+ mode: Type.Optional(Type.Literal("text-only")),
77
+ }, { additionalProperties: false })),
78
+ budgetTotal: Type.Optional(Type.Integer({ minimum: 1000 })),
79
+ budgetUnlimited: Type.Optional(Type.Boolean()),
80
+ }, { additionalProperties: false });
81
+
82
+ export const PiTeamsGoalWrapConfigSchema = Type.Record(
83
+ Type.String({ minLength: 1 }),
84
+ GoalWrapWorkflowConfigSchema,
85
+ );
86
+
62
87
  export const AgentOverrideSchema = Type.Object({
63
88
  disabled: Type.Optional(Type.Boolean()),
64
89
  model: Type.Optional(Type.Union([Type.String({ minLength: 1 }), Type.Literal(false)])),
@@ -152,6 +177,7 @@ export const PiTeamsConfigSchema = Type.Object({
152
177
  runtime: Type.Optional(PiTeamsRuntimeConfigSchema),
153
178
  control: Type.Optional(PiTeamsControlConfigSchema),
154
179
  worktree: Type.Optional(PiTeamsWorktreeConfigSchema),
180
+ goalWrap: Type.Optional(PiTeamsGoalWrapConfigSchema),
155
181
  agents: Type.Optional(PiTeamsAgentsConfigSchema),
156
182
  tools: Type.Optional(PiTeamsToolsConfigSchema),
157
183
  telemetry: Type.Optional(PiTeamsTelemetryConfigSchema),