pi-crew 0.8.13 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/CHANGELOG.md +296 -0
  2. package/README.md +118 -2
  3. package/docs/FEATURE_INTAKE.md +1 -1
  4. package/docs/HARNESS.md +20 -19
  5. package/docs/PROJECT_REVIEW.md +132 -133
  6. package/docs/PROJECT_REVIEW_FIXES.md +130 -131
  7. package/docs/actions-reference.md +127 -121
  8. package/docs/architecture.md +1 -1
  9. package/docs/code-review-2026-05-11.md +134 -134
  10. package/docs/commands-reference.md +108 -106
  11. package/docs/comparison-pi-subagents-vs-pi-crew.md +105 -105
  12. package/docs/deep-review-report.md +1 -1
  13. package/docs/dynamic-workflows.md +90 -0
  14. package/docs/fixes/BATCH_A_H1_H2.md +17 -17
  15. package/docs/fixes/bug-007-async-notifier-stale-ctx.md +23 -23
  16. package/docs/followup-plan-2026-05-12.md +135 -135
  17. package/docs/followup-review-2026-05-12.md +86 -86
  18. package/docs/followup-review-round3-2026-05-12.md +123 -123
  19. package/docs/goals.md +59 -0
  20. package/docs/implementation-plan-top3.md +4 -4
  21. package/docs/issue-29-analysis.md +2 -2
  22. package/docs/oh-my-pi-research.md +154 -154
  23. package/docs/optimization-plan.md +2 -0
  24. package/docs/perf/baseline-2026-05.md +9 -9
  25. package/docs/perf/final-report-2026-05.md +2 -2
  26. package/docs/perf/sprint-1-report.md +2 -2
  27. package/docs/perf/sprint-2-report.md +1 -1
  28. package/docs/perf/upgrade-plan-2026-05.md +72 -72
  29. package/docs/pi-crew-bugs.md +230 -230
  30. package/docs/pi-crew-investigation-report.md +102 -102
  31. package/docs/pi-crew-test-round5.md +4 -4
  32. package/docs/runtime-analysis-child-vs-live.md +57 -57
  33. package/docs/runtime-migration-in-process-analysis.md +97 -97
  34. package/install.mjs +3 -2
  35. package/package.json +2 -4
  36. package/skills/orchestration/SKILL.md +11 -11
  37. package/src/agents/agent-config.ts +4 -0
  38. package/src/config/config.ts +39 -0
  39. package/src/config/types.ts +11 -0
  40. package/src/extension/action-suggestions.ts +2 -1
  41. package/src/extension/async-notifier.ts +10 -0
  42. package/src/extension/help.ts +14 -0
  43. package/src/extension/project-init.ts +7 -20
  44. package/src/extension/registration/commands.ts +27 -0
  45. package/src/extension/team-tool/destructive-gate.ts +1 -1
  46. package/src/extension/team-tool/goal-wrap.ts +288 -0
  47. package/src/extension/team-tool/goal.ts +405 -0
  48. package/src/extension/team-tool/run.ts +103 -4
  49. package/src/extension/team-tool/workflow-manage.ts +194 -0
  50. package/src/extension/team-tool.ts +20 -0
  51. package/src/hooks/types.ts +3 -1
  52. package/src/runtime/async-runner.ts +24 -2
  53. package/src/runtime/background-runner.ts +68 -19
  54. package/src/runtime/child-pi.ts +6 -1
  55. package/src/runtime/completion-guard.ts +1 -1
  56. package/src/runtime/dynamic-workflow-context.ts +450 -0
  57. package/src/runtime/dynamic-workflow-runner.ts +180 -0
  58. package/src/runtime/global-worker-cap.ts +96 -0
  59. package/src/runtime/goal-evaluator.ts +294 -0
  60. package/src/runtime/goal-loop-runner.ts +612 -0
  61. package/src/runtime/goal-state-store.ts +209 -0
  62. package/src/runtime/pi-args.ts +10 -2
  63. package/src/runtime/result-extractor.ts +32 -0
  64. package/src/runtime/team-runner.ts +11 -1
  65. package/src/runtime/verification-gates.ts +85 -5
  66. package/src/runtime/verification-integrity.ts +110 -0
  67. package/src/runtime/verification-worktree.ts +136 -0
  68. package/src/runtime/workspace-lock.ts +448 -0
  69. package/src/schema/config-schema.ts +26 -0
  70. package/src/schema/team-tool-schema.ts +39 -4
  71. package/src/state/atomic-write.ts +9 -0
  72. package/src/state/contracts.ts +14 -0
  73. package/src/state/crew-init.ts +18 -5
  74. package/src/state/event-log.ts +7 -1
  75. package/src/state/state-store.ts +2 -0
  76. package/src/state/types.ts +82 -0
  77. package/src/state/worker-atomic-writer.ts +176 -0
  78. package/src/utils/redaction.ts +104 -24
  79. package/src/workflows/discover-workflows.ts +25 -1
  80. package/src/workflows/workflow-config.ts +13 -0
  81. package/teams/parallel-research.team.md +1 -1
  82. package/workflows/examples/hello.dwf.ts +24 -0
@@ -0,0 +1,448 @@
1
+ /**
2
+ * workspace-lock.ts — Per-cwd workspace lock with startTime-safe liveness (P1g).
3
+ *
4
+ * RFC: research-findings/goal-workflow/13-VISION-RFC.md v0.5 §P1g + D10.
5
+ *
6
+ * Closes #8 (multi-goal clobber) and the B-2 PID-recycling gap. Each
7
+ * `workspaceMode:"single"` goal acquires this lock for its entire lifetime,
8
+ * serializing concurrent goals that share a cwd.
9
+ *
10
+ * Lockfile location: `<crewRoot>/state/workspace-locks/<sha256(absCwd)>.lock`
11
+ * Lockfile contents: { pid, startTime, heartbeat, goalId, acquiredAt }
12
+ *
13
+ * ─── LIVENESS = stale-reconciler startTime pattern (D10, B-2 fix) ───
14
+ * A lock is STALE iff EITHER:
15
+ * (a) the recorded pid's CURRENT startTime ≠ the lockfile startTime
16
+ * (the PID was recycled to a different process), OR
17
+ * (b) the heartbeat is older than HEARTBEAT_STALE_MS (default 60s)
18
+ * (the process crashed without exiting / heartbeat stopped).
19
+ *
20
+ * Why NOT child-pi.ts killProcessPid (B-2): killProcessPid uses
21
+ * process.kill(pid, 0) which is PID-only — vulnerable to PID recycling. The
22
+ * startTime + before/after re-verify pattern is TOCTOU-correct.
23
+ *
24
+ * getProcessStartTime is NOT exported from stale-reconciler.ts, so its logic
25
+ * is REPLICATED here (RFC §P1g explicitly permits importing OR replicating).
26
+ * The replication matches stale-reconciler.ts:112 field-for-field.
27
+ *
28
+ * Granularity: per-goal, held for the goal's lifetime (release() on goal end).
29
+ * Contention: default QUEUE (poll until released or stale);
30
+ * opts.failOnWorkspaceBusy:true → THROW instead of queue.
31
+ */
32
+
33
+ import { createHash } from "node:crypto";
34
+ import {
35
+ existsSync,
36
+ mkdirSync,
37
+ readFileSync,
38
+ readdirSync,
39
+ unlinkSync,
40
+ openSync,
41
+ closeSync,
42
+ statSync,
43
+ writeFileSync,
44
+ } from "node:fs";
45
+ import * as path from "node:path";
46
+ import { atomicWriteJson } from "../state/atomic-write.ts";
47
+ import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
48
+
49
+ /** Heartbeat staleness threshold (ms). Default 60s per RFC §P1g. */
50
+ const DEFAULT_HEARTBEAT_STALE_MS = 60_000;
51
+
52
+ /** Polling interval while queued waiting for a held lock (ms). */
53
+ const DEFAULT_LOCK_POLL_MS = 500;
54
+
55
+ /**
56
+ * Resolve a pid's process start time in ms, reusing the stale-reconciler
57
+ * pattern (src/runtime/stale-reconciler.ts:112). Returns undefined if the
58
+ * process is gone or /proc is unavailable (non-Linux). The absolute value
59
+ * matters less than its uniqueness per PID lifecycle. Used to detect PID
60
+ * recycling: a recycled PID has a different startTime than the recorded one.
61
+ *
62
+ * Callers (esp. tests) may inject a custom resolver to simulate PID recycling
63
+ * deterministically without spawning real processes.
64
+ */
65
+ export type StartTimeResolver = (pid: number) => number | undefined;
66
+
67
+ export const defaultStartTimeResolver: StartTimeResolver = (pid: number): number | undefined => {
68
+ try {
69
+ const stat = readFileSync(`/proc/${pid}/stat`, "utf-8");
70
+ const lastParen = stat.lastIndexOf(")");
71
+ if (lastParen === -1) return undefined;
72
+ const fieldsAfterComm = stat.slice(lastParen + 1).trim().split(/\s+/);
73
+ // starttime is at index 19 (the 20th field after comm) of /proc/<pid>/stat.
74
+ const startTimeClockTicks = Number(fieldsAfterComm[19]);
75
+ if (!Number.isFinite(startTimeClockTicks)) return undefined;
76
+ // Convert clock ticks to ms (~CLK_TCK). Absolute uniqueness is what matters.
77
+ return Math.floor(startTimeClockTicks * 10);
78
+ } catch {
79
+ return undefined;
80
+ }
81
+ };
82
+
83
+ /** Lockfile contents (persisted as JSON). */
84
+ export interface WorkspaceLockContents {
85
+ pid: number;
86
+ startTime: number | undefined;
87
+ heartbeat: number;
88
+ goalId: string;
89
+ acquiredAt: string;
90
+ }
91
+
92
+ /**
93
+ * Opaque handle returned by acquireWorkspaceLock. Call release() to free the
94
+ * lock when the goal ends. release() is a no-op if the lock was already
95
+ * reclaimed/re-acquired by another goal (guarded by goalId + pid + startTime).
96
+ */
97
+ export interface WorkspaceLockHandle {
98
+ readonly cwd: string;
99
+ readonly goalId: string;
100
+ readonly lockPath: string;
101
+ /** The startTime value written to the lockfile at acquire (release guard). */
102
+ readonly startTime: number | undefined;
103
+ release(): void;
104
+ }
105
+
106
+ export interface AcquireWorkspaceLockOptions {
107
+ /** Throw instead of queue when the workspace is already held (default: queue). */
108
+ failOnWorkspaceBusy?: boolean;
109
+ /** Override the heartbeat-staleness threshold (ms). */
110
+ heartbeatStaleMs?: number;
111
+ /** Override the polling interval while queued (ms). */
112
+ pollMs?: number;
113
+ /** Test injection: override process start time resolution. */
114
+ startTimeResolver?: StartTimeResolver;
115
+ /** Test injection: override current time (ms). Default Date.now(). */
116
+ now?: () => number;
117
+ /** Test injection: override the current pid. Default process.pid. */
118
+ pid?: number;
119
+ /** Abort waiting when this signal aborts. */
120
+ signal?: AbortSignal;
121
+ }
122
+
123
+ /**
124
+ * Resolve the lockfile path for a cwd. Lockfiles live under the project's
125
+ * `.crew/state/workspace-locks/` (or user crew-root fallback) and are named by
126
+ * the sha256 of the absolute cwd to avoid filesystem-unsafe characters and to
127
+ * normalize symlink-equivalent paths.
128
+ */
129
+ export function workspaceLockPath(cwd: string): string {
130
+ const absCwd = path.resolve(cwd);
131
+ const crewRoot = projectCrewRoot(absCwd) ?? userCrewRoot();
132
+ const locksDir = path.join(crewRoot, "state", "workspace-locks");
133
+ const hash = createHash("sha256").update(absCwd).digest("hex");
134
+ return path.join(locksDir, `${hash}.lock`);
135
+ }
136
+
137
+ /** Read + parse a lockfile. Returns undefined if missing/corrupt. */
138
+ function readLock(lockPath: string): WorkspaceLockContents | undefined {
139
+ if (!existsSync(lockPath)) return undefined;
140
+ try {
141
+ const parsed = JSON.parse(readFileSync(lockPath, "utf-8"));
142
+ if (!parsed || typeof parsed !== "object") return undefined;
143
+ return parsed as WorkspaceLockContents;
144
+ } catch {
145
+ return undefined;
146
+ }
147
+ }
148
+
149
+ /**
150
+ * Write the lockfile atomically (temp+rename+fsync via atomicWriteJson).
151
+ * Used for HEARTBEAT refresh only (a claim that already owns the lock is refreshing its
152
+ * timestamp — overwrite is correct because the owner verified ownership first).
153
+ */
154
+ function writeLock(lockPath: string, contents: WorkspaceLockContents): void {
155
+ mkdirSync(path.dirname(lockPath), { recursive: true });
156
+ atomicWriteJson(lockPath, contents);
157
+ }
158
+
159
+ /**
160
+ * CLAIM the lockfile atomically via O_EXCL (cold-review #3 NIT #N1 fix).
161
+ * The previous claim used temp+rename (writeLock), which is NOT cross-process atomic —
162
+ * two goals that both observed a free lock in the same tick could both writeLock and
163
+ * both believe they own it (the same class of TOCTOU cold-review #2 caught for CAS).
164
+ * O_EXCL (openSync "wx") IS atomic at the OS level: only one process can create the
165
+ * file. Returns true on success, false if the file already exists (EEXIST). Stale
166
+ * lockfiles older than the threshold are force-deleted + retried once.
167
+ */
168
+ /**
169
+ * CLAIM the lockfile atomically via O_EXCL (cold-review #3 NIT #N1 fix).
170
+ * The previous claim used temp+rename (writeLock), which is NOT cross-process atomic —
171
+ * two goals that both observed a free/stale lock in the same tick could both writeLock
172
+ * and both believe they own it (the same class of TOCTOU cold-review #2 caught for CAS).
173
+ * O_EXCL (openSync "wx") IS atomic at the OS level: only one process can create the file.
174
+ *
175
+ * `forceOverwrite`: when the caller has ALREADY verified (via isLockStale) that the existing
176
+ * lock is logically stale (e.g. PID recycled — a stronger signal than mtime age), the caller
177
+ * passes forceOverwrite:true and claimLock unlinks then claims, bypassing the mtime age check.
178
+ * (Without this, a stale-by-PID-recycling lock whose mtime is recent would never be claimed,
179
+ * because tryCreate sees EEXIST and the mtime age check fails — infinite re-queue = hang.)
180
+ *
181
+ * Returns true on success, false if the file already exists and is not stale.
182
+ */
183
+ function claimLock(lockPath: string, contents: WorkspaceLockContents, staleReclaimMs: number, forceOverwrite = false): boolean {
184
+ mkdirSync(path.dirname(lockPath), { recursive: true });
185
+ const json = JSON.stringify(contents);
186
+ const tryCreate = (): boolean => {
187
+ try {
188
+ const fd = openSync(lockPath, "wx"); // O_EXCL — throws EEXIST if it exists.
189
+ try {
190
+ writeFileSync(fd, json);
191
+ } finally {
192
+ closeSync(fd);
193
+ }
194
+ return true;
195
+ } catch (error) {
196
+ const code = (error as NodeJS.ErrnoException).code;
197
+ if (code !== "EEXIST") throw error;
198
+ return false;
199
+ }
200
+ };
201
+ if (forceOverwrite) {
202
+ // Caller verified the existing lock is logically stale; remove it and claim. A concurrent
203
+ // reclaimer might re-create between our unlink and our open — that's fine, we lose the race
204
+ // and return false, falling through to the queue path.
205
+ try { unlinkSync(lockPath); } catch { /* best-effort */ }
206
+ return tryCreate();
207
+ }
208
+ if (tryCreate()) return true;
209
+ // Stale recovery by mtime age: if the lockfile is older than staleReclaimMs, force-delete + retry.
210
+ try {
211
+ const stat = statSync(lockPath);
212
+ if (Date.now() - stat.mtimeMs > staleReclaimMs) {
213
+ try { unlinkSync(lockPath); } catch { /* fall through */ }
214
+ return tryCreate();
215
+ }
216
+ } catch { /* fall through to false */ }
217
+ return false;
218
+ }
219
+
220
+ /**
221
+ * Is the lock STALE? RFC §P1g + D10 dual-check:
222
+ * (a) startTime mismatch → PID recycled to a different process, OR
223
+ * (b) heartbeat older than heartbeatStaleMs → crash w/o exit / abandoned.
224
+ *
225
+ * On platforms where startTime is unavailable (non-Linux), only the heartbeat
226
+ * check applies (weaker PID-reuse detection — documented platform limitation,
227
+ * matching stale-reconciler.ts).
228
+ */
229
+ function isLockStale(
230
+ lock: WorkspaceLockContents,
231
+ resolveStartTime: StartTimeResolver,
232
+ heartbeatStaleMs: number,
233
+ now: number,
234
+ ): { stale: boolean; reason?: string } {
235
+ // (a) startTime mismatch → PID recycled to a different process.
236
+ if (lock.startTime !== undefined) {
237
+ const currentStartTime = resolveStartTime(lock.pid);
238
+ if (currentStartTime !== undefined && currentStartTime !== lock.startTime) {
239
+ return { stale: true, reason: "pid_recycled" };
240
+ }
241
+ // currentStartTime === undefined: process gone OR /proc unavailable →
242
+ // fall through to the heartbeat check (corroborating evidence).
243
+ }
244
+ // (b) heartbeat older than threshold → crash without exit / abandoned.
245
+ const heartbeatAge = now - lock.heartbeat;
246
+ if (heartbeatAge > heartbeatStaleMs) {
247
+ return { stale: true, reason: "heartbeat_stale" };
248
+ }
249
+ return { stale: false };
250
+ }
251
+
252
+ /**
253
+ * Acquire the workspace lock for `goalId` at `cwd`. If the lock is held by a
254
+ * live goal, the default behavior is QUEUE (poll until released or the holder
255
+ * goes stale); with opts.failOnWorkspaceBusy:true, throws instead.
256
+ *
257
+ * Stale locks (PID recycled or heartbeat expired) are reclaimed transparently.
258
+ *
259
+ * The returned handle's release() deletes the lockfile ONLY if it still
260
+ * belongs to this goal+pid+startTime — so a stale handle cannot clobber a
261
+ * lock reclaimed and re-acquired by another goal after this goal went stale.
262
+ *
263
+ * In-process serialization: the read→stale-check→write sequence is
264
+ * synchronous within one event-loop tick, so concurrent in-process acquires
265
+ * cannot both observe a free lock and both write (no interleave between the
266
+ * sync read and sync write).
267
+ */
268
+ export async function acquireWorkspaceLock(
269
+ cwd: string,
270
+ goalId: string,
271
+ opts: AcquireWorkspaceLockOptions = {},
272
+ ): Promise<WorkspaceLockHandle> {
273
+ const lockPath = workspaceLockPath(cwd);
274
+ const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
275
+ const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
276
+ const pollMs = opts.pollMs ?? DEFAULT_LOCK_POLL_MS;
277
+ const now = opts.now ?? Date.now;
278
+ const pid = opts.pid ?? process.pid;
279
+ const writtenStartTime = resolveStartTime(pid);
280
+
281
+ while (true) {
282
+ // Poll-loop: re-check the lock each tick until free/stale or aborted.
283
+ if (opts.signal?.aborted) {
284
+ throw new Error(
285
+ `workspace lock acquisition aborted for goal ${goalId} (cwd=${cwd})`,
286
+ );
287
+ }
288
+ const existing = readLock(lockPath);
289
+ // Classify the existing lock: "absent" / "stale" (PID recycled or heartbeat dead) / "live".
290
+ // Cold-review #3 NIT #N1 fix: when stale, pass forceOverwrite:true to claimLock so it
291
+ // unlinks the stale file before claiming. Without this, a stale-by-PID lock whose mtime
292
+ // is recent would never pass claimLock's mtime age check (it would return false) and the
293
+ // acquireWorkspaceLock poll loop would re-queue forever = hang.
294
+ const existingKind: "absent" | "stale" | "live" = !existing
295
+ ? "absent"
296
+ : (isLockStale(existing, resolveStartTime, heartbeatStaleMs, now()).stale ? "stale" : "live");
297
+ if (existingKind !== "live") {
298
+ // Claim the lock (covers both no-lock and stale-lock cases).
299
+ // Cold-review #3 NIT #N1 fix: claim via O_EXCL (claimLock), NOT temp+rename — two
300
+ // processes racing past the isLockStale check could both writeLock and both believe
301
+ // they own the lock. claimLock atomically creates the file; if it returns false we
302
+ // lost the race, so fall through to the queue/re-throw path below.
303
+ const contents: WorkspaceLockContents = {
304
+ pid,
305
+ startTime: writtenStartTime,
306
+ heartbeat: now(),
307
+ goalId,
308
+ acquiredAt: new Date(now()).toISOString(),
309
+ };
310
+ const claimed = claimLock(lockPath, contents, heartbeatStaleMs, existingKind === "stale");
311
+ if (claimed) {
312
+ return {
313
+ cwd,
314
+ goalId,
315
+ lockPath,
316
+ startTime: writtenStartTime,
317
+ release(): void {
318
+ safeRelease(lockPath, goalId, pid, writtenStartTime);
319
+ },
320
+ };
321
+ }
322
+ // claimLock lost the race (another process claimed between our stale-check and
323
+ // our claim). Fall through to the busy path (throw or queue) — re-check next tick.
324
+ }
325
+ // Lock is held and live.
326
+ if (opts.failOnWorkspaceBusy) {
327
+ throw new Error(
328
+ `workspace busy: cwd=${cwd} held by goalId=${existing!.goalId} (pid=${existing!.pid})`,
329
+ );
330
+ }
331
+ // Queue: wait for the next poll interval, then re-check.
332
+ await sleepOrAbort(pollMs, opts.signal);
333
+ }
334
+ }
335
+
336
+ /**
337
+ * Delete the lockfile at `lockPath` only if it still belongs to
338
+ * (goalId, pid, startTime). A stale handle (whose lock was reclaimed and
339
+ * re-acquired by another goal) must NOT delete the new owner's lock.
340
+ */
341
+ function safeRelease(
342
+ lockPath: string,
343
+ goalId: string,
344
+ pid: number,
345
+ writtenStartTime: number | undefined,
346
+ ): void {
347
+ try {
348
+ const current = readLock(lockPath);
349
+ if (
350
+ current &&
351
+ current.goalId === goalId &&
352
+ current.pid === pid &&
353
+ current.startTime === writtenStartTime
354
+ ) {
355
+ unlinkSync(lockPath);
356
+ }
357
+ } catch {
358
+ /* best-effort — release must never throw into a finally block */
359
+ }
360
+ }
361
+
362
+ /** Sleep that resolves after `ms`, or rejects early if `signal` aborts. */
363
+ function sleepOrAbort(ms: number, signal?: AbortSignal): Promise<void> {
364
+ if (!signal) return new Promise<void>((r) => setTimeout(r, ms));
365
+ return new Promise<void>((resolve, reject) => {
366
+ const timer = setTimeout(resolve, ms);
367
+ signal.addEventListener(
368
+ "abort",
369
+ () => {
370
+ clearTimeout(timer);
371
+ reject(new Error("workspace lock acquisition aborted"));
372
+ },
373
+ { once: true },
374
+ );
375
+ });
376
+ }
377
+
378
+ /**
379
+ * Reclaim all stale locks under `dir` (the workspace-locks directory). Returns
380
+ * the list of reclaimed lock paths. Stale = PID recycled OR heartbeat older
381
+ * than threshold. Corrupt/unreadable locks are also reclaimed.
382
+ *
383
+ * Useful as a startup or periodic sweep to clear locks left by crashed
384
+ * processes before any goal tries to acquire them.
385
+ */
386
+ /**
387
+ * Peek whether the workspace is currently locked by a live owner (without acquiring).
388
+ * Used by `goal start` / `goal resume` to fail-fast with a clear error BEFORE spawning.
389
+ * Returns the goalId of the current owner if busy, undefined if free (or lock missing).
390
+ */
391
+ export function isWorkspaceBusy(
392
+ cwd: string,
393
+ opts: { startTimeResolver?: StartTimeResolver; heartbeatStaleMs?: number; now?: () => number } = {},
394
+ ): string | undefined {
395
+ const lockPath = workspaceLockPath(cwd);
396
+ const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
397
+ const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
398
+ const now = opts.now ?? Date.now;
399
+ const existing = readLock(lockPath);
400
+ if (!existing) return undefined;
401
+ const { stale } = isLockStale(existing, resolveStartTime, heartbeatStaleMs, now());
402
+ return stale ? undefined : existing.goalId;
403
+ }
404
+
405
+ export function reclaimStaleLocks(
406
+ dir: string,
407
+ opts: {
408
+ heartbeatStaleMs?: number;
409
+ startTimeResolver?: StartTimeResolver;
410
+ now?: () => number;
411
+ } = {},
412
+ ): string[] {
413
+ const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
414
+ const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
415
+ const now = opts.now ?? Date.now;
416
+ const reclaimed: string[] = [];
417
+ if (!existsSync(dir)) return reclaimed;
418
+ let entries: string[];
419
+ try {
420
+ entries = readdirSync(dir);
421
+ } catch {
422
+ return reclaimed;
423
+ }
424
+ for (const entry of entries) {
425
+ if (!entry.endsWith(".lock")) continue;
426
+ const lockPath = path.join(dir, entry);
427
+ const lock = readLock(lockPath);
428
+ if (!lock) {
429
+ // Corrupt/empty — reclaim.
430
+ try {
431
+ unlinkSync(lockPath);
432
+ reclaimed.push(lockPath);
433
+ } catch {
434
+ /* best-effort */
435
+ }
436
+ continue;
437
+ }
438
+ if (isLockStale(lock, resolveStartTime, heartbeatStaleMs, now()).stale) {
439
+ try {
440
+ unlinkSync(lockPath);
441
+ reclaimed.push(lockPath);
442
+ } catch {
443
+ /* best-effort */
444
+ }
445
+ }
446
+ }
447
+ return reclaimed;
448
+ }
@@ -59,6 +59,31 @@ export const PiTeamsWorktreeConfigSchema = Type.Object({
59
59
  seedPaths: Type.Optional(Type.Array(Type.String({ minLength: 1 }))),
60
60
  }, { additionalProperties: false });
61
61
 
62
+ /**
63
+ * Goal-wrap config (RFC v0.5 vision: apply `goal` completion-guarantee to builtin workflows).
64
+ * Per-workflow toggle. When enabled, a builtin workflow runs as the WORKER TURN inside a
65
+ * goal loop (worker → judge → feedback → redo until achieved / maxTurns / budget / stuck).
66
+ * Default OFF — opt-in per workflow. Only applies to builtin workflows that have a clear
67
+ * 'done' condition (implementation, fast-fix). Read-only workflows (review, research) are
68
+ * not goal-wrappable.
69
+ */
70
+ export const GoalWrapWorkflowConfigSchema = Type.Object({
71
+ enabled: Type.Optional(Type.Boolean()),
72
+ maxTurns: Type.Optional(Type.Integer({ minimum: 1, maximum: 50 })),
73
+ evaluatorModel: Type.Optional(Type.String({ minLength: 1 })),
74
+ verification: Type.Optional(Type.Object({
75
+ commands: Type.Array(Type.String({ minLength: 1 })),
76
+ mode: Type.Optional(Type.Literal("text-only")),
77
+ }, { additionalProperties: false })),
78
+ budgetTotal: Type.Optional(Type.Integer({ minimum: 1000 })),
79
+ budgetUnlimited: Type.Optional(Type.Boolean()),
80
+ }, { additionalProperties: false });
81
+
82
+ export const PiTeamsGoalWrapConfigSchema = Type.Record(
83
+ Type.String({ minLength: 1 }),
84
+ GoalWrapWorkflowConfigSchema,
85
+ );
86
+
62
87
  export const AgentOverrideSchema = Type.Object({
63
88
  disabled: Type.Optional(Type.Boolean()),
64
89
  model: Type.Optional(Type.Union([Type.String({ minLength: 1 }), Type.Literal(false)])),
@@ -152,6 +177,7 @@ export const PiTeamsConfigSchema = Type.Object({
152
177
  runtime: Type.Optional(PiTeamsRuntimeConfigSchema),
153
178
  control: Type.Optional(PiTeamsControlConfigSchema),
154
179
  worktree: Type.Optional(PiTeamsWorktreeConfigSchema),
180
+ goalWrap: Type.Optional(PiTeamsGoalWrapConfigSchema),
155
181
  agents: Type.Optional(PiTeamsAgentsConfigSchema),
156
182
  tools: Type.Optional(PiTeamsToolsConfigSchema),
157
183
  telemetry: Type.Optional(PiTeamsTelemetryConfigSchema),
@@ -72,6 +72,12 @@ export const TeamToolParams = Type.Object({
72
72
  Type.Literal("anchor"),
73
73
  Type.Literal("auto-summarize"),
74
74
  Type.Literal("auto_boomerang"),
75
+ Type.Literal("goal"),
76
+ Type.Literal("workflow-create"),
77
+ Type.Literal("workflow-get"),
78
+ Type.Literal("workflow-list"),
79
+ Type.Literal("workflow-save"),
80
+ Type.Literal("workflow-delete"),
75
81
  ],
76
82
  { description: "Team action. Defaults to 'list' when omitted." },
77
83
  ),
@@ -244,8 +250,14 @@ export const TeamToolParams = Type.Object({
244
250
  budgetTotal: Type.Optional(
245
251
  Type.Number({
246
252
  description:
247
- "Total token budget for the run. When set, enables budget tracking with default 80% warning and 95% abort thresholds.",
248
- minimum: 1,
253
+ "Total token budget for the run. When set, enables budget tracking with default 80% warning and 95% abort thresholds. Minimum 1000 — this is a MISCONFIGURATION GUARD (catches typos / silent-abort configs like budgetTotal:1, which would abort on turn 1), NOT a usefulness guarantee; a productive multi-turn goal needs far more than 1000 tokens.",
254
+ minimum: 1000,
255
+ }),
256
+ ),
257
+ budgetUnlimited: Type.Optional(
258
+ Type.Boolean({
259
+ description:
260
+ "When true, skip budget enforcement entirely (explicit opt-out). Goal-start validation requires budgetTotal>=1000 OR budgetUnlimited:true; audit-logged when set. The validation itself is enforced in a later integration task.",
249
261
  }),
250
262
  ),
251
263
  budgetWarning: Type.Optional(
@@ -264,6 +276,19 @@ export const TeamToolParams = Type.Object({
264
276
  maximum: 1,
265
277
  }),
266
278
  ),
279
+ runKind: Type.Optional(
280
+ Type.Union(
281
+ [
282
+ Type.Literal("team-run"),
283
+ Type.Literal("goal-loop"),
284
+ Type.Literal("dynamic-workflow"),
285
+ ],
286
+ {
287
+ description:
288
+ "Background dispatch discriminator. Default \"team-run\" runs the normal executeTeamRun workflow; \"goal-loop\" (P0/P1) and \"dynamic-workflow\" (P2/P3) dispatch to their respective background runners. Absent = \"team-run\" for backward compatibility.",
289
+ },
290
+ ),
291
+ ),
267
292
  });
268
293
 
269
294
  export interface TeamToolParamsValue {
@@ -312,7 +337,13 @@ export interface TeamToolParamsValue {
312
337
  | "search"
313
338
  | "orchestrate"
314
339
  | "schedule"
315
- | "scheduled";
340
+ | "scheduled"
341
+ | "goal"
342
+ | "workflow-create"
343
+ | "workflow-get"
344
+ | "workflow-list"
345
+ | "workflow-save"
346
+ | "workflow-delete";
316
347
  resource?: "agent" | "team" | "workflow";
317
348
  team?: string;
318
349
  workflow?: string;
@@ -352,10 +383,14 @@ export interface TeamToolParamsValue {
352
383
  once?: string | number;
353
384
  /** Mark certain bash commands as excludeFromContext to reduce context tokens (default: false). */
354
385
  excludeContextBash?: boolean;
355
- /** Total token budget for the run. When set, enables budget tracking. */
386
+ /** Total token budget for the run. When set, enables budget tracking (minimum 1000). */
356
387
  budgetTotal?: number;
388
+ /** When true, skip budget enforcement entirely (explicit opt-out). */
389
+ budgetUnlimited?: boolean;
357
390
  /** Budget warning threshold as a fraction (0-1). Default: 0.8. */
358
391
  budgetWarning?: number;
359
392
  /** Budget abort threshold as a fraction (0-1). Default: 0.95. */
360
393
  budgetAbort?: number;
394
+ /** Background dispatch discriminator. Default "team-run". "goal-loop"/"dynamic-workflow" dispatch to their runners (P0/P2). */
395
+ runKind?: "team-run" | "goal-loop" | "dynamic-workflow";
361
396
  }
@@ -3,6 +3,7 @@ import * as fs from "node:fs";
3
3
  import * as os from "node:os";
4
4
  import * as path from "node:path";
5
5
  import { logInternalError } from "../utils/internal-error.ts";
6
+ import { isWorkerAtomicWriterEnabled, atomicWriteFileViaWorker } from "./worker-atomic-writer.ts";
6
7
  import { sleepSync } from "../utils/sleep.ts";
7
8
 
8
9
  function hashContent(content: string): string {
@@ -380,6 +381,14 @@ export function atomicWriteFile(filePath: string, content: string, expectedHash?
380
381
 
381
382
 
382
383
  export async function atomicWriteFileAsync(filePath: string, content: string): Promise<void> {
384
+ // Phase 1.5 (RFC 15): when the worker-thread atomic writer is enabled
385
+ // (PI_CREW_WORKER_ATOMIC_WRITER=1), dispatch to a dedicated worker thread
386
+ // that performs SYNC fs ops with no internal yields. Mitigates the
387
+ // non-deterministic V8/libuv crash during event-loop yields in multi-step
388
+ // goal-wrapped workflows.
389
+ if (isWorkerAtomicWriterEnabled()) {
390
+ return atomicWriteFileViaWorker(filePath, content);
391
+ }
383
392
  if (!isSymlinkSafePath(filePath)) throw new Error(`Refusing to write: target is a symlink or inside untrusted directory: ${filePath}`);
384
393
  await fs.promises.mkdir(path.dirname(filePath), { recursive: true });
385
394
  const tempPath = `${filePath}.${crypto.randomUUID()}.tmp`;
@@ -77,6 +77,20 @@ const TEAM_EVENT_TYPES = [
77
77
  "phase.completed",
78
78
  "phase.skipped",
79
79
  "phase.failed",
80
+ // Goal loop events (P0/P1) — autonomous goal-loop coordinator.
81
+ "goal.loop_start",
82
+ "goal.turn_start",
83
+ "goal.turn_evaluated",
84
+ "goal.budget_warning",
85
+ "goal.loop_end",
86
+ "goal.feedback_steered",
87
+ "goal.state_changed",
88
+ // Dynamic workflow events (P2) — script-driven orchestration.
89
+ "dwf.started",
90
+ "dwf.phase_started",
91
+ "dwf.phase_completed",
92
+ "dwf.completed",
93
+ "dwf.failed",
80
94
  ] as const;
81
95
  export type TeamEventType = typeof TEAM_EVENT_TYPES[number];
82
96
 
@@ -22,8 +22,19 @@ import { updateGitignore } from "./gitignore-manager.ts";
22
22
  // Re-export updateGitignore for backwards compatibility with tests.
23
23
  export { updateGitignore };
24
24
 
25
- /** README content for the .crew directory. */
26
- const CREW_README = `# .crew pi-crew Runtime Directory
25
+ /**
26
+ * README content for the .crew directory.
27
+ *
28
+ * Defined as a function (not a `const`) to avoid the Temporal Dead Zone race
29
+ * documented in issue #28 + RFC 17. When this module is loaded via
30
+ * `jiti.import()` (pi's extension loader) wrapped in an async function, a
31
+ * `const` initializer can be hit in TDZ by functions hoisted above it (the
32
+ * same pattern that bit `crewInitPromise` in team-tool/run.ts — see commit
33
+ * fixing it). A function declaration is hoisted with its body available
34
+ * immediately, so callers always get the fully-built string.
35
+ */
36
+ function buildCrewReadme(): string {
37
+ return `# .crew — pi-crew Runtime Directory
27
38
 
28
39
  This directory contains pi-crew runtime state and artifacts.
29
40
 
@@ -50,7 +61,7 @@ To clear cache:
50
61
  team action='cache' action='clear'
51
62
  \`\`\`
52
63
  `;
53
-
64
+ }
54
65
  /**
55
66
  * Find the project root by walking up from start directory.
56
67
  * Inline implementation to avoid module dependency on paths.ts.
@@ -249,13 +260,15 @@ export async function ensureCrewDirectory(cwd: string): Promise<void> {
249
260
  }
250
261
 
251
262
  // 3. Write README.md (always overwrite to keep it current)
252
- fs.writeFileSync(safeJoin(crewRoot, "README.md"), CREW_README, "utf-8");
263
+ fs.writeFileSync(safeJoin(crewRoot, "README.md"), buildCrewReadme(), "utf-8");
253
264
 
254
265
  // 4. Update .gitignore at project root
255
266
  const repoRoot = findProjectRoot(cwd);
256
267
  if (repoRoot) {
257
268
  const gitignorePath = safeJoin(repoRoot, ".gitignore");
258
- await updateGitignore(gitignorePath);
269
+ // LAZY: dodge the jiti ESM/CJS interop TDZ race on the static `import { updateGitignore }` above (issue #28, RFC 17). At this point the module body has fully evaluated, so the dynamic import resolves to a live binding.
270
+ const { updateGitignore: updateGitignoreFn } = await import("./gitignore-manager.ts");
271
+ await updateGitignoreFn(gitignorePath);
259
272
  }
260
273
  }
261
274
 
@@ -1,6 +1,7 @@
1
1
  import { createHash } from "node:crypto";
2
2
  import * as fs from "node:fs";
3
3
  import * as path from "node:path";
4
+ import { isWorkerAtomicWriterEnabled, appendFileViaWorker } from "./worker-atomic-writer.ts";
4
5
  import { DEFAULT_EVENT_LOG } from "../config/defaults.ts";
5
6
  import { atomicWriteFile } from "./atomic-write.ts";
6
7
  import { errors } from "../errors.ts";
@@ -443,7 +444,12 @@ export async function appendEventAsync(eventsPath: string, event: AppendTeamEven
443
444
 
444
445
  if (!skippedDueToSize) {
445
446
  const line = JSON.stringify(redactSecrets(fullEvent)) + "\n";
446
- await fs.promises.appendFile(eventsPath, line, { encoding: "utf-8", flag: "a" });
447
+ // Phase 1.5: when worker atomic writer is enabled, append via worker.
448
+ if (isWorkerAtomicWriterEnabled()) {
449
+ await appendFileViaWorker(eventsPath, line);
450
+ } else {
451
+ await fs.promises.appendFile(eventsPath, line, { encoding: "utf-8", flag: "a" });
452
+ }
447
453
  // FIX: fsync to ensure event content is flushed to disk before persisting
448
454
  // the sequence number. This closes the crash window between appendFile and
449
455
  // persistSequence where sequence reuse could occur on restart.