pi-crew 0.8.14 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +271 -0
- package/README.md +112 -2
- package/docs/FEATURE_INTAKE.md +1 -1
- package/docs/HARNESS.md +20 -19
- package/docs/PROJECT_REVIEW.md +132 -133
- package/docs/PROJECT_REVIEW_FIXES.md +130 -131
- package/docs/actions-reference.md +127 -121
- package/docs/architecture.md +1 -1
- package/docs/code-review-2026-05-11.md +134 -134
- package/docs/commands-reference.md +108 -106
- package/docs/comparison-pi-subagents-vs-pi-crew.md +105 -105
- package/docs/deep-review-report.md +1 -1
- package/docs/dynamic-workflows.md +90 -0
- package/docs/fixes/BATCH_A_H1_H2.md +17 -17
- package/docs/fixes/bug-007-async-notifier-stale-ctx.md +23 -23
- package/docs/followup-plan-2026-05-12.md +135 -135
- package/docs/followup-review-2026-05-12.md +86 -86
- package/docs/followup-review-round3-2026-05-12.md +123 -123
- package/docs/goals.md +59 -0
- package/docs/implementation-plan-top3.md +4 -4
- package/docs/issue-29-analysis.md +2 -2
- package/docs/oh-my-pi-research.md +154 -154
- package/docs/optimization-plan.md +2 -0
- package/docs/perf/baseline-2026-05.md +9 -9
- package/docs/perf/final-report-2026-05.md +2 -2
- package/docs/perf/sprint-1-report.md +2 -2
- package/docs/perf/sprint-2-report.md +1 -1
- package/docs/perf/upgrade-plan-2026-05.md +72 -72
- package/docs/pi-crew-bugs.md +230 -230
- package/docs/pi-crew-investigation-report.md +102 -102
- package/docs/pi-crew-test-round5.md +4 -4
- package/docs/runtime-analysis-child-vs-live.md +57 -57
- package/docs/runtime-migration-in-process-analysis.md +97 -97
- package/package.json +2 -4
- package/skills/orchestration/SKILL.md +11 -11
- package/src/agents/agent-config.ts +4 -0
- package/src/config/config.ts +39 -0
- package/src/config/types.ts +11 -0
- package/src/extension/action-suggestions.ts +2 -1
- package/src/extension/async-notifier.ts +10 -0
- package/src/extension/help.ts +14 -0
- package/src/extension/registration/commands.ts +27 -0
- package/src/extension/team-tool/destructive-gate.ts +1 -1
- package/src/extension/team-tool/goal-wrap.ts +288 -0
- package/src/extension/team-tool/goal.ts +405 -0
- package/src/extension/team-tool/run.ts +103 -4
- package/src/extension/team-tool/workflow-manage.ts +194 -0
- package/src/extension/team-tool.ts +20 -0
- package/src/hooks/types.ts +3 -1
- package/src/runtime/async-runner.ts +24 -2
- package/src/runtime/background-runner.ts +68 -19
- package/src/runtime/child-pi.ts +6 -1
- package/src/runtime/completion-guard.ts +1 -1
- package/src/runtime/dynamic-workflow-context.ts +450 -0
- package/src/runtime/dynamic-workflow-runner.ts +180 -0
- package/src/runtime/global-worker-cap.ts +96 -0
- package/src/runtime/goal-evaluator.ts +294 -0
- package/src/runtime/goal-loop-runner.ts +612 -0
- package/src/runtime/goal-state-store.ts +209 -0
- package/src/runtime/pi-args.ts +10 -2
- package/src/runtime/result-extractor.ts +32 -0
- package/src/runtime/team-runner.ts +11 -1
- package/src/runtime/verification-gates.ts +85 -5
- package/src/runtime/verification-integrity.ts +110 -0
- package/src/runtime/verification-worktree.ts +136 -0
- package/src/runtime/workspace-lock.ts +448 -0
- package/src/schema/config-schema.ts +26 -0
- package/src/schema/team-tool-schema.ts +39 -4
- package/src/state/atomic-write.ts +9 -0
- package/src/state/contracts.ts +14 -0
- package/src/state/crew-init.ts +18 -5
- package/src/state/event-log.ts +7 -1
- package/src/state/state-store.ts +2 -0
- package/src/state/types.ts +82 -0
- package/src/state/worker-atomic-writer.ts +176 -0
- package/src/utils/redaction.ts +104 -24
- package/src/workflows/discover-workflows.ts +25 -1
- package/src/workflows/workflow-config.ts +13 -0
- package/teams/parallel-research.team.md +1 -1
- package/workflows/examples/hello.dwf.ts +24 -0
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* workspace-lock.ts — Per-cwd workspace lock with startTime-safe liveness (P1g).
|
|
3
|
+
*
|
|
4
|
+
* RFC: research-findings/goal-workflow/13-VISION-RFC.md v0.5 §P1g + D10.
|
|
5
|
+
*
|
|
6
|
+
* Closes #8 (multi-goal clobber) and the B-2 PID-recycling gap. Each
|
|
7
|
+
* `workspaceMode:"single"` goal acquires this lock for its entire lifetime,
|
|
8
|
+
* serializing concurrent goals that share a cwd.
|
|
9
|
+
*
|
|
10
|
+
* Lockfile location: `<crewRoot>/state/workspace-locks/<sha256(absCwd)>.lock`
|
|
11
|
+
* Lockfile contents: { pid, startTime, heartbeat, goalId, acquiredAt }
|
|
12
|
+
*
|
|
13
|
+
* ─── LIVENESS = stale-reconciler startTime pattern (D10, B-2 fix) ───
|
|
14
|
+
* A lock is STALE iff EITHER:
|
|
15
|
+
* (a) the recorded pid's CURRENT startTime ≠ the lockfile startTime
|
|
16
|
+
* (the PID was recycled to a different process), OR
|
|
17
|
+
* (b) the heartbeat is older than HEARTBEAT_STALE_MS (default 60s)
|
|
18
|
+
* (the process crashed without exiting / heartbeat stopped).
|
|
19
|
+
*
|
|
20
|
+
* Why NOT child-pi.ts killProcessPid (B-2): killProcessPid uses
|
|
21
|
+
* process.kill(pid, 0) which is PID-only — vulnerable to PID recycling. The
|
|
22
|
+
* startTime + before/after re-verify pattern is TOCTOU-correct.
|
|
23
|
+
*
|
|
24
|
+
* getProcessStartTime is NOT exported from stale-reconciler.ts, so its logic
|
|
25
|
+
* is REPLICATED here (RFC §P1g explicitly permits importing OR replicating).
|
|
26
|
+
* The replication matches stale-reconciler.ts:112 field-for-field.
|
|
27
|
+
*
|
|
28
|
+
* Granularity: per-goal, held for the goal's lifetime (release() on goal end).
|
|
29
|
+
* Contention: default QUEUE (poll until released or stale);
|
|
30
|
+
* opts.failOnWorkspaceBusy:true → THROW instead of queue.
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
import { createHash } from "node:crypto";
|
|
34
|
+
import {
|
|
35
|
+
existsSync,
|
|
36
|
+
mkdirSync,
|
|
37
|
+
readFileSync,
|
|
38
|
+
readdirSync,
|
|
39
|
+
unlinkSync,
|
|
40
|
+
openSync,
|
|
41
|
+
closeSync,
|
|
42
|
+
statSync,
|
|
43
|
+
writeFileSync,
|
|
44
|
+
} from "node:fs";
|
|
45
|
+
import * as path from "node:path";
|
|
46
|
+
import { atomicWriteJson } from "../state/atomic-write.ts";
|
|
47
|
+
import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
|
|
48
|
+
|
|
49
|
+
/** Heartbeat staleness threshold (ms). Default 60s per RFC §P1g. */
|
|
50
|
+
const DEFAULT_HEARTBEAT_STALE_MS = 60_000;
|
|
51
|
+
|
|
52
|
+
/** Polling interval while queued waiting for a held lock (ms). */
|
|
53
|
+
const DEFAULT_LOCK_POLL_MS = 500;
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Resolve a pid's process start time in ms, reusing the stale-reconciler
|
|
57
|
+
* pattern (src/runtime/stale-reconciler.ts:112). Returns undefined if the
|
|
58
|
+
* process is gone or /proc is unavailable (non-Linux). The absolute value
|
|
59
|
+
* matters less than its uniqueness per PID lifecycle. Used to detect PID
|
|
60
|
+
* recycling: a recycled PID has a different startTime than the recorded one.
|
|
61
|
+
*
|
|
62
|
+
* Callers (esp. tests) may inject a custom resolver to simulate PID recycling
|
|
63
|
+
* deterministically without spawning real processes.
|
|
64
|
+
*/
|
|
65
|
+
export type StartTimeResolver = (pid: number) => number | undefined;
|
|
66
|
+
|
|
67
|
+
export const defaultStartTimeResolver: StartTimeResolver = (pid: number): number | undefined => {
|
|
68
|
+
try {
|
|
69
|
+
const stat = readFileSync(`/proc/${pid}/stat`, "utf-8");
|
|
70
|
+
const lastParen = stat.lastIndexOf(")");
|
|
71
|
+
if (lastParen === -1) return undefined;
|
|
72
|
+
const fieldsAfterComm = stat.slice(lastParen + 1).trim().split(/\s+/);
|
|
73
|
+
// starttime is at index 19 (the 20th field after comm) of /proc/<pid>/stat.
|
|
74
|
+
const startTimeClockTicks = Number(fieldsAfterComm[19]);
|
|
75
|
+
if (!Number.isFinite(startTimeClockTicks)) return undefined;
|
|
76
|
+
// Convert clock ticks to ms (~CLK_TCK). Absolute uniqueness is what matters.
|
|
77
|
+
return Math.floor(startTimeClockTicks * 10);
|
|
78
|
+
} catch {
|
|
79
|
+
return undefined;
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
/** Lockfile contents (persisted as JSON). */
|
|
84
|
+
export interface WorkspaceLockContents {
|
|
85
|
+
pid: number;
|
|
86
|
+
startTime: number | undefined;
|
|
87
|
+
heartbeat: number;
|
|
88
|
+
goalId: string;
|
|
89
|
+
acquiredAt: string;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Opaque handle returned by acquireWorkspaceLock. Call release() to free the
|
|
94
|
+
* lock when the goal ends. release() is a no-op if the lock was already
|
|
95
|
+
* reclaimed/re-acquired by another goal (guarded by goalId + pid + startTime).
|
|
96
|
+
*/
|
|
97
|
+
export interface WorkspaceLockHandle {
|
|
98
|
+
readonly cwd: string;
|
|
99
|
+
readonly goalId: string;
|
|
100
|
+
readonly lockPath: string;
|
|
101
|
+
/** The startTime value written to the lockfile at acquire (release guard). */
|
|
102
|
+
readonly startTime: number | undefined;
|
|
103
|
+
release(): void;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export interface AcquireWorkspaceLockOptions {
|
|
107
|
+
/** Throw instead of queue when the workspace is already held (default: queue). */
|
|
108
|
+
failOnWorkspaceBusy?: boolean;
|
|
109
|
+
/** Override the heartbeat-staleness threshold (ms). */
|
|
110
|
+
heartbeatStaleMs?: number;
|
|
111
|
+
/** Override the polling interval while queued (ms). */
|
|
112
|
+
pollMs?: number;
|
|
113
|
+
/** Test injection: override process start time resolution. */
|
|
114
|
+
startTimeResolver?: StartTimeResolver;
|
|
115
|
+
/** Test injection: override current time (ms). Default Date.now(). */
|
|
116
|
+
now?: () => number;
|
|
117
|
+
/** Test injection: override the current pid. Default process.pid. */
|
|
118
|
+
pid?: number;
|
|
119
|
+
/** Abort waiting when this signal aborts. */
|
|
120
|
+
signal?: AbortSignal;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Resolve the lockfile path for a cwd. Lockfiles live under the project's
|
|
125
|
+
* `.crew/state/workspace-locks/` (or user crew-root fallback) and are named by
|
|
126
|
+
* the sha256 of the absolute cwd to avoid filesystem-unsafe characters and to
|
|
127
|
+
* normalize symlink-equivalent paths.
|
|
128
|
+
*/
|
|
129
|
+
export function workspaceLockPath(cwd: string): string {
|
|
130
|
+
const absCwd = path.resolve(cwd);
|
|
131
|
+
const crewRoot = projectCrewRoot(absCwd) ?? userCrewRoot();
|
|
132
|
+
const locksDir = path.join(crewRoot, "state", "workspace-locks");
|
|
133
|
+
const hash = createHash("sha256").update(absCwd).digest("hex");
|
|
134
|
+
return path.join(locksDir, `${hash}.lock`);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/** Read + parse a lockfile. Returns undefined if missing/corrupt. */
|
|
138
|
+
function readLock(lockPath: string): WorkspaceLockContents | undefined {
|
|
139
|
+
if (!existsSync(lockPath)) return undefined;
|
|
140
|
+
try {
|
|
141
|
+
const parsed = JSON.parse(readFileSync(lockPath, "utf-8"));
|
|
142
|
+
if (!parsed || typeof parsed !== "object") return undefined;
|
|
143
|
+
return parsed as WorkspaceLockContents;
|
|
144
|
+
} catch {
|
|
145
|
+
return undefined;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Write the lockfile atomically (temp+rename+fsync via atomicWriteJson).
|
|
151
|
+
* Used for HEARTBEAT refresh only (a claim that already owns the lock is refreshing its
|
|
152
|
+
* timestamp — overwrite is correct because the owner verified ownership first).
|
|
153
|
+
*/
|
|
154
|
+
function writeLock(lockPath: string, contents: WorkspaceLockContents): void {
|
|
155
|
+
mkdirSync(path.dirname(lockPath), { recursive: true });
|
|
156
|
+
atomicWriteJson(lockPath, contents);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* CLAIM the lockfile atomically via O_EXCL (cold-review #3 NIT #N1 fix).
|
|
161
|
+
* The previous claim used temp+rename (writeLock), which is NOT cross-process atomic —
|
|
162
|
+
* two goals that both observed a free lock in the same tick could both writeLock and
|
|
163
|
+
* both believe they own it (the same class of TOCTOU cold-review #2 caught for CAS).
|
|
164
|
+
* O_EXCL (openSync "wx") IS atomic at the OS level: only one process can create the
|
|
165
|
+
* file. Returns true on success, false if the file already exists (EEXIST). Stale
|
|
166
|
+
* lockfiles older than the threshold are force-deleted + retried once.
|
|
167
|
+
*/
|
|
168
|
+
/**
|
|
169
|
+
* CLAIM the lockfile atomically via O_EXCL (cold-review #3 NIT #N1 fix).
|
|
170
|
+
* The previous claim used temp+rename (writeLock), which is NOT cross-process atomic —
|
|
171
|
+
* two goals that both observed a free/stale lock in the same tick could both writeLock
|
|
172
|
+
* and both believe they own it (the same class of TOCTOU cold-review #2 caught for CAS).
|
|
173
|
+
* O_EXCL (openSync "wx") IS atomic at the OS level: only one process can create the file.
|
|
174
|
+
*
|
|
175
|
+
* `forceOverwrite`: when the caller has ALREADY verified (via isLockStale) that the existing
|
|
176
|
+
* lock is logically stale (e.g. PID recycled — a stronger signal than mtime age), the caller
|
|
177
|
+
* passes forceOverwrite:true and claimLock unlinks then claims, bypassing the mtime age check.
|
|
178
|
+
* (Without this, a stale-by-PID-recycling lock whose mtime is recent would never be claimed,
|
|
179
|
+
* because tryCreate sees EEXIST and the mtime age check fails — infinite re-queue = hang.)
|
|
180
|
+
*
|
|
181
|
+
* Returns true on success, false if the file already exists and is not stale.
|
|
182
|
+
*/
|
|
183
|
+
function claimLock(lockPath: string, contents: WorkspaceLockContents, staleReclaimMs: number, forceOverwrite = false): boolean {
|
|
184
|
+
mkdirSync(path.dirname(lockPath), { recursive: true });
|
|
185
|
+
const json = JSON.stringify(contents);
|
|
186
|
+
const tryCreate = (): boolean => {
|
|
187
|
+
try {
|
|
188
|
+
const fd = openSync(lockPath, "wx"); // O_EXCL — throws EEXIST if it exists.
|
|
189
|
+
try {
|
|
190
|
+
writeFileSync(fd, json);
|
|
191
|
+
} finally {
|
|
192
|
+
closeSync(fd);
|
|
193
|
+
}
|
|
194
|
+
return true;
|
|
195
|
+
} catch (error) {
|
|
196
|
+
const code = (error as NodeJS.ErrnoException).code;
|
|
197
|
+
if (code !== "EEXIST") throw error;
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
if (forceOverwrite) {
|
|
202
|
+
// Caller verified the existing lock is logically stale; remove it and claim. A concurrent
|
|
203
|
+
// reclaimer might re-create between our unlink and our open — that's fine, we lose the race
|
|
204
|
+
// and return false, falling through to the queue path.
|
|
205
|
+
try { unlinkSync(lockPath); } catch { /* best-effort */ }
|
|
206
|
+
return tryCreate();
|
|
207
|
+
}
|
|
208
|
+
if (tryCreate()) return true;
|
|
209
|
+
// Stale recovery by mtime age: if the lockfile is older than staleReclaimMs, force-delete + retry.
|
|
210
|
+
try {
|
|
211
|
+
const stat = statSync(lockPath);
|
|
212
|
+
if (Date.now() - stat.mtimeMs > staleReclaimMs) {
|
|
213
|
+
try { unlinkSync(lockPath); } catch { /* fall through */ }
|
|
214
|
+
return tryCreate();
|
|
215
|
+
}
|
|
216
|
+
} catch { /* fall through to false */ }
|
|
217
|
+
return false;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Is the lock STALE? RFC §P1g + D10 dual-check:
|
|
222
|
+
* (a) startTime mismatch → PID recycled to a different process, OR
|
|
223
|
+
* (b) heartbeat older than heartbeatStaleMs → crash w/o exit / abandoned.
|
|
224
|
+
*
|
|
225
|
+
* On platforms where startTime is unavailable (non-Linux), only the heartbeat
|
|
226
|
+
* check applies (weaker PID-reuse detection — documented platform limitation,
|
|
227
|
+
* matching stale-reconciler.ts).
|
|
228
|
+
*/
|
|
229
|
+
function isLockStale(
|
|
230
|
+
lock: WorkspaceLockContents,
|
|
231
|
+
resolveStartTime: StartTimeResolver,
|
|
232
|
+
heartbeatStaleMs: number,
|
|
233
|
+
now: number,
|
|
234
|
+
): { stale: boolean; reason?: string } {
|
|
235
|
+
// (a) startTime mismatch → PID recycled to a different process.
|
|
236
|
+
if (lock.startTime !== undefined) {
|
|
237
|
+
const currentStartTime = resolveStartTime(lock.pid);
|
|
238
|
+
if (currentStartTime !== undefined && currentStartTime !== lock.startTime) {
|
|
239
|
+
return { stale: true, reason: "pid_recycled" };
|
|
240
|
+
}
|
|
241
|
+
// currentStartTime === undefined: process gone OR /proc unavailable →
|
|
242
|
+
// fall through to the heartbeat check (corroborating evidence).
|
|
243
|
+
}
|
|
244
|
+
// (b) heartbeat older than threshold → crash without exit / abandoned.
|
|
245
|
+
const heartbeatAge = now - lock.heartbeat;
|
|
246
|
+
if (heartbeatAge > heartbeatStaleMs) {
|
|
247
|
+
return { stale: true, reason: "heartbeat_stale" };
|
|
248
|
+
}
|
|
249
|
+
return { stale: false };
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Acquire the workspace lock for `goalId` at `cwd`. If the lock is held by a
|
|
254
|
+
* live goal, the default behavior is QUEUE (poll until released or the holder
|
|
255
|
+
* goes stale); with opts.failOnWorkspaceBusy:true, throws instead.
|
|
256
|
+
*
|
|
257
|
+
* Stale locks (PID recycled or heartbeat expired) are reclaimed transparently.
|
|
258
|
+
*
|
|
259
|
+
* The returned handle's release() deletes the lockfile ONLY if it still
|
|
260
|
+
* belongs to this goal+pid+startTime — so a stale handle cannot clobber a
|
|
261
|
+
* lock reclaimed and re-acquired by another goal after this goal went stale.
|
|
262
|
+
*
|
|
263
|
+
* In-process serialization: the read→stale-check→write sequence is
|
|
264
|
+
* synchronous within one event-loop tick, so concurrent in-process acquires
|
|
265
|
+
* cannot both observe a free lock and both write (no interleave between the
|
|
266
|
+
* sync read and sync write).
|
|
267
|
+
*/
|
|
268
|
+
export async function acquireWorkspaceLock(
|
|
269
|
+
cwd: string,
|
|
270
|
+
goalId: string,
|
|
271
|
+
opts: AcquireWorkspaceLockOptions = {},
|
|
272
|
+
): Promise<WorkspaceLockHandle> {
|
|
273
|
+
const lockPath = workspaceLockPath(cwd);
|
|
274
|
+
const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
|
|
275
|
+
const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
276
|
+
const pollMs = opts.pollMs ?? DEFAULT_LOCK_POLL_MS;
|
|
277
|
+
const now = opts.now ?? Date.now;
|
|
278
|
+
const pid = opts.pid ?? process.pid;
|
|
279
|
+
const writtenStartTime = resolveStartTime(pid);
|
|
280
|
+
|
|
281
|
+
while (true) {
|
|
282
|
+
// Poll-loop: re-check the lock each tick until free/stale or aborted.
|
|
283
|
+
if (opts.signal?.aborted) {
|
|
284
|
+
throw new Error(
|
|
285
|
+
`workspace lock acquisition aborted for goal ${goalId} (cwd=${cwd})`,
|
|
286
|
+
);
|
|
287
|
+
}
|
|
288
|
+
const existing = readLock(lockPath);
|
|
289
|
+
// Classify the existing lock: "absent" / "stale" (PID recycled or heartbeat dead) / "live".
|
|
290
|
+
// Cold-review #3 NIT #N1 fix: when stale, pass forceOverwrite:true to claimLock so it
|
|
291
|
+
// unlinks the stale file before claiming. Without this, a stale-by-PID lock whose mtime
|
|
292
|
+
// is recent would never pass claimLock's mtime age check (it would return false) and the
|
|
293
|
+
// acquireWorkspaceLock poll loop would re-queue forever = hang.
|
|
294
|
+
const existingKind: "absent" | "stale" | "live" = !existing
|
|
295
|
+
? "absent"
|
|
296
|
+
: (isLockStale(existing, resolveStartTime, heartbeatStaleMs, now()).stale ? "stale" : "live");
|
|
297
|
+
if (existingKind !== "live") {
|
|
298
|
+
// Claim the lock (covers both no-lock and stale-lock cases).
|
|
299
|
+
// Cold-review #3 NIT #N1 fix: claim via O_EXCL (claimLock), NOT temp+rename — two
|
|
300
|
+
// processes racing past the isLockStale check could both writeLock and both believe
|
|
301
|
+
// they own the lock. claimLock atomically creates the file; if it returns false we
|
|
302
|
+
// lost the race, so fall through to the queue/re-throw path below.
|
|
303
|
+
const contents: WorkspaceLockContents = {
|
|
304
|
+
pid,
|
|
305
|
+
startTime: writtenStartTime,
|
|
306
|
+
heartbeat: now(),
|
|
307
|
+
goalId,
|
|
308
|
+
acquiredAt: new Date(now()).toISOString(),
|
|
309
|
+
};
|
|
310
|
+
const claimed = claimLock(lockPath, contents, heartbeatStaleMs, existingKind === "stale");
|
|
311
|
+
if (claimed) {
|
|
312
|
+
return {
|
|
313
|
+
cwd,
|
|
314
|
+
goalId,
|
|
315
|
+
lockPath,
|
|
316
|
+
startTime: writtenStartTime,
|
|
317
|
+
release(): void {
|
|
318
|
+
safeRelease(lockPath, goalId, pid, writtenStartTime);
|
|
319
|
+
},
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
// claimLock lost the race (another process claimed between our stale-check and
|
|
323
|
+
// our claim). Fall through to the busy path (throw or queue) — re-check next tick.
|
|
324
|
+
}
|
|
325
|
+
// Lock is held and live.
|
|
326
|
+
if (opts.failOnWorkspaceBusy) {
|
|
327
|
+
throw new Error(
|
|
328
|
+
`workspace busy: cwd=${cwd} held by goalId=${existing!.goalId} (pid=${existing!.pid})`,
|
|
329
|
+
);
|
|
330
|
+
}
|
|
331
|
+
// Queue: wait for the next poll interval, then re-check.
|
|
332
|
+
await sleepOrAbort(pollMs, opts.signal);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* Delete the lockfile at `lockPath` only if it still belongs to
|
|
338
|
+
* (goalId, pid, startTime). A stale handle (whose lock was reclaimed and
|
|
339
|
+
* re-acquired by another goal) must NOT delete the new owner's lock.
|
|
340
|
+
*/
|
|
341
|
+
function safeRelease(
|
|
342
|
+
lockPath: string,
|
|
343
|
+
goalId: string,
|
|
344
|
+
pid: number,
|
|
345
|
+
writtenStartTime: number | undefined,
|
|
346
|
+
): void {
|
|
347
|
+
try {
|
|
348
|
+
const current = readLock(lockPath);
|
|
349
|
+
if (
|
|
350
|
+
current &&
|
|
351
|
+
current.goalId === goalId &&
|
|
352
|
+
current.pid === pid &&
|
|
353
|
+
current.startTime === writtenStartTime
|
|
354
|
+
) {
|
|
355
|
+
unlinkSync(lockPath);
|
|
356
|
+
}
|
|
357
|
+
} catch {
|
|
358
|
+
/* best-effort — release must never throw into a finally block */
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/** Sleep that resolves after `ms`, or rejects early if `signal` aborts. */
|
|
363
|
+
function sleepOrAbort(ms: number, signal?: AbortSignal): Promise<void> {
|
|
364
|
+
if (!signal) return new Promise<void>((r) => setTimeout(r, ms));
|
|
365
|
+
return new Promise<void>((resolve, reject) => {
|
|
366
|
+
const timer = setTimeout(resolve, ms);
|
|
367
|
+
signal.addEventListener(
|
|
368
|
+
"abort",
|
|
369
|
+
() => {
|
|
370
|
+
clearTimeout(timer);
|
|
371
|
+
reject(new Error("workspace lock acquisition aborted"));
|
|
372
|
+
},
|
|
373
|
+
{ once: true },
|
|
374
|
+
);
|
|
375
|
+
});
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/**
|
|
379
|
+
* Reclaim all stale locks under `dir` (the workspace-locks directory). Returns
|
|
380
|
+
* the list of reclaimed lock paths. Stale = PID recycled OR heartbeat older
|
|
381
|
+
* than threshold. Corrupt/unreadable locks are also reclaimed.
|
|
382
|
+
*
|
|
383
|
+
* Useful as a startup or periodic sweep to clear locks left by crashed
|
|
384
|
+
* processes before any goal tries to acquire them.
|
|
385
|
+
*/
|
|
386
|
+
/**
|
|
387
|
+
* Peek whether the workspace is currently locked by a live owner (without acquiring).
|
|
388
|
+
* Used by `goal start` / `goal resume` to fail-fast with a clear error BEFORE spawning.
|
|
389
|
+
* Returns the goalId of the current owner if busy, undefined if free (or lock missing).
|
|
390
|
+
*/
|
|
391
|
+
export function isWorkspaceBusy(
|
|
392
|
+
cwd: string,
|
|
393
|
+
opts: { startTimeResolver?: StartTimeResolver; heartbeatStaleMs?: number; now?: () => number } = {},
|
|
394
|
+
): string | undefined {
|
|
395
|
+
const lockPath = workspaceLockPath(cwd);
|
|
396
|
+
const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
|
|
397
|
+
const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
398
|
+
const now = opts.now ?? Date.now;
|
|
399
|
+
const existing = readLock(lockPath);
|
|
400
|
+
if (!existing) return undefined;
|
|
401
|
+
const { stale } = isLockStale(existing, resolveStartTime, heartbeatStaleMs, now());
|
|
402
|
+
return stale ? undefined : existing.goalId;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
export function reclaimStaleLocks(
|
|
406
|
+
dir: string,
|
|
407
|
+
opts: {
|
|
408
|
+
heartbeatStaleMs?: number;
|
|
409
|
+
startTimeResolver?: StartTimeResolver;
|
|
410
|
+
now?: () => number;
|
|
411
|
+
} = {},
|
|
412
|
+
): string[] {
|
|
413
|
+
const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
|
|
414
|
+
const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
415
|
+
const now = opts.now ?? Date.now;
|
|
416
|
+
const reclaimed: string[] = [];
|
|
417
|
+
if (!existsSync(dir)) return reclaimed;
|
|
418
|
+
let entries: string[];
|
|
419
|
+
try {
|
|
420
|
+
entries = readdirSync(dir);
|
|
421
|
+
} catch {
|
|
422
|
+
return reclaimed;
|
|
423
|
+
}
|
|
424
|
+
for (const entry of entries) {
|
|
425
|
+
if (!entry.endsWith(".lock")) continue;
|
|
426
|
+
const lockPath = path.join(dir, entry);
|
|
427
|
+
const lock = readLock(lockPath);
|
|
428
|
+
if (!lock) {
|
|
429
|
+
// Corrupt/empty — reclaim.
|
|
430
|
+
try {
|
|
431
|
+
unlinkSync(lockPath);
|
|
432
|
+
reclaimed.push(lockPath);
|
|
433
|
+
} catch {
|
|
434
|
+
/* best-effort */
|
|
435
|
+
}
|
|
436
|
+
continue;
|
|
437
|
+
}
|
|
438
|
+
if (isLockStale(lock, resolveStartTime, heartbeatStaleMs, now()).stale) {
|
|
439
|
+
try {
|
|
440
|
+
unlinkSync(lockPath);
|
|
441
|
+
reclaimed.push(lockPath);
|
|
442
|
+
} catch {
|
|
443
|
+
/* best-effort */
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
return reclaimed;
|
|
448
|
+
}
|
|
@@ -59,6 +59,31 @@ export const PiTeamsWorktreeConfigSchema = Type.Object({
|
|
|
59
59
|
seedPaths: Type.Optional(Type.Array(Type.String({ minLength: 1 }))),
|
|
60
60
|
}, { additionalProperties: false });
|
|
61
61
|
|
|
62
|
+
/**
|
|
63
|
+
* Goal-wrap config (RFC v0.5 vision: apply `goal` completion-guarantee to builtin workflows).
|
|
64
|
+
* Per-workflow toggle. When enabled, a builtin workflow runs as the WORKER TURN inside a
|
|
65
|
+
* goal loop (worker → judge → feedback → redo until achieved / maxTurns / budget / stuck).
|
|
66
|
+
* Default OFF — opt-in per workflow. Only applies to builtin workflows that have a clear
|
|
67
|
+
* 'done' condition (implementation, fast-fix). Read-only workflows (review, research) are
|
|
68
|
+
* not goal-wrappable.
|
|
69
|
+
*/
|
|
70
|
+
export const GoalWrapWorkflowConfigSchema = Type.Object({
|
|
71
|
+
enabled: Type.Optional(Type.Boolean()),
|
|
72
|
+
maxTurns: Type.Optional(Type.Integer({ minimum: 1, maximum: 50 })),
|
|
73
|
+
evaluatorModel: Type.Optional(Type.String({ minLength: 1 })),
|
|
74
|
+
verification: Type.Optional(Type.Object({
|
|
75
|
+
commands: Type.Array(Type.String({ minLength: 1 })),
|
|
76
|
+
mode: Type.Optional(Type.Literal("text-only")),
|
|
77
|
+
}, { additionalProperties: false })),
|
|
78
|
+
budgetTotal: Type.Optional(Type.Integer({ minimum: 1000 })),
|
|
79
|
+
budgetUnlimited: Type.Optional(Type.Boolean()),
|
|
80
|
+
}, { additionalProperties: false });
|
|
81
|
+
|
|
82
|
+
export const PiTeamsGoalWrapConfigSchema = Type.Record(
|
|
83
|
+
Type.String({ minLength: 1 }),
|
|
84
|
+
GoalWrapWorkflowConfigSchema,
|
|
85
|
+
);
|
|
86
|
+
|
|
62
87
|
export const AgentOverrideSchema = Type.Object({
|
|
63
88
|
disabled: Type.Optional(Type.Boolean()),
|
|
64
89
|
model: Type.Optional(Type.Union([Type.String({ minLength: 1 }), Type.Literal(false)])),
|
|
@@ -152,6 +177,7 @@ export const PiTeamsConfigSchema = Type.Object({
|
|
|
152
177
|
runtime: Type.Optional(PiTeamsRuntimeConfigSchema),
|
|
153
178
|
control: Type.Optional(PiTeamsControlConfigSchema),
|
|
154
179
|
worktree: Type.Optional(PiTeamsWorktreeConfigSchema),
|
|
180
|
+
goalWrap: Type.Optional(PiTeamsGoalWrapConfigSchema),
|
|
155
181
|
agents: Type.Optional(PiTeamsAgentsConfigSchema),
|
|
156
182
|
tools: Type.Optional(PiTeamsToolsConfigSchema),
|
|
157
183
|
telemetry: Type.Optional(PiTeamsTelemetryConfigSchema),
|
|
@@ -72,6 +72,12 @@ export const TeamToolParams = Type.Object({
|
|
|
72
72
|
Type.Literal("anchor"),
|
|
73
73
|
Type.Literal("auto-summarize"),
|
|
74
74
|
Type.Literal("auto_boomerang"),
|
|
75
|
+
Type.Literal("goal"),
|
|
76
|
+
Type.Literal("workflow-create"),
|
|
77
|
+
Type.Literal("workflow-get"),
|
|
78
|
+
Type.Literal("workflow-list"),
|
|
79
|
+
Type.Literal("workflow-save"),
|
|
80
|
+
Type.Literal("workflow-delete"),
|
|
75
81
|
],
|
|
76
82
|
{ description: "Team action. Defaults to 'list' when omitted." },
|
|
77
83
|
),
|
|
@@ -244,8 +250,14 @@ export const TeamToolParams = Type.Object({
|
|
|
244
250
|
budgetTotal: Type.Optional(
|
|
245
251
|
Type.Number({
|
|
246
252
|
description:
|
|
247
|
-
"Total token budget for the run. When set, enables budget tracking with default 80% warning and 95% abort thresholds.",
|
|
248
|
-
minimum:
|
|
253
|
+
"Total token budget for the run. When set, enables budget tracking with default 80% warning and 95% abort thresholds. Minimum 1000 — this is a MISCONFIGURATION GUARD (catches typos / silent-abort configs like budgetTotal:1, which would abort on turn 1), NOT a usefulness guarantee; a productive multi-turn goal needs far more than 1000 tokens.",
|
|
254
|
+
minimum: 1000,
|
|
255
|
+
}),
|
|
256
|
+
),
|
|
257
|
+
budgetUnlimited: Type.Optional(
|
|
258
|
+
Type.Boolean({
|
|
259
|
+
description:
|
|
260
|
+
"When true, skip budget enforcement entirely (explicit opt-out). Goal-start validation requires budgetTotal>=1000 OR budgetUnlimited:true; audit-logged when set. The validation itself is enforced in a later integration task.",
|
|
249
261
|
}),
|
|
250
262
|
),
|
|
251
263
|
budgetWarning: Type.Optional(
|
|
@@ -264,6 +276,19 @@ export const TeamToolParams = Type.Object({
|
|
|
264
276
|
maximum: 1,
|
|
265
277
|
}),
|
|
266
278
|
),
|
|
279
|
+
runKind: Type.Optional(
|
|
280
|
+
Type.Union(
|
|
281
|
+
[
|
|
282
|
+
Type.Literal("team-run"),
|
|
283
|
+
Type.Literal("goal-loop"),
|
|
284
|
+
Type.Literal("dynamic-workflow"),
|
|
285
|
+
],
|
|
286
|
+
{
|
|
287
|
+
description:
|
|
288
|
+
"Background dispatch discriminator. Default \"team-run\" runs the normal executeTeamRun workflow; \"goal-loop\" (P0/P1) and \"dynamic-workflow\" (P2/P3) dispatch to their respective background runners. Absent = \"team-run\" for backward compatibility.",
|
|
289
|
+
},
|
|
290
|
+
),
|
|
291
|
+
),
|
|
267
292
|
});
|
|
268
293
|
|
|
269
294
|
export interface TeamToolParamsValue {
|
|
@@ -312,7 +337,13 @@ export interface TeamToolParamsValue {
|
|
|
312
337
|
| "search"
|
|
313
338
|
| "orchestrate"
|
|
314
339
|
| "schedule"
|
|
315
|
-
| "scheduled"
|
|
340
|
+
| "scheduled"
|
|
341
|
+
| "goal"
|
|
342
|
+
| "workflow-create"
|
|
343
|
+
| "workflow-get"
|
|
344
|
+
| "workflow-list"
|
|
345
|
+
| "workflow-save"
|
|
346
|
+
| "workflow-delete";
|
|
316
347
|
resource?: "agent" | "team" | "workflow";
|
|
317
348
|
team?: string;
|
|
318
349
|
workflow?: string;
|
|
@@ -352,10 +383,14 @@ export interface TeamToolParamsValue {
|
|
|
352
383
|
once?: string | number;
|
|
353
384
|
/** Mark certain bash commands as excludeFromContext to reduce context tokens (default: false). */
|
|
354
385
|
excludeContextBash?: boolean;
|
|
355
|
-
/** Total token budget for the run. When set, enables budget tracking. */
|
|
386
|
+
/** Total token budget for the run. When set, enables budget tracking (minimum 1000). */
|
|
356
387
|
budgetTotal?: number;
|
|
388
|
+
/** When true, skip budget enforcement entirely (explicit opt-out). */
|
|
389
|
+
budgetUnlimited?: boolean;
|
|
357
390
|
/** Budget warning threshold as a fraction (0-1). Default: 0.8. */
|
|
358
391
|
budgetWarning?: number;
|
|
359
392
|
/** Budget abort threshold as a fraction (0-1). Default: 0.95. */
|
|
360
393
|
budgetAbort?: number;
|
|
394
|
+
/** Background dispatch discriminator. Default "team-run". "goal-loop"/"dynamic-workflow" dispatch to their runners (P0/P2). */
|
|
395
|
+
runKind?: "team-run" | "goal-loop" | "dynamic-workflow";
|
|
361
396
|
}
|
|
@@ -3,6 +3,7 @@ import * as fs from "node:fs";
|
|
|
3
3
|
import * as os from "node:os";
|
|
4
4
|
import * as path from "node:path";
|
|
5
5
|
import { logInternalError } from "../utils/internal-error.ts";
|
|
6
|
+
import { isWorkerAtomicWriterEnabled, atomicWriteFileViaWorker } from "./worker-atomic-writer.ts";
|
|
6
7
|
import { sleepSync } from "../utils/sleep.ts";
|
|
7
8
|
|
|
8
9
|
function hashContent(content: string): string {
|
|
@@ -380,6 +381,14 @@ export function atomicWriteFile(filePath: string, content: string, expectedHash?
|
|
|
380
381
|
|
|
381
382
|
|
|
382
383
|
export async function atomicWriteFileAsync(filePath: string, content: string): Promise<void> {
|
|
384
|
+
// Phase 1.5 (RFC 15): when the worker-thread atomic writer is enabled
|
|
385
|
+
// (PI_CREW_WORKER_ATOMIC_WRITER=1), dispatch to a dedicated worker thread
|
|
386
|
+
// that performs SYNC fs ops with no internal yields. Mitigates the
|
|
387
|
+
// non-deterministic V8/libuv crash during event-loop yields in multi-step
|
|
388
|
+
// goal-wrapped workflows.
|
|
389
|
+
if (isWorkerAtomicWriterEnabled()) {
|
|
390
|
+
return atomicWriteFileViaWorker(filePath, content);
|
|
391
|
+
}
|
|
383
392
|
if (!isSymlinkSafePath(filePath)) throw new Error(`Refusing to write: target is a symlink or inside untrusted directory: ${filePath}`);
|
|
384
393
|
await fs.promises.mkdir(path.dirname(filePath), { recursive: true });
|
|
385
394
|
const tempPath = `${filePath}.${crypto.randomUUID()}.tmp`;
|
package/src/state/contracts.ts
CHANGED
|
@@ -77,6 +77,20 @@ const TEAM_EVENT_TYPES = [
|
|
|
77
77
|
"phase.completed",
|
|
78
78
|
"phase.skipped",
|
|
79
79
|
"phase.failed",
|
|
80
|
+
// Goal loop events (P0/P1) — autonomous goal-loop coordinator.
|
|
81
|
+
"goal.loop_start",
|
|
82
|
+
"goal.turn_start",
|
|
83
|
+
"goal.turn_evaluated",
|
|
84
|
+
"goal.budget_warning",
|
|
85
|
+
"goal.loop_end",
|
|
86
|
+
"goal.feedback_steered",
|
|
87
|
+
"goal.state_changed",
|
|
88
|
+
// Dynamic workflow events (P2) — script-driven orchestration.
|
|
89
|
+
"dwf.started",
|
|
90
|
+
"dwf.phase_started",
|
|
91
|
+
"dwf.phase_completed",
|
|
92
|
+
"dwf.completed",
|
|
93
|
+
"dwf.failed",
|
|
80
94
|
] as const;
|
|
81
95
|
export type TeamEventType = typeof TEAM_EVENT_TYPES[number];
|
|
82
96
|
|
package/src/state/crew-init.ts
CHANGED
|
@@ -22,8 +22,19 @@ import { updateGitignore } from "./gitignore-manager.ts";
|
|
|
22
22
|
// Re-export updateGitignore for backwards compatibility with tests.
|
|
23
23
|
export { updateGitignore };
|
|
24
24
|
|
|
25
|
-
/**
|
|
26
|
-
|
|
25
|
+
/**
|
|
26
|
+
* README content for the .crew directory.
|
|
27
|
+
*
|
|
28
|
+
* Defined as a function (not a `const`) to avoid the Temporal Dead Zone race
|
|
29
|
+
* documented in issue #28 + RFC 17. When this module is loaded via
|
|
30
|
+
* `jiti.import()` (pi's extension loader) wrapped in an async function, a
|
|
31
|
+
* `const` initializer can be hit in TDZ by functions hoisted above it (the
|
|
32
|
+
* same pattern that bit `crewInitPromise` in team-tool/run.ts — see commit
|
|
33
|
+
* fixing it). A function declaration is hoisted with its body available
|
|
34
|
+
* immediately, so callers always get the fully-built string.
|
|
35
|
+
*/
|
|
36
|
+
function buildCrewReadme(): string {
|
|
37
|
+
return `# .crew — pi-crew Runtime Directory
|
|
27
38
|
|
|
28
39
|
This directory contains pi-crew runtime state and artifacts.
|
|
29
40
|
|
|
@@ -50,7 +61,7 @@ To clear cache:
|
|
|
50
61
|
team action='cache' action='clear'
|
|
51
62
|
\`\`\`
|
|
52
63
|
`;
|
|
53
|
-
|
|
64
|
+
}
|
|
54
65
|
/**
|
|
55
66
|
* Find the project root by walking up from start directory.
|
|
56
67
|
* Inline implementation to avoid module dependency on paths.ts.
|
|
@@ -249,13 +260,15 @@ export async function ensureCrewDirectory(cwd: string): Promise<void> {
|
|
|
249
260
|
}
|
|
250
261
|
|
|
251
262
|
// 3. Write README.md (always overwrite to keep it current)
|
|
252
|
-
fs.writeFileSync(safeJoin(crewRoot, "README.md"),
|
|
263
|
+
fs.writeFileSync(safeJoin(crewRoot, "README.md"), buildCrewReadme(), "utf-8");
|
|
253
264
|
|
|
254
265
|
// 4. Update .gitignore at project root
|
|
255
266
|
const repoRoot = findProjectRoot(cwd);
|
|
256
267
|
if (repoRoot) {
|
|
257
268
|
const gitignorePath = safeJoin(repoRoot, ".gitignore");
|
|
258
|
-
|
|
269
|
+
// LAZY: dodge the jiti ESM/CJS interop TDZ race on the static `import { updateGitignore }` above (issue #28, RFC 17). At this point the module body has fully evaluated, so the dynamic import resolves to a live binding.
|
|
270
|
+
const { updateGitignore: updateGitignoreFn } = await import("./gitignore-manager.ts");
|
|
271
|
+
await updateGitignoreFn(gitignorePath);
|
|
259
272
|
}
|
|
260
273
|
}
|
|
261
274
|
|
package/src/state/event-log.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { createHash } from "node:crypto";
|
|
2
2
|
import * as fs from "node:fs";
|
|
3
3
|
import * as path from "node:path";
|
|
4
|
+
import { isWorkerAtomicWriterEnabled, appendFileViaWorker } from "./worker-atomic-writer.ts";
|
|
4
5
|
import { DEFAULT_EVENT_LOG } from "../config/defaults.ts";
|
|
5
6
|
import { atomicWriteFile } from "./atomic-write.ts";
|
|
6
7
|
import { errors } from "../errors.ts";
|
|
@@ -443,7 +444,12 @@ export async function appendEventAsync(eventsPath: string, event: AppendTeamEven
|
|
|
443
444
|
|
|
444
445
|
if (!skippedDueToSize) {
|
|
445
446
|
const line = JSON.stringify(redactSecrets(fullEvent)) + "\n";
|
|
446
|
-
|
|
447
|
+
// Phase 1.5: when worker atomic writer is enabled, append via worker.
|
|
448
|
+
if (isWorkerAtomicWriterEnabled()) {
|
|
449
|
+
await appendFileViaWorker(eventsPath, line);
|
|
450
|
+
} else {
|
|
451
|
+
await fs.promises.appendFile(eventsPath, line, { encoding: "utf-8", flag: "a" });
|
|
452
|
+
}
|
|
447
453
|
// FIX: fsync to ensure event content is flushed to disk before persisting
|
|
448
454
|
// the sequence number. This closes the crash window between appendFile and
|
|
449
455
|
// persistSequence where sequence reuse could occur on restart.
|