pi-crew 0.8.14 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +366 -0
- package/README.md +112 -2
- package/docs/FEATURE_INTAKE.md +1 -1
- package/docs/HARNESS.md +20 -19
- package/docs/PROJECT_REVIEW.md +132 -133
- package/docs/PROJECT_REVIEW_FIXES.md +130 -131
- package/docs/actions-reference.md +127 -121
- package/docs/architecture.md +1 -1
- package/docs/code-review-2026-05-11.md +134 -134
- package/docs/commands-reference.md +108 -106
- package/docs/comparison-pi-subagents-vs-pi-crew.md +105 -105
- package/docs/deep-review-report.md +1 -1
- package/docs/dynamic-workflows.md +90 -0
- package/docs/fixes/BATCH_A_H1_H2.md +17 -17
- package/docs/fixes/bug-007-async-notifier-stale-ctx.md +23 -23
- package/docs/followup-plan-2026-05-12.md +135 -135
- package/docs/followup-review-2026-05-12.md +86 -86
- package/docs/followup-review-round3-2026-05-12.md +123 -123
- package/docs/goals.md +59 -0
- package/docs/implementation-plan-top3.md +4 -4
- package/docs/issue-29-analysis.md +2 -2
- package/docs/oh-my-pi-research.md +154 -154
- package/docs/optimization-plan.md +2 -0
- package/docs/perf/baseline-2026-05.md +9 -9
- package/docs/perf/final-report-2026-05.md +2 -2
- package/docs/perf/sprint-1-report.md +2 -2
- package/docs/perf/sprint-2-report.md +1 -1
- package/docs/perf/upgrade-plan-2026-05.md +72 -72
- package/docs/pi-crew-bugs.md +230 -230
- package/docs/pi-crew-investigation-report.md +102 -102
- package/docs/pi-crew-test-round5.md +4 -4
- package/docs/runtime-analysis-child-vs-live.md +57 -57
- package/docs/runtime-migration-in-process-analysis.md +97 -97
- package/package.json +2 -4
- package/skills/orchestration/SKILL.md +11 -11
- package/src/agents/agent-config.ts +4 -0
- package/src/config/config.ts +39 -0
- package/src/config/types.ts +11 -0
- package/src/extension/action-suggestions.ts +2 -1
- package/src/extension/async-notifier.ts +10 -0
- package/src/extension/help.ts +14 -0
- package/src/extension/registration/commands.ts +27 -0
- package/src/extension/team-tool/destructive-gate.ts +1 -1
- package/src/extension/team-tool/goal-wrap.ts +288 -0
- package/src/extension/team-tool/goal.ts +405 -0
- package/src/extension/team-tool/run.ts +103 -4
- package/src/extension/team-tool/workflow-manage.ts +194 -0
- package/src/extension/team-tool.ts +20 -0
- package/src/hooks/types.ts +3 -1
- package/src/runtime/async-runner.ts +27 -2
- package/src/runtime/background-runner.ts +68 -19
- package/src/runtime/child-pi.ts +9 -1
- package/src/runtime/completion-guard.ts +1 -1
- package/src/runtime/dynamic-workflow-context.ts +450 -0
- package/src/runtime/dynamic-workflow-runner.ts +180 -0
- package/src/runtime/global-worker-cap.ts +96 -0
- package/src/runtime/goal-evaluator.ts +294 -0
- package/src/runtime/goal-loop-runner.ts +612 -0
- package/src/runtime/goal-state-store.ts +209 -0
- package/src/runtime/iteration-hooks.ts +2 -1
- package/src/runtime/pi-args.ts +10 -2
- package/src/runtime/post-checks.ts +2 -1
- package/src/runtime/result-extractor.ts +32 -0
- package/src/runtime/team-runner.ts +11 -1
- package/src/runtime/verification-gates.ts +88 -5
- package/src/runtime/verification-integrity.ts +110 -0
- package/src/runtime/verification-worktree.ts +136 -0
- package/src/runtime/workspace-lock.ts +448 -0
- package/src/schema/config-schema.ts +26 -0
- package/src/schema/team-tool-schema.ts +39 -4
- package/src/state/atomic-write.ts +9 -0
- package/src/state/contracts.ts +14 -0
- package/src/state/crew-init.ts +18 -5
- package/src/state/event-log.ts +7 -1
- package/src/state/state-store.ts +2 -0
- package/src/state/types.ts +82 -0
- package/src/state/worker-atomic-writer.ts +190 -0
- package/src/utils/env-allowlist.ts +30 -0
- package/src/utils/redaction.ts +104 -24
- package/src/utils/safe-paths.ts +55 -14
- package/src/workflows/discover-workflows.ts +25 -1
- package/src/workflows/workflow-config.ts +13 -0
- package/src/worktree/cleanup.ts +2 -1
- package/src/worktree/worktree-manager.ts +4 -3
- package/teams/parallel-research.team.md +1 -1
- package/workflows/examples/hello.dwf.ts +24 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 1.5 #2 — Git-worktree verification sandbox.
|
|
3
|
+
*
|
|
4
|
+
* RFC: research-findings/goal-workflow/16-PHASE1.5-WORKTREE-SANDBOX-RFC.md
|
|
5
|
+
*
|
|
6
|
+
* Closes the two remaining Med-High tampering residuals from RFC 13 §6:
|
|
7
|
+
* (1) Round-trip manifest tamper (MAJ#2): worker edits package.json, runs
|
|
8
|
+
* test, reverts before T_verify_done → snapshot matches → undetected.
|
|
9
|
+
* (2) Invoked-script tampering: worker rewrites a script the verification
|
|
10
|
+
* command invokes; only MANIFEST_FILES are hashed → invisible.
|
|
11
|
+
*
|
|
12
|
+
* Mitigation: run verification commands in a pristine `git worktree` at the
|
|
13
|
+
* T_snap commit SHA. Worker edits in the main workspace are invisible to
|
|
14
|
+
* the verification subprocess. Content-addressed execution: the test passes
|
|
15
|
+
* (or fails) against the original code, regardless of worker edits.
|
|
16
|
+
*
|
|
17
|
+
* Opt-in via PI_CREW_VERIFICATION_WORKTREE=1 (mirrors Phase 1.5 #1 pattern).
|
|
18
|
+
* Auto-fallback to existing behavior when: not a git repo, dirty index,
|
|
19
|
+
* git unavailable, or opt-out explicitly set. NEVER blocks the goal loop.
|
|
20
|
+
*
|
|
21
|
+
* Pure leaf module: depends only on node: built-ins + git CLI. No imports
|
|
22
|
+
* from goal-loop-runner or verification-gates (keeps unit-testable).
|
|
23
|
+
*/
|
|
24
|
+
import { execFileSync } from "node:child_process";
|
|
25
|
+
import * as fs from "node:fs";
|
|
26
|
+
import * as os from "node:os";
|
|
27
|
+
import * as path from "node:path";
|
|
28
|
+
|
|
29
|
+
export interface VerificationWorktree {
|
|
30
|
+
/** Absolute path to the pristine worktree directory. */
|
|
31
|
+
worktreePath: string;
|
|
32
|
+
/** Commit SHA the worktree is checked out at (matches T_snap). */
|
|
33
|
+
commitSha: string;
|
|
34
|
+
/** Cleanup handle — call to remove the worktree + temp dir. Idempotent. */
|
|
35
|
+
cleanup: () => void;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Whether the worktree sandbox is enabled (env var opt-in). */
|
|
39
|
+
export function isWorktreeSandboxEnabled(): boolean {
|
|
40
|
+
const v = process.env.PI_CREW_VERIFICATION_WORKTREE ?? process.env.PI_TEAMS_VERIFICATION_WORKTREE;
|
|
41
|
+
return v === "1" || v === "true";
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Detect whether the worktree sandbox is AVAILABLE at `cwd`:
|
|
46
|
+
* - opt-in env var set
|
|
47
|
+
* - git executable on PATH
|
|
48
|
+
* - cwd is inside a git repo
|
|
49
|
+
* - git index is clean (no uncommitted changes that would be lost)
|
|
50
|
+
*
|
|
51
|
+
* Returns false (with reason) when any precondition fails. Callers MUST
|
|
52
|
+
* gracefully fall back to non-sandboxed execution — never block the goal.
|
|
53
|
+
*/
|
|
54
|
+
export function checkWorktreeSandboxAvailable(cwd: string): { available: true; commitSha: string } | { available: false; reason: string } {
|
|
55
|
+
if (!isWorktreeSandboxEnabled()) {
|
|
56
|
+
return { available: false, reason: "PI_CREW_VERIFICATION_WORKTREE not set (opt-in)" };
|
|
57
|
+
}
|
|
58
|
+
try {
|
|
59
|
+
// Is cwd inside a git repo? `git rev-parse --show-toplevel` errors out
|
|
60
|
+
// (non-zero exit) when not in a repo. execFileSync throws on non-zero.
|
|
61
|
+
const toplevel = execFileSync("git", ["rev-parse", "--show-toplevel"], { cwd, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }).trim();
|
|
62
|
+
if (!toplevel) return { available: false, reason: "git rev-parse returned empty toplevel" };
|
|
63
|
+
// Current commit SHA (this is what T_snap will pin to).
|
|
64
|
+
const commitSha = execFileSync("git", ["rev-parse", "HEAD"], { cwd, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }).trim();
|
|
65
|
+
if (!commitSha) return { available: false, reason: "git rev-parse HEAD returned empty SHA" };
|
|
66
|
+
// Dirty index? `git status --porcelain` outputs non-empty if there are
|
|
67
|
+
// uncommitted changes. We refuse to sandbox a dirty workspace because
|
|
68
|
+
// the worktree would NOT contain the in-progress edits (T_snap would
|
|
69
|
+
// pin to a stale commit). Better to fall back + warn than silently
|
|
70
|
+
// verify against the wrong code.
|
|
71
|
+
const status = execFileSync("git", ["status", "--porcelain"], { cwd, stdio: ["ignore", "pipe", "pipe"], encoding: "utf-8" }).trim();
|
|
72
|
+
if (status.length > 0) return { available: false, reason: `dirty git index (${status.split("\n").length} changed files); refusing to sandbox — worktree would pin to stale commit` };
|
|
73
|
+
return { available: true, commitSha };
|
|
74
|
+
} catch (error) {
|
|
75
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
76
|
+
return { available: false, reason: `git precondition check failed: ${msg.slice(0, 200)}` };
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Prepare a pristine git worktree at `commitSha`. The worktree is a fresh
|
|
82
|
+
* checkout of the project at that commit — it does NOT contain worker edits
|
|
83
|
+
* from the main workspace.
|
|
84
|
+
*
|
|
85
|
+
* `git worktree add --detach <tmp>/wt-<sha8> <sha>` creates a detached-HEAD
|
|
86
|
+
* worktree (no branch pollution). Returns the worktree path + cleanup handle.
|
|
87
|
+
*
|
|
88
|
+
* Cleanup is idempotent (safe to call multiple times) and best-effort (swallows
|
|
89
|
+
* errors so a stuck worktree doesn't propagate into the goal loop).
|
|
90
|
+
*/
|
|
91
|
+
export function prepareVerificationWorktree(cwd: string, commitSha: string): VerificationWorktree {
|
|
92
|
+
// Temp parent dir under os.tmpdir() so worktrees are auto-cleaned on reboot.
|
|
93
|
+
const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), "pi-crew-wt-"));
|
|
94
|
+
const shortSha = commitSha.slice(0, 8);
|
|
95
|
+
const worktreePath = path.join(tmpRoot, `wt-${shortSha}`);
|
|
96
|
+
let cleaned = false;
|
|
97
|
+
const cleanup = (): void => {
|
|
98
|
+
if (cleaned) return;
|
|
99
|
+
cleaned = true;
|
|
100
|
+
// Remove the worktree (force = proceed even if it has untracked files).
|
|
101
|
+
try {
|
|
102
|
+
execFileSync("git", ["worktree", "remove", "--force", worktreePath], { cwd, stdio: ["ignore", "pipe", "pipe"], timeout: 5000 });
|
|
103
|
+
} catch {
|
|
104
|
+
// Fall back to `git worktree prune` if remove fails (already gone).
|
|
105
|
+
try { execFileSync("git", ["worktree", "prune"], { cwd, stdio: ["ignore", "pipe", "pipe"], timeout: 5000 }); } catch { /* best-effort */ }
|
|
106
|
+
}
|
|
107
|
+
// Remove the temp parent dir.
|
|
108
|
+
try { fs.rmSync(tmpRoot, { recursive: true, force: true }); } catch { /* best-effort */ }
|
|
109
|
+
};
|
|
110
|
+
try {
|
|
111
|
+
execFileSync("git", ["worktree", "add", "--detach", worktreePath, commitSha], { cwd, stdio: ["ignore", "pipe", "pipe"], timeout: 30_000 });
|
|
112
|
+
return { worktreePath, commitSha, cleanup };
|
|
113
|
+
} catch (error) {
|
|
114
|
+
cleanup();
|
|
115
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
116
|
+
throw new Error(`git worktree add failed (cwd=${cwd}, sha=${shortSha}): ${msg.slice(0, 300)}`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* RAII wrapper: prepare worktree, run `fn(worktree)`, ALWAYS cleanup in finally.
|
|
122
|
+
*
|
|
123
|
+
* `fn` may throw — the worktree is removed regardless. The original error
|
|
124
|
+
* propagates (cleanup errors are swallowed and best-effort).
|
|
125
|
+
*
|
|
126
|
+
* If preparation fails, the function rethrows WITHOUT calling fn — caller
|
|
127
|
+
* must handle the prep failure (typically by falling back to non-sandboxed).
|
|
128
|
+
*/
|
|
129
|
+
export async function withVerificationWorktree<T>(cwd: string, commitSha: string, fn: (worktree: VerificationWorktree) => Promise<T> | T): Promise<T> {
|
|
130
|
+
const worktree = prepareVerificationWorktree(cwd, commitSha);
|
|
131
|
+
try {
|
|
132
|
+
return await fn(worktree);
|
|
133
|
+
} finally {
|
|
134
|
+
worktree.cleanup();
|
|
135
|
+
}
|
|
136
|
+
}
|
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* workspace-lock.ts — Per-cwd workspace lock with startTime-safe liveness (P1g).
|
|
3
|
+
*
|
|
4
|
+
* RFC: research-findings/goal-workflow/13-VISION-RFC.md v0.5 §P1g + D10.
|
|
5
|
+
*
|
|
6
|
+
* Closes #8 (multi-goal clobber) and the B-2 PID-recycling gap. Each
|
|
7
|
+
* `workspaceMode:"single"` goal acquires this lock for its entire lifetime,
|
|
8
|
+
* serializing concurrent goals that share a cwd.
|
|
9
|
+
*
|
|
10
|
+
* Lockfile location: `<crewRoot>/state/workspace-locks/<sha256(absCwd)>.lock`
|
|
11
|
+
* Lockfile contents: { pid, startTime, heartbeat, goalId, acquiredAt }
|
|
12
|
+
*
|
|
13
|
+
* ─── LIVENESS = stale-reconciler startTime pattern (D10, B-2 fix) ───
|
|
14
|
+
* A lock is STALE iff EITHER:
|
|
15
|
+
* (a) the recorded pid's CURRENT startTime ≠ the lockfile startTime
|
|
16
|
+
* (the PID was recycled to a different process), OR
|
|
17
|
+
* (b) the heartbeat is older than HEARTBEAT_STALE_MS (default 60s)
|
|
18
|
+
* (the process crashed without exiting / heartbeat stopped).
|
|
19
|
+
*
|
|
20
|
+
* Why NOT child-pi.ts killProcessPid (B-2): killProcessPid uses
|
|
21
|
+
* process.kill(pid, 0) which is PID-only — vulnerable to PID recycling. The
|
|
22
|
+
* startTime + before/after re-verify pattern is TOCTOU-correct.
|
|
23
|
+
*
|
|
24
|
+
* getProcessStartTime is NOT exported from stale-reconciler.ts, so its logic
|
|
25
|
+
* is REPLICATED here (RFC §P1g explicitly permits importing OR replicating).
|
|
26
|
+
* The replication matches stale-reconciler.ts:112 field-for-field.
|
|
27
|
+
*
|
|
28
|
+
* Granularity: per-goal, held for the goal's lifetime (release() on goal end).
|
|
29
|
+
* Contention: default QUEUE (poll until released or stale);
|
|
30
|
+
* opts.failOnWorkspaceBusy:true → THROW instead of queue.
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
import { createHash } from "node:crypto";
|
|
34
|
+
import {
|
|
35
|
+
existsSync,
|
|
36
|
+
mkdirSync,
|
|
37
|
+
readFileSync,
|
|
38
|
+
readdirSync,
|
|
39
|
+
unlinkSync,
|
|
40
|
+
openSync,
|
|
41
|
+
closeSync,
|
|
42
|
+
statSync,
|
|
43
|
+
writeFileSync,
|
|
44
|
+
} from "node:fs";
|
|
45
|
+
import * as path from "node:path";
|
|
46
|
+
import { atomicWriteJson } from "../state/atomic-write.ts";
|
|
47
|
+
import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
|
|
48
|
+
|
|
49
|
+
/** Heartbeat staleness threshold (ms). Default 60s per RFC §P1g. */
|
|
50
|
+
const DEFAULT_HEARTBEAT_STALE_MS = 60_000;
|
|
51
|
+
|
|
52
|
+
/** Polling interval while queued waiting for a held lock (ms). */
|
|
53
|
+
const DEFAULT_LOCK_POLL_MS = 500;
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Resolve a pid's process start time in ms, reusing the stale-reconciler
|
|
57
|
+
* pattern (src/runtime/stale-reconciler.ts:112). Returns undefined if the
|
|
58
|
+
* process is gone or /proc is unavailable (non-Linux). The absolute value
|
|
59
|
+
* matters less than its uniqueness per PID lifecycle. Used to detect PID
|
|
60
|
+
* recycling: a recycled PID has a different startTime than the recorded one.
|
|
61
|
+
*
|
|
62
|
+
* Callers (esp. tests) may inject a custom resolver to simulate PID recycling
|
|
63
|
+
* deterministically without spawning real processes.
|
|
64
|
+
*/
|
|
65
|
+
export type StartTimeResolver = (pid: number) => number | undefined;
|
|
66
|
+
|
|
67
|
+
export const defaultStartTimeResolver: StartTimeResolver = (pid: number): number | undefined => {
|
|
68
|
+
try {
|
|
69
|
+
const stat = readFileSync(`/proc/${pid}/stat`, "utf-8");
|
|
70
|
+
const lastParen = stat.lastIndexOf(")");
|
|
71
|
+
if (lastParen === -1) return undefined;
|
|
72
|
+
const fieldsAfterComm = stat.slice(lastParen + 1).trim().split(/\s+/);
|
|
73
|
+
// starttime is at index 19 (the 20th field after comm) of /proc/<pid>/stat.
|
|
74
|
+
const startTimeClockTicks = Number(fieldsAfterComm[19]);
|
|
75
|
+
if (!Number.isFinite(startTimeClockTicks)) return undefined;
|
|
76
|
+
// Convert clock ticks to ms (~CLK_TCK). Absolute uniqueness is what matters.
|
|
77
|
+
return Math.floor(startTimeClockTicks * 10);
|
|
78
|
+
} catch {
|
|
79
|
+
return undefined;
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
/** Lockfile contents (persisted as JSON). */
|
|
84
|
+
export interface WorkspaceLockContents {
|
|
85
|
+
pid: number;
|
|
86
|
+
startTime: number | undefined;
|
|
87
|
+
heartbeat: number;
|
|
88
|
+
goalId: string;
|
|
89
|
+
acquiredAt: string;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Opaque handle returned by acquireWorkspaceLock. Call release() to free the
|
|
94
|
+
* lock when the goal ends. release() is a no-op if the lock was already
|
|
95
|
+
* reclaimed/re-acquired by another goal (guarded by goalId + pid + startTime).
|
|
96
|
+
*/
|
|
97
|
+
export interface WorkspaceLockHandle {
|
|
98
|
+
readonly cwd: string;
|
|
99
|
+
readonly goalId: string;
|
|
100
|
+
readonly lockPath: string;
|
|
101
|
+
/** The startTime value written to the lockfile at acquire (release guard). */
|
|
102
|
+
readonly startTime: number | undefined;
|
|
103
|
+
release(): void;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export interface AcquireWorkspaceLockOptions {
|
|
107
|
+
/** Throw instead of queue when the workspace is already held (default: queue). */
|
|
108
|
+
failOnWorkspaceBusy?: boolean;
|
|
109
|
+
/** Override the heartbeat-staleness threshold (ms). */
|
|
110
|
+
heartbeatStaleMs?: number;
|
|
111
|
+
/** Override the polling interval while queued (ms). */
|
|
112
|
+
pollMs?: number;
|
|
113
|
+
/** Test injection: override process start time resolution. */
|
|
114
|
+
startTimeResolver?: StartTimeResolver;
|
|
115
|
+
/** Test injection: override current time (ms). Default Date.now(). */
|
|
116
|
+
now?: () => number;
|
|
117
|
+
/** Test injection: override the current pid. Default process.pid. */
|
|
118
|
+
pid?: number;
|
|
119
|
+
/** Abort waiting when this signal aborts. */
|
|
120
|
+
signal?: AbortSignal;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Resolve the lockfile path for a cwd. Lockfiles live under the project's
|
|
125
|
+
* `.crew/state/workspace-locks/` (or user crew-root fallback) and are named by
|
|
126
|
+
* the sha256 of the absolute cwd to avoid filesystem-unsafe characters and to
|
|
127
|
+
* normalize symlink-equivalent paths.
|
|
128
|
+
*/
|
|
129
|
+
export function workspaceLockPath(cwd: string): string {
|
|
130
|
+
const absCwd = path.resolve(cwd);
|
|
131
|
+
const crewRoot = projectCrewRoot(absCwd) ?? userCrewRoot();
|
|
132
|
+
const locksDir = path.join(crewRoot, "state", "workspace-locks");
|
|
133
|
+
const hash = createHash("sha256").update(absCwd).digest("hex");
|
|
134
|
+
return path.join(locksDir, `${hash}.lock`);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/** Read + parse a lockfile. Returns undefined if missing/corrupt. */
|
|
138
|
+
function readLock(lockPath: string): WorkspaceLockContents | undefined {
|
|
139
|
+
if (!existsSync(lockPath)) return undefined;
|
|
140
|
+
try {
|
|
141
|
+
const parsed = JSON.parse(readFileSync(lockPath, "utf-8"));
|
|
142
|
+
if (!parsed || typeof parsed !== "object") return undefined;
|
|
143
|
+
return parsed as WorkspaceLockContents;
|
|
144
|
+
} catch {
|
|
145
|
+
return undefined;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Write the lockfile atomically (temp+rename+fsync via atomicWriteJson).
|
|
151
|
+
* Used for HEARTBEAT refresh only (a claim that already owns the lock is refreshing its
|
|
152
|
+
* timestamp — overwrite is correct because the owner verified ownership first).
|
|
153
|
+
*/
|
|
154
|
+
function writeLock(lockPath: string, contents: WorkspaceLockContents): void {
|
|
155
|
+
mkdirSync(path.dirname(lockPath), { recursive: true });
|
|
156
|
+
atomicWriteJson(lockPath, contents);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* CLAIM the lockfile atomically via O_EXCL (cold-review #3 NIT #N1 fix).
|
|
161
|
+
* The previous claim used temp+rename (writeLock), which is NOT cross-process atomic —
|
|
162
|
+
* two goals that both observed a free lock in the same tick could both writeLock and
|
|
163
|
+
* both believe they own it (the same class of TOCTOU cold-review #2 caught for CAS).
|
|
164
|
+
* O_EXCL (openSync "wx") IS atomic at the OS level: only one process can create the
|
|
165
|
+
* file. Returns true on success, false if the file already exists (EEXIST). Stale
|
|
166
|
+
* lockfiles older than the threshold are force-deleted + retried once.
|
|
167
|
+
*/
|
|
168
|
+
/**
|
|
169
|
+
* CLAIM the lockfile atomically via O_EXCL (cold-review #3 NIT #N1 fix).
|
|
170
|
+
* The previous claim used temp+rename (writeLock), which is NOT cross-process atomic —
|
|
171
|
+
* two goals that both observed a free/stale lock in the same tick could both writeLock
|
|
172
|
+
* and both believe they own it (the same class of TOCTOU cold-review #2 caught for CAS).
|
|
173
|
+
* O_EXCL (openSync "wx") IS atomic at the OS level: only one process can create the file.
|
|
174
|
+
*
|
|
175
|
+
* `forceOverwrite`: when the caller has ALREADY verified (via isLockStale) that the existing
|
|
176
|
+
* lock is logically stale (e.g. PID recycled — a stronger signal than mtime age), the caller
|
|
177
|
+
* passes forceOverwrite:true and claimLock unlinks then claims, bypassing the mtime age check.
|
|
178
|
+
* (Without this, a stale-by-PID-recycling lock whose mtime is recent would never be claimed,
|
|
179
|
+
* because tryCreate sees EEXIST and the mtime age check fails — infinite re-queue = hang.)
|
|
180
|
+
*
|
|
181
|
+
* Returns true on success, false if the file already exists and is not stale.
|
|
182
|
+
*/
|
|
183
|
+
function claimLock(lockPath: string, contents: WorkspaceLockContents, staleReclaimMs: number, forceOverwrite = false): boolean {
|
|
184
|
+
mkdirSync(path.dirname(lockPath), { recursive: true });
|
|
185
|
+
const json = JSON.stringify(contents);
|
|
186
|
+
const tryCreate = (): boolean => {
|
|
187
|
+
try {
|
|
188
|
+
const fd = openSync(lockPath, "wx"); // O_EXCL — throws EEXIST if it exists.
|
|
189
|
+
try {
|
|
190
|
+
writeFileSync(fd, json);
|
|
191
|
+
} finally {
|
|
192
|
+
closeSync(fd);
|
|
193
|
+
}
|
|
194
|
+
return true;
|
|
195
|
+
} catch (error) {
|
|
196
|
+
const code = (error as NodeJS.ErrnoException).code;
|
|
197
|
+
if (code !== "EEXIST") throw error;
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
if (forceOverwrite) {
|
|
202
|
+
// Caller verified the existing lock is logically stale; remove it and claim. A concurrent
|
|
203
|
+
// reclaimer might re-create between our unlink and our open — that's fine, we lose the race
|
|
204
|
+
// and return false, falling through to the queue path.
|
|
205
|
+
try { unlinkSync(lockPath); } catch { /* best-effort */ }
|
|
206
|
+
return tryCreate();
|
|
207
|
+
}
|
|
208
|
+
if (tryCreate()) return true;
|
|
209
|
+
// Stale recovery by mtime age: if the lockfile is older than staleReclaimMs, force-delete + retry.
|
|
210
|
+
try {
|
|
211
|
+
const stat = statSync(lockPath);
|
|
212
|
+
if (Date.now() - stat.mtimeMs > staleReclaimMs) {
|
|
213
|
+
try { unlinkSync(lockPath); } catch { /* fall through */ }
|
|
214
|
+
return tryCreate();
|
|
215
|
+
}
|
|
216
|
+
} catch { /* fall through to false */ }
|
|
217
|
+
return false;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Is the lock STALE? RFC §P1g + D10 dual-check:
|
|
222
|
+
* (a) startTime mismatch → PID recycled to a different process, OR
|
|
223
|
+
* (b) heartbeat older than heartbeatStaleMs → crash w/o exit / abandoned.
|
|
224
|
+
*
|
|
225
|
+
* On platforms where startTime is unavailable (non-Linux), only the heartbeat
|
|
226
|
+
* check applies (weaker PID-reuse detection — documented platform limitation,
|
|
227
|
+
* matching stale-reconciler.ts).
|
|
228
|
+
*/
|
|
229
|
+
function isLockStale(
|
|
230
|
+
lock: WorkspaceLockContents,
|
|
231
|
+
resolveStartTime: StartTimeResolver,
|
|
232
|
+
heartbeatStaleMs: number,
|
|
233
|
+
now: number,
|
|
234
|
+
): { stale: boolean; reason?: string } {
|
|
235
|
+
// (a) startTime mismatch → PID recycled to a different process.
|
|
236
|
+
if (lock.startTime !== undefined) {
|
|
237
|
+
const currentStartTime = resolveStartTime(lock.pid);
|
|
238
|
+
if (currentStartTime !== undefined && currentStartTime !== lock.startTime) {
|
|
239
|
+
return { stale: true, reason: "pid_recycled" };
|
|
240
|
+
}
|
|
241
|
+
// currentStartTime === undefined: process gone OR /proc unavailable →
|
|
242
|
+
// fall through to the heartbeat check (corroborating evidence).
|
|
243
|
+
}
|
|
244
|
+
// (b) heartbeat older than threshold → crash without exit / abandoned.
|
|
245
|
+
const heartbeatAge = now - lock.heartbeat;
|
|
246
|
+
if (heartbeatAge > heartbeatStaleMs) {
|
|
247
|
+
return { stale: true, reason: "heartbeat_stale" };
|
|
248
|
+
}
|
|
249
|
+
return { stale: false };
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Acquire the workspace lock for `goalId` at `cwd`. If the lock is held by a
|
|
254
|
+
* live goal, the default behavior is QUEUE (poll until released or the holder
|
|
255
|
+
* goes stale); with opts.failOnWorkspaceBusy:true, throws instead.
|
|
256
|
+
*
|
|
257
|
+
* Stale locks (PID recycled or heartbeat expired) are reclaimed transparently.
|
|
258
|
+
*
|
|
259
|
+
* The returned handle's release() deletes the lockfile ONLY if it still
|
|
260
|
+
* belongs to this goal+pid+startTime — so a stale handle cannot clobber a
|
|
261
|
+
* lock reclaimed and re-acquired by another goal after this goal went stale.
|
|
262
|
+
*
|
|
263
|
+
* In-process serialization: the read→stale-check→write sequence is
|
|
264
|
+
* synchronous within one event-loop tick, so concurrent in-process acquires
|
|
265
|
+
* cannot both observe a free lock and both write (no interleave between the
|
|
266
|
+
* sync read and sync write).
|
|
267
|
+
*/
|
|
268
|
+
export async function acquireWorkspaceLock(
|
|
269
|
+
cwd: string,
|
|
270
|
+
goalId: string,
|
|
271
|
+
opts: AcquireWorkspaceLockOptions = {},
|
|
272
|
+
): Promise<WorkspaceLockHandle> {
|
|
273
|
+
const lockPath = workspaceLockPath(cwd);
|
|
274
|
+
const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
|
|
275
|
+
const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
276
|
+
const pollMs = opts.pollMs ?? DEFAULT_LOCK_POLL_MS;
|
|
277
|
+
const now = opts.now ?? Date.now;
|
|
278
|
+
const pid = opts.pid ?? process.pid;
|
|
279
|
+
const writtenStartTime = resolveStartTime(pid);
|
|
280
|
+
|
|
281
|
+
while (true) {
|
|
282
|
+
// Poll-loop: re-check the lock each tick until free/stale or aborted.
|
|
283
|
+
if (opts.signal?.aborted) {
|
|
284
|
+
throw new Error(
|
|
285
|
+
`workspace lock acquisition aborted for goal ${goalId} (cwd=${cwd})`,
|
|
286
|
+
);
|
|
287
|
+
}
|
|
288
|
+
const existing = readLock(lockPath);
|
|
289
|
+
// Classify the existing lock: "absent" / "stale" (PID recycled or heartbeat dead) / "live".
|
|
290
|
+
// Cold-review #3 NIT #N1 fix: when stale, pass forceOverwrite:true to claimLock so it
|
|
291
|
+
// unlinks the stale file before claiming. Without this, a stale-by-PID lock whose mtime
|
|
292
|
+
// is recent would never pass claimLock's mtime age check (it would return false) and the
|
|
293
|
+
// acquireWorkspaceLock poll loop would re-queue forever = hang.
|
|
294
|
+
const existingKind: "absent" | "stale" | "live" = !existing
|
|
295
|
+
? "absent"
|
|
296
|
+
: (isLockStale(existing, resolveStartTime, heartbeatStaleMs, now()).stale ? "stale" : "live");
|
|
297
|
+
if (existingKind !== "live") {
|
|
298
|
+
// Claim the lock (covers both no-lock and stale-lock cases).
|
|
299
|
+
// Cold-review #3 NIT #N1 fix: claim via O_EXCL (claimLock), NOT temp+rename — two
|
|
300
|
+
// processes racing past the isLockStale check could both writeLock and both believe
|
|
301
|
+
// they own the lock. claimLock atomically creates the file; if it returns false we
|
|
302
|
+
// lost the race, so fall through to the queue/re-throw path below.
|
|
303
|
+
const contents: WorkspaceLockContents = {
|
|
304
|
+
pid,
|
|
305
|
+
startTime: writtenStartTime,
|
|
306
|
+
heartbeat: now(),
|
|
307
|
+
goalId,
|
|
308
|
+
acquiredAt: new Date(now()).toISOString(),
|
|
309
|
+
};
|
|
310
|
+
const claimed = claimLock(lockPath, contents, heartbeatStaleMs, existingKind === "stale");
|
|
311
|
+
if (claimed) {
|
|
312
|
+
return {
|
|
313
|
+
cwd,
|
|
314
|
+
goalId,
|
|
315
|
+
lockPath,
|
|
316
|
+
startTime: writtenStartTime,
|
|
317
|
+
release(): void {
|
|
318
|
+
safeRelease(lockPath, goalId, pid, writtenStartTime);
|
|
319
|
+
},
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
// claimLock lost the race (another process claimed between our stale-check and
|
|
323
|
+
// our claim). Fall through to the busy path (throw or queue) — re-check next tick.
|
|
324
|
+
}
|
|
325
|
+
// Lock is held and live.
|
|
326
|
+
if (opts.failOnWorkspaceBusy) {
|
|
327
|
+
throw new Error(
|
|
328
|
+
`workspace busy: cwd=${cwd} held by goalId=${existing!.goalId} (pid=${existing!.pid})`,
|
|
329
|
+
);
|
|
330
|
+
}
|
|
331
|
+
// Queue: wait for the next poll interval, then re-check.
|
|
332
|
+
await sleepOrAbort(pollMs, opts.signal);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* Delete the lockfile at `lockPath` only if it still belongs to
|
|
338
|
+
* (goalId, pid, startTime). A stale handle (whose lock was reclaimed and
|
|
339
|
+
* re-acquired by another goal) must NOT delete the new owner's lock.
|
|
340
|
+
*/
|
|
341
|
+
function safeRelease(
|
|
342
|
+
lockPath: string,
|
|
343
|
+
goalId: string,
|
|
344
|
+
pid: number,
|
|
345
|
+
writtenStartTime: number | undefined,
|
|
346
|
+
): void {
|
|
347
|
+
try {
|
|
348
|
+
const current = readLock(lockPath);
|
|
349
|
+
if (
|
|
350
|
+
current &&
|
|
351
|
+
current.goalId === goalId &&
|
|
352
|
+
current.pid === pid &&
|
|
353
|
+
current.startTime === writtenStartTime
|
|
354
|
+
) {
|
|
355
|
+
unlinkSync(lockPath);
|
|
356
|
+
}
|
|
357
|
+
} catch {
|
|
358
|
+
/* best-effort — release must never throw into a finally block */
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/** Sleep that resolves after `ms`, or rejects early if `signal` aborts. */
|
|
363
|
+
function sleepOrAbort(ms: number, signal?: AbortSignal): Promise<void> {
|
|
364
|
+
if (!signal) return new Promise<void>((r) => setTimeout(r, ms));
|
|
365
|
+
return new Promise<void>((resolve, reject) => {
|
|
366
|
+
const timer = setTimeout(resolve, ms);
|
|
367
|
+
signal.addEventListener(
|
|
368
|
+
"abort",
|
|
369
|
+
() => {
|
|
370
|
+
clearTimeout(timer);
|
|
371
|
+
reject(new Error("workspace lock acquisition aborted"));
|
|
372
|
+
},
|
|
373
|
+
{ once: true },
|
|
374
|
+
);
|
|
375
|
+
});
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/**
|
|
379
|
+
* Reclaim all stale locks under `dir` (the workspace-locks directory). Returns
|
|
380
|
+
* the list of reclaimed lock paths. Stale = PID recycled OR heartbeat older
|
|
381
|
+
* than threshold. Corrupt/unreadable locks are also reclaimed.
|
|
382
|
+
*
|
|
383
|
+
* Useful as a startup or periodic sweep to clear locks left by crashed
|
|
384
|
+
* processes before any goal tries to acquire them.
|
|
385
|
+
*/
|
|
386
|
+
/**
|
|
387
|
+
* Peek whether the workspace is currently locked by a live owner (without acquiring).
|
|
388
|
+
* Used by `goal start` / `goal resume` to fail-fast with a clear error BEFORE spawning.
|
|
389
|
+
* Returns the goalId of the current owner if busy, undefined if free (or lock missing).
|
|
390
|
+
*/
|
|
391
|
+
export function isWorkspaceBusy(
|
|
392
|
+
cwd: string,
|
|
393
|
+
opts: { startTimeResolver?: StartTimeResolver; heartbeatStaleMs?: number; now?: () => number } = {},
|
|
394
|
+
): string | undefined {
|
|
395
|
+
const lockPath = workspaceLockPath(cwd);
|
|
396
|
+
const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
|
|
397
|
+
const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
398
|
+
const now = opts.now ?? Date.now;
|
|
399
|
+
const existing = readLock(lockPath);
|
|
400
|
+
if (!existing) return undefined;
|
|
401
|
+
const { stale } = isLockStale(existing, resolveStartTime, heartbeatStaleMs, now());
|
|
402
|
+
return stale ? undefined : existing.goalId;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
export function reclaimStaleLocks(
|
|
406
|
+
dir: string,
|
|
407
|
+
opts: {
|
|
408
|
+
heartbeatStaleMs?: number;
|
|
409
|
+
startTimeResolver?: StartTimeResolver;
|
|
410
|
+
now?: () => number;
|
|
411
|
+
} = {},
|
|
412
|
+
): string[] {
|
|
413
|
+
const resolveStartTime = opts.startTimeResolver ?? defaultStartTimeResolver;
|
|
414
|
+
const heartbeatStaleMs = opts.heartbeatStaleMs ?? DEFAULT_HEARTBEAT_STALE_MS;
|
|
415
|
+
const now = opts.now ?? Date.now;
|
|
416
|
+
const reclaimed: string[] = [];
|
|
417
|
+
if (!existsSync(dir)) return reclaimed;
|
|
418
|
+
let entries: string[];
|
|
419
|
+
try {
|
|
420
|
+
entries = readdirSync(dir);
|
|
421
|
+
} catch {
|
|
422
|
+
return reclaimed;
|
|
423
|
+
}
|
|
424
|
+
for (const entry of entries) {
|
|
425
|
+
if (!entry.endsWith(".lock")) continue;
|
|
426
|
+
const lockPath = path.join(dir, entry);
|
|
427
|
+
const lock = readLock(lockPath);
|
|
428
|
+
if (!lock) {
|
|
429
|
+
// Corrupt/empty — reclaim.
|
|
430
|
+
try {
|
|
431
|
+
unlinkSync(lockPath);
|
|
432
|
+
reclaimed.push(lockPath);
|
|
433
|
+
} catch {
|
|
434
|
+
/* best-effort */
|
|
435
|
+
}
|
|
436
|
+
continue;
|
|
437
|
+
}
|
|
438
|
+
if (isLockStale(lock, resolveStartTime, heartbeatStaleMs, now()).stale) {
|
|
439
|
+
try {
|
|
440
|
+
unlinkSync(lockPath);
|
|
441
|
+
reclaimed.push(lockPath);
|
|
442
|
+
} catch {
|
|
443
|
+
/* best-effort */
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
return reclaimed;
|
|
448
|
+
}
|
|
@@ -59,6 +59,31 @@ export const PiTeamsWorktreeConfigSchema = Type.Object({
|
|
|
59
59
|
seedPaths: Type.Optional(Type.Array(Type.String({ minLength: 1 }))),
|
|
60
60
|
}, { additionalProperties: false });
|
|
61
61
|
|
|
62
|
+
/**
|
|
63
|
+
* Goal-wrap config (RFC v0.5 vision: apply `goal` completion-guarantee to builtin workflows).
|
|
64
|
+
* Per-workflow toggle. When enabled, a builtin workflow runs as the WORKER TURN inside a
|
|
65
|
+
* goal loop (worker → judge → feedback → redo until achieved / maxTurns / budget / stuck).
|
|
66
|
+
* Default OFF — opt-in per workflow. Only applies to builtin workflows that have a clear
|
|
67
|
+
* 'done' condition (implementation, fast-fix). Read-only workflows (review, research) are
|
|
68
|
+
* not goal-wrappable.
|
|
69
|
+
*/
|
|
70
|
+
export const GoalWrapWorkflowConfigSchema = Type.Object({
|
|
71
|
+
enabled: Type.Optional(Type.Boolean()),
|
|
72
|
+
maxTurns: Type.Optional(Type.Integer({ minimum: 1, maximum: 50 })),
|
|
73
|
+
evaluatorModel: Type.Optional(Type.String({ minLength: 1 })),
|
|
74
|
+
verification: Type.Optional(Type.Object({
|
|
75
|
+
commands: Type.Array(Type.String({ minLength: 1 })),
|
|
76
|
+
mode: Type.Optional(Type.Literal("text-only")),
|
|
77
|
+
}, { additionalProperties: false })),
|
|
78
|
+
budgetTotal: Type.Optional(Type.Integer({ minimum: 1000 })),
|
|
79
|
+
budgetUnlimited: Type.Optional(Type.Boolean()),
|
|
80
|
+
}, { additionalProperties: false });
|
|
81
|
+
|
|
82
|
+
export const PiTeamsGoalWrapConfigSchema = Type.Record(
|
|
83
|
+
Type.String({ minLength: 1 }),
|
|
84
|
+
GoalWrapWorkflowConfigSchema,
|
|
85
|
+
);
|
|
86
|
+
|
|
62
87
|
export const AgentOverrideSchema = Type.Object({
|
|
63
88
|
disabled: Type.Optional(Type.Boolean()),
|
|
64
89
|
model: Type.Optional(Type.Union([Type.String({ minLength: 1 }), Type.Literal(false)])),
|
|
@@ -152,6 +177,7 @@ export const PiTeamsConfigSchema = Type.Object({
|
|
|
152
177
|
runtime: Type.Optional(PiTeamsRuntimeConfigSchema),
|
|
153
178
|
control: Type.Optional(PiTeamsControlConfigSchema),
|
|
154
179
|
worktree: Type.Optional(PiTeamsWorktreeConfigSchema),
|
|
180
|
+
goalWrap: Type.Optional(PiTeamsGoalWrapConfigSchema),
|
|
155
181
|
agents: Type.Optional(PiTeamsAgentsConfigSchema),
|
|
156
182
|
tools: Type.Optional(PiTeamsToolsConfigSchema),
|
|
157
183
|
telemetry: Type.Optional(PiTeamsTelemetryConfigSchema),
|