pi-crew 0.7.5 → 0.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +71 -0
- package/README.md +11 -11
- package/docs/commands-reference.md +14 -10
- package/docs/troubleshooting.md +131 -0
- package/docs/usage.md +9 -4
- package/package.json +1 -1
- package/src/config/config.ts +11 -4
- package/src/extension/action-suggestions.ts +71 -0
- package/src/extension/context-status-injection.ts +32 -1
- package/src/extension/register.ts +71 -65
- package/src/extension/team-tool/api.ts +3 -2
- package/src/extension/team-tool/cancel.ts +5 -4
- package/src/extension/team-tool/explain.ts +2 -1
- package/src/extension/team-tool/failure-patterns.ts +124 -0
- package/src/extension/team-tool/inspect.ts +10 -6
- package/src/extension/team-tool/lifecycle-actions.ts +5 -4
- package/src/extension/team-tool/respond.ts +4 -3
- package/src/extension/team-tool/run-not-found.ts +54 -0
- package/src/extension/team-tool/run.ts +26 -4
- package/src/extension/team-tool/status.ts +58 -4
- package/src/extension/team-tool.ts +5 -3
- package/src/runtime/async-runner.ts +7 -0
- package/src/runtime/background-runner.ts +7 -1
- package/src/runtime/chain-parser.ts +13 -5
- package/src/runtime/checkpoint.ts +13 -1
- package/src/runtime/child-pi.ts +9 -1
- package/src/runtime/crash-recovery.ts +21 -1
- package/src/runtime/live-session-runtime.ts +15 -1
- package/src/runtime/parent-guard.ts +2 -2
- package/src/runtime/pi-spawn.ts +66 -0
- package/src/runtime/stale-reconciler.ts +38 -3
- package/src/runtime/task-runner.ts +10 -1
- package/src/runtime/team-runner.ts +19 -2
- package/src/runtime/verification-gates.ts +21 -1
- package/src/schema/team-tool-schema.ts +9 -0
- package/src/state/blob-store.ts +12 -10
- package/src/state/event-log-rotation.ts +114 -93
- package/src/state/event-log.ts +79 -20
- package/src/state/health-store.ts +6 -1
- package/src/state/locks.ts +66 -16
- package/src/state/state-store.ts +14 -1
- package/src/ui/card-colors.ts +7 -3
- package/src/ui/dashboard-panes/agents-pane.ts +15 -2
- package/src/ui/live-duration.ts +58 -0
- package/src/ui/tool-render.ts +7 -11
- package/src/ui/tool-renderers/index.ts +6 -3
- package/src/ui/widget/widget-formatters.ts +2 -13
- package/src/utils/fs-watch.ts +11 -60
- package/src/utils/run-watcher-registry.ts +164 -0
- package/src/workflows/discover-workflows.ts +2 -1
- package/src/workflows/workflow-config.ts +5 -0
- package/src/runtime/dynamic-script-runner.ts +0 -497
- package/src/runtime/sandbox.ts +0 -335
|
@@ -135,6 +135,13 @@ export const TeamToolParams = Type.Object({
|
|
|
135
135
|
description: "Run in background when execution support is enabled.",
|
|
136
136
|
}),
|
|
137
137
|
),
|
|
138
|
+
details: Type.Optional(
|
|
139
|
+
Type.Boolean({
|
|
140
|
+
default: true,
|
|
141
|
+
description:
|
|
142
|
+
"(status) Output detail level. true (default) = full status (task graph, agents, effectiveness, events). false = compact summary (status, goal, task counts, and only failed/attention task errors) for quick checks.",
|
|
143
|
+
}),
|
|
144
|
+
),
|
|
138
145
|
workspaceMode: Type.Optional(
|
|
139
146
|
Type.Union([Type.Literal("single"), Type.Literal("worktree")], {
|
|
140
147
|
description:
|
|
@@ -318,6 +325,8 @@ export interface TeamToolParamsValue {
|
|
|
318
325
|
taskId?: string;
|
|
319
326
|
message?: string;
|
|
320
327
|
async?: boolean;
|
|
328
|
+
/** (status) Output detail level. false = compact summary. Default: true (full). */
|
|
329
|
+
details?: boolean;
|
|
321
330
|
workspaceMode?: "single" | "worktree";
|
|
322
331
|
context?: "fresh" | "fork";
|
|
323
332
|
cwd?: string;
|
package/src/state/blob-store.ts
CHANGED
|
@@ -190,16 +190,18 @@ export function writeBlob(artifactsRoot: string, input: {
|
|
|
190
190
|
metadataWritten = true;
|
|
191
191
|
});
|
|
192
192
|
} catch (error) {
|
|
193
|
-
//
|
|
194
|
-
//
|
|
195
|
-
//
|
|
196
|
-
//
|
|
197
|
-
//
|
|
198
|
-
// the
|
|
199
|
-
//
|
|
200
|
-
//
|
|
201
|
-
//
|
|
202
|
-
//
|
|
193
|
+
// Round 24 (BUG 4 note): the catch block previously checked
|
|
194
|
+
// `if (!blobContentWritten)` — the WRONG variable (the local comment said
|
|
195
|
+
// `metadataWritten === false`). For a CONTENT-ADDRESSED store the blob path
|
|
196
|
+
// is the content hash, so the blob may be referenced by another process's
|
|
197
|
+
// metadata even when OUR metadata write failed (e.g. a concurrent conflict
|
|
198
|
+
// where the peer already wrote metadata for the same hash). Deleting it
|
|
199
|
+
// would orphan their metadata. The safe behavior is therefore to NEVER
|
|
200
|
+
// delete on a metadata write failure and let the periodic
|
|
201
|
+
// cleanupOrphanedBlobs() reclaim genuinely-orphaned blobs. The guard below
|
|
202
|
+
// only removes a blob when its CONTENT was never written (a stray/partial
|
|
203
|
+
// file from a failed content write) — which is the only unambiguously-safe
|
|
204
|
+
// case to clean up here.
|
|
203
205
|
if (!blobContentWritten) {
|
|
204
206
|
try { fs.rmSync(blobPath, { force: true }); } catch { /* best-effort */ }
|
|
205
207
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
|
-
import { readEvents } from "./event-log.ts";
|
|
2
|
+
import { readEvents, type TeamEvent } from "./event-log.ts";
|
|
3
3
|
import { atomicWriteFile } from "./atomic-write.ts";
|
|
4
4
|
import { logInternalError } from "../utils/internal-error.ts";
|
|
5
5
|
import { withEventLogLockSync } from "./event-log.ts";
|
|
@@ -65,6 +65,25 @@ export interface CompactionResult {
|
|
|
65
65
|
* 6. Return compaction stats
|
|
66
66
|
*/
|
|
67
67
|
export function compactEventLog(eventsPath: string, config?: Partial<RotationConfig>): CompactionResult | undefined {
|
|
68
|
+
const prepared = prepareCompaction(eventsPath, config);
|
|
69
|
+
if (!prepared) return undefined;
|
|
70
|
+
// FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
|
|
71
|
+
// event loss during compaction. Without lock, events can be appended between
|
|
72
|
+
// read and write, lost silently.
|
|
73
|
+
//
|
|
74
|
+
// NOTE (Round 24 BUG 1): callers ALREADY holding the event-log lock (e.g.
|
|
75
|
+
// appendEventInsideLock in event-log.ts) must call applyCompactionUnlocked
|
|
76
|
+
// directly — calling compactEventLog from inside the lock deadlocks (the
|
|
77
|
+
// mkdir lock is not re-entrant → 5s timeout → compaction never ran → the
|
|
78
|
+
// log grew unbounded until events were silently dropped past 50MB).
|
|
79
|
+
return withEventLogLockSync(eventsPath, () => applyCompactionUnlocked(eventsPath, prepared));
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** Round 24 (BUG 1): the lock-free pre-read for compaction. Safe to run
|
|
83
|
+
* outside the lock (read-only). Returns the compacted lines + stats needed
|
|
84
|
+
* for the write phase. */
|
|
85
|
+
export function prepareCompaction(eventsPath: string, config?: Partial<RotationConfig>):
|
|
86
|
+
{ lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] } | undefined {
|
|
68
87
|
if (!fs.existsSync(eventsPath)) return undefined;
|
|
69
88
|
const cfg = resolveConfig(config);
|
|
70
89
|
let originalSize: number;
|
|
@@ -74,79 +93,73 @@ export function compactEventLog(eventsPath: string, config?: Partial<RotationCon
|
|
|
74
93
|
if (originalCount <= cfg.compactToCount) return undefined;
|
|
75
94
|
const kept = allEvents.slice(-cfg.compactToCount);
|
|
76
95
|
const lines = kept.map((e) => JSON.stringify(e)).join("\n") + "\n";
|
|
96
|
+
return { lines, originalSize, originalCount, kept };
|
|
97
|
+
}
|
|
77
98
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
compactedSize: fs.statSync(eventsPath).size,
|
|
105
|
-
eventsRemoved: originalCount - kept.length,
|
|
106
|
-
eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
|
|
107
|
-
};
|
|
108
|
-
}
|
|
109
|
-
// afterWrite.length < kept.length — events were lost during compaction window.
|
|
110
|
-
// Find missing events and re-append them.
|
|
111
|
-
// FIX: Use sequence numbers for comparison instead of JSON.stringify.
|
|
112
|
-
const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
|
|
113
|
-
const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
|
|
114
|
-
let recoveredCount = 0;
|
|
115
|
-
let recoveryFailed = false;
|
|
116
|
-
if (missingEvents.length > 0) {
|
|
117
|
-
// BUGFIX (Round 12 C2): the previous loop called atomicWriteFile PER event,
|
|
118
|
-
// which REPLACES the entire file each iteration — destroying the
|
|
119
|
-
// compacted log and all previously-recovered events, leaving only the
|
|
120
|
-
// LAST missing event. FIX: accumulate all missing events into one
|
|
121
|
-
// string and append in a single write (appendFileSync appends without
|
|
122
|
-
// destroying existing content).
|
|
123
|
-
const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
|
|
124
|
-
try {
|
|
125
|
-
fs.appendFileSync(eventsPath, recoveryLines);
|
|
126
|
-
recoveredCount = missingEvents.length;
|
|
127
|
-
} catch (err) {
|
|
128
|
-
recoveryFailed = true;
|
|
129
|
-
logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
|
|
130
|
-
}
|
|
131
|
-
}
|
|
99
|
+
/** Round 24 (BUG 1): the write+recover phase of compaction. Assumes the
|
|
100
|
+
* caller ALREADY holds the event-log lock (or accepts the unlocked race). */
|
|
101
|
+
export function applyCompactionUnlocked(
|
|
102
|
+
eventsPath: string,
|
|
103
|
+
prepared: { lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] },
|
|
104
|
+
): CompactionResult | undefined {
|
|
105
|
+
const { lines, originalSize, originalCount, kept } = prepared;
|
|
106
|
+
try {
|
|
107
|
+
atomicWriteFile(eventsPath, lines);
|
|
108
|
+
} catch (err) {
|
|
109
|
+
// Concurrent write conflict — skip compaction this cycle
|
|
110
|
+
logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
|
|
111
|
+
return undefined;
|
|
112
|
+
}
|
|
113
|
+
// C2: Re-read to recover any events appended during the compaction window.
|
|
114
|
+
// Events appended during the compaction window are preserved because they
|
|
115
|
+
// appear in afterWrite and the condition afterWrite.length >= kept.length is
|
|
116
|
+
// true, so they are included in the return stats without entering the
|
|
117
|
+
// recovery branch.
|
|
118
|
+
try {
|
|
119
|
+
const afterWrite = readEvents(eventsPath);
|
|
120
|
+
// FIX: Check if events were actually lost (afterWrite.length < kept.length)
|
|
121
|
+
// rather than using appendedDuringWindow >= 0 which is always true.
|
|
122
|
+
// Also use sequence numbers for comparison instead of JSON.stringify
|
|
123
|
+
// which is fragile due to key ordering and floating point differences.
|
|
124
|
+
if (afterWrite.length >= kept.length) {
|
|
132
125
|
return {
|
|
133
126
|
originalSize,
|
|
134
127
|
compactedSize: fs.statSync(eventsPath).size,
|
|
135
128
|
eventsRemoved: originalCount - kept.length,
|
|
136
|
-
eventsKept: kept.length +
|
|
137
|
-
recoveryFailed,
|
|
138
|
-
};
|
|
139
|
-
} catch {
|
|
140
|
-
// Post-write verification failed — compaction likely succeeded.
|
|
141
|
-
const compactedSize = fs.statSync(eventsPath).size;
|
|
142
|
-
return {
|
|
143
|
-
originalSize,
|
|
144
|
-
compactedSize,
|
|
145
|
-
eventsRemoved: originalCount - kept.length,
|
|
146
|
-
eventsKept: kept.length,
|
|
129
|
+
eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
|
|
147
130
|
};
|
|
148
131
|
}
|
|
149
|
-
|
|
132
|
+
// afterWrite.length < kept.length — events were lost during compaction window.
|
|
133
|
+
const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
|
|
134
|
+
const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
|
|
135
|
+
let recoveredCount = 0;
|
|
136
|
+
let recoveryFailed = false;
|
|
137
|
+
if (missingEvents.length > 0) {
|
|
138
|
+
const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
|
|
139
|
+
try {
|
|
140
|
+
fs.appendFileSync(eventsPath, recoveryLines);
|
|
141
|
+
recoveredCount = missingEvents.length;
|
|
142
|
+
} catch (err) {
|
|
143
|
+
recoveryFailed = true;
|
|
144
|
+
logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
originalSize,
|
|
149
|
+
compactedSize: fs.statSync(eventsPath).size,
|
|
150
|
+
eventsRemoved: originalCount - kept.length,
|
|
151
|
+
eventsKept: kept.length + recoveredCount,
|
|
152
|
+
recoveryFailed,
|
|
153
|
+
};
|
|
154
|
+
} catch {
|
|
155
|
+
// Post-write verification failed — compaction likely succeeded.
|
|
156
|
+
return {
|
|
157
|
+
originalSize,
|
|
158
|
+
compactedSize: fs.statSync(eventsPath).size,
|
|
159
|
+
eventsRemoved: originalCount - kept.length,
|
|
160
|
+
eventsKept: kept.length,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
150
163
|
}
|
|
151
164
|
|
|
152
165
|
/**
|
|
@@ -161,33 +174,41 @@ export function rotateEventLog(eventsPath: string): boolean {
|
|
|
161
174
|
// FIX: Wrap rotation in lock to prevent race conditions with concurrent readers.
|
|
162
175
|
// Order of operations: (1) create new empty file, (2) rename old file to archive.
|
|
163
176
|
// This ensures eventsPath always exists — a reader never sees a missing file.
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
fs.writeFileSync(eventsPath, "", "utf-8");
|
|
185
|
-
return true;
|
|
186
|
-
} catch (error) {
|
|
187
|
-
logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
|
|
188
|
-
return false;
|
|
177
|
+
//
|
|
178
|
+
// NOTE (Round 24 BUG 1): callers ALREADY holding the lock must call
|
|
179
|
+
// rotateEventLogUnlocked directly — this locked variant is NOT re-entrant.
|
|
180
|
+
return withEventLogLockSync(eventsPath, () => rotateEventLogUnlocked(eventsPath));
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/** Round 24 (BUG 1): the lock-free core of rotation. Assumes the caller
|
|
184
|
+
* already holds the event-log lock (or accepts the unlocked race). */
|
|
185
|
+
export function rotateEventLogUnlocked(eventsPath: string): boolean {
|
|
186
|
+
if (!fs.existsSync(eventsPath)) return false;
|
|
187
|
+
try {
|
|
188
|
+
const ts = new Date().toISOString().replace(/[:.]/g, "-");
|
|
189
|
+
let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
|
|
190
|
+
// Round 12: avoid timestamp collisions when two rotations happen within
|
|
191
|
+
// the same millisecond (copyFileSync would silently overwrite the
|
|
192
|
+
// first archive). Append a counter until the path is free.
|
|
193
|
+
let collision = 1;
|
|
194
|
+
while (fs.existsSync(archivePath)) {
|
|
195
|
+
archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
|
|
196
|
+
collision++;
|
|
189
197
|
}
|
|
190
|
-
|
|
198
|
+
// BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
|
|
199
|
+
// rename) destroyed ALL events — atomicWriteFile replaces the file
|
|
200
|
+
// in place, so the rename then moved an EMPTY file to the archive.
|
|
201
|
+
// FIX: copy current content to the archive first (archive is populated,
|
|
202
|
+
// original still intact), then truncate the original to empty in place.
|
|
203
|
+
// copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
|
|
204
|
+
// (no missing-file window for concurrent readers).
|
|
205
|
+
fs.copyFileSync(eventsPath, archivePath);
|
|
206
|
+
fs.writeFileSync(eventsPath, "", "utf-8");
|
|
207
|
+
return true;
|
|
208
|
+
} catch (error) {
|
|
209
|
+
logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
|
|
210
|
+
return false;
|
|
211
|
+
}
|
|
191
212
|
}
|
|
192
213
|
|
|
193
214
|
export interface EventLogStats {
|
package/src/state/event-log.ts
CHANGED
|
@@ -9,7 +9,7 @@ import { logInternalError } from "../utils/internal-error.ts";
|
|
|
9
9
|
import { readJsonlSince, type IncrementalReadState } from "../utils/incremental-reader.ts";
|
|
10
10
|
import { redactSecrets } from "../utils/redaction.ts";
|
|
11
11
|
import { sleepSync } from "../utils/sleep.ts";
|
|
12
|
-
import { needsRotation, compactEventLog, rotateEventLog } from "./event-log-rotation.ts";
|
|
12
|
+
import { needsRotation, compactEventLog, rotateEventLog, applyCompactionUnlocked, prepareCompaction, rotateEventLogUnlocked } from "./event-log-rotation.ts";
|
|
13
13
|
|
|
14
14
|
export type TeamEventProvenance = "live_worker" | "test" | "healthcheck" | "replay" | "api" | "background" | "team_runner";
|
|
15
15
|
export type TeamWatcherAction = "act" | "observe" | "ignore";
|
|
@@ -76,7 +76,7 @@ let overflowCounter = 0;
|
|
|
76
76
|
* `flushOneEventLogBuffer`, and `state/mailbox.ts`. Prefer the async alternative
|
|
77
77
|
* (`appendEventAsync`) for all new code.
|
|
78
78
|
*/
|
|
79
|
-
export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
|
|
79
|
+
export function withEventLogLockSync<T>(eventsPath: string, fn: () => T, options?: { timeoutMs?: number; staleMs?: number }): T {
|
|
80
80
|
// Ensure parent directory exists before attempting lock
|
|
81
81
|
fs.mkdirSync(path.dirname(eventsPath), { recursive: true });
|
|
82
82
|
const lockDir = `${eventsPath}.lock`;
|
|
@@ -86,8 +86,8 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
|
|
|
86
86
|
// event loop indefinitely. 500 retries × 10ms = 5s max. After timeout, we
|
|
87
87
|
// throw a clear error instead of blocking forever. This ensures AbortSignal
|
|
88
88
|
// handlers, SIGTERM, and graceful shutdown can fire within seconds.
|
|
89
|
-
const timeout = 5000;
|
|
90
|
-
const staleMs = 10000;
|
|
89
|
+
const timeout = options?.timeoutMs ?? 5000;
|
|
90
|
+
const staleMs = options?.staleMs ?? 10000;
|
|
91
91
|
let acquired = false;
|
|
92
92
|
while (true) {
|
|
93
93
|
try {
|
|
@@ -110,24 +110,35 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
|
|
|
110
110
|
// to check for orphaned .lock dirs / stale processes.
|
|
111
111
|
throw errors.eventLogLockTimeout(eventsPath, timeout);
|
|
112
112
|
}
|
|
113
|
-
//
|
|
113
|
+
// Round 26 (BUG 3): mtime-based stale check INDEPENDENT of pidFile.
|
|
114
|
+
// If the holder crashed between mkdir and writing pidFile, there is no
|
|
115
|
+
// pidFile to read — the old code just slept until the 5s timeout, then
|
|
116
|
+
// threw, leaving the dir orphaned FOREVER (every retry repeats the
|
|
117
|
+
// timeout). Now: if the lock dir's mtime exceeds staleMs, reclaim it.
|
|
118
|
+
try {
|
|
119
|
+
const dirStat = fs.statSync(lockDir);
|
|
120
|
+
if (Date.now() - dirStat.mtimeMs > staleMs) {
|
|
121
|
+
fs.rmSync(lockDir, { recursive: true, force: true });
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
} catch { /* dir vanished — let loop retry */ }
|
|
125
|
+
// Round 26 (BUG 4): the mtime check was previously NESTED inside
|
|
126
|
+
// `if (!alive)`, so a recycled PID (crashed holder's PID reused by an
|
|
127
|
+
// unrelated live process) kept `alive=true` and the mtime check NEVER
|
|
128
|
+
// fired → permanent wedge. mtime is now checked FIRST (above) for ALL
|
|
129
|
+
// holders. The PID check below is a secondary fast-path: if the holder
|
|
130
|
+
// PID is provably dead AND the lock isn't stale yet, we still wait
|
|
131
|
+
// (don't steal a fresh lock just because the pid lookup raced).
|
|
114
132
|
try {
|
|
115
133
|
const raw = fs.readFileSync(pidFile, "utf-8").trim();
|
|
116
134
|
const ownerPid = Number.parseInt(raw, 10);
|
|
117
135
|
if (!Number.isNaN(ownerPid) && ownerPid !== process.pid) {
|
|
118
136
|
let alive = false;
|
|
119
137
|
try { process.kill(ownerPid, 0); alive = true; } catch { /* dead */ }
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
const stat = fs.statSync(lockDir);
|
|
123
|
-
if (Date.now() - stat.mtimeMs > staleMs) {
|
|
124
|
-
fs.rmSync(lockDir, { recursive: true, force: true });
|
|
125
|
-
continue;
|
|
126
|
-
}
|
|
127
|
-
} catch { /* race — let loop sleep */ }
|
|
128
|
-
}
|
|
138
|
+
// (mtime already handled above; nothing to do here for dead-but-fresh.)
|
|
139
|
+
void alive;
|
|
129
140
|
}
|
|
130
|
-
} catch { /* no pid file —
|
|
141
|
+
} catch { /* no pid file — mtime check above already handles it */ }
|
|
131
142
|
sleepSync(10);
|
|
132
143
|
}
|
|
133
144
|
}
|
|
@@ -135,7 +146,19 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
|
|
|
135
146
|
return fn();
|
|
136
147
|
} finally {
|
|
137
148
|
if (acquired) {
|
|
138
|
-
|
|
149
|
+
// Round 26 (BUG 5): token/PID-guarded release. Previously the release
|
|
150
|
+
// was an UNCONDITIONAL rmSync. If our fn exceeded staleMs, another
|
|
151
|
+
// process could steal our lock (rm our dir, make its own); when our fn
|
|
152
|
+
// finished our finally block would then DELETE THE STEALER's dir → both
|
|
153
|
+
// in the critical section + lost lock. Verify the pidFile still records
|
|
154
|
+
// OUR pid before removing; if it doesn't, the lock was stolen and the
|
|
155
|
+
// current holder owns the dir.
|
|
156
|
+
try {
|
|
157
|
+
const currentPid = fs.readFileSync(pidFile, "utf-8").trim();
|
|
158
|
+
if (currentPid === String(process.pid)) {
|
|
159
|
+
fs.rmSync(lockDir, { recursive: true, force: true });
|
|
160
|
+
}
|
|
161
|
+
} catch { /* lock stolen or already gone — do not touch */ }
|
|
139
162
|
}
|
|
140
163
|
}
|
|
141
164
|
}
|
|
@@ -152,6 +175,29 @@ function evictOldestSequenceCacheEntries(): void {
|
|
|
152
175
|
}
|
|
153
176
|
}
|
|
154
177
|
|
|
178
|
+
/** @internal — exported for sequence-cache LRU testing (Round 19). */
|
|
179
|
+
export function __test__sequenceCacheSize(): number {
|
|
180
|
+
return sequenceCache.size;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/** @internal — seed an entry into the sequence cache for testing. */
|
|
184
|
+
export function __test__seedSequenceCache(eventsPath: string, lastAccessMs: number): void {
|
|
185
|
+
sequenceCache.set(eventsPath, { size: 1, mtimeMs: 0, seq: 0, lastAccessMs });
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/** @internal — expose eviction for testing. */
|
|
189
|
+
export function __test__evictOldestSequenceCacheEntries(): void {
|
|
190
|
+
evictOldestSequenceCacheEntries();
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/** @internal — clear the sequence cache. */
|
|
194
|
+
export function __test__clearSequenceCache(): void {
|
|
195
|
+
sequenceCache.clear();
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/** @internal — the max sequence cache entries bound. */
|
|
199
|
+
export const MAX_SEQUENCE_CACHE_ENTRIES_VALUE = MAX_SEQUENCE_CACHE_ENTRIES;
|
|
200
|
+
|
|
155
201
|
export function sequencePath(eventsPath: string): string {
|
|
156
202
|
return `${eventsPath}.seq`;
|
|
157
203
|
}
|
|
@@ -497,9 +543,14 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
|
|
|
497
543
|
if (!isTerminal && fs.existsSync(eventsPath)) {
|
|
498
544
|
const stat = fs.statSync(eventsPath);
|
|
499
545
|
if (stat.size > MAX_EVENTS_BYTES) {
|
|
500
|
-
// Try immediate compact (not waiting for counter % 100)
|
|
546
|
+
// Try immediate compact (not waiting for counter % 100).
|
|
547
|
+
// Round 24 (BUG 1): we are INSIDE withEventLogLockSync. Use the unlocked
|
|
548
|
+
// apply/rotate cores — the locked variants would deadlock (mkdir lock
|
|
549
|
+
// is not re-entrant → 5s timeout → compaction/rotation never ran →
|
|
550
|
+
// unbounded log growth → events silently dropped past 50MB).
|
|
501
551
|
try {
|
|
502
|
-
|
|
552
|
+
const prepared = prepareCompaction(eventsPath);
|
|
553
|
+
if (prepared) applyCompactionUnlocked(eventsPath, prepared);
|
|
503
554
|
} catch (error) {
|
|
504
555
|
logInternalError("event-log.immediate-compact", error, `eventsPath=${eventsPath}`);
|
|
505
556
|
}
|
|
@@ -507,7 +558,7 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
|
|
|
507
558
|
if (fs.existsSync(eventsPath)) {
|
|
508
559
|
const afterCompact = fs.statSync(eventsPath);
|
|
509
560
|
if (afterCompact.size > MAX_EVENTS_BYTES) {
|
|
510
|
-
|
|
561
|
+
rotateEventLogUnlocked(eventsPath);
|
|
511
562
|
}
|
|
512
563
|
}
|
|
513
564
|
}
|
|
@@ -555,7 +606,15 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
|
|
|
555
606
|
}
|
|
556
607
|
appendCounter++;
|
|
557
608
|
if (appendCounter % 100 === 0 && needsRotation(eventsPath)) {
|
|
558
|
-
|
|
609
|
+
// Round 24 (BUG 1): we are INSIDE withEventLogLockSync here (called via
|
|
610
|
+
// appendEventInsideLock). The mkdir lock is NOT re-entrant, so calling the
|
|
611
|
+
// locked compactEventLog would deadlock → 5s timeout → compaction never
|
|
612
|
+
// ran → unbounded log growth → events silently dropped past 50MB. Use the
|
|
613
|
+
// unlocked apply path instead (lock already held).
|
|
614
|
+
try {
|
|
615
|
+
const prepared = prepareCompaction(eventsPath);
|
|
616
|
+
if (prepared) applyCompactionUnlocked(eventsPath, prepared);
|
|
617
|
+
} catch (error) { logInternalError("event-log.rotation", error, `eventsPath=${eventsPath}`); }
|
|
559
618
|
}
|
|
560
619
|
try { emitFromTeamEvent(fullEvent); } catch (error) { logInternalError("event-log.emit", error); }
|
|
561
620
|
return fullEvent;
|
|
@@ -4,7 +4,12 @@ import type { RunHealth } from "../runtime/task-health.ts";
|
|
|
4
4
|
import { computeRunHealth } from "../runtime/task-health.ts";
|
|
5
5
|
import type { ManifestSummary } from "../runtime/task-health.ts";
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
// Relative to the crew root (`<cwd>/.crew`). BUG A fix (pts/2 hang
|
|
8
|
+
// investigation 2026-06-16): this was `.crew/state/health`, which double-joined
|
|
9
|
+
// to `<crewRoot>/state/.crew/state/health` because the caller passed the state
|
|
10
|
+
// dir (not the crew root). Now the caller passes the real crew root, so this is
|
|
11
|
+
// a plain `state/health` suffix.
|
|
12
|
+
const HEALTH_DIR = "state/health";
|
|
8
13
|
|
|
9
14
|
export interface HealthSnapshot {
|
|
10
15
|
runId: string;
|
package/src/state/locks.ts
CHANGED
|
@@ -66,6 +66,57 @@ function isLockHolderAlive(filePath: string): boolean {
|
|
|
66
66
|
}
|
|
67
67
|
}
|
|
68
68
|
|
|
69
|
+
/**
|
|
70
|
+
* Round 26 (BUG 1): read the lock file ONCE and evaluate staleness + holder
|
|
71
|
+
* liveness from that single snapshot.
|
|
72
|
+
*
|
|
73
|
+
* Previously `acquireLockWithRetry` called `isLockStale()` and
|
|
74
|
+
* `isLockHolderAlive()` separately, each performing its own `readFileSync`.
|
|
75
|
+
* Between those two reads the lock could transition stale→fresh (old holder
|
|
76
|
+
* released, new holder acquired): isLockStale saw the OLD createdAt → stale,
|
|
77
|
+
* isLockHolderAlive saw the NEW pid → alive, yielding `!stale && alive` =
|
|
78
|
+
* false → we forcibly rm the NEW holder's freshly-acquired lock and take it
|
|
79
|
+
* ourselves → BOTH in the critical section. Reading once closes the window.
|
|
80
|
+
*
|
|
81
|
+
* Returns `{ canSteal: true }` if the lock is stale OR the holder is dead
|
|
82
|
+
* (safe to forcibly remove); `{ canSteal: false }` if it is fresh AND held by
|
|
83
|
+
* a live process (must keep waiting).
|
|
84
|
+
*/
|
|
85
|
+
function readLockSnapshot(filePath: string, staleMs: number): { canSteal: boolean } {
|
|
86
|
+
let stat: fs.Stats | undefined;
|
|
87
|
+
let raw: string | undefined;
|
|
88
|
+
try {
|
|
89
|
+
stat = fs.statSync(filePath);
|
|
90
|
+
raw = fs.readFileSync(filePath, "utf-8");
|
|
91
|
+
} catch {
|
|
92
|
+
// File vanished between writeLockFile's EEXIST and now (holder released).
|
|
93
|
+
// Loop will retry the create; safe to signal "nothing to steal".
|
|
94
|
+
return { canSteal: false };
|
|
95
|
+
}
|
|
96
|
+
// Staleness from a single snapshot.
|
|
97
|
+
let createdAt = parseCreatedAtFromLock(raw);
|
|
98
|
+
if (createdAt === undefined) createdAt = stat.mtimeMs;
|
|
99
|
+
const isStale = Date.now() - createdAt > staleMs;
|
|
100
|
+
// Holder liveness from the SAME snapshot.
|
|
101
|
+
let isAlive = true; // Unknown holder — assume alive to be safe (matches isLockHolderAlive).
|
|
102
|
+
try {
|
|
103
|
+
const parsed = JSON.parse(raw) as { pid?: unknown };
|
|
104
|
+
const pid = typeof parsed.pid === "number" ? parsed.pid : undefined;
|
|
105
|
+
if (pid !== undefined) {
|
|
106
|
+
try {
|
|
107
|
+
process.kill(pid, 0);
|
|
108
|
+
isAlive = true;
|
|
109
|
+
} catch (error) {
|
|
110
|
+
const code = (error as NodeJS.ErrnoException).code;
|
|
111
|
+
// EPERM/ESRCH → treat as not-alive (stealable), see isLockHolderAlive.
|
|
112
|
+
isAlive = false;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
} catch { /* malformed payload — keep isAlive=true */ }
|
|
116
|
+
// Steal if stale OR holder dead — matches the original intent.
|
|
117
|
+
return { canSteal: isStale || !isAlive };
|
|
118
|
+
}
|
|
119
|
+
|
|
69
120
|
/**
|
|
70
121
|
* Lock file kinds. Discriminator written to the lock file payload so that:
|
|
71
122
|
* - Debugging tools (e.g. a future `pi-crew locks` command) can identify
|
|
@@ -180,9 +231,10 @@ function acquireLockWithRetry(filePath: string, staleMs: number, kind: LockKind
|
|
|
180
231
|
if (Date.now() > deadline) {
|
|
181
232
|
throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
|
|
182
233
|
}
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
234
|
+
// Round 26 (BUG 1): single-snapshot read closes the TOCTOU window between
|
|
235
|
+
// separate stale + alive reads (which could race stale→fresh).
|
|
236
|
+
const { canSteal } = readLockSnapshot(filePath, staleMs);
|
|
237
|
+
if (!canSteal) {
|
|
186
238
|
throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
|
|
187
239
|
}
|
|
188
240
|
// Stale or dead holder — forcibly remove the lock.
|
|
@@ -213,9 +265,9 @@ async function acquireLockWithRetryAsync(filePath: string, staleMs: number, kind
|
|
|
213
265
|
if (Date.now() > deadline) {
|
|
214
266
|
throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
|
|
215
267
|
}
|
|
216
|
-
|
|
217
|
-
const
|
|
218
|
-
if (!
|
|
268
|
+
// Round 26 (BUG 1): single-snapshot read (see sync variant).
|
|
269
|
+
const { canSteal } = readLockSnapshot(filePath, staleMs);
|
|
270
|
+
if (!canSteal) {
|
|
219
271
|
throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
|
|
220
272
|
}
|
|
221
273
|
// Stale or dead holder — forcibly remove the lock.
|
|
@@ -244,16 +296,14 @@ export function withFileLockSync<T>(filePath: string, fn: () => T, options: RunL
|
|
|
244
296
|
// Between mkdir and lock acquisition, an attacker could plant a symlink.
|
|
245
297
|
if (!isSymlinkSafePath(path.dirname(lockFile))) throw new Error("Refusing: parent of lock directory is a symlink");
|
|
246
298
|
fs.mkdirSync(path.dirname(lockFile), { recursive: true });
|
|
247
|
-
//
|
|
248
|
-
//
|
|
249
|
-
//
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
try { fs.rmSync(lockFile, { force: true }); } catch { /* ignore */ }
|
|
256
|
-
}
|
|
299
|
+
// Round 26 (BUG 2): REMOVED the pre-acquisition target-file-existence check.
|
|
300
|
+
// It was racy — between statSync(target) and acquire, a concurrent process
|
|
301
|
+
// could acquire the lock to CREATE the target, and we'd delete its active
|
|
302
|
+
// lock. It was also actively wrong for callers that pass a path already
|
|
303
|
+
// ending in `.lock` (config.ts: the checked "target" never exists, so the
|
|
304
|
+
// cleanup ALWAYS fired, deleting a fresh concurrent holder's lock). Genuine
|
|
305
|
+
// orphan locks (crashed holder) are reclaimed by acquireLockWithRetry's
|
|
306
|
+
// staleMs-based steal logic after at most `staleMs`.
|
|
257
307
|
// FIX (TOCTOU): Re-validate symlink safety before each lock acquisition
|
|
258
308
|
// attempt. Between our initial check and the acquisition (and between
|
|
259
309
|
// acquireLockWithRetry's internal retries), an attacker could plant a
|
package/src/state/state-store.ts
CHANGED
|
@@ -57,7 +57,7 @@ export interface RunPaths {
|
|
|
57
57
|
eventsPath: string;
|
|
58
58
|
}
|
|
59
59
|
|
|
60
|
-
interface ManifestCacheEntry {
|
|
60
|
+
export interface ManifestCacheEntry {
|
|
61
61
|
manifest: TeamRunManifest;
|
|
62
62
|
tasks: TeamTaskState[];
|
|
63
63
|
manifestMtimeMs: number;
|
|
@@ -76,6 +76,19 @@ const MANIFEST_CACHE_TTL_MS = 15 * 1000; // 15 seconds (FIX: increased from 5s f
|
|
|
76
76
|
const LOAD_MANIFEST_RETRY_LIMIT = 5; // Configurable retry limit for mtime/size stability checks under contention
|
|
77
77
|
const manifestCache = new Map<string, ManifestCacheEntry>();
|
|
78
78
|
|
|
79
|
+
/** @internal — exported for TTL-eviction unit testing (Round 19). */
|
|
80
|
+
export function __test__setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
|
|
81
|
+
setManifestCache(stateRoot, entry);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** @internal — exported for TTL-eviction unit testing (Round 19). */
|
|
85
|
+
export function __test__getManifestCacheEntry(stateRoot: string): ManifestCacheEntry | undefined {
|
|
86
|
+
return manifestCache.get(stateRoot);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/** @internal — the TTL in ms used for manifest cache eviction. */
|
|
90
|
+
export const MANIFEST_CACHE_TTL_MS_VALUE = MANIFEST_CACHE_TTL_MS;
|
|
91
|
+
|
|
79
92
|
function setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
|
|
80
93
|
if (manifestCache.has(stateRoot)) manifestCache.delete(stateRoot);
|
|
81
94
|
entry.cachedAt = Date.now();
|