gsd-pi 2.78.1-dev.d8826a445 → 2.78.1-dev.eccf86e27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -7
- package/dist/help-text.js +1 -1
- package/dist/resource-loader.js +6 -1
- package/dist/resources/.managed-resources-content-hash +1 -1
- package/dist/resources/extensions/gsd/auto/detect-stuck.js +41 -5
- package/dist/resources/extensions/gsd/auto/loop.js +235 -36
- package/dist/resources/extensions/gsd/auto/phases.js +7 -5
- package/dist/resources/extensions/gsd/auto/session.js +33 -0
- package/dist/resources/extensions/gsd/auto-dispatch.js +46 -2
- package/dist/resources/extensions/gsd/auto-post-unit.js +19 -11
- package/dist/resources/extensions/gsd/auto-worktree.js +26 -187
- package/dist/resources/extensions/gsd/auto.js +79 -50
- package/dist/resources/extensions/gsd/bootstrap/register-hooks.js +9 -4
- package/dist/resources/extensions/gsd/crash-recovery.js +160 -47
- package/dist/resources/extensions/gsd/db/auto-workers.js +227 -0
- package/dist/resources/extensions/gsd/db/command-queue.js +105 -0
- package/dist/resources/extensions/gsd/db/milestone-leases.js +210 -0
- package/dist/resources/extensions/gsd/db/runtime-kv.js +91 -0
- package/dist/resources/extensions/gsd/db/unit-dispatches.js +322 -0
- package/dist/resources/extensions/gsd/docs/COORDINATION.md +42 -0
- package/dist/resources/extensions/gsd/doctor-proactive.js +4 -0
- package/dist/resources/extensions/gsd/doctor-runtime-checks.js +22 -6
- package/dist/resources/extensions/gsd/doctor.js +12 -2
- package/dist/resources/extensions/gsd/gsd-db.js +161 -3
- package/dist/resources/extensions/gsd/guided-flow.js +6 -2
- package/dist/resources/extensions/gsd/interrupted-session.js +18 -15
- package/dist/resources/extensions/gsd/state.js +21 -6
- package/dist/resources/extensions/gsd/worktree-resolver.js +64 -0
- package/dist/tsconfig.extensions.tsbuildinfo +1 -1
- package/dist/web/standalone/.next/BUILD_ID +1 -1
- package/dist/web/standalone/.next/app-path-routes-manifest.json +12 -12
- package/dist/web/standalone/.next/build-manifest.json +2 -2
- package/dist/web/standalone/.next/prerender-manifest.json +3 -3
- package/dist/web/standalone/.next/server/app/_global-error.html +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error/__PAGE__.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.html +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_full.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_index.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_tree.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.html +1 -1
- package/dist/web/standalone/.next/server/app/index.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/__PAGE__.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_full.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_head.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_index.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_tree.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app-paths-manifest.json +12 -12
- package/dist/web/standalone/.next/server/middleware-build-manifest.js +1 -1
- package/dist/web/standalone/.next/server/pages/404.html +1 -1
- package/dist/web/standalone/.next/server/pages/500.html +1 -1
- package/dist/web/standalone/.next/server/server-reference-manifest.json +1 -1
- package/package.json +1 -1
- package/src/resources/extensions/gsd/auto/detect-stuck.ts +37 -5
- package/src/resources/extensions/gsd/auto/loop.ts +263 -41
- package/src/resources/extensions/gsd/auto/phases.ts +7 -5
- package/src/resources/extensions/gsd/auto/session.ts +36 -0
- package/src/resources/extensions/gsd/auto-dispatch.ts +53 -2
- package/src/resources/extensions/gsd/auto-post-unit.ts +19 -11
- package/src/resources/extensions/gsd/auto-worktree.ts +26 -211
- package/src/resources/extensions/gsd/auto.ts +89 -44
- package/src/resources/extensions/gsd/bootstrap/register-hooks.ts +9 -4
- package/src/resources/extensions/gsd/crash-recovery.ts +177 -43
- package/src/resources/extensions/gsd/db/auto-workers.ts +273 -0
- package/src/resources/extensions/gsd/db/command-queue.ts +149 -0
- package/src/resources/extensions/gsd/db/milestone-leases.ts +274 -0
- package/src/resources/extensions/gsd/db/runtime-kv.ts +127 -0
- package/src/resources/extensions/gsd/db/unit-dispatches.ts +446 -0
- package/src/resources/extensions/gsd/docs/COORDINATION.md +42 -0
- package/src/resources/extensions/gsd/doctor-proactive.ts +4 -0
- package/src/resources/extensions/gsd/doctor-runtime-checks.ts +24 -6
- package/src/resources/extensions/gsd/doctor.ts +10 -2
- package/src/resources/extensions/gsd/gsd-db.ts +170 -3
- package/src/resources/extensions/gsd/guided-flow.ts +6 -2
- package/src/resources/extensions/gsd/interrupted-session.ts +19 -12
- package/src/resources/extensions/gsd/state.ts +44 -6
- package/src/resources/extensions/gsd/tests/auto-loop-no-copy-artifacts.test.ts +72 -0
- package/src/resources/extensions/gsd/tests/auto-loop-symlink-worktree.test.ts +190 -0
- package/src/resources/extensions/gsd/tests/auto-workers.test.ts +105 -0
- package/src/resources/extensions/gsd/tests/command-queue.test.ts +141 -0
- package/src/resources/extensions/gsd/tests/crash-recovery-via-db.test.ts +203 -0
- package/src/resources/extensions/gsd/tests/crash-recovery.test.ts +169 -59
- package/src/resources/extensions/gsd/tests/detect-stuck-respects-retry.test.ts +173 -0
- package/src/resources/extensions/gsd/tests/integration/auto-worktree.test.ts +22 -12
- package/src/resources/extensions/gsd/tests/integration/doctor-proactive.test.ts +24 -10
- package/src/resources/extensions/gsd/tests/integration/doctor-runtime.test.ts +35 -23
- package/src/resources/extensions/gsd/tests/integration/workspace-collapse-integration.test.ts +3 -5
- package/src/resources/extensions/gsd/tests/interrupted-session-auto.test.ts +72 -25
- package/src/resources/extensions/gsd/tests/interrupted-session-ui.test.ts +72 -25
- package/src/resources/extensions/gsd/tests/memory-pressure-stuck-state.test.ts +9 -6
- package/src/resources/extensions/gsd/tests/milestone-leases.test.ts +152 -0
- package/src/resources/extensions/gsd/tests/parallel-milestone-isolation.test.ts +106 -0
- package/src/resources/extensions/gsd/tests/paused-session-via-db.test.ts +119 -0
- package/src/resources/extensions/gsd/tests/pipeline-variant-dispatch.test.ts +58 -0
- package/src/resources/extensions/gsd/tests/preferences-worktree-sync.test.ts +3 -17
- package/src/resources/extensions/gsd/tests/register-hooks-depth-verification.test.ts +110 -0
- package/src/resources/extensions/gsd/tests/runtime-kv.test.ts +120 -0
- package/src/resources/extensions/gsd/tests/skipped-validation-completion.test.ts +133 -28
- package/src/resources/extensions/gsd/tests/skipped-validation-db-atomicity.test.ts +17 -0
- package/src/resources/extensions/gsd/tests/stuck-state-via-db.test.ts +134 -0
- package/src/resources/extensions/gsd/tests/sync-layer-scope.test.ts +7 -26
- package/src/resources/extensions/gsd/tests/teardown-cleanup-parity.test.ts +4 -8
- package/src/resources/extensions/gsd/tests/unit-dispatches.test.ts +247 -0
- package/src/resources/extensions/gsd/tests/validate-milestone.test.ts +41 -1
- package/src/resources/extensions/gsd/tests/workspace.test.ts +15 -9
- package/src/resources/extensions/gsd/tests/write-gate.test.ts +31 -23
- package/src/resources/extensions/gsd/worktree-resolver.ts +62 -0
- package/src/resources/extensions/gsd/tests/auto-lock-creation.test.ts +0 -213
- package/src/resources/extensions/gsd/tests/auto-stale-lock-self-kill.test.ts +0 -87
- package/src/resources/extensions/gsd/tests/stop-auto-remote.test.ts +0 -159
- /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_buildManifest.js +0 -0
- /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_ssgManifest.js +0 -0
|
@@ -1,21 +1,43 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* GSD Crash Recovery
|
|
2
|
+
* GSD Crash Recovery (Phase C pt 2 — DB-backed)
|
|
3
3
|
*
|
|
4
|
-
* Detects interrupted auto-mode sessions via
|
|
5
|
-
*
|
|
6
|
-
*
|
|
4
|
+
* Detects interrupted auto-mode sessions via the DB-backed workers +
|
|
5
|
+
* unit_dispatches + runtime_kv tables. The auto.lock file is gone; the
|
|
6
|
+
* `LockData` shape is preserved for backward compatibility with callers
|
|
7
|
+
* (auto.ts, doctor checks, interrupted-session.ts), but the contents are
|
|
8
|
+
* now synthesized from:
|
|
7
9
|
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
10
|
+
* - workers.pid / .started_at / .last_heartbeat_at → liveness + age
|
|
11
|
+
* - unit_dispatches.unit_type / .unit_id / .started_at → what was running
|
|
12
|
+
* - runtime_kv("worker", workerId, "session_file") → pi session JSONL path
|
|
13
|
+
*
|
|
14
|
+
* "Crashed" is detected via workers.status='active' + heartbeat past TTL,
|
|
15
|
+
* cross-checked with the OS PID via isLockProcessAlive(). When the DB is
|
|
16
|
+
* unavailable (fresh project before init), all readers return null and
|
|
17
|
+
* writers no-op — preserving the historical "no lock means no prior
|
|
18
|
+
* crash" semantics.
|
|
19
|
+
*
|
|
20
|
+
* The journal-based emitCrashRecoveredUnitEnd is unchanged from the file
|
|
21
|
+
* era — it queries the journal independently of the lock mechanism.
|
|
11
22
|
*/
|
|
12
23
|
|
|
24
|
+
import {
|
|
25
|
+
emitJournalEvent,
|
|
26
|
+
queryJournal,
|
|
27
|
+
} from "./journal.js";
|
|
13
28
|
import { readFileSync, unlinkSync, existsSync } from "node:fs";
|
|
14
29
|
import { join } from "node:path";
|
|
15
|
-
import {
|
|
30
|
+
import {
|
|
31
|
+
findStaleWorkerForProject,
|
|
32
|
+
getAllAutoWorkers,
|
|
33
|
+
type AutoWorkerRow,
|
|
34
|
+
} from "./db/auto-workers.js";
|
|
35
|
+
import { getLatestForUnit, type DispatchStatus } from "./db/unit-dispatches.js";
|
|
36
|
+
import { getRuntimeKv, setRuntimeKv, deleteRuntimeKv } from "./db/runtime-kv.js";
|
|
37
|
+
import { _getAdapter, isDbAvailable } from "./gsd-db.js";
|
|
38
|
+
import { gsdRoot, normalizeRealPath } from "./paths.js";
|
|
16
39
|
import { atomicWriteSync } from "./atomic-write.js";
|
|
17
40
|
import { effectiveLockFile } from "./session-lock.js";
|
|
18
|
-
import { emitJournalEvent, queryJournal } from "./journal.js";
|
|
19
41
|
|
|
20
42
|
export interface LockData {
|
|
21
43
|
pid: number;
|
|
@@ -27,11 +49,91 @@ export interface LockData {
|
|
|
27
49
|
sessionFile?: string;
|
|
28
50
|
}
|
|
29
51
|
|
|
52
|
+
const SESSION_FILE_KV_KEY = "session_file";
|
|
53
|
+
|
|
30
54
|
function lockPath(basePath: string): string {
|
|
31
55
|
return join(gsdRoot(basePath), effectiveLockFile());
|
|
32
56
|
}
|
|
33
57
|
|
|
34
|
-
|
|
58
|
+
function readLegacyLock(basePath: string): LockData | null {
|
|
59
|
+
try {
|
|
60
|
+
const p = lockPath(basePath);
|
|
61
|
+
if (!existsSync(p)) return null;
|
|
62
|
+
return JSON.parse(readFileSync(p, "utf-8")) as LockData;
|
|
63
|
+
} catch {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function findActiveWorkerForCurrentProcess(
|
|
69
|
+
projectRootRealpath: string,
|
|
70
|
+
): AutoWorkerRow | null {
|
|
71
|
+
if (!isDbAvailable()) return null;
|
|
72
|
+
const workers = getAllAutoWorkers();
|
|
73
|
+
for (const worker of workers) {
|
|
74
|
+
if (
|
|
75
|
+
worker.pid === process.pid
|
|
76
|
+
&& worker.project_root_realpath === projectRootRealpath
|
|
77
|
+
) {
|
|
78
|
+
return worker;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Look up the most recent dispatch row for a worker, regardless of status.
|
|
86
|
+
* Returns null if the worker has no dispatch history yet (e.g. crashed
|
|
87
|
+
* during bootstrap before claiming the first unit).
|
|
88
|
+
*/
|
|
89
|
+
function getLatestDispatchForWorker(workerId: string):
|
|
90
|
+
| { unit_type: string; unit_id: string; started_at: string; status: DispatchStatus }
|
|
91
|
+
| null {
|
|
92
|
+
if (!isDbAvailable()) return null;
|
|
93
|
+
const db = _getAdapter()!;
|
|
94
|
+
const row = db.prepare(
|
|
95
|
+
`SELECT unit_type, unit_id, started_at, status
|
|
96
|
+
FROM unit_dispatches
|
|
97
|
+
WHERE worker_id = :worker_id
|
|
98
|
+
ORDER BY id DESC
|
|
99
|
+
LIMIT 1`,
|
|
100
|
+
).get({ ":worker_id": workerId }) as
|
|
101
|
+
| { unit_type: string; unit_id: string; started_at: string; status: DispatchStatus }
|
|
102
|
+
| undefined;
|
|
103
|
+
return row ?? null;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
function workerToLockData(worker: AutoWorkerRow): LockData {
|
|
107
|
+
const dispatch = getLatestDispatchForWorker(worker.worker_id);
|
|
108
|
+
const sessionFile =
|
|
109
|
+
getRuntimeKv<string>("worker", worker.worker_id, SESSION_FILE_KV_KEY) ?? undefined;
|
|
110
|
+
return {
|
|
111
|
+
pid: worker.pid,
|
|
112
|
+
startedAt: worker.started_at,
|
|
113
|
+
// Pre-Phase-C-pt-2 default: when no dispatch row exists yet (bootstrap
|
|
114
|
+
// crash), report unitType="starting", unitId="bootstrap" — same shape
|
|
115
|
+
// the file-based writer used to produce.
|
|
116
|
+
unitType: dispatch?.unit_type ?? "starting",
|
|
117
|
+
unitId: dispatch?.unit_id ?? "bootstrap",
|
|
118
|
+
unitStartedAt: dispatch?.started_at ?? worker.started_at,
|
|
119
|
+
sessionFile,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Write or update the lock state for the current auto-mode session.
|
|
125
|
+
*
|
|
126
|
+
* Phase C pt 2: the only persistent state this function adds beyond what
|
|
127
|
+
* the workers + unit_dispatches tables already track is the pi session
|
|
128
|
+
* JSONL path, which lands in runtime_kv (worker scope, key
|
|
129
|
+
* "session_file"). The pid/startedAt/unitType/unitId/unitStartedAt are
|
|
130
|
+
* recorded by registerAutoWorker / heartbeatAutoWorker / recordDispatchClaim
|
|
131
|
+
* already.
|
|
132
|
+
*
|
|
133
|
+
* basePath is unused by the new implementation (kept as a parameter for
|
|
134
|
+
* back-compat with the 15+ call sites) — the worker is identified by
|
|
135
|
+
* pid + project_root_realpath in the workers table.
|
|
136
|
+
*/
|
|
35
137
|
export function writeLock(
|
|
36
138
|
basePath: string,
|
|
37
139
|
unitType: string,
|
|
@@ -47,51 +149,84 @@ export function writeLock(
|
|
|
47
149
|
unitStartedAt: new Date().toISOString(),
|
|
48
150
|
sessionFile,
|
|
49
151
|
};
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
152
|
+
atomicWriteSync(lockPath(basePath), JSON.stringify(data, null, 2));
|
|
153
|
+
} catch {
|
|
154
|
+
// Best-effort — never throw from the lock writer.
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (!isDbAvailable() || !sessionFile) return;
|
|
158
|
+
try {
|
|
159
|
+
const projectRoot = normalizeRealPath(basePath);
|
|
160
|
+
const worker = findActiveWorkerForCurrentProcess(projectRoot);
|
|
161
|
+
if (!worker) return;
|
|
162
|
+
setRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY, sessionFile);
|
|
163
|
+
} catch {
|
|
164
|
+
// Best-effort — never throw from the lock writer.
|
|
165
|
+
}
|
|
53
166
|
}
|
|
54
167
|
|
|
55
|
-
/**
|
|
168
|
+
/**
|
|
169
|
+
* Phase C pt 2: clearLock no longer deletes a file. The cleanup path
|
|
170
|
+
* (markWorkerStopping in stopAuto) flips the workers row to 'stopping'.
|
|
171
|
+
* This function additionally drops the session_file runtime_kv row for
|
|
172
|
+
* the current worker so a follow-up crash detection doesn't pick up a
|
|
173
|
+
* stale session-file pointer.
|
|
174
|
+
*/
|
|
56
175
|
export function clearLock(basePath: string): void {
|
|
57
176
|
try {
|
|
58
177
|
const p = lockPath(basePath);
|
|
59
178
|
if (existsSync(p)) unlinkSync(p);
|
|
60
|
-
} catch
|
|
179
|
+
} catch {
|
|
180
|
+
// Best-effort.
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
if (!isDbAvailable()) return;
|
|
184
|
+
try {
|
|
185
|
+
const projectRoot = normalizeRealPath(basePath);
|
|
186
|
+
const worker = findActiveWorkerForCurrentProcess(projectRoot);
|
|
187
|
+
if (!worker) return;
|
|
188
|
+
deleteRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY);
|
|
189
|
+
} catch {
|
|
190
|
+
// Best-effort.
|
|
191
|
+
}
|
|
61
192
|
}
|
|
62
193
|
|
|
63
|
-
/**
|
|
194
|
+
/**
|
|
195
|
+
* Detect a previous crashed auto-mode session.
|
|
196
|
+
*
|
|
197
|
+
* Phase C pt 2: synthesized from workers (status='active' + lapsed
|
|
198
|
+
* heartbeat) + unit_dispatches (most recent for that worker) +
|
|
199
|
+
* runtime_kv (session_file). Returns null when no stale worker exists
|
|
200
|
+
* or the DB is unavailable.
|
|
201
|
+
*/
|
|
64
202
|
export function readCrashLock(basePath: string): LockData | null {
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
203
|
+
if (isDbAvailable()) {
|
|
204
|
+
try {
|
|
205
|
+
const projectRoot = normalizeRealPath(basePath);
|
|
206
|
+
const stale = findStaleWorkerForProject(projectRoot);
|
|
207
|
+
if (stale) return workerToLockData(stale);
|
|
208
|
+
} catch {
|
|
209
|
+
// Fall through to the legacy lock-file compatibility path.
|
|
210
|
+
}
|
|
73
211
|
}
|
|
212
|
+
return readLegacyLock(basePath);
|
|
74
213
|
}
|
|
75
214
|
|
|
76
215
|
/**
|
|
77
216
|
* Check whether the process that wrote the lock is still running.
|
|
78
217
|
* Uses `process.kill(pid, 0)` which sends no signal but checks liveness.
|
|
79
218
|
* Returns true if the PID matches our own — we are the lock holder (#2470).
|
|
219
|
+
*
|
|
220
|
+
* Unchanged from the file-based era — pure stateless OS check.
|
|
80
221
|
*/
|
|
81
222
|
export function isLockProcessAlive(lock: LockData): boolean {
|
|
82
223
|
const pid = lock.pid;
|
|
83
224
|
if (!Number.isInteger(pid) || pid <= 0) return false;
|
|
84
|
-
// Our own PID means WE hold this lock — we are alive. (#2470)
|
|
85
|
-
// Callers that need to distinguish "our lock" from "someone else's lock"
|
|
86
|
-
// (e.g. startAuto checking for a prior crashed session with a recycled PID)
|
|
87
|
-
// already guard with `crashLock.pid !== process.pid` before calling us.
|
|
88
225
|
if (pid === process.pid) return true;
|
|
89
226
|
try {
|
|
90
227
|
process.kill(pid, 0);
|
|
91
228
|
return true;
|
|
92
229
|
} catch (err) {
|
|
93
|
-
// EPERM means the process exists but we lack permission — treat as alive.
|
|
94
|
-
// ESRCH means the process does not exist — treat as dead (stale lock).
|
|
95
230
|
if ((err as NodeJS.ErrnoException).code === "EPERM") return true;
|
|
96
231
|
return false;
|
|
97
232
|
}
|
|
@@ -106,7 +241,6 @@ export function formatCrashInfo(lock: LockData): string {
|
|
|
106
241
|
` PID: ${lock.pid}`,
|
|
107
242
|
];
|
|
108
243
|
|
|
109
|
-
// Add recovery guidance based on what was happening when it crashed
|
|
110
244
|
if (lock.unitType === "starting" && lock.unitId === "bootstrap") {
|
|
111
245
|
lines.push(`No work was lost. Run /gsd auto to restart.`);
|
|
112
246
|
} else if (lock.unitType.includes("research") || lock.unitType.includes("plan")) {
|
|
@@ -122,22 +256,14 @@ export function formatCrashInfo(lock: LockData): string {
|
|
|
122
256
|
|
|
123
257
|
/**
|
|
124
258
|
* Emit a synthetic unit-end event for a unit that crashed without emitting its own.
|
|
125
|
-
*
|
|
126
|
-
* Queries the journal to find the most recent unit-start for the crashed unit.
|
|
127
|
-
* If a matching unit-end already exists (e.g. the hard timeout fired), this is a
|
|
128
|
-
* no-op. Called during crash recovery, before clearing the stale lock.
|
|
129
|
-
*
|
|
130
|
-
* Addresses the gap reported in #3348 where `unit-start` was emitted but no
|
|
131
|
-
* `unit-end` followed — side effects landed but the worker died before closeout.
|
|
259
|
+
* Unchanged from the file era — operates on the journal, not the lock.
|
|
132
260
|
*/
|
|
133
261
|
export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): void {
|
|
134
|
-
// Skip bootstrap / starting pseudo-units — they have no meaningful unit-start event.
|
|
135
262
|
if (!lock.unitType || !lock.unitId || lock.unitType === "starting") return;
|
|
136
263
|
|
|
137
264
|
try {
|
|
138
265
|
const all = queryJournal(basePath);
|
|
139
266
|
|
|
140
|
-
// Find the most recent unit-start for this unitId
|
|
141
267
|
const starts = all.filter(
|
|
142
268
|
(e) => e.eventType === "unit-start" && e.data?.unitId === lock.unitId,
|
|
143
269
|
);
|
|
@@ -145,7 +271,6 @@ export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): voi
|
|
|
145
271
|
|
|
146
272
|
const lastStart = starts[starts.length - 1];
|
|
147
273
|
|
|
148
|
-
// Check if a unit-end was already emitted (e.g. hard timeout fired after the crash)
|
|
149
274
|
const alreadyClosed = all.some(
|
|
150
275
|
(e) =>
|
|
151
276
|
e.eventType === "unit-end" &&
|
|
@@ -155,7 +280,6 @@ export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): voi
|
|
|
155
280
|
);
|
|
156
281
|
if (alreadyClosed) return;
|
|
157
282
|
|
|
158
|
-
// Find the highest seq in this flow for monotonic ordering
|
|
159
283
|
const maxSeq = all
|
|
160
284
|
.filter((e) => e.flowId === lastStart.flowId)
|
|
161
285
|
.reduce((max, e) => Math.max(max, e.seq), lastStart.seq);
|
|
@@ -174,6 +298,16 @@ export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): voi
|
|
|
174
298
|
causedBy: { flowId: lastStart.flowId, seq: lastStart.seq },
|
|
175
299
|
});
|
|
176
300
|
} catch {
|
|
177
|
-
// Never throw from crash recovery path
|
|
301
|
+
// Never throw from crash recovery path.
|
|
178
302
|
}
|
|
179
303
|
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Used by the doctor checks (doctor-runtime-checks.ts, doctor-proactive.ts)
|
|
307
|
+
* to enumerate stale workers across all projects this DB knows about.
|
|
308
|
+
* Phase C pt 2 export — surface for the same diagnostics that previously
|
|
309
|
+
* iterated `auto.lock` files.
|
|
310
|
+
*/
|
|
311
|
+
export function findStaleAutoWorker(basePath: string): LockData | null {
|
|
312
|
+
return readCrashLock(basePath);
|
|
313
|
+
}
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
// gsd-2 + Auto-mode worker process registry (DB-backed coordination, Phase B)
|
|
2
|
+
//
|
|
3
|
+
// IMPORTANT — naming clarification (codex review LOW N1):
|
|
4
|
+
// This module is the AUTO-MODE PROCESS REGISTRY. It tracks long-running
|
|
5
|
+
// `gsd auto` worker processes for cross-process coordination via the shared
|
|
6
|
+
// SQLite WAL. It is NOT the in-process subagent registry, which lives at
|
|
7
|
+
// `src/resources/extensions/subagent/worker-registry.ts` and tracks dispatched
|
|
8
|
+
// subagent threads within a single process.
|
|
9
|
+
//
|
|
10
|
+
// Both modules use the word "worker" but they are unrelated:
|
|
11
|
+
// - subagent/worker-registry.ts → ephemeral in-process subagent threads
|
|
12
|
+
// - db/auto-workers.ts → durable cross-process auto-mode sessions
|
|
13
|
+
//
|
|
14
|
+
// Single-host invariant: SQLite WAL coordination only works on local disk.
|
|
15
|
+
// NFS / network filesystems break heartbeat semantics. Multi-host execution
|
|
16
|
+
// needs a real coordinator (etcd, Postgres) — out of scope for Phase B.
|
|
17
|
+
|
|
18
|
+
import { randomUUID } from "node:crypto";
|
|
19
|
+
import { hostname } from "node:os";
|
|
20
|
+
|
|
21
|
+
import {
|
|
22
|
+
_getAdapter,
|
|
23
|
+
isDbAvailable,
|
|
24
|
+
transaction,
|
|
25
|
+
insertAuditEvent,
|
|
26
|
+
} from "../gsd-db.js";
|
|
27
|
+
import { normalizeRealPath } from "../paths.js";
|
|
28
|
+
|
|
29
|
+
const HEARTBEAT_TTL_SECONDS = 60;
|
|
30
|
+
// Version label is for diagnostics only — embedded in audit_events and
|
|
31
|
+
// workers.version. Bumping this manually on protocol changes is fine; we
|
|
32
|
+
// don't pull it from package.json to avoid module-load filesystem I/O.
|
|
33
|
+
const WORKER_REGISTRY_VERSION = "1";
|
|
34
|
+
|
|
35
|
+
export type WorkerStatus = "active" | "stopping" | "crashed";
|
|
36
|
+
|
|
37
|
+
export interface AutoWorkerRow {
|
|
38
|
+
worker_id: string;
|
|
39
|
+
host: string;
|
|
40
|
+
pid: number;
|
|
41
|
+
started_at: string;
|
|
42
|
+
version: string;
|
|
43
|
+
last_heartbeat_at: string;
|
|
44
|
+
status: WorkerStatus;
|
|
45
|
+
project_root_realpath: string;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Register a new auto-mode worker process. Returns the generated worker_id
|
|
50
|
+
* for the session to store on its AutoSession.
|
|
51
|
+
*
|
|
52
|
+
* The worker is created with `status='active'` and an initial heartbeat
|
|
53
|
+
* stamp; callers must invoke heartbeatAutoWorker() periodically (e.g. once
|
|
54
|
+
* per loop iteration) to refresh the TTL.
|
|
55
|
+
*/
|
|
56
|
+
export function registerAutoWorker(opts: {
|
|
57
|
+
projectRootRealpath: string;
|
|
58
|
+
}): string {
|
|
59
|
+
if (!isDbAvailable()) {
|
|
60
|
+
throw new Error("registerAutoWorker: DB unavailable");
|
|
61
|
+
}
|
|
62
|
+
const workerId = `auto-${hostname()}-${process.pid}-${randomUUID().slice(0, 8)}`;
|
|
63
|
+
const now = new Date().toISOString();
|
|
64
|
+
|
|
65
|
+
transaction(() => {
|
|
66
|
+
const db = _getAdapter()!;
|
|
67
|
+
db.prepare(
|
|
68
|
+
`INSERT INTO workers (
|
|
69
|
+
worker_id, host, pid, started_at, version,
|
|
70
|
+
last_heartbeat_at, status, project_root_realpath
|
|
71
|
+
) VALUES (
|
|
72
|
+
:worker_id, :host, :pid, :started_at, :version,
|
|
73
|
+
:last_heartbeat_at, 'active', :project_root_realpath
|
|
74
|
+
)`,
|
|
75
|
+
).run({
|
|
76
|
+
":worker_id": workerId,
|
|
77
|
+
":host": hostname(),
|
|
78
|
+
":pid": process.pid,
|
|
79
|
+
":started_at": now,
|
|
80
|
+
":version": WORKER_REGISTRY_VERSION,
|
|
81
|
+
":last_heartbeat_at": now,
|
|
82
|
+
":project_root_realpath": opts.projectRootRealpath,
|
|
83
|
+
});
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
insertAuditEvent({
|
|
87
|
+
eventId: randomUUID(),
|
|
88
|
+
traceId: workerId,
|
|
89
|
+
category: "orchestration",
|
|
90
|
+
type: "worker-registered",
|
|
91
|
+
ts: now,
|
|
92
|
+
payload: {
|
|
93
|
+
workerId,
|
|
94
|
+
host: hostname(),
|
|
95
|
+
pid: process.pid,
|
|
96
|
+
version: WORKER_REGISTRY_VERSION,
|
|
97
|
+
projectRootRealpath: opts.projectRootRealpath,
|
|
98
|
+
},
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
return workerId;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Refresh the worker's heartbeat. Call once per auto-loop iteration.
|
|
106
|
+
* Idempotent — silently no-ops if the worker no longer exists (e.g. row was
|
|
107
|
+
* cleaned up by a janitor).
|
|
108
|
+
*/
|
|
109
|
+
export function heartbeatAutoWorker(workerId: string): void {
|
|
110
|
+
if (!isDbAvailable()) return;
|
|
111
|
+
const now = new Date().toISOString();
|
|
112
|
+
const db = _getAdapter()!;
|
|
113
|
+
db.prepare(
|
|
114
|
+
`UPDATE workers SET last_heartbeat_at = :now WHERE worker_id = :worker_id AND status = 'active'`,
|
|
115
|
+
).run({ ":now": now, ":worker_id": workerId });
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Mark the worker as crashed. Used by janitors / doctor commands when a
|
|
120
|
+
* worker's heartbeat has expired beyond the TTL window.
|
|
121
|
+
*/
|
|
122
|
+
export function markWorkerCrashed(workerId: string): void {
|
|
123
|
+
if (!isDbAvailable()) return;
|
|
124
|
+
const db = _getAdapter()!;
|
|
125
|
+
let changes = 0;
|
|
126
|
+
transaction(() => {
|
|
127
|
+
const result = db.prepare(
|
|
128
|
+
`UPDATE workers SET status = 'crashed' WHERE worker_id = :worker_id AND status = 'active'`,
|
|
129
|
+
).run({ ":worker_id": workerId });
|
|
130
|
+
changes =
|
|
131
|
+
typeof (result as { changes?: unknown }).changes === "number"
|
|
132
|
+
? (result as { changes: number }).changes
|
|
133
|
+
: 0;
|
|
134
|
+
});
|
|
135
|
+
if (changes < 1) return;
|
|
136
|
+
insertAuditEvent({
|
|
137
|
+
eventId: randomUUID(),
|
|
138
|
+
traceId: workerId,
|
|
139
|
+
category: "orchestration",
|
|
140
|
+
type: "worker-crashed",
|
|
141
|
+
ts: new Date().toISOString(),
|
|
142
|
+
payload: { workerId },
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Mark the worker as stopping. Called from the stopAuto path when the user
|
|
148
|
+
* cleanly shuts down auto-mode.
|
|
149
|
+
*/
|
|
150
|
+
export function markWorkerStopping(workerId: string): void {
|
|
151
|
+
if (!isDbAvailable()) return;
|
|
152
|
+
const db = _getAdapter()!;
|
|
153
|
+
transaction(() => {
|
|
154
|
+
db.prepare(
|
|
155
|
+
`UPDATE workers SET status = 'stopping' WHERE worker_id = :worker_id`,
|
|
156
|
+
).run({ ":worker_id": workerId });
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Return all workers whose status is 'active' AND whose heartbeat is within
|
|
162
|
+
* the TTL window. Workers older than the TTL are NOT auto-marked crashed
|
|
163
|
+
* here — that's a separate janitor responsibility — but they are filtered
|
|
164
|
+
* out of the active set so callers see a fresh view.
|
|
165
|
+
*/
|
|
166
|
+
export function getActiveAutoWorkers(): readonly AutoWorkerRow[] {
|
|
167
|
+
if (!isDbAvailable()) return [];
|
|
168
|
+
const db = _getAdapter()!;
|
|
169
|
+
const cutoffMs = Date.now() - HEARTBEAT_TTL_SECONDS * 1000;
|
|
170
|
+
const cutoffIso = new Date(cutoffMs).toISOString();
|
|
171
|
+
const rows = db.prepare(
|
|
172
|
+
`SELECT worker_id, host, pid, started_at, version,
|
|
173
|
+
last_heartbeat_at, status, project_root_realpath
|
|
174
|
+
FROM workers
|
|
175
|
+
WHERE status = 'active' AND last_heartbeat_at >= :cutoff
|
|
176
|
+
ORDER BY started_at`,
|
|
177
|
+
).all({ ":cutoff": cutoffIso }) as unknown as AutoWorkerRow[];
|
|
178
|
+
return rows;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/** Return all worker rows regardless of status or TTL. */
|
|
182
|
+
export function getAllAutoWorkers(): readonly AutoWorkerRow[] {
|
|
183
|
+
if (!isDbAvailable()) return [];
|
|
184
|
+
const db = _getAdapter()!;
|
|
185
|
+
const rows = db.prepare(
|
|
186
|
+
`SELECT worker_id, host, pid, started_at, version,
|
|
187
|
+
last_heartbeat_at, status, project_root_realpath
|
|
188
|
+
FROM workers
|
|
189
|
+
ORDER BY started_at`,
|
|
190
|
+
).all() as unknown as AutoWorkerRow[];
|
|
191
|
+
return rows;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Look up a single worker row. Returns null if no row exists.
|
|
196
|
+
*/
|
|
197
|
+
export function getAutoWorker(workerId: string): AutoWorkerRow | null {
|
|
198
|
+
if (!isDbAvailable()) return null;
|
|
199
|
+
const db = _getAdapter()!;
|
|
200
|
+
const row = db.prepare(
|
|
201
|
+
`SELECT worker_id, host, pid, started_at, version,
|
|
202
|
+
last_heartbeat_at, status, project_root_realpath
|
|
203
|
+
FROM workers WHERE worker_id = :worker_id`,
|
|
204
|
+
).get({ ":worker_id": workerId }) as AutoWorkerRow | undefined;
|
|
205
|
+
return row ?? null;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/** Test/janitor helper: TTL constant exported for callers to compute expirations. */
|
|
209
|
+
export function autoWorkerHeartbeatTtlSeconds(): number {
|
|
210
|
+
return HEARTBEAT_TTL_SECONDS;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
function isWorkerProcessAlive(candidate: Pick<AutoWorkerRow, "host" | "pid">): boolean {
|
|
214
|
+
const pid = candidate.pid;
|
|
215
|
+
if (!Number.isInteger(pid) || pid <= 0) return false;
|
|
216
|
+
if (candidate.host !== hostname()) return false;
|
|
217
|
+
if (pid === process.pid) return true;
|
|
218
|
+
try {
|
|
219
|
+
process.kill(pid, 0);
|
|
220
|
+
return true;
|
|
221
|
+
} catch (err) {
|
|
222
|
+
if ((err as NodeJS.ErrnoException).code === "EPERM") return true;
|
|
223
|
+
return false;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/**
|
|
228
|
+
* Phase C pt 2 — find the most recently active worker for a project root
|
|
229
|
+
* whose heartbeat has lapsed (the "previous crashed session" indicator).
|
|
230
|
+
*
|
|
231
|
+
* Used by crash-recovery.ts:readCrashLock to detect when a prior auto-mode
|
|
232
|
+
* session ended without cleanup. Workers are only treated as stale after
|
|
233
|
+
* their heartbeat has lapsed and the OS PID liveness check says the process
|
|
234
|
+
* is no longer alive.
|
|
235
|
+
*
|
|
236
|
+
* Returns null if no stale worker exists for this project root.
|
|
237
|
+
*/
|
|
238
|
+
export function findStaleWorkerForProject(
|
|
239
|
+
projectRootRealpath: string,
|
|
240
|
+
): AutoWorkerRow | null {
|
|
241
|
+
if (!isDbAvailable()) return null;
|
|
242
|
+
const db = _getAdapter()!;
|
|
243
|
+
const cutoffMs = Date.now() - HEARTBEAT_TTL_SECONDS * 1000;
|
|
244
|
+
const cutoffIso = new Date(cutoffMs).toISOString();
|
|
245
|
+
const row = db.prepare(
|
|
246
|
+
`SELECT worker_id, host, pid, started_at, version,
|
|
247
|
+
last_heartbeat_at, status, project_root_realpath
|
|
248
|
+
FROM workers
|
|
249
|
+
WHERE project_root_realpath = :project_root
|
|
250
|
+
AND status = 'active'
|
|
251
|
+
AND last_heartbeat_at < :cutoff
|
|
252
|
+
ORDER BY started_at DESC
|
|
253
|
+
LIMIT 1`,
|
|
254
|
+
).get({ ":project_root": projectRootRealpath, ":cutoff": cutoffIso }) as AutoWorkerRow | undefined;
|
|
255
|
+
if (row && !isWorkerProcessAlive(row)) return row;
|
|
256
|
+
|
|
257
|
+
// Older rows and external fixtures may have captured a non-realpath spelling
|
|
258
|
+
// of the same project root, e.g. /var/... vs /private/var/... on macOS.
|
|
259
|
+
const canonicalProjectRoot = normalizeRealPath(projectRootRealpath);
|
|
260
|
+
const staleRows = db.prepare(
|
|
261
|
+
`SELECT worker_id, host, pid, started_at, version,
|
|
262
|
+
last_heartbeat_at, status, project_root_realpath
|
|
263
|
+
FROM workers
|
|
264
|
+
WHERE status = 'active'
|
|
265
|
+
AND last_heartbeat_at < :cutoff
|
|
266
|
+
ORDER BY started_at DESC`,
|
|
267
|
+
).all({ ":cutoff": cutoffIso }) as unknown as AutoWorkerRow[];
|
|
268
|
+
return staleRows.find(
|
|
269
|
+
(candidate) =>
|
|
270
|
+
normalizeRealPath(candidate.project_root_realpath) === canonicalProjectRoot
|
|
271
|
+
&& !isWorkerProcessAlive(candidate),
|
|
272
|
+
) ?? null;
|
|
273
|
+
}
|