gsd-pi 2.78.1-dev.d8826a445 → 2.78.1-dev.eccf86e27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/README.md +5 -7
  2. package/dist/help-text.js +1 -1
  3. package/dist/resource-loader.js +6 -1
  4. package/dist/resources/.managed-resources-content-hash +1 -1
  5. package/dist/resources/extensions/gsd/auto/detect-stuck.js +41 -5
  6. package/dist/resources/extensions/gsd/auto/loop.js +235 -36
  7. package/dist/resources/extensions/gsd/auto/phases.js +7 -5
  8. package/dist/resources/extensions/gsd/auto/session.js +33 -0
  9. package/dist/resources/extensions/gsd/auto-dispatch.js +46 -2
  10. package/dist/resources/extensions/gsd/auto-post-unit.js +19 -11
  11. package/dist/resources/extensions/gsd/auto-worktree.js +26 -187
  12. package/dist/resources/extensions/gsd/auto.js +79 -50
  13. package/dist/resources/extensions/gsd/bootstrap/register-hooks.js +9 -4
  14. package/dist/resources/extensions/gsd/crash-recovery.js +160 -47
  15. package/dist/resources/extensions/gsd/db/auto-workers.js +227 -0
  16. package/dist/resources/extensions/gsd/db/command-queue.js +105 -0
  17. package/dist/resources/extensions/gsd/db/milestone-leases.js +210 -0
  18. package/dist/resources/extensions/gsd/db/runtime-kv.js +91 -0
  19. package/dist/resources/extensions/gsd/db/unit-dispatches.js +322 -0
  20. package/dist/resources/extensions/gsd/docs/COORDINATION.md +42 -0
  21. package/dist/resources/extensions/gsd/doctor-proactive.js +4 -0
  22. package/dist/resources/extensions/gsd/doctor-runtime-checks.js +22 -6
  23. package/dist/resources/extensions/gsd/doctor.js +12 -2
  24. package/dist/resources/extensions/gsd/gsd-db.js +161 -3
  25. package/dist/resources/extensions/gsd/guided-flow.js +6 -2
  26. package/dist/resources/extensions/gsd/interrupted-session.js +18 -15
  27. package/dist/resources/extensions/gsd/state.js +21 -6
  28. package/dist/resources/extensions/gsd/worktree-resolver.js +64 -0
  29. package/dist/tsconfig.extensions.tsbuildinfo +1 -1
  30. package/dist/web/standalone/.next/BUILD_ID +1 -1
  31. package/dist/web/standalone/.next/app-path-routes-manifest.json +12 -12
  32. package/dist/web/standalone/.next/build-manifest.json +2 -2
  33. package/dist/web/standalone/.next/prerender-manifest.json +3 -3
  34. package/dist/web/standalone/.next/server/app/_global-error.html +1 -1
  35. package/dist/web/standalone/.next/server/app/_global-error.rsc +1 -1
  36. package/dist/web/standalone/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
  37. package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error/__PAGE__.segment.rsc +1 -1
  38. package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error.segment.rsc +1 -1
  39. package/dist/web/standalone/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
  40. package/dist/web/standalone/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
  41. package/dist/web/standalone/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
  42. package/dist/web/standalone/.next/server/app/_not-found.html +1 -1
  43. package/dist/web/standalone/.next/server/app/_not-found.rsc +1 -1
  44. package/dist/web/standalone/.next/server/app/_not-found.segments/_full.segment.rsc +1 -1
  45. package/dist/web/standalone/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
  46. package/dist/web/standalone/.next/server/app/_not-found.segments/_index.segment.rsc +1 -1
  47. package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
  48. package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
  49. package/dist/web/standalone/.next/server/app/_not-found.segments/_tree.segment.rsc +1 -1
  50. package/dist/web/standalone/.next/server/app/index.html +1 -1
  51. package/dist/web/standalone/.next/server/app/index.rsc +1 -1
  52. package/dist/web/standalone/.next/server/app/index.segments/__PAGE__.segment.rsc +1 -1
  53. package/dist/web/standalone/.next/server/app/index.segments/_full.segment.rsc +1 -1
  54. package/dist/web/standalone/.next/server/app/index.segments/_head.segment.rsc +1 -1
  55. package/dist/web/standalone/.next/server/app/index.segments/_index.segment.rsc +1 -1
  56. package/dist/web/standalone/.next/server/app/index.segments/_tree.segment.rsc +1 -1
  57. package/dist/web/standalone/.next/server/app-paths-manifest.json +12 -12
  58. package/dist/web/standalone/.next/server/middleware-build-manifest.js +1 -1
  59. package/dist/web/standalone/.next/server/pages/404.html +1 -1
  60. package/dist/web/standalone/.next/server/pages/500.html +1 -1
  61. package/dist/web/standalone/.next/server/server-reference-manifest.json +1 -1
  62. package/package.json +1 -1
  63. package/src/resources/extensions/gsd/auto/detect-stuck.ts +37 -5
  64. package/src/resources/extensions/gsd/auto/loop.ts +263 -41
  65. package/src/resources/extensions/gsd/auto/phases.ts +7 -5
  66. package/src/resources/extensions/gsd/auto/session.ts +36 -0
  67. package/src/resources/extensions/gsd/auto-dispatch.ts +53 -2
  68. package/src/resources/extensions/gsd/auto-post-unit.ts +19 -11
  69. package/src/resources/extensions/gsd/auto-worktree.ts +26 -211
  70. package/src/resources/extensions/gsd/auto.ts +89 -44
  71. package/src/resources/extensions/gsd/bootstrap/register-hooks.ts +9 -4
  72. package/src/resources/extensions/gsd/crash-recovery.ts +177 -43
  73. package/src/resources/extensions/gsd/db/auto-workers.ts +273 -0
  74. package/src/resources/extensions/gsd/db/command-queue.ts +149 -0
  75. package/src/resources/extensions/gsd/db/milestone-leases.ts +274 -0
  76. package/src/resources/extensions/gsd/db/runtime-kv.ts +127 -0
  77. package/src/resources/extensions/gsd/db/unit-dispatches.ts +446 -0
  78. package/src/resources/extensions/gsd/docs/COORDINATION.md +42 -0
  79. package/src/resources/extensions/gsd/doctor-proactive.ts +4 -0
  80. package/src/resources/extensions/gsd/doctor-runtime-checks.ts +24 -6
  81. package/src/resources/extensions/gsd/doctor.ts +10 -2
  82. package/src/resources/extensions/gsd/gsd-db.ts +170 -3
  83. package/src/resources/extensions/gsd/guided-flow.ts +6 -2
  84. package/src/resources/extensions/gsd/interrupted-session.ts +19 -12
  85. package/src/resources/extensions/gsd/state.ts +44 -6
  86. package/src/resources/extensions/gsd/tests/auto-loop-no-copy-artifacts.test.ts +72 -0
  87. package/src/resources/extensions/gsd/tests/auto-loop-symlink-worktree.test.ts +190 -0
  88. package/src/resources/extensions/gsd/tests/auto-workers.test.ts +105 -0
  89. package/src/resources/extensions/gsd/tests/command-queue.test.ts +141 -0
  90. package/src/resources/extensions/gsd/tests/crash-recovery-via-db.test.ts +203 -0
  91. package/src/resources/extensions/gsd/tests/crash-recovery.test.ts +169 -59
  92. package/src/resources/extensions/gsd/tests/detect-stuck-respects-retry.test.ts +173 -0
  93. package/src/resources/extensions/gsd/tests/integration/auto-worktree.test.ts +22 -12
  94. package/src/resources/extensions/gsd/tests/integration/doctor-proactive.test.ts +24 -10
  95. package/src/resources/extensions/gsd/tests/integration/doctor-runtime.test.ts +35 -23
  96. package/src/resources/extensions/gsd/tests/integration/workspace-collapse-integration.test.ts +3 -5
  97. package/src/resources/extensions/gsd/tests/interrupted-session-auto.test.ts +72 -25
  98. package/src/resources/extensions/gsd/tests/interrupted-session-ui.test.ts +72 -25
  99. package/src/resources/extensions/gsd/tests/memory-pressure-stuck-state.test.ts +9 -6
  100. package/src/resources/extensions/gsd/tests/milestone-leases.test.ts +152 -0
  101. package/src/resources/extensions/gsd/tests/parallel-milestone-isolation.test.ts +106 -0
  102. package/src/resources/extensions/gsd/tests/paused-session-via-db.test.ts +119 -0
  103. package/src/resources/extensions/gsd/tests/pipeline-variant-dispatch.test.ts +58 -0
  104. package/src/resources/extensions/gsd/tests/preferences-worktree-sync.test.ts +3 -17
  105. package/src/resources/extensions/gsd/tests/register-hooks-depth-verification.test.ts +110 -0
  106. package/src/resources/extensions/gsd/tests/runtime-kv.test.ts +120 -0
  107. package/src/resources/extensions/gsd/tests/skipped-validation-completion.test.ts +133 -28
  108. package/src/resources/extensions/gsd/tests/skipped-validation-db-atomicity.test.ts +17 -0
  109. package/src/resources/extensions/gsd/tests/stuck-state-via-db.test.ts +134 -0
  110. package/src/resources/extensions/gsd/tests/sync-layer-scope.test.ts +7 -26
  111. package/src/resources/extensions/gsd/tests/teardown-cleanup-parity.test.ts +4 -8
  112. package/src/resources/extensions/gsd/tests/unit-dispatches.test.ts +247 -0
  113. package/src/resources/extensions/gsd/tests/validate-milestone.test.ts +41 -1
  114. package/src/resources/extensions/gsd/tests/workspace.test.ts +15 -9
  115. package/src/resources/extensions/gsd/tests/write-gate.test.ts +31 -23
  116. package/src/resources/extensions/gsd/worktree-resolver.ts +62 -0
  117. package/src/resources/extensions/gsd/tests/auto-lock-creation.test.ts +0 -213
  118. package/src/resources/extensions/gsd/tests/auto-stale-lock-self-kill.test.ts +0 -87
  119. package/src/resources/extensions/gsd/tests/stop-auto-remote.test.ts +0 -159
  120. /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_buildManifest.js +0 -0
  121. /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_ssgManifest.js +0 -0
@@ -1,21 +1,43 @@
1
1
  /**
2
- * GSD Crash Recovery
2
+ * GSD Crash Recovery (Phase C pt 2 — DB-backed)
3
3
  *
4
- * Detects interrupted auto-mode sessions via a lock file.
5
- * Written on auto-start, updated on each unit dispatch, deleted on clean stop.
6
- * If the lock file exists on next startup, the previous session crashed.
4
+ * Detects interrupted auto-mode sessions via the DB-backed workers +
5
+ * unit_dispatches + runtime_kv tables. The auto.lock file is gone; the
6
+ * `LockData` shape is preserved for backward compatibility with callers
7
+ * (auto.ts, doctor checks, interrupted-session.ts), but the contents are
8
+ * now synthesized from:
7
9
  *
8
- * The lock records the pi session file path so crash recovery can read the
9
- * surviving JSONL (pi appends entries incrementally via appendFileSync,
10
- * so the file on disk reflects every tool call up to the crash point).
10
+ * - workers.pid / .started_at / .last_heartbeat_at → liveness + age
11
+ * - unit_dispatches.unit_type / .unit_id / .started_at → what was running
12
+ * - runtime_kv("worker", workerId, "session_file") → pi session JSONL path
13
+ *
14
+ * "Crashed" is detected via workers.status='active' + heartbeat past TTL,
15
+ * cross-checked with the OS PID via isLockProcessAlive(). When the DB is
16
+ * unavailable (fresh project before init), all readers return null and
17
+ * writers no-op — preserving the historical "no lock means no prior
18
+ * crash" semantics.
19
+ *
20
+ * The journal-based emitCrashRecoveredUnitEnd is unchanged from the file
21
+ * era — it queries the journal independently of the lock mechanism.
11
22
  */
12
23
 
24
+ import {
25
+ emitJournalEvent,
26
+ queryJournal,
27
+ } from "./journal.js";
13
28
  import { readFileSync, unlinkSync, existsSync } from "node:fs";
14
29
  import { join } from "node:path";
15
- import { gsdRoot } from "./paths.js";
30
+ import {
31
+ findStaleWorkerForProject,
32
+ getAllAutoWorkers,
33
+ type AutoWorkerRow,
34
+ } from "./db/auto-workers.js";
35
+ import { getLatestForUnit, type DispatchStatus } from "./db/unit-dispatches.js";
36
+ import { getRuntimeKv, setRuntimeKv, deleteRuntimeKv } from "./db/runtime-kv.js";
37
+ import { _getAdapter, isDbAvailable } from "./gsd-db.js";
38
+ import { gsdRoot, normalizeRealPath } from "./paths.js";
16
39
  import { atomicWriteSync } from "./atomic-write.js";
17
40
  import { effectiveLockFile } from "./session-lock.js";
18
- import { emitJournalEvent, queryJournal } from "./journal.js";
19
41
 
20
42
  export interface LockData {
21
43
  pid: number;
@@ -27,11 +49,91 @@ export interface LockData {
27
49
  sessionFile?: string;
28
50
  }
29
51
 
52
+ const SESSION_FILE_KV_KEY = "session_file";
53
+
30
54
  function lockPath(basePath: string): string {
31
55
  return join(gsdRoot(basePath), effectiveLockFile());
32
56
  }
33
57
 
34
- /** Write or update the lock file with current auto-mode state. */
58
+ function readLegacyLock(basePath: string): LockData | null {
59
+ try {
60
+ const p = lockPath(basePath);
61
+ if (!existsSync(p)) return null;
62
+ return JSON.parse(readFileSync(p, "utf-8")) as LockData;
63
+ } catch {
64
+ return null;
65
+ }
66
+ }
67
+
68
+ function findActiveWorkerForCurrentProcess(
69
+ projectRootRealpath: string,
70
+ ): AutoWorkerRow | null {
71
+ if (!isDbAvailable()) return null;
72
+ const workers = getAllAutoWorkers();
73
+ for (const worker of workers) {
74
+ if (
75
+ worker.pid === process.pid
76
+ && worker.project_root_realpath === projectRootRealpath
77
+ ) {
78
+ return worker;
79
+ }
80
+ }
81
+ return null;
82
+ }
83
+
84
+ /**
85
+ * Look up the most recent dispatch row for a worker, regardless of status.
86
+ * Returns null if the worker has no dispatch history yet (e.g. crashed
87
+ * during bootstrap before claiming the first unit).
88
+ */
89
+ function getLatestDispatchForWorker(workerId: string):
90
+ | { unit_type: string; unit_id: string; started_at: string; status: DispatchStatus }
91
+ | null {
92
+ if (!isDbAvailable()) return null;
93
+ const db = _getAdapter()!;
94
+ const row = db.prepare(
95
+ `SELECT unit_type, unit_id, started_at, status
96
+ FROM unit_dispatches
97
+ WHERE worker_id = :worker_id
98
+ ORDER BY id DESC
99
+ LIMIT 1`,
100
+ ).get({ ":worker_id": workerId }) as
101
+ | { unit_type: string; unit_id: string; started_at: string; status: DispatchStatus }
102
+ | undefined;
103
+ return row ?? null;
104
+ }
105
+
106
+ function workerToLockData(worker: AutoWorkerRow): LockData {
107
+ const dispatch = getLatestDispatchForWorker(worker.worker_id);
108
+ const sessionFile =
109
+ getRuntimeKv<string>("worker", worker.worker_id, SESSION_FILE_KV_KEY) ?? undefined;
110
+ return {
111
+ pid: worker.pid,
112
+ startedAt: worker.started_at,
113
+ // Pre-Phase-C-pt-2 default: when no dispatch row exists yet (bootstrap
114
+ // crash), report unitType="starting", unitId="bootstrap" — same shape
115
+ // the file-based writer used to produce.
116
+ unitType: dispatch?.unit_type ?? "starting",
117
+ unitId: dispatch?.unit_id ?? "bootstrap",
118
+ unitStartedAt: dispatch?.started_at ?? worker.started_at,
119
+ sessionFile,
120
+ };
121
+ }
122
+
123
+ /**
124
+ * Write or update the lock state for the current auto-mode session.
125
+ *
126
+ * Phase C pt 2: the only persistent state this function adds beyond what
127
+ * the workers + unit_dispatches tables already track is the pi session
128
+ * JSONL path, which lands in runtime_kv (worker scope, key
129
+ * "session_file"). The pid/startedAt/unitType/unitId/unitStartedAt are
130
+ * recorded by registerAutoWorker / heartbeatAutoWorker / recordDispatchClaim
131
+ * already.
132
+ *
133
+ * basePath is unused by the new implementation (kept as a parameter for
134
+ * back-compat with the 15+ call sites) — the worker is identified by
135
+ * pid + project_root_realpath in the workers table.
136
+ */
35
137
  export function writeLock(
36
138
  basePath: string,
37
139
  unitType: string,
@@ -47,51 +149,84 @@ export function writeLock(
47
149
  unitStartedAt: new Date().toISOString(),
48
150
  sessionFile,
49
151
  };
50
- const lp = lockPath(basePath);
51
- atomicWriteSync(lp, JSON.stringify(data, null, 2));
52
- } catch (e) { /* non-fatal: lock write failure */ void e; }
152
+ atomicWriteSync(lockPath(basePath), JSON.stringify(data, null, 2));
153
+ } catch {
154
+ // Best-effort never throw from the lock writer.
155
+ }
156
+
157
+ if (!isDbAvailable() || !sessionFile) return;
158
+ try {
159
+ const projectRoot = normalizeRealPath(basePath);
160
+ const worker = findActiveWorkerForCurrentProcess(projectRoot);
161
+ if (!worker) return;
162
+ setRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY, sessionFile);
163
+ } catch {
164
+ // Best-effort — never throw from the lock writer.
165
+ }
53
166
  }
54
167
 
55
- /** Remove the lock file on clean stop. */
168
+ /**
169
+ * Phase C pt 2: clearLock no longer deletes a file. The cleanup path
170
+ * (markWorkerStopping in stopAuto) flips the workers row to 'stopping'.
171
+ * This function additionally drops the session_file runtime_kv row for
172
+ * the current worker so a follow-up crash detection doesn't pick up a
173
+ * stale session-file pointer.
174
+ */
56
175
  export function clearLock(basePath: string): void {
57
176
  try {
58
177
  const p = lockPath(basePath);
59
178
  if (existsSync(p)) unlinkSync(p);
60
- } catch (e) { /* non-fatal: lock clear failure */ void e; }
179
+ } catch {
180
+ // Best-effort.
181
+ }
182
+
183
+ if (!isDbAvailable()) return;
184
+ try {
185
+ const projectRoot = normalizeRealPath(basePath);
186
+ const worker = findActiveWorkerForCurrentProcess(projectRoot);
187
+ if (!worker) return;
188
+ deleteRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY);
189
+ } catch {
190
+ // Best-effort.
191
+ }
61
192
  }
62
193
 
63
- /** Check if a crash lock exists and return its data. */
194
+ /**
195
+ * Detect a previous crashed auto-mode session.
196
+ *
197
+ * Phase C pt 2: synthesized from workers (status='active' + lapsed
198
+ * heartbeat) + unit_dispatches (most recent for that worker) +
199
+ * runtime_kv (session_file). Returns null when no stale worker exists
200
+ * or the DB is unavailable.
201
+ */
64
202
  export function readCrashLock(basePath: string): LockData | null {
65
- try {
66
- const p = lockPath(basePath);
67
- if (!existsSync(p)) return null;
68
- const raw = readFileSync(p, "utf-8");
69
- return JSON.parse(raw) as LockData;
70
- } catch (e) {
71
- /* non-fatal: corrupt or unreadable lock file */ void e;
72
- return null;
203
+ if (isDbAvailable()) {
204
+ try {
205
+ const projectRoot = normalizeRealPath(basePath);
206
+ const stale = findStaleWorkerForProject(projectRoot);
207
+ if (stale) return workerToLockData(stale);
208
+ } catch {
209
+ // Fall through to the legacy lock-file compatibility path.
210
+ }
73
211
  }
212
+ return readLegacyLock(basePath);
74
213
  }
75
214
 
76
215
  /**
77
216
  * Check whether the process that wrote the lock is still running.
78
217
  * Uses `process.kill(pid, 0)` which sends no signal but checks liveness.
79
218
  * Returns true if the PID matches our own — we are the lock holder (#2470).
219
+ *
220
+ * Unchanged from the file-based era — pure stateless OS check.
80
221
  */
81
222
  export function isLockProcessAlive(lock: LockData): boolean {
82
223
  const pid = lock.pid;
83
224
  if (!Number.isInteger(pid) || pid <= 0) return false;
84
- // Our own PID means WE hold this lock — we are alive. (#2470)
85
- // Callers that need to distinguish "our lock" from "someone else's lock"
86
- // (e.g. startAuto checking for a prior crashed session with a recycled PID)
87
- // already guard with `crashLock.pid !== process.pid` before calling us.
88
225
  if (pid === process.pid) return true;
89
226
  try {
90
227
  process.kill(pid, 0);
91
228
  return true;
92
229
  } catch (err) {
93
- // EPERM means the process exists but we lack permission — treat as alive.
94
- // ESRCH means the process does not exist — treat as dead (stale lock).
95
230
  if ((err as NodeJS.ErrnoException).code === "EPERM") return true;
96
231
  return false;
97
232
  }
@@ -106,7 +241,6 @@ export function formatCrashInfo(lock: LockData): string {
106
241
  ` PID: ${lock.pid}`,
107
242
  ];
108
243
 
109
- // Add recovery guidance based on what was happening when it crashed
110
244
  if (lock.unitType === "starting" && lock.unitId === "bootstrap") {
111
245
  lines.push(`No work was lost. Run /gsd auto to restart.`);
112
246
  } else if (lock.unitType.includes("research") || lock.unitType.includes("plan")) {
@@ -122,22 +256,14 @@ export function formatCrashInfo(lock: LockData): string {
122
256
 
123
257
  /**
124
258
  * Emit a synthetic unit-end event for a unit that crashed without emitting its own.
125
- *
126
- * Queries the journal to find the most recent unit-start for the crashed unit.
127
- * If a matching unit-end already exists (e.g. the hard timeout fired), this is a
128
- * no-op. Called during crash recovery, before clearing the stale lock.
129
- *
130
- * Addresses the gap reported in #3348 where `unit-start` was emitted but no
131
- * `unit-end` followed — side effects landed but the worker died before closeout.
259
+ * Unchanged from the file era — operates on the journal, not the lock.
132
260
  */
133
261
  export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): void {
134
- // Skip bootstrap / starting pseudo-units — they have no meaningful unit-start event.
135
262
  if (!lock.unitType || !lock.unitId || lock.unitType === "starting") return;
136
263
 
137
264
  try {
138
265
  const all = queryJournal(basePath);
139
266
 
140
- // Find the most recent unit-start for this unitId
141
267
  const starts = all.filter(
142
268
  (e) => e.eventType === "unit-start" && e.data?.unitId === lock.unitId,
143
269
  );
@@ -145,7 +271,6 @@ export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): voi
145
271
 
146
272
  const lastStart = starts[starts.length - 1];
147
273
 
148
- // Check if a unit-end was already emitted (e.g. hard timeout fired after the crash)
149
274
  const alreadyClosed = all.some(
150
275
  (e) =>
151
276
  e.eventType === "unit-end" &&
@@ -155,7 +280,6 @@ export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): voi
155
280
  );
156
281
  if (alreadyClosed) return;
157
282
 
158
- // Find the highest seq in this flow for monotonic ordering
159
283
  const maxSeq = all
160
284
  .filter((e) => e.flowId === lastStart.flowId)
161
285
  .reduce((max, e) => Math.max(max, e.seq), lastStart.seq);
@@ -174,6 +298,16 @@ export function emitCrashRecoveredUnitEnd(basePath: string, lock: LockData): voi
174
298
  causedBy: { flowId: lastStart.flowId, seq: lastStart.seq },
175
299
  });
176
300
  } catch {
177
- // Never throw from crash recovery path — journal failure must not block recovery
301
+ // Never throw from crash recovery path.
178
302
  }
179
303
  }
304
+
305
+ /**
306
+ * Used by the doctor checks (doctor-runtime-checks.ts, doctor-proactive.ts)
307
+ * to enumerate stale workers across all projects this DB knows about.
308
+ * Phase C pt 2 export — surface for the same diagnostics that previously
309
+ * iterated `auto.lock` files.
310
+ */
311
+ export function findStaleAutoWorker(basePath: string): LockData | null {
312
+ return readCrashLock(basePath);
313
+ }
@@ -0,0 +1,273 @@
1
+ // gsd-2 + Auto-mode worker process registry (DB-backed coordination, Phase B)
2
+ //
3
+ // IMPORTANT — naming clarification (codex review LOW N1):
4
+ // This module is the AUTO-MODE PROCESS REGISTRY. It tracks long-running
5
+ // `gsd auto` worker processes for cross-process coordination via the shared
6
+ // SQLite WAL. It is NOT the in-process subagent registry, which lives at
7
+ // `src/resources/extensions/subagent/worker-registry.ts` and tracks dispatched
8
+ // subagent threads within a single process.
9
+ //
10
+ // Both modules use the word "worker" but they are unrelated:
11
+ // - subagent/worker-registry.ts → ephemeral in-process subagent threads
12
+ // - db/auto-workers.ts → durable cross-process auto-mode sessions
13
+ //
14
+ // Single-host invariant: SQLite WAL coordination only works on local disk.
15
+ // NFS / network filesystems break heartbeat semantics. Multi-host execution
16
+ // needs a real coordinator (etcd, Postgres) — out of scope for Phase B.
17
+
18
+ import { randomUUID } from "node:crypto";
19
+ import { hostname } from "node:os";
20
+
21
+ import {
22
+ _getAdapter,
23
+ isDbAvailable,
24
+ transaction,
25
+ insertAuditEvent,
26
+ } from "../gsd-db.js";
27
+ import { normalizeRealPath } from "../paths.js";
28
+
29
+ const HEARTBEAT_TTL_SECONDS = 60;
30
+ // Version label is for diagnostics only — embedded in audit_events and
31
+ // workers.version. Bumping this manually on protocol changes is fine; we
32
+ // don't pull it from package.json to avoid module-load filesystem I/O.
33
+ const WORKER_REGISTRY_VERSION = "1";
34
+
35
+ export type WorkerStatus = "active" | "stopping" | "crashed";
36
+
37
+ export interface AutoWorkerRow {
38
+ worker_id: string;
39
+ host: string;
40
+ pid: number;
41
+ started_at: string;
42
+ version: string;
43
+ last_heartbeat_at: string;
44
+ status: WorkerStatus;
45
+ project_root_realpath: string;
46
+ }
47
+
48
+ /**
49
+ * Register a new auto-mode worker process. Returns the generated worker_id
50
+ * for the session to store on its AutoSession.
51
+ *
52
+ * The worker is created with `status='active'` and an initial heartbeat
53
+ * stamp; callers must invoke heartbeatAutoWorker() periodically (e.g. once
54
+ * per loop iteration) to refresh the TTL.
55
+ */
56
+ export function registerAutoWorker(opts: {
57
+ projectRootRealpath: string;
58
+ }): string {
59
+ if (!isDbAvailable()) {
60
+ throw new Error("registerAutoWorker: DB unavailable");
61
+ }
62
+ const workerId = `auto-${hostname()}-${process.pid}-${randomUUID().slice(0, 8)}`;
63
+ const now = new Date().toISOString();
64
+
65
+ transaction(() => {
66
+ const db = _getAdapter()!;
67
+ db.prepare(
68
+ `INSERT INTO workers (
69
+ worker_id, host, pid, started_at, version,
70
+ last_heartbeat_at, status, project_root_realpath
71
+ ) VALUES (
72
+ :worker_id, :host, :pid, :started_at, :version,
73
+ :last_heartbeat_at, 'active', :project_root_realpath
74
+ )`,
75
+ ).run({
76
+ ":worker_id": workerId,
77
+ ":host": hostname(),
78
+ ":pid": process.pid,
79
+ ":started_at": now,
80
+ ":version": WORKER_REGISTRY_VERSION,
81
+ ":last_heartbeat_at": now,
82
+ ":project_root_realpath": opts.projectRootRealpath,
83
+ });
84
+ });
85
+
86
+ insertAuditEvent({
87
+ eventId: randomUUID(),
88
+ traceId: workerId,
89
+ category: "orchestration",
90
+ type: "worker-registered",
91
+ ts: now,
92
+ payload: {
93
+ workerId,
94
+ host: hostname(),
95
+ pid: process.pid,
96
+ version: WORKER_REGISTRY_VERSION,
97
+ projectRootRealpath: opts.projectRootRealpath,
98
+ },
99
+ });
100
+
101
+ return workerId;
102
+ }
103
+
104
+ /**
105
+ * Refresh the worker's heartbeat. Call once per auto-loop iteration.
106
+ * Idempotent — silently no-ops if the worker no longer exists (e.g. row was
107
+ * cleaned up by a janitor).
108
+ */
109
+ export function heartbeatAutoWorker(workerId: string): void {
110
+ if (!isDbAvailable()) return;
111
+ const now = new Date().toISOString();
112
+ const db = _getAdapter()!;
113
+ db.prepare(
114
+ `UPDATE workers SET last_heartbeat_at = :now WHERE worker_id = :worker_id AND status = 'active'`,
115
+ ).run({ ":now": now, ":worker_id": workerId });
116
+ }
117
+
118
+ /**
119
+ * Mark the worker as crashed. Used by janitors / doctor commands when a
120
+ * worker's heartbeat has expired beyond the TTL window.
121
+ */
122
+ export function markWorkerCrashed(workerId: string): void {
123
+ if (!isDbAvailable()) return;
124
+ const db = _getAdapter()!;
125
+ let changes = 0;
126
+ transaction(() => {
127
+ const result = db.prepare(
128
+ `UPDATE workers SET status = 'crashed' WHERE worker_id = :worker_id AND status = 'active'`,
129
+ ).run({ ":worker_id": workerId });
130
+ changes =
131
+ typeof (result as { changes?: unknown }).changes === "number"
132
+ ? (result as { changes: number }).changes
133
+ : 0;
134
+ });
135
+ if (changes < 1) return;
136
+ insertAuditEvent({
137
+ eventId: randomUUID(),
138
+ traceId: workerId,
139
+ category: "orchestration",
140
+ type: "worker-crashed",
141
+ ts: new Date().toISOString(),
142
+ payload: { workerId },
143
+ });
144
+ }
145
+
146
+ /**
147
+ * Mark the worker as stopping. Called from the stopAuto path when the user
148
+ * cleanly shuts down auto-mode.
149
+ */
150
+ export function markWorkerStopping(workerId: string): void {
151
+ if (!isDbAvailable()) return;
152
+ const db = _getAdapter()!;
153
+ transaction(() => {
154
+ db.prepare(
155
+ `UPDATE workers SET status = 'stopping' WHERE worker_id = :worker_id`,
156
+ ).run({ ":worker_id": workerId });
157
+ });
158
+ }
159
+
160
+ /**
161
+ * Return all workers whose status is 'active' AND whose heartbeat is within
162
+ * the TTL window. Workers older than the TTL are NOT auto-marked crashed
163
+ * here — that's a separate janitor responsibility — but they are filtered
164
+ * out of the active set so callers see a fresh view.
165
+ */
166
+ export function getActiveAutoWorkers(): readonly AutoWorkerRow[] {
167
+ if (!isDbAvailable()) return [];
168
+ const db = _getAdapter()!;
169
+ const cutoffMs = Date.now() - HEARTBEAT_TTL_SECONDS * 1000;
170
+ const cutoffIso = new Date(cutoffMs).toISOString();
171
+ const rows = db.prepare(
172
+ `SELECT worker_id, host, pid, started_at, version,
173
+ last_heartbeat_at, status, project_root_realpath
174
+ FROM workers
175
+ WHERE status = 'active' AND last_heartbeat_at >= :cutoff
176
+ ORDER BY started_at`,
177
+ ).all({ ":cutoff": cutoffIso }) as unknown as AutoWorkerRow[];
178
+ return rows;
179
+ }
180
+
181
+ /** Return all worker rows regardless of status or TTL. */
182
+ export function getAllAutoWorkers(): readonly AutoWorkerRow[] {
183
+ if (!isDbAvailable()) return [];
184
+ const db = _getAdapter()!;
185
+ const rows = db.prepare(
186
+ `SELECT worker_id, host, pid, started_at, version,
187
+ last_heartbeat_at, status, project_root_realpath
188
+ FROM workers
189
+ ORDER BY started_at`,
190
+ ).all() as unknown as AutoWorkerRow[];
191
+ return rows;
192
+ }
193
+
194
+ /**
195
+ * Look up a single worker row. Returns null if no row exists.
196
+ */
197
+ export function getAutoWorker(workerId: string): AutoWorkerRow | null {
198
+ if (!isDbAvailable()) return null;
199
+ const db = _getAdapter()!;
200
+ const row = db.prepare(
201
+ `SELECT worker_id, host, pid, started_at, version,
202
+ last_heartbeat_at, status, project_root_realpath
203
+ FROM workers WHERE worker_id = :worker_id`,
204
+ ).get({ ":worker_id": workerId }) as AutoWorkerRow | undefined;
205
+ return row ?? null;
206
+ }
207
+
208
+ /** Test/janitor helper: TTL constant exported for callers to compute expirations. */
209
+ export function autoWorkerHeartbeatTtlSeconds(): number {
210
+ return HEARTBEAT_TTL_SECONDS;
211
+ }
212
+
213
+ function isWorkerProcessAlive(candidate: Pick<AutoWorkerRow, "host" | "pid">): boolean {
214
+ const pid = candidate.pid;
215
+ if (!Number.isInteger(pid) || pid <= 0) return false;
216
+ if (candidate.host !== hostname()) return false;
217
+ if (pid === process.pid) return true;
218
+ try {
219
+ process.kill(pid, 0);
220
+ return true;
221
+ } catch (err) {
222
+ if ((err as NodeJS.ErrnoException).code === "EPERM") return true;
223
+ return false;
224
+ }
225
+ }
226
+
227
+ /**
228
+ * Phase C pt 2 — find the most recently active worker for a project root
229
+ * whose heartbeat has lapsed (the "previous crashed session" indicator).
230
+ *
231
+ * Used by crash-recovery.ts:readCrashLock to detect when a prior auto-mode
232
+ * session ended without cleanup. Workers are only treated as stale after
233
+ * their heartbeat has lapsed and the OS PID liveness check says the process
234
+ * is no longer alive.
235
+ *
236
+ * Returns null if no stale worker exists for this project root.
237
+ */
238
+ export function findStaleWorkerForProject(
239
+ projectRootRealpath: string,
240
+ ): AutoWorkerRow | null {
241
+ if (!isDbAvailable()) return null;
242
+ const db = _getAdapter()!;
243
+ const cutoffMs = Date.now() - HEARTBEAT_TTL_SECONDS * 1000;
244
+ const cutoffIso = new Date(cutoffMs).toISOString();
245
+ const row = db.prepare(
246
+ `SELECT worker_id, host, pid, started_at, version,
247
+ last_heartbeat_at, status, project_root_realpath
248
+ FROM workers
249
+ WHERE project_root_realpath = :project_root
250
+ AND status = 'active'
251
+ AND last_heartbeat_at < :cutoff
252
+ ORDER BY started_at DESC
253
+ LIMIT 1`,
254
+ ).get({ ":project_root": projectRootRealpath, ":cutoff": cutoffIso }) as AutoWorkerRow | undefined;
255
+ if (row && !isWorkerProcessAlive(row)) return row;
256
+
257
+ // Older rows and external fixtures may have captured a non-realpath spelling
258
+ // of the same project root, e.g. /var/... vs /private/var/... on macOS.
259
+ const canonicalProjectRoot = normalizeRealPath(projectRootRealpath);
260
+ const staleRows = db.prepare(
261
+ `SELECT worker_id, host, pid, started_at, version,
262
+ last_heartbeat_at, status, project_root_realpath
263
+ FROM workers
264
+ WHERE status = 'active'
265
+ AND last_heartbeat_at < :cutoff
266
+ ORDER BY started_at DESC`,
267
+ ).all({ ":cutoff": cutoffIso }) as unknown as AutoWorkerRow[];
268
+ return staleRows.find(
269
+ (candidate) =>
270
+ normalizeRealPath(candidate.project_root_realpath) === canonicalProjectRoot
271
+ && !isWorkerProcessAlive(candidate),
272
+ ) ?? null;
273
+ }