gsd-pi 2.78.1-dev.d8826a445 → 2.78.1-dev.eccf86e27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/README.md +5 -7
  2. package/dist/help-text.js +1 -1
  3. package/dist/resource-loader.js +6 -1
  4. package/dist/resources/.managed-resources-content-hash +1 -1
  5. package/dist/resources/extensions/gsd/auto/detect-stuck.js +41 -5
  6. package/dist/resources/extensions/gsd/auto/loop.js +235 -36
  7. package/dist/resources/extensions/gsd/auto/phases.js +7 -5
  8. package/dist/resources/extensions/gsd/auto/session.js +33 -0
  9. package/dist/resources/extensions/gsd/auto-dispatch.js +46 -2
  10. package/dist/resources/extensions/gsd/auto-post-unit.js +19 -11
  11. package/dist/resources/extensions/gsd/auto-worktree.js +26 -187
  12. package/dist/resources/extensions/gsd/auto.js +79 -50
  13. package/dist/resources/extensions/gsd/bootstrap/register-hooks.js +9 -4
  14. package/dist/resources/extensions/gsd/crash-recovery.js +160 -47
  15. package/dist/resources/extensions/gsd/db/auto-workers.js +227 -0
  16. package/dist/resources/extensions/gsd/db/command-queue.js +105 -0
  17. package/dist/resources/extensions/gsd/db/milestone-leases.js +210 -0
  18. package/dist/resources/extensions/gsd/db/runtime-kv.js +91 -0
  19. package/dist/resources/extensions/gsd/db/unit-dispatches.js +322 -0
  20. package/dist/resources/extensions/gsd/docs/COORDINATION.md +42 -0
  21. package/dist/resources/extensions/gsd/doctor-proactive.js +4 -0
  22. package/dist/resources/extensions/gsd/doctor-runtime-checks.js +22 -6
  23. package/dist/resources/extensions/gsd/doctor.js +12 -2
  24. package/dist/resources/extensions/gsd/gsd-db.js +161 -3
  25. package/dist/resources/extensions/gsd/guided-flow.js +6 -2
  26. package/dist/resources/extensions/gsd/interrupted-session.js +18 -15
  27. package/dist/resources/extensions/gsd/state.js +21 -6
  28. package/dist/resources/extensions/gsd/worktree-resolver.js +64 -0
  29. package/dist/tsconfig.extensions.tsbuildinfo +1 -1
  30. package/dist/web/standalone/.next/BUILD_ID +1 -1
  31. package/dist/web/standalone/.next/app-path-routes-manifest.json +12 -12
  32. package/dist/web/standalone/.next/build-manifest.json +2 -2
  33. package/dist/web/standalone/.next/prerender-manifest.json +3 -3
  34. package/dist/web/standalone/.next/server/app/_global-error.html +1 -1
  35. package/dist/web/standalone/.next/server/app/_global-error.rsc +1 -1
  36. package/dist/web/standalone/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
  37. package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error/__PAGE__.segment.rsc +1 -1
  38. package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error.segment.rsc +1 -1
  39. package/dist/web/standalone/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
  40. package/dist/web/standalone/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
  41. package/dist/web/standalone/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
  42. package/dist/web/standalone/.next/server/app/_not-found.html +1 -1
  43. package/dist/web/standalone/.next/server/app/_not-found.rsc +1 -1
  44. package/dist/web/standalone/.next/server/app/_not-found.segments/_full.segment.rsc +1 -1
  45. package/dist/web/standalone/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
  46. package/dist/web/standalone/.next/server/app/_not-found.segments/_index.segment.rsc +1 -1
  47. package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
  48. package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
  49. package/dist/web/standalone/.next/server/app/_not-found.segments/_tree.segment.rsc +1 -1
  50. package/dist/web/standalone/.next/server/app/index.html +1 -1
  51. package/dist/web/standalone/.next/server/app/index.rsc +1 -1
  52. package/dist/web/standalone/.next/server/app/index.segments/__PAGE__.segment.rsc +1 -1
  53. package/dist/web/standalone/.next/server/app/index.segments/_full.segment.rsc +1 -1
  54. package/dist/web/standalone/.next/server/app/index.segments/_head.segment.rsc +1 -1
  55. package/dist/web/standalone/.next/server/app/index.segments/_index.segment.rsc +1 -1
  56. package/dist/web/standalone/.next/server/app/index.segments/_tree.segment.rsc +1 -1
  57. package/dist/web/standalone/.next/server/app-paths-manifest.json +12 -12
  58. package/dist/web/standalone/.next/server/middleware-build-manifest.js +1 -1
  59. package/dist/web/standalone/.next/server/pages/404.html +1 -1
  60. package/dist/web/standalone/.next/server/pages/500.html +1 -1
  61. package/dist/web/standalone/.next/server/server-reference-manifest.json +1 -1
  62. package/package.json +1 -1
  63. package/src/resources/extensions/gsd/auto/detect-stuck.ts +37 -5
  64. package/src/resources/extensions/gsd/auto/loop.ts +263 -41
  65. package/src/resources/extensions/gsd/auto/phases.ts +7 -5
  66. package/src/resources/extensions/gsd/auto/session.ts +36 -0
  67. package/src/resources/extensions/gsd/auto-dispatch.ts +53 -2
  68. package/src/resources/extensions/gsd/auto-post-unit.ts +19 -11
  69. package/src/resources/extensions/gsd/auto-worktree.ts +26 -211
  70. package/src/resources/extensions/gsd/auto.ts +89 -44
  71. package/src/resources/extensions/gsd/bootstrap/register-hooks.ts +9 -4
  72. package/src/resources/extensions/gsd/crash-recovery.ts +177 -43
  73. package/src/resources/extensions/gsd/db/auto-workers.ts +273 -0
  74. package/src/resources/extensions/gsd/db/command-queue.ts +149 -0
  75. package/src/resources/extensions/gsd/db/milestone-leases.ts +274 -0
  76. package/src/resources/extensions/gsd/db/runtime-kv.ts +127 -0
  77. package/src/resources/extensions/gsd/db/unit-dispatches.ts +446 -0
  78. package/src/resources/extensions/gsd/docs/COORDINATION.md +42 -0
  79. package/src/resources/extensions/gsd/doctor-proactive.ts +4 -0
  80. package/src/resources/extensions/gsd/doctor-runtime-checks.ts +24 -6
  81. package/src/resources/extensions/gsd/doctor.ts +10 -2
  82. package/src/resources/extensions/gsd/gsd-db.ts +170 -3
  83. package/src/resources/extensions/gsd/guided-flow.ts +6 -2
  84. package/src/resources/extensions/gsd/interrupted-session.ts +19 -12
  85. package/src/resources/extensions/gsd/state.ts +44 -6
  86. package/src/resources/extensions/gsd/tests/auto-loop-no-copy-artifacts.test.ts +72 -0
  87. package/src/resources/extensions/gsd/tests/auto-loop-symlink-worktree.test.ts +190 -0
  88. package/src/resources/extensions/gsd/tests/auto-workers.test.ts +105 -0
  89. package/src/resources/extensions/gsd/tests/command-queue.test.ts +141 -0
  90. package/src/resources/extensions/gsd/tests/crash-recovery-via-db.test.ts +203 -0
  91. package/src/resources/extensions/gsd/tests/crash-recovery.test.ts +169 -59
  92. package/src/resources/extensions/gsd/tests/detect-stuck-respects-retry.test.ts +173 -0
  93. package/src/resources/extensions/gsd/tests/integration/auto-worktree.test.ts +22 -12
  94. package/src/resources/extensions/gsd/tests/integration/doctor-proactive.test.ts +24 -10
  95. package/src/resources/extensions/gsd/tests/integration/doctor-runtime.test.ts +35 -23
  96. package/src/resources/extensions/gsd/tests/integration/workspace-collapse-integration.test.ts +3 -5
  97. package/src/resources/extensions/gsd/tests/interrupted-session-auto.test.ts +72 -25
  98. package/src/resources/extensions/gsd/tests/interrupted-session-ui.test.ts +72 -25
  99. package/src/resources/extensions/gsd/tests/memory-pressure-stuck-state.test.ts +9 -6
  100. package/src/resources/extensions/gsd/tests/milestone-leases.test.ts +152 -0
  101. package/src/resources/extensions/gsd/tests/parallel-milestone-isolation.test.ts +106 -0
  102. package/src/resources/extensions/gsd/tests/paused-session-via-db.test.ts +119 -0
  103. package/src/resources/extensions/gsd/tests/pipeline-variant-dispatch.test.ts +58 -0
  104. package/src/resources/extensions/gsd/tests/preferences-worktree-sync.test.ts +3 -17
  105. package/src/resources/extensions/gsd/tests/register-hooks-depth-verification.test.ts +110 -0
  106. package/src/resources/extensions/gsd/tests/runtime-kv.test.ts +120 -0
  107. package/src/resources/extensions/gsd/tests/skipped-validation-completion.test.ts +133 -28
  108. package/src/resources/extensions/gsd/tests/skipped-validation-db-atomicity.test.ts +17 -0
  109. package/src/resources/extensions/gsd/tests/stuck-state-via-db.test.ts +134 -0
  110. package/src/resources/extensions/gsd/tests/sync-layer-scope.test.ts +7 -26
  111. package/src/resources/extensions/gsd/tests/teardown-cleanup-parity.test.ts +4 -8
  112. package/src/resources/extensions/gsd/tests/unit-dispatches.test.ts +247 -0
  113. package/src/resources/extensions/gsd/tests/validate-milestone.test.ts +41 -1
  114. package/src/resources/extensions/gsd/tests/workspace.test.ts +15 -9
  115. package/src/resources/extensions/gsd/tests/write-gate.test.ts +31 -23
  116. package/src/resources/extensions/gsd/worktree-resolver.ts +62 -0
  117. package/src/resources/extensions/gsd/tests/auto-lock-creation.test.ts +0 -213
  118. package/src/resources/extensions/gsd/tests/auto-stale-lock-self-kill.test.ts +0 -87
  119. package/src/resources/extensions/gsd/tests/stop-auto-remote.test.ts +0 -159
  120. /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_buildManifest.js +0 -0
  121. /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_ssgManifest.js +0 -0
@@ -11,7 +11,8 @@
11
11
  */
12
12
  import { deriveState } from "./state.js";
13
13
  import { parseUnitId } from "./unit-id.js";
14
- import { assessInterruptedSession, readPausedSessionMetadata, } from "./interrupted-session.js";
14
+ import { assessInterruptedSession, readPausedSessionMetadata, PAUSED_SESSION_KV_KEY, } from "./interrupted-session.js";
15
+ import { setRuntimeKv, deleteRuntimeKv, } from "./db/runtime-kv.js";
15
16
  import { getManifestStatus } from "./files.js";
16
17
  export { inlinePriorMilestoneSummary } from "./files.js";
17
18
  import { collectSecretsFromManifest } from "../get-secrets-from-user.js";
@@ -40,7 +41,7 @@ import { setLogBasePath, logWarning } from "./workflow-logger.js";
40
41
  import { preflightCleanRoot, postflightPopStash } from "./clean-root-preflight.js";
41
42
  import { isAbsolute, join } from "node:path";
42
43
  import { pathToFileURL } from "node:url";
43
- import { readFileSync, existsSync, mkdirSync, unlinkSync } from "node:fs";
44
+ import { readFileSync, existsSync, mkdirSync } from "node:fs";
44
45
  import { atomicWriteSync } from "./atomic-write.js";
45
46
  import { autoCommitCurrentBranch, captureIntegrationBranch, detectWorktreeName, getCurrentBranch, getMainBranch, setActiveMilestoneId, } from "./worktree.js";
46
47
  import { GitServiceImpl } from "./git-service.js";
@@ -87,6 +88,9 @@ export { STUB_RECOVERY_THRESHOLD, NEW_SESSION_TIMEOUT_MS, } from "./auto/session
87
88
  import { autoSession as s } from "./auto-runtime-state.js";
88
89
  import { gsdHome } from "./gsd-home.js";
89
90
  import { createWorkspace, scopeMilestone } from "./workspace.js";
91
+ import { registerAutoWorker, markWorkerStopping } from "./db/auto-workers.js";
92
+ import { releaseMilestoneLease } from "./db/milestone-leases.js";
93
+ import { normalizeRealPath } from "./paths.js";
90
94
  // ── ENCAPSULATION INVARIANT ─────────────────────────────────────────────────
91
95
  // ALL mutable auto-mode state lives in the AutoSession class (auto/session.ts).
92
96
  // This file must NOT declare module-level `let` or `var` variables for state.
@@ -101,6 +105,27 @@ import { createWorkspace, scopeMilestone } from "./workspace.js";
101
105
  // ─────────────────────────────────────────────────────────────────────────────
102
106
  /** Throttle STATE.md rebuilds — at most once per 30 seconds */
103
107
  const STATE_REBUILD_MIN_INTERVAL_MS = 30_000;
108
+ /**
109
+ * Phase B — register this auto-mode process in the workers table so other
110
+ * workers and janitors can detect liveness via heartbeat. Best-effort: if
111
+ * the DB is unavailable (e.g. fresh project before init) we skip registration
112
+ * silently rather than blocking session start.
113
+ */
114
+ function registerAutoWorkerForSession(session) {
115
+ if (session.workerId)
116
+ return; // already registered (e.g. resume re-runs)
117
+ try {
118
+ const projectRootRealpath = normalizeRealPath(session.scope?.workspace.projectRoot
119
+ ?? (session.originalBasePath || session.basePath));
120
+ session.workerId = registerAutoWorker({ projectRootRealpath });
121
+ }
122
+ catch (err) {
123
+ debugLog("autoLoop", {
124
+ phase: "register-worker-failed",
125
+ error: err instanceof Error ? err.message : String(err),
126
+ });
127
+ }
128
+ }
104
129
  function captureProjectRootEnv(projectRoot) {
105
130
  if (!s.projectRootEnvCaptured) {
106
131
  s.hadProjectRootEnv = Object.prototype.hasOwnProperty.call(process.env, "GSD_PROJECT_ROOT");
@@ -621,6 +646,21 @@ export async function stopAuto(ctx, pi, reason) {
621
646
  catch (e) {
622
647
  debugLog("stop-cleanup-locks", { error: e instanceof Error ? e.message : String(e) });
623
648
  }
649
+ // ── Step 1b: Coordination cleanup (Phase B) ──
650
+ // Release any active milestone lease so other workers don't have to
651
+ // wait for TTL expiry, then mark this worker as stopping. Best-effort:
652
+ // DB unavailability or stale state must not block shutdown.
653
+ try {
654
+ if (s.workerId && s.currentMilestoneId && s.milestoneLeaseToken) {
655
+ releaseMilestoneLease(s.workerId, s.currentMilestoneId, s.milestoneLeaseToken);
656
+ }
657
+ if (s.workerId) {
658
+ markWorkerStopping(s.workerId);
659
+ }
660
+ }
661
+ catch (e) {
662
+ debugLog("stop-cleanup-coordination", { error: e instanceof Error ? e.message : String(e) });
663
+ }
624
664
  // ── Step 1b: Flush queued follow-up messages (#3512) ──
625
665
  // Late async notifications (async_job_result, gsd-auto-wrapup) can trigger
626
666
  // extra LLM turns after stop. Flush them the same way run-unit.ts does.
@@ -799,13 +839,12 @@ export async function stopAuto(ctx, pi, reason) {
799
839
  debugLog("stop-cleanup-metrics", { error: e instanceof Error ? e.message : String(e) });
800
840
  }
801
841
  // ── Step 12: Remove paused-session metadata (#1383) ──
842
+ // Phase C pt 2: deleteRuntimeKv replaces unlinkSync(paused-session.json).
802
843
  try {
803
- const pausedPath = join(gsdRoot(s.originalBasePath || s.basePath), "runtime", "paused-session.json");
804
- if (existsSync(pausedPath))
805
- unlinkSync(pausedPath);
844
+ deleteRuntimeKv("global", "", PAUSED_SESSION_KV_KEY);
806
845
  }
807
846
  catch (err) { /* non-fatal */
808
- logWarning("engine", `file unlink failed: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
847
+ logWarning("engine", `paused-session DB delete failed: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
809
848
  }
810
849
  // ── Step 13: Restore original model + thinking (before reset clears IDs) ──
811
850
  try {
@@ -897,10 +936,12 @@ export async function pauseAuto(ctx, _pi, _errorContext) {
897
936
  resolveAgentEndCancelled(_errorContext);
898
937
  s.pausedSessionFile = normalizeSessionFilePath(ctx?.sessionManager?.getSessionFile() ?? null);
899
938
  // Persist paused-session metadata so resume survives /exit (#1383).
900
- // The fresh-start bootstrap checks for this file and restores worktree context.
939
+ // Phase C pt 2: persisted to runtime_kv (global scope, key
940
+ // PAUSED_SESSION_KV_KEY) instead of runtime/paused-session.json. The
941
+ // fresh-start bootstrap below reads from the same key.
901
942
  try {
902
943
  const pausedMeta = {
903
- milestoneId: s.currentMilestoneId,
944
+ milestoneId: s.currentMilestoneId ?? undefined,
904
945
  worktreePath: isInAutoWorktree(s.basePath) ? s.basePath : null,
905
946
  originalBasePath: s.originalBasePath,
906
947
  stepMode: s.stepMode,
@@ -908,17 +949,16 @@ export async function pauseAuto(ctx, _pi, _errorContext) {
908
949
  sessionFile: s.pausedSessionFile,
909
950
  unitType: s.currentUnit?.type ?? undefined,
910
951
  unitId: s.currentUnit?.id ?? undefined,
911
- activeEngineId: s.activeEngineId,
952
+ activeEngineId: s.activeEngineId ?? undefined,
912
953
  activeRunDir: s.activeRunDir,
913
954
  autoStartTime: s.autoStartTime,
914
955
  milestoneLock: s.sessionMilestoneLock ?? undefined,
915
956
  };
916
- const runtimeDir = join(gsdRoot(s.originalBasePath || s.basePath), "runtime");
917
- atomicWriteSync(join(runtimeDir, "paused-session.json"), JSON.stringify(pausedMeta, null, 2), "utf-8");
957
+ setRuntimeKv("global", "", PAUSED_SESSION_KV_KEY, pausedMeta);
918
958
  }
919
959
  catch (err) {
920
960
  // Non-fatal — resume will still work via full bootstrap, just without worktree context
921
- logWarning("engine", `paused-session file write failed: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
961
+ logWarning("engine", `paused-session DB write failed: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
922
962
  }
923
963
  // Close out the current unit so its runtime record doesn't stay at "dispatched"
924
964
  if (s.currentUnit && ctx) {
@@ -1154,8 +1194,10 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
1154
1194
  if (recoverFailedMigration(base)) {
1155
1195
  ctx.ui.notify("Recovered unfinished migration (.gsd.migrating → .gsd).", "info");
1156
1196
  }
1157
- const freshStartAssessment = interruptedAssessment
1158
- ?? await assessInterruptedSession(base);
1197
+ const freshStartAssessment = await (interruptedAssessment
1198
+ ?? (() => {
1199
+ return ensureDbOpen(base).then(() => assessInterruptedSession(base));
1200
+ })());
1159
1201
  if (freshStartAssessment.classification === "running") {
1160
1202
  const pid = freshStartAssessment.lock?.pid;
1161
1203
  ctx.ui.notify(pid
@@ -1165,10 +1207,20 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
1165
1207
  }
1166
1208
  // If resuming from paused state, just re-activate and dispatch next unit.
1167
1209
  // Check persisted paused-session first (#1383) — survives /exit.
1210
+ // Phase C pt 2: persisted in runtime_kv (global scope) instead of
1211
+ // runtime/paused-session.json. The `clearPausedSession` helper
1212
+ // replaces every prior unlinkSync(pausedPath) call.
1213
+ const clearPausedSession = (logTag) => {
1214
+ try {
1215
+ deleteRuntimeKv("global", "", PAUSED_SESSION_KV_KEY);
1216
+ }
1217
+ catch (err) {
1218
+ logWarning("session", `${logTag}: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
1219
+ }
1220
+ };
1168
1221
  if (!s.paused) {
1169
1222
  try {
1170
1223
  const meta = freshStartAssessment.pausedSession ?? readPausedSessionMetadata(base);
1171
- const pausedPath = join(gsdRoot(base), "runtime", "paused-session.json");
1172
1224
  if (meta?.activeEngineId && meta.activeEngineId !== "dev") {
1173
1225
  // Custom workflow resume — restore engine state
1174
1226
  s.activeEngineId = meta.activeEngineId;
@@ -1178,14 +1230,6 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
1178
1230
  s.autoStartTime = meta.autoStartTime || Date.now();
1179
1231
  s.sessionMilestoneLock = meta.milestoneLock ?? null;
1180
1232
  s.paused = true;
1181
- try {
1182
- unlinkSync(pausedPath);
1183
- }
1184
- catch (e) {
1185
- if (e.code !== "ENOENT") {
1186
- logWarning("session", `pause file cleanup failed: ${e instanceof Error ? e.message : String(e)}`, { file: "auto.ts" });
1187
- }
1188
- }
1189
1233
  ctx.ui.notify(`Resuming paused custom workflow${meta.activeRunDir ? ` (${meta.activeRunDir})` : ""}.`, "info");
1190
1234
  }
1191
1235
  else if (meta?.milestoneId) {
@@ -1223,14 +1267,7 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
1223
1267
  }
1224
1268
  }
1225
1269
  if (!mDir || summaryIsTerminal) {
1226
- try {
1227
- unlinkSync(pausedPath);
1228
- }
1229
- catch (err) {
1230
- if (err.code !== "ENOENT") {
1231
- logWarning("session", `pause file cleanup failed: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
1232
- }
1233
- }
1270
+ clearPausedSession("paused-session DB cleanup failed (milestone gone/complete)");
1234
1271
  ctx.ui.notify(`Paused milestone ${meta.milestoneId} is ${!mDir ? "missing" : "already complete"}. Starting fresh.`, "info");
1235
1272
  }
1236
1273
  else {
@@ -1255,26 +1292,13 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
1255
1292
  : (s.originalBasePath || base);
1256
1293
  rebuildScope(rawForScope, s.currentMilestoneId);
1257
1294
  }
1258
- try {
1259
- unlinkSync(pausedPath);
1260
- }
1261
- catch (e) {
1262
- if (e.code !== "ENOENT") {
1263
- logWarning("session", `pause file cleanup failed: ${e instanceof Error ? e.message : String(e)}`, { file: "auto.ts" });
1264
- }
1265
- }
1266
1295
  ctx.ui.notify(`Resuming paused session for ${meta.milestoneId}${meta.worktreePath && existsSync(meta.worktreePath) ? ` (worktree)` : ""}.`, "info");
1267
1296
  }
1268
1297
  }
1269
- else if (existsSync(pausedPath)) {
1270
- try {
1271
- unlinkSync(pausedPath);
1272
- }
1273
- catch (e) {
1274
- if (e.code !== "ENOENT") {
1275
- logWarning("session", `stale pause file cleanup failed: ${e instanceof Error ? e.message : String(e)}`, { file: "auto.ts" });
1276
- }
1277
- }
1298
+ else if (meta) {
1299
+ // Stale paused-session metadata that the assessment chose not to
1300
+ // resume — clean it up so the next bootstrap starts fresh.
1301
+ clearPausedSession("stale paused-session DB cleanup failed");
1278
1302
  }
1279
1303
  }
1280
1304
  }
@@ -1425,10 +1449,14 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
1425
1449
  }
1426
1450
  s.pausedSessionFile = null;
1427
1451
  }
1452
+ captureProjectRootEnv(s.originalBasePath || s.basePath);
1453
+ registerAutoWorkerForSession(s);
1428
1454
  updateSessionLock(lockBase(), "resuming", s.currentMilestoneId ?? "unknown");
1429
- writeLock(lockBase(), "resuming", s.currentMilestoneId ?? "unknown");
1455
+ if (s.workerId) {
1456
+ writeLock(lockBase(), "resuming", s.currentMilestoneId ?? "unknown");
1457
+ clearPausedSession("paused-session DB cleanup failed (resume activation)");
1458
+ }
1430
1459
  pi.events.emit(CMUX_CHANNELS.LOG, { preferences: loadEffectiveGSDPreferences(s.basePath || undefined)?.preferences, message: s.stepMode ? "Step-mode resumed." : "Auto-mode resumed.", level: "progress" });
1431
- captureProjectRootEnv(s.originalBasePath || s.basePath);
1432
1460
  startAutoCommandPolling(s.basePath);
1433
1461
  await runAutoLoopWithUok({
1434
1462
  ctx,
@@ -1455,6 +1483,7 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
1455
1483
  // s.currentMilestoneId (including worktree setup inside bootstrapAutoSession).
1456
1484
  rebuildScope(s.basePath, s.currentMilestoneId);
1457
1485
  captureProjectRootEnv(s.originalBasePath || s.basePath);
1486
+ registerAutoWorkerForSession(s);
1458
1487
  try {
1459
1488
  pi.events.emit(CMUX_CHANNELS.SIDEBAR, { action: "sync", preferences: loadEffectiveGSDPreferences(s.basePath || undefined)?.preferences, state: await deriveState(s.basePath) });
1460
1489
  }
@@ -515,8 +515,13 @@ export function registerHooks(pi, ecosystemHandlers) {
515
515
  const currentPendingGate = getPendingGate();
516
516
  if (currentPendingGate) {
517
517
  if (details?.cancelled || !details?.response) {
518
- // Gate stays pending. Return a hard instruction as the tool result so
519
- // the model cannot reinterpret a cancelled prompt as prior approval.
518
+ // Gate stays pending. Direct the agent to the most reliable recovery
519
+ // path re-calling ask_user_questions with the same gate id — without
520
+ // misrepresenting the plain-text path. The plain-text path also works
521
+ // (isExplicitApprovalResponse on the next before_agent_start clears
522
+ // the gate when the user replies with an approval keyword), but the
523
+ // structured re-ask is more deterministic and gives the user a clear UI.
524
+ resetToolCallLoopGuard();
520
525
  return {
521
526
  content: [{
522
527
  type: "text",
@@ -524,8 +529,8 @@ export function registerHooks(pi, ecosystemHandlers) {
524
529
  `HARD BLOCK: approval gate "${currentPendingGate}" is still pending.`,
525
530
  "No user response was received for the confirmation question.",
526
531
  "Do not infer approval from earlier or prior messages.",
527
- "Do not proceed, write files, save artifacts, or call more tools.",
528
- "Ask the user to confirm in plain chat, then stop and wait for their next message.",
532
+ "Do not proceed, write files, save artifacts, or call other tools.",
533
+ `Re-call ask_user_questions with the same gate question id ("${currentPendingGate}") and wait for the user's response.`,
529
534
  ].join(" "),
530
535
  }],
531
536
  };
@@ -1,24 +1,106 @@
1
1
  /**
2
- * GSD Crash Recovery
2
+ * GSD Crash Recovery (Phase C pt 2 — DB-backed)
3
3
  *
4
- * Detects interrupted auto-mode sessions via a lock file.
5
- * Written on auto-start, updated on each unit dispatch, deleted on clean stop.
6
- * If the lock file exists on next startup, the previous session crashed.
4
+ * Detects interrupted auto-mode sessions via the DB-backed workers +
5
+ * unit_dispatches + runtime_kv tables. The auto.lock file is gone; the
6
+ * `LockData` shape is preserved for backward compatibility with callers
7
+ * (auto.ts, doctor checks, interrupted-session.ts), but the contents are
8
+ * now synthesized from:
7
9
  *
8
- * The lock records the pi session file path so crash recovery can read the
9
- * surviving JSONL (pi appends entries incrementally via appendFileSync,
10
- * so the file on disk reflects every tool call up to the crash point).
10
+ * - workers.pid / .started_at / .last_heartbeat_at → liveness + age
11
+ * - unit_dispatches.unit_type / .unit_id / .started_at → what was running
12
+ * - runtime_kv("worker", workerId, "session_file") → pi session JSONL path
13
+ *
14
+ * "Crashed" is detected via workers.status='active' + heartbeat past TTL,
15
+ * cross-checked with the OS PID via isLockProcessAlive(). When the DB is
16
+ * unavailable (fresh project before init), all readers return null and
17
+ * writers no-op — preserving the historical "no lock means no prior
18
+ * crash" semantics.
19
+ *
20
+ * The journal-based emitCrashRecoveredUnitEnd is unchanged from the file
21
+ * era — it queries the journal independently of the lock mechanism.
11
22
  */
23
+ import { emitJournalEvent, queryJournal, } from "./journal.js";
12
24
  import { readFileSync, unlinkSync, existsSync } from "node:fs";
13
25
  import { join } from "node:path";
14
- import { gsdRoot } from "./paths.js";
26
+ import { findStaleWorkerForProject, getAllAutoWorkers, } from "./db/auto-workers.js";
27
+ import { getRuntimeKv, setRuntimeKv, deleteRuntimeKv } from "./db/runtime-kv.js";
28
+ import { _getAdapter, isDbAvailable } from "./gsd-db.js";
29
+ import { gsdRoot, normalizeRealPath } from "./paths.js";
15
30
  import { atomicWriteSync } from "./atomic-write.js";
16
31
  import { effectiveLockFile } from "./session-lock.js";
17
- import { emitJournalEvent, queryJournal } from "./journal.js";
32
+ const SESSION_FILE_KV_KEY = "session_file";
18
33
  function lockPath(basePath) {
19
34
  return join(gsdRoot(basePath), effectiveLockFile());
20
35
  }
21
- /** Write or update the lock file with current auto-mode state. */
36
+ function readLegacyLock(basePath) {
37
+ try {
38
+ const p = lockPath(basePath);
39
+ if (!existsSync(p))
40
+ return null;
41
+ return JSON.parse(readFileSync(p, "utf-8"));
42
+ }
43
+ catch {
44
+ return null;
45
+ }
46
+ }
47
+ function findActiveWorkerForCurrentProcess(projectRootRealpath) {
48
+ if (!isDbAvailable())
49
+ return null;
50
+ const workers = getAllAutoWorkers();
51
+ for (const worker of workers) {
52
+ if (worker.pid === process.pid
53
+ && worker.project_root_realpath === projectRootRealpath) {
54
+ return worker;
55
+ }
56
+ }
57
+ return null;
58
+ }
59
+ /**
60
+ * Look up the most recent dispatch row for a worker, regardless of status.
61
+ * Returns null if the worker has no dispatch history yet (e.g. crashed
62
+ * during bootstrap before claiming the first unit).
63
+ */
64
+ function getLatestDispatchForWorker(workerId) {
65
+ if (!isDbAvailable())
66
+ return null;
67
+ const db = _getAdapter();
68
+ const row = db.prepare(`SELECT unit_type, unit_id, started_at, status
69
+ FROM unit_dispatches
70
+ WHERE worker_id = :worker_id
71
+ ORDER BY id DESC
72
+ LIMIT 1`).get({ ":worker_id": workerId });
73
+ return row ?? null;
74
+ }
75
+ function workerToLockData(worker) {
76
+ const dispatch = getLatestDispatchForWorker(worker.worker_id);
77
+ const sessionFile = getRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY) ?? undefined;
78
+ return {
79
+ pid: worker.pid,
80
+ startedAt: worker.started_at,
81
+ // Pre-Phase-C-pt-2 default: when no dispatch row exists yet (bootstrap
82
+ // crash), report unitType="starting", unitId="bootstrap" — same shape
83
+ // the file-based writer used to produce.
84
+ unitType: dispatch?.unit_type ?? "starting",
85
+ unitId: dispatch?.unit_id ?? "bootstrap",
86
+ unitStartedAt: dispatch?.started_at ?? worker.started_at,
87
+ sessionFile,
88
+ };
89
+ }
90
+ /**
91
+ * Write or update the lock state for the current auto-mode session.
92
+ *
93
+ * Phase C pt 2: the only persistent state this function adds beyond what
94
+ * the workers + unit_dispatches tables already track is the pi session
95
+ * JSONL path, which lands in runtime_kv (worker scope, key
96
+ * "session_file"). The pid/startedAt/unitType/unitId/unitStartedAt are
97
+ * recorded by registerAutoWorker / heartbeatAutoWorker / recordDispatchClaim
98
+ * already.
99
+ *
100
+ * basePath is unused by the new implementation (kept as a parameter for
101
+ * back-compat with the 15+ call sites) — the worker is identified by
102
+ * pid + project_root_realpath in the workers table.
103
+ */
22
104
  export function writeLock(basePath, unitType, unitId, sessionFile) {
23
105
  try {
24
106
  const data = {
@@ -29,51 +111,86 @@ export function writeLock(basePath, unitType, unitId, sessionFile) {
29
111
  unitStartedAt: new Date().toISOString(),
30
112
  sessionFile,
31
113
  };
32
- const lp = lockPath(basePath);
33
- atomicWriteSync(lp, JSON.stringify(data, null, 2));
114
+ atomicWriteSync(lockPath(basePath), JSON.stringify(data, null, 2));
115
+ }
116
+ catch {
117
+ // Best-effort — never throw from the lock writer.
118
+ }
119
+ if (!isDbAvailable() || !sessionFile)
120
+ return;
121
+ try {
122
+ const projectRoot = normalizeRealPath(basePath);
123
+ const worker = findActiveWorkerForCurrentProcess(projectRoot);
124
+ if (!worker)
125
+ return;
126
+ setRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY, sessionFile);
34
127
  }
35
- catch (e) { /* non-fatal: lock write failure */
36
- void e;
128
+ catch {
129
+ // Best-effort — never throw from the lock writer.
37
130
  }
38
131
  }
39
- /** Remove the lock file on clean stop. */
132
+ /**
133
+ * Phase C pt 2: clearLock no longer deletes a file. The cleanup path
134
+ * (markWorkerStopping in stopAuto) flips the workers row to 'stopping'.
135
+ * This function additionally drops the session_file runtime_kv row for
136
+ * the current worker so a follow-up crash detection doesn't pick up a
137
+ * stale session-file pointer.
138
+ */
40
139
  export function clearLock(basePath) {
41
140
  try {
42
141
  const p = lockPath(basePath);
43
142
  if (existsSync(p))
44
143
  unlinkSync(p);
45
144
  }
46
- catch (e) { /* non-fatal: lock clear failure */
47
- void e;
145
+ catch {
146
+ // Best-effort.
48
147
  }
49
- }
50
- /** Check if a crash lock exists and return its data. */
51
- export function readCrashLock(basePath) {
148
+ if (!isDbAvailable())
149
+ return;
52
150
  try {
53
- const p = lockPath(basePath);
54
- if (!existsSync(p))
55
- return null;
56
- const raw = readFileSync(p, "utf-8");
57
- return JSON.parse(raw);
151
+ const projectRoot = normalizeRealPath(basePath);
152
+ const worker = findActiveWorkerForCurrentProcess(projectRoot);
153
+ if (!worker)
154
+ return;
155
+ deleteRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY);
58
156
  }
59
- catch (e) {
60
- /* non-fatal: corrupt or unreadable lock file */ void e;
61
- return null;
157
+ catch {
158
+ // Best-effort.
159
+ }
160
+ }
161
+ /**
162
+ * Detect a previous crashed auto-mode session.
163
+ *
164
+ * Phase C pt 2: synthesized from workers (status='active' + lapsed
165
+ * heartbeat) + unit_dispatches (most recent for that worker) +
166
+ * runtime_kv (session_file). Returns null when no stale worker exists
167
+ * or the DB is unavailable.
168
+ */
169
+ export function readCrashLock(basePath) {
170
+ if (isDbAvailable()) {
171
+ try {
172
+ const projectRoot = normalizeRealPath(basePath);
173
+ const stale = findStaleWorkerForProject(projectRoot);
174
+ if (stale)
175
+ return workerToLockData(stale);
176
+ }
177
+ catch {
178
+ // Fall through to the legacy lock-file compatibility path.
179
+ }
62
180
  }
181
+ return readLegacyLock(basePath);
63
182
  }
64
183
  /**
65
184
  * Check whether the process that wrote the lock is still running.
66
185
  * Uses `process.kill(pid, 0)` which sends no signal but checks liveness.
67
186
  * Returns true if the PID matches our own — we are the lock holder (#2470).
187
+ *
188
+ * Unchanged from the file-based era — pure stateless OS check.
68
189
  */
69
190
  export function isLockProcessAlive(lock) {
70
191
  const pid = lock.pid;
71
192
  if (!Number.isInteger(pid) || pid <= 0)
72
193
  return false;
73
- // Our own PID means WE hold this lock — we are alive. (#2470)
74
- // Callers that need to distinguish "our lock" from "someone else's lock"
75
- // (e.g. startAuto checking for a prior crashed session with a recycled PID)
76
- // already guard with `crashLock.pid !== process.pid` before calling us.
77
194
  if (pid === process.pid)
78
195
  return true;
79
196
  try {
@@ -81,8 +198,6 @@ export function isLockProcessAlive(lock) {
81
198
  return true;
82
199
  }
83
200
  catch (err) {
84
- // EPERM means the process exists but we lack permission — treat as alive.
85
- // ESRCH means the process does not exist — treat as dead (stale lock).
86
201
  if (err.code === "EPERM")
87
202
  return true;
88
203
  return false;
@@ -96,7 +211,6 @@ export function formatCrashInfo(lock) {
96
211
  ` Started at: ${lock.unitStartedAt}`,
97
212
  ` PID: ${lock.pid}`,
98
213
  ];
99
- // Add recovery guidance based on what was happening when it crashed
100
214
  if (lock.unitType === "starting" && lock.unitId === "bootstrap") {
101
215
  lines.push(`No work was lost. Run /gsd auto to restart.`);
102
216
  }
@@ -113,33 +227,23 @@ export function formatCrashInfo(lock) {
113
227
  }
114
228
  /**
115
229
  * Emit a synthetic unit-end event for a unit that crashed without emitting its own.
116
- *
117
- * Queries the journal to find the most recent unit-start for the crashed unit.
118
- * If a matching unit-end already exists (e.g. the hard timeout fired), this is a
119
- * no-op. Called during crash recovery, before clearing the stale lock.
120
- *
121
- * Addresses the gap reported in #3348 where `unit-start` was emitted but no
122
- * `unit-end` followed — side effects landed but the worker died before closeout.
230
+ * Unchanged from the file era — operates on the journal, not the lock.
123
231
  */
124
232
  export function emitCrashRecoveredUnitEnd(basePath, lock) {
125
- // Skip bootstrap / starting pseudo-units — they have no meaningful unit-start event.
126
233
  if (!lock.unitType || !lock.unitId || lock.unitType === "starting")
127
234
  return;
128
235
  try {
129
236
  const all = queryJournal(basePath);
130
- // Find the most recent unit-start for this unitId
131
237
  const starts = all.filter((e) => e.eventType === "unit-start" && e.data?.unitId === lock.unitId);
132
238
  if (starts.length === 0)
133
239
  return;
134
240
  const lastStart = starts[starts.length - 1];
135
- // Check if a unit-end was already emitted (e.g. hard timeout fired after the crash)
136
241
  const alreadyClosed = all.some((e) => e.eventType === "unit-end" &&
137
242
  e.data?.unitId === lock.unitId &&
138
243
  e.causedBy?.flowId === lastStart.flowId &&
139
244
  e.causedBy?.seq === lastStart.seq);
140
245
  if (alreadyClosed)
141
246
  return;
142
- // Find the highest seq in this flow for monotonic ordering
143
247
  const maxSeq = all
144
248
  .filter((e) => e.flowId === lastStart.flowId)
145
249
  .reduce((max, e) => Math.max(max, e.seq), lastStart.seq);
@@ -158,6 +262,15 @@ export function emitCrashRecoveredUnitEnd(basePath, lock) {
158
262
  });
159
263
  }
160
264
  catch {
161
- // Never throw from crash recovery path — journal failure must not block recovery
265
+ // Never throw from crash recovery path.
162
266
  }
163
267
  }
268
+ /**
269
+ * Used by the doctor checks (doctor-runtime-checks.ts, doctor-proactive.ts)
270
+ * to enumerate stale workers across all projects this DB knows about.
271
+ * Phase C pt 2 export — surface for the same diagnostics that previously
272
+ * iterated `auto.lock` files.
273
+ */
274
+ export function findStaleAutoWorker(basePath) {
275
+ return readCrashLock(basePath);
276
+ }