gsd-pi 2.78.1-dev.d8826a445 → 2.78.1-dev.eccf86e27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/README.md +5 -7
  2. package/dist/help-text.js +1 -1
  3. package/dist/resource-loader.js +6 -1
  4. package/dist/resources/.managed-resources-content-hash +1 -1
  5. package/dist/resources/extensions/gsd/auto/detect-stuck.js +41 -5
  6. package/dist/resources/extensions/gsd/auto/loop.js +235 -36
  7. package/dist/resources/extensions/gsd/auto/phases.js +7 -5
  8. package/dist/resources/extensions/gsd/auto/session.js +33 -0
  9. package/dist/resources/extensions/gsd/auto-dispatch.js +46 -2
  10. package/dist/resources/extensions/gsd/auto-post-unit.js +19 -11
  11. package/dist/resources/extensions/gsd/auto-worktree.js +26 -187
  12. package/dist/resources/extensions/gsd/auto.js +79 -50
  13. package/dist/resources/extensions/gsd/bootstrap/register-hooks.js +9 -4
  14. package/dist/resources/extensions/gsd/crash-recovery.js +160 -47
  15. package/dist/resources/extensions/gsd/db/auto-workers.js +227 -0
  16. package/dist/resources/extensions/gsd/db/command-queue.js +105 -0
  17. package/dist/resources/extensions/gsd/db/milestone-leases.js +210 -0
  18. package/dist/resources/extensions/gsd/db/runtime-kv.js +91 -0
  19. package/dist/resources/extensions/gsd/db/unit-dispatches.js +322 -0
  20. package/dist/resources/extensions/gsd/docs/COORDINATION.md +42 -0
  21. package/dist/resources/extensions/gsd/doctor-proactive.js +4 -0
  22. package/dist/resources/extensions/gsd/doctor-runtime-checks.js +22 -6
  23. package/dist/resources/extensions/gsd/doctor.js +12 -2
  24. package/dist/resources/extensions/gsd/gsd-db.js +161 -3
  25. package/dist/resources/extensions/gsd/guided-flow.js +6 -2
  26. package/dist/resources/extensions/gsd/interrupted-session.js +18 -15
  27. package/dist/resources/extensions/gsd/state.js +21 -6
  28. package/dist/resources/extensions/gsd/worktree-resolver.js +64 -0
  29. package/dist/tsconfig.extensions.tsbuildinfo +1 -1
  30. package/dist/web/standalone/.next/BUILD_ID +1 -1
  31. package/dist/web/standalone/.next/app-path-routes-manifest.json +12 -12
  32. package/dist/web/standalone/.next/build-manifest.json +2 -2
  33. package/dist/web/standalone/.next/prerender-manifest.json +3 -3
  34. package/dist/web/standalone/.next/server/app/_global-error.html +1 -1
  35. package/dist/web/standalone/.next/server/app/_global-error.rsc +1 -1
  36. package/dist/web/standalone/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
  37. package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error/__PAGE__.segment.rsc +1 -1
  38. package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error.segment.rsc +1 -1
  39. package/dist/web/standalone/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
  40. package/dist/web/standalone/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
  41. package/dist/web/standalone/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
  42. package/dist/web/standalone/.next/server/app/_not-found.html +1 -1
  43. package/dist/web/standalone/.next/server/app/_not-found.rsc +1 -1
  44. package/dist/web/standalone/.next/server/app/_not-found.segments/_full.segment.rsc +1 -1
  45. package/dist/web/standalone/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
  46. package/dist/web/standalone/.next/server/app/_not-found.segments/_index.segment.rsc +1 -1
  47. package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
  48. package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
  49. package/dist/web/standalone/.next/server/app/_not-found.segments/_tree.segment.rsc +1 -1
  50. package/dist/web/standalone/.next/server/app/index.html +1 -1
  51. package/dist/web/standalone/.next/server/app/index.rsc +1 -1
  52. package/dist/web/standalone/.next/server/app/index.segments/__PAGE__.segment.rsc +1 -1
  53. package/dist/web/standalone/.next/server/app/index.segments/_full.segment.rsc +1 -1
  54. package/dist/web/standalone/.next/server/app/index.segments/_head.segment.rsc +1 -1
  55. package/dist/web/standalone/.next/server/app/index.segments/_index.segment.rsc +1 -1
  56. package/dist/web/standalone/.next/server/app/index.segments/_tree.segment.rsc +1 -1
  57. package/dist/web/standalone/.next/server/app-paths-manifest.json +12 -12
  58. package/dist/web/standalone/.next/server/middleware-build-manifest.js +1 -1
  59. package/dist/web/standalone/.next/server/pages/404.html +1 -1
  60. package/dist/web/standalone/.next/server/pages/500.html +1 -1
  61. package/dist/web/standalone/.next/server/server-reference-manifest.json +1 -1
  62. package/package.json +1 -1
  63. package/src/resources/extensions/gsd/auto/detect-stuck.ts +37 -5
  64. package/src/resources/extensions/gsd/auto/loop.ts +263 -41
  65. package/src/resources/extensions/gsd/auto/phases.ts +7 -5
  66. package/src/resources/extensions/gsd/auto/session.ts +36 -0
  67. package/src/resources/extensions/gsd/auto-dispatch.ts +53 -2
  68. package/src/resources/extensions/gsd/auto-post-unit.ts +19 -11
  69. package/src/resources/extensions/gsd/auto-worktree.ts +26 -211
  70. package/src/resources/extensions/gsd/auto.ts +89 -44
  71. package/src/resources/extensions/gsd/bootstrap/register-hooks.ts +9 -4
  72. package/src/resources/extensions/gsd/crash-recovery.ts +177 -43
  73. package/src/resources/extensions/gsd/db/auto-workers.ts +273 -0
  74. package/src/resources/extensions/gsd/db/command-queue.ts +149 -0
  75. package/src/resources/extensions/gsd/db/milestone-leases.ts +274 -0
  76. package/src/resources/extensions/gsd/db/runtime-kv.ts +127 -0
  77. package/src/resources/extensions/gsd/db/unit-dispatches.ts +446 -0
  78. package/src/resources/extensions/gsd/docs/COORDINATION.md +42 -0
  79. package/src/resources/extensions/gsd/doctor-proactive.ts +4 -0
  80. package/src/resources/extensions/gsd/doctor-runtime-checks.ts +24 -6
  81. package/src/resources/extensions/gsd/doctor.ts +10 -2
  82. package/src/resources/extensions/gsd/gsd-db.ts +170 -3
  83. package/src/resources/extensions/gsd/guided-flow.ts +6 -2
  84. package/src/resources/extensions/gsd/interrupted-session.ts +19 -12
  85. package/src/resources/extensions/gsd/state.ts +44 -6
  86. package/src/resources/extensions/gsd/tests/auto-loop-no-copy-artifacts.test.ts +72 -0
  87. package/src/resources/extensions/gsd/tests/auto-loop-symlink-worktree.test.ts +190 -0
  88. package/src/resources/extensions/gsd/tests/auto-workers.test.ts +105 -0
  89. package/src/resources/extensions/gsd/tests/command-queue.test.ts +141 -0
  90. package/src/resources/extensions/gsd/tests/crash-recovery-via-db.test.ts +203 -0
  91. package/src/resources/extensions/gsd/tests/crash-recovery.test.ts +169 -59
  92. package/src/resources/extensions/gsd/tests/detect-stuck-respects-retry.test.ts +173 -0
  93. package/src/resources/extensions/gsd/tests/integration/auto-worktree.test.ts +22 -12
  94. package/src/resources/extensions/gsd/tests/integration/doctor-proactive.test.ts +24 -10
  95. package/src/resources/extensions/gsd/tests/integration/doctor-runtime.test.ts +35 -23
  96. package/src/resources/extensions/gsd/tests/integration/workspace-collapse-integration.test.ts +3 -5
  97. package/src/resources/extensions/gsd/tests/interrupted-session-auto.test.ts +72 -25
  98. package/src/resources/extensions/gsd/tests/interrupted-session-ui.test.ts +72 -25
  99. package/src/resources/extensions/gsd/tests/memory-pressure-stuck-state.test.ts +9 -6
  100. package/src/resources/extensions/gsd/tests/milestone-leases.test.ts +152 -0
  101. package/src/resources/extensions/gsd/tests/parallel-milestone-isolation.test.ts +106 -0
  102. package/src/resources/extensions/gsd/tests/paused-session-via-db.test.ts +119 -0
  103. package/src/resources/extensions/gsd/tests/pipeline-variant-dispatch.test.ts +58 -0
  104. package/src/resources/extensions/gsd/tests/preferences-worktree-sync.test.ts +3 -17
  105. package/src/resources/extensions/gsd/tests/register-hooks-depth-verification.test.ts +110 -0
  106. package/src/resources/extensions/gsd/tests/runtime-kv.test.ts +120 -0
  107. package/src/resources/extensions/gsd/tests/skipped-validation-completion.test.ts +133 -28
  108. package/src/resources/extensions/gsd/tests/skipped-validation-db-atomicity.test.ts +17 -0
  109. package/src/resources/extensions/gsd/tests/stuck-state-via-db.test.ts +134 -0
  110. package/src/resources/extensions/gsd/tests/sync-layer-scope.test.ts +7 -26
  111. package/src/resources/extensions/gsd/tests/teardown-cleanup-parity.test.ts +4 -8
  112. package/src/resources/extensions/gsd/tests/unit-dispatches.test.ts +247 -0
  113. package/src/resources/extensions/gsd/tests/validate-milestone.test.ts +41 -1
  114. package/src/resources/extensions/gsd/tests/workspace.test.ts +15 -9
  115. package/src/resources/extensions/gsd/tests/write-gate.test.ts +31 -23
  116. package/src/resources/extensions/gsd/worktree-resolver.ts +62 -0
  117. package/src/resources/extensions/gsd/tests/auto-lock-creation.test.ts +0 -213
  118. package/src/resources/extensions/gsd/tests/auto-stale-lock-self-kill.test.ts +0 -87
  119. package/src/resources/extensions/gsd/tests/stop-auto-remote.test.ts +0 -159
  120. /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_buildManifest.js +0 -0
  121. /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_ssgManifest.js +0 -0
package/README.md CHANGED
@@ -322,7 +322,7 @@ The database is authoritative for milestones, slices, tasks, requirements, decis
322
322
 
323
323
  3. **Git isolation** — When `git.isolation` is set to `worktree` or `branch`, each milestone runs on its own `milestone/<MID>` branch (in a worktree or in-place). All slice work commits sequentially — no branch switching, no merge conflicts. When the milestone completes, it's squash-merged to main as one clean commit. The default is `none` (work on the current branch), configurable via preferences. If `worktree` is configured in a repo with no committed `HEAD`, GSD temporarily behaves as `none` until the first commit exists because git worktrees need a committed start point.
324
324
 
325
- 4. **Crash recovery** — A lock file tracks the current unit. If the session dies, the next `/gsd auto` reads the surviving session file, synthesizes a recovery briefing from every tool call that made it to disk, and resumes with full context. Parallel orchestrator state is persisted to disk with PID liveness detection, so multi-worker sessions survive crashes too. In headless mode, crashes trigger automatic restart with exponential backoff (default 3 attempts).
325
+ 4. **Crash recovery** — Auto mode persists worker state, unit-dispatch state, and paused-session metadata in the project-root SQLite database. If the session dies, the next `/gsd auto` reconstructs the interrupted unit from DB-backed runtime state, reads the surviving session file, synthesizes a recovery briefing from every tool call that made it to disk, and resumes with full context. Parallel orchestrator IPC still lives under `.gsd/parallel/`, so multi-worker sessions survive crashes too. In headless mode, crashes trigger automatic restart with exponential backoff (default 3 attempts).
326
326
 
327
327
  5. **Provider error recovery** — Transient provider errors (rate limits, 500/503 server errors, overloaded) auto-resume after a delay. Permanent errors (auth, billing) pause for manual review. The model fallback chain retries transient network errors before switching models.
328
328
 
@@ -414,7 +414,7 @@ gsd
414
414
  /gsd queue # queue the next milestone
415
415
  ```
416
416
 
417
- Both terminals read and write the same `.gsd/` files on disk. Your decisions in terminal 2 are picked up automatically at the next phase boundary no need to stop auto mode.
417
+ Both terminals coordinate through the same project-root GSD runtime on local disk. The SQLite database is authoritative, `.gsd/` markdown is refreshed from it, and your decisions in terminal 2 are picked up at the next phase boundary without stopping auto mode.
418
418
 
419
419
  ### Headless mode — CI and scripts
420
420
 
@@ -439,7 +439,7 @@ gsd headless dispatch plan
439
439
 
440
440
  Headless auto-responds to interactive prompts, detects completion, and exits with structured codes: `0` complete, `1` error/timeout, `2` blocked. Auto-restarts on crash with exponential backoff. Use `gsd headless query` for instant, machine-readable state inspection — returns phase, next dispatch preview, and parallel worker costs as a single JSON object without spawning an LLM session. Pair with [remote questions](./docs/user-docs/remote-questions.md) to route decisions to Slack or Discord when human input is needed.
441
441
 
442
- **Multi-session orchestration** — headless mode supports file-based IPC in `.gsd/parallel/` for coordinating multiple GSD workers across milestones. Build orchestrators that spawn, monitor, and budget-cap a fleet of GSD workers.
442
+ **Multi-session orchestration** — headless mode supports DB-backed coordination across multiple GSD workers on the same machine. Worker registration, milestone leases, unit dispatch tracking, and command delivery live in `.gsd/gsd.db`, while `.gsd/parallel/` remains a local runtime area for per-milestone locks and isolation artifacts.
443
443
 
444
444
  ### First launch
445
445
 
@@ -705,8 +705,6 @@ The best practice for working in teams is to ensure unique milestone names acros
705
705
 
706
706
  ```bash
707
707
  # ── GSD: Runtime / Ephemeral (per-developer, per-session) ──────────────────
708
- # Crash detection sentinel — PID lock, written per auto-mode session
709
- .gsd/auto.lock
710
708
  # Auto-mode dispatch tracker — prevents re-running completed units (includes archived per-milestone files)
711
709
  .gsd/completed-units*.json
712
710
  # State manifest — workflow state for recovery
@@ -717,11 +715,11 @@ The best practice for working in teams is to ensure unique milestone names acros
717
715
  .gsd/metrics.json
718
716
  # Raw JSONL session dumps — crash recovery forensics, auto-pruned
719
717
  .gsd/activity/
720
- # Unit execution records — dispatch phase, timeouts, recovery tracking
718
+ # Unit execution records — dispatch phase, timeouts, and recovery tracking
721
719
  .gsd/runtime/
722
720
  # Git worktree working copies
723
721
  .gsd/worktrees/
724
- # Parallel orchestration IPC and worker status
722
+ # Parallel runtime locks and per-milestone isolation artifacts
725
723
  .gsd/parallel/
726
724
  # SQLite database and WAL sidecars — authoritative runtime state, local only
727
725
  .gsd/gsd.db*
package/dist/help-text.js CHANGED
@@ -156,7 +156,7 @@ const SUBCOMMAND_HELP = {
156
156
  ' gsd headless --answers answers.json auto With pre-supplied answers',
157
157
  ' gsd headless --events agent_end,extension_ui_request auto Filtered event stream',
158
158
  ' gsd headless query Instant JSON state snapshot',
159
- ' gsd headless recover Rebuild DB hierarchy from markdown (mutating)',
159
+ ' gsd headless recover Reset hierarchy + validation/gates, then rebuild from markdown',
160
160
  '',
161
161
  'Exit codes: 0 = success, 1 = error/timeout, 10 = blocked, 11 = cancelled',
162
162
  ].join('\n'),
@@ -29,6 +29,9 @@ export function getExtensionKey(entryPath, extensionsDir) {
29
29
  const relPath = relative(extensionsDir, entryPath);
30
30
  return relPath.split(/[\\/]/)[0].replace(/\.(?:ts|js)$/, '');
31
31
  }
32
+ function stripSemverBuildMetadata(version) {
33
+ return version.trim().replace(/^v/, '').split(/[+-]/, 1)[0] || '0.0.0';
34
+ }
32
35
  function getManagedResourceManifestPath(agentDir) {
33
36
  return join(agentDir, resourceVersionManifestName);
34
37
  }
@@ -166,7 +169,9 @@ export function getNewerManagedResourceVersion(agentDir, currentVersion) {
166
169
  if (!managedVersion) {
167
170
  return null;
168
171
  }
169
- return compareSemver(managedVersion, currentVersion) > 0 ? managedVersion : null;
172
+ // Managed resources stamped from the same release line should remain usable
173
+ // against local dev binaries like 2.78.1-dev.<sha>.
174
+ return compareSemver(stripSemverBuildMetadata(managedVersion), stripSemverBuildMetadata(currentVersion)) > 0 ? managedVersion : null;
170
175
  }
171
176
  /**
172
177
  * Recursively makes all files and directories under dirPath owner-writable.
@@ -1 +1 @@
1
- 3cb2810818585c65
1
+ 36cc9805e706289c
@@ -4,19 +4,53 @@
4
4
  * Leaf node in the import DAG.
5
5
  */
6
6
  import { summarizeLogs } from "../workflow-logger.js";
7
+ import { getLatestForUnit } from "../db/unit-dispatches.js";
7
8
  /**
8
9
  * Pattern matching ENOENT errors with a file path.
9
10
  * Matches: "ENOENT: no such file or directory, access '/path/to/file'"
10
11
  * and similar Node.js filesystem error messages.
11
12
  */
12
13
  const ENOENT_PATH_RE = /ENOENT[^']*'([^']+)'/;
14
+ /**
15
+ * Phase B / codex review MEDIUM B3 — retry coupling.
16
+ *
17
+ * If unit_dispatches has a recent failed dispatch for `unitKey` whose
18
+ * retry budget is not yet exhausted (attempt_n < max_attempts) AND whose
19
+ * scheduled next_run_at is still in the future, the loop is legitimately
20
+ * waiting on its own backoff. Suppress the stuck verdict in that case so
21
+ * the retry budget can fully drain before we declare stuck.
22
+ *
23
+ * Returns true if the dispatch ledger says we should suppress the stuck
24
+ * signal; false (no suppression) when the ledger is unavailable or has
25
+ * no opinion.
26
+ */
27
+ function retryBudgetSuppresses(unitKey) {
28
+ try {
29
+ const latest = getLatestForUnit(unitKey);
30
+ if (!latest)
31
+ return false;
32
+ if (latest.attempt_n >= latest.max_attempts)
33
+ return false;
34
+ if (!latest.next_run_at)
35
+ return false;
36
+ const nextRun = Date.parse(latest.next_run_at);
37
+ if (!Number.isFinite(nextRun))
38
+ return false;
39
+ return nextRun > Date.now();
40
+ }
41
+ catch {
42
+ return false;
43
+ }
44
+ }
13
45
  /**
14
46
  * Analyze a sliding window of recent unit dispatches for stuck patterns.
15
47
  * Returns a signal with reason if stuck, null otherwise.
16
48
  *
17
49
  * Rule 1: Same error string twice in a row → stuck immediately.
18
50
  * Rule 2: Same unit key 3+ consecutive times → stuck (preserves prior behavior).
19
- * Rule 2b: Same unit key appears 3+ times anywhere in the active window → stuck.
51
+ * Rule 2b: Same unit key appears 3+ times anywhere in the active window → stuck,
52
+ * UNLESS unit_dispatches says we're inside the retry-backoff window
53
+ * (codex review MEDIUM B3 — Phase B retry coupling).
20
54
  * Rule 3: Oscillation A→B→A→B in last 4 entries → stuck.
21
55
  * Rule 4: Same ENOENT path in any 2 entries within the window → stuck (#3575).
22
56
  * Missing files don't self-heal between retries — retrying wastes budget.
@@ -39,19 +73,21 @@ export function detectStuck(window) {
39
73
  reason: `Same error repeated: ${last.error.slice(0, 200)}${suffix}`,
40
74
  };
41
75
  }
42
- // Rule 2: Same unit 3+ consecutive times
76
+ // Rule 2: Same unit 3+ consecutive times — suppressed if unit_dispatches
77
+ // says we're inside the retry-backoff window (codex MEDIUM B3).
43
78
  if (window.length >= 3) {
44
79
  const lastThree = window.slice(-3);
45
- if (lastThree.every((u) => u.key === last.key)) {
80
+ if (lastThree.every((u) => u.key === last.key) && !retryBudgetSuppresses(last.key)) {
46
81
  return {
47
82
  stuck: true,
48
83
  reason: `${last.key} derived 3 consecutive times without progress${suffix}`,
49
84
  };
50
85
  }
51
86
  }
52
- // Rule 2b: Same unit key 3+ times anywhere in the active window
87
+ // Rule 2b: Same unit key 3+ times anywhere in the active window — same
88
+ // retry-budget suppression as Rule 2.
53
89
  const countInWindow = window.filter((entry) => entry.key === last.key).length;
54
- if (countInWindow >= 3) {
90
+ if (countInWindow >= 3 && !retryBudgetSuppresses(last.key)) {
55
91
  return {
56
92
  stuck: true,
57
93
  reason: `${last.key} derived ${countInWindow} times in last ${window.length} attempts without progress${suffix}`,
@@ -16,49 +16,55 @@ import { ModelPolicyDispatchBlockedError } from "../auto-model-selection.js";
16
16
  import { resolveEngine } from "../engine-resolver.js";
17
17
  import { logWarning } from "../workflow-logger.js";
18
18
  import { gsdRoot } from "../paths.js";
19
+ import { heartbeatAutoWorker } from "../db/auto-workers.js";
20
+ import { recordDispatchClaim, markRunning as markDispatchRunning, markCompleted as markDispatchCompleted, markFailed as markDispatchFailed, getRecentForUnit as getRecentDispatchesForUnit, getRecentUnitKeysForProjectRoot, } from "../db/unit-dispatches.js";
21
+ import { refreshMilestoneLease } from "../db/milestone-leases.js";
22
+ import { getRuntimeKv, setRuntimeKv } from "../db/runtime-kv.js";
19
23
  import { atomicWriteSync } from "../atomic-write.js";
20
24
  import { resolveUokFlags } from "../uok/flags.js";
21
25
  import { scheduleSidecarQueue } from "../uok/execution-graph.js";
22
26
  import { ExecutionGraphScheduler } from "../uok/execution-graph.js";
23
- import { readFileSync, writeFileSync, mkdirSync, unlinkSync } from "node:fs";
27
+ import { readFileSync, mkdirSync, unlinkSync } from "node:fs";
24
28
  import { join } from "node:path";
29
+ import { normalizeRealPath } from "../paths.js";
25
30
  // ── Stuck detection persistence (#3704) ──────────────────────────────────
26
- // Persist stuck detection state to disk so it survives session restarts.
27
- // Without this, restarting auto-mode resets all counters, allowing the
28
- // same blocked unit to burn a full retry budget each session.
29
- function stuckStatePath(basePath) {
30
- return join(gsdRoot(basePath), "runtime", "stuck-state.json");
31
+ // Phase C migration: stuck-state.json deleted in favor of DB-backed
32
+ // equivalents. recentUnits is rebuilt from unit_dispatches (Phase B
33
+ // ledger) on session start; stuckRecoveryAttempts persists in runtime_kv
34
+ // under a stable project scope (soft state per the runtime_kv invariant). Single-host
35
+ // SQLite WAL only — multi-host would need a real coordinator.
36
+ //
37
+ // When no worker is registered (DB unavailable, fresh project), both
38
+ // helpers degrade to the empty-state fallback that #3704 already
39
+ // tolerates — same behavior as a fresh session.
40
+ const STUCK_RECOVERY_ATTEMPTS_KEY = "stuck_recovery_attempts";
41
+ const RECENT_UNIT_KEYS_LIMIT = 20;
42
+ function stableStuckStateScopeId(s) {
43
+ return normalizeRealPath(s.scope?.workspace.projectRoot ?? (s.originalBasePath || s.basePath));
31
44
  }
32
- function loadStuckState(basePath) {
45
+ function loadStuckState(s) {
46
+ const scopeId = stableStuckStateScopeId(s);
47
+ if (!scopeId)
48
+ return { recentUnits: [], stuckRecoveryAttempts: 0 };
33
49
  try {
34
- const data = JSON.parse(readFileSync(stuckStatePath(basePath), "utf-8"));
35
- // Only load state written by a DIFFERENT process (real session restart).
36
- // If the PID matches the current process, this state was written by an earlier
37
- // autoLoop call in the same process (e.g., a test that completed before this
38
- // one), not by a crashed session — skip it to prevent test state pollution.
39
- if (data.pid === process.pid) {
40
- return { recentUnits: [], stuckRecoveryAttempts: 0 };
41
- }
42
- return {
43
- recentUnits: Array.isArray(data.recentUnits) ? data.recentUnits : [],
44
- stuckRecoveryAttempts: typeof data.stuckRecoveryAttempts === "number" ? data.stuckRecoveryAttempts : 0,
45
- };
50
+ const recentUnits = getRecentUnitKeysForProjectRoot(scopeId, RECENT_UNIT_KEYS_LIMIT);
51
+ const stuckRecoveryAttempts = getRuntimeKv("global", scopeId, STUCK_RECOVERY_ATTEMPTS_KEY) ?? 0;
52
+ return { recentUnits, stuckRecoveryAttempts };
46
53
  }
47
54
  catch (err) {
48
55
  debugLog("autoLoop", { phase: "load-stuck-state-failed", error: err instanceof Error ? err.message : String(err) });
49
56
  return { recentUnits: [], stuckRecoveryAttempts: 0 };
50
57
  }
51
58
  }
52
- function saveStuckState(basePath, state) {
59
+ function saveStuckState(s, state) {
60
+ const scopeId = stableStuckStateScopeId(s);
61
+ if (!scopeId)
62
+ return;
63
+ // recentUnits is automatically derived from unit_dispatches by the
64
+ // dispatch ledger writes in openDispatchClaim — no separate persistence
65
+ // needed. Only the soft retry counter needs a runtime_kv row.
53
66
  try {
54
- const filePath = stuckStatePath(basePath);
55
- mkdirSync(join(gsdRoot(basePath), "runtime"), { recursive: true });
56
- writeFileSync(filePath, JSON.stringify({
57
- pid: process.pid,
58
- recentUnits: state.recentUnits.slice(-20), // keep last 20 entries
59
- stuckRecoveryAttempts: state.stuckRecoveryAttempts,
60
- updatedAt: new Date().toISOString(),
61
- }) + "\n");
67
+ setRuntimeKv("global", scopeId, STUCK_RECOVERY_ATTEMPTS_KEY, state.stuckRecoveryAttempts);
62
68
  }
63
69
  catch (err) {
64
70
  debugLog("autoLoop", { phase: "save-stuck-state-failed", error: err instanceof Error ? err.message : String(err) });
@@ -115,6 +121,57 @@ function saveCustomVerifyRetryCounts(s) {
115
121
  }
116
122
  }
117
123
  }
124
+ function openDispatchClaim(s, flowId, turnId, iterData) {
125
+ if (!s.workerId || s.milestoneLeaseToken === null)
126
+ return { kind: "degraded" };
127
+ const mid = iterData.mid;
128
+ if (!mid)
129
+ return { kind: "degraded" };
130
+ const recent = getRecentDispatchesForUnit(iterData.unitId, 1);
131
+ const attemptN = (recent[0]?.attempt_n ?? 0) + 1;
132
+ let claim;
133
+ try {
134
+ claim = recordDispatchClaim({
135
+ traceId: flowId,
136
+ turnId,
137
+ workerId: s.workerId,
138
+ milestoneLeaseToken: s.milestoneLeaseToken,
139
+ milestoneId: mid,
140
+ sliceId: iterData.state.activeSlice?.id ?? null,
141
+ taskId: iterData.state.activeTask?.id ?? null,
142
+ unitType: iterData.unitType,
143
+ unitId: iterData.unitId,
144
+ attemptN,
145
+ });
146
+ if (!claim.ok) {
147
+ debugLog("autoLoop", {
148
+ phase: "dispatch-claim-rejected",
149
+ unitId: iterData.unitId,
150
+ reason: claim.error,
151
+ existingId: "existingId" in claim ? claim.existingId : undefined,
152
+ existingWorker: "existingWorker" in claim ? claim.existingWorker : undefined,
153
+ });
154
+ if (claim.error === "already_active") {
155
+ return {
156
+ kind: "skip",
157
+ reason: "already-active",
158
+ existingId: claim.existingId,
159
+ existingWorker: claim.existingWorker,
160
+ };
161
+ }
162
+ return { kind: "skip", reason: "stale-lease" };
163
+ }
164
+ markDispatchRunning(claim.dispatchId);
165
+ return { kind: "opened", dispatchId: claim.dispatchId };
166
+ }
167
+ catch (err) {
168
+ debugLog("autoLoop", {
169
+ phase: "dispatch-claim-failed",
170
+ error: err instanceof Error ? err.message : String(err),
171
+ });
172
+ return { kind: "degraded" };
173
+ }
174
+ }
118
175
  // ── Memory pressure monitoring (#3331) ──────────────────────────────────
119
176
  // Check heap usage every N iterations and trigger graceful shutdown before
120
177
  // the OS OOM killer sends SIGKILL. The threshold is 90% of the V8 heap
@@ -220,7 +277,7 @@ export async function autoLoop(ctx, pi, s, deps, options) {
220
277
  let iteration = 0;
221
278
  const dispatchContract = options?.dispatchContract ?? "legacy-direct";
222
279
  // Load persisted stuck state so counters survive session restarts (#3704)
223
- const persisted = loadStuckState(s.basePath);
280
+ const persisted = loadStuckState(s);
224
281
  const loopState = {
225
282
  recentUnits: persisted.recentUnits,
226
283
  stuckRecoveryAttempts: persisted.stuckRecoveryAttempts,
@@ -232,6 +289,23 @@ export async function autoLoop(ctx, pi, s, deps, options) {
232
289
  while (s.active) {
233
290
  iteration++;
234
291
  debugLog("autoLoop", { phase: "loop-top", iteration });
292
+ // Phase B: heartbeat the worker registry + active milestone lease so
293
+ // janitors and concurrent workers see a live process. Best-effort —
294
+ // DB unavailability or stale state must not stop the loop.
295
+ if (s.workerId) {
296
+ try {
297
+ heartbeatAutoWorker(s.workerId);
298
+ if (s.currentMilestoneId && s.milestoneLeaseToken) {
299
+ refreshMilestoneLease(s.workerId, s.currentMilestoneId, s.milestoneLeaseToken);
300
+ }
301
+ }
302
+ catch (err) {
303
+ debugLog("autoLoop", {
304
+ phase: "heartbeat-failed",
305
+ error: err instanceof Error ? err.message : String(err),
306
+ });
307
+ }
308
+ }
235
309
  // ── Journal: per-iteration flow grouping ──
236
310
  const flowId = randomUUID();
237
311
  let seqCounter = 0;
@@ -299,6 +373,8 @@ export async function autoLoop(ctx, pi, s, deps, options) {
299
373
  finishTurn("stopped", "manual-attention", "missing-command-context");
300
374
  break;
301
375
  }
376
+ let dispatchId = null;
377
+ let dispatchSettled = false;
302
378
  try {
303
379
  // ── Blanket try/catch: one bad iteration must not kill the session
304
380
  const prefs = deps.loadEffectiveGSDPreferences()?.preferences;
@@ -359,7 +435,17 @@ export async function autoLoop(ctx, pi, s, deps, options) {
359
435
  activeEngineId: s.activeEngineId,
360
436
  activeRunDir: s.activeRunDir,
361
437
  });
362
- const engineState = await engine.deriveState(s.basePath);
438
+ const engineState = await engine.deriveState(s.canonicalProjectRoot);
439
+ debugLog("autoLoop", {
440
+ phase: "post-derive",
441
+ site: "custom-engine-derive",
442
+ basePath: s.basePath,
443
+ originalBasePath: s.originalBasePath,
444
+ scopeProjectRoot: s.scope?.workspace.projectRoot,
445
+ canonicalProjectRoot: s.canonicalProjectRoot,
446
+ derivedPhase: engineState.phase,
447
+ isComplete: engineState.isComplete,
448
+ });
363
449
  if (engineState.isComplete) {
364
450
  await deps.stopAuto(ctx, pi, "Workflow complete");
365
451
  break;
@@ -375,7 +461,15 @@ export async function autoLoop(ctx, pi, s, deps, options) {
375
461
  }
376
462
  // dispatch.action === "dispatch"
377
463
  const step = dispatch.step;
378
- const gsdState = await deps.deriveState(s.basePath);
464
+ const gsdState = await deps.deriveState(s.canonicalProjectRoot);
465
+ debugLog("autoLoop", {
466
+ phase: "post-derive",
467
+ site: "custom-engine-gsd-state",
468
+ basePath: s.basePath,
469
+ canonicalProjectRoot: s.canonicalProjectRoot,
470
+ derivedPhase: gsdState.phase,
471
+ activeUnit: gsdState.activeTask?.id ?? gsdState.activeSlice?.id ?? gsdState.activeMilestone?.id,
472
+ });
379
473
  iterData = {
380
474
  unitType: step.unitType,
381
475
  unitId: step.unitId,
@@ -478,7 +572,7 @@ export async function autoLoop(ctx, pi, s, deps, options) {
478
572
  consecutiveCooldowns = 0;
479
573
  recentErrorMessages.length = 0;
480
574
  deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
481
- saveStuckState(s.basePath, loopState); // persist across session restarts (#3704)
575
+ saveStuckState(s, loopState); // persist across session restarts (#3704)
482
576
  debugLog("autoLoop", { phase: "iteration-complete", iteration });
483
577
  if (reconcileResult.outcome === "milestone-complete") {
484
578
  await deps.stopAuto(ctx, pi, "Workflow complete");
@@ -552,7 +646,15 @@ export async function autoLoop(ctx, pi, s, deps, options) {
552
646
  }
553
647
  else {
554
648
  // ── Sidecar path: use values from the sidecar item directly ──
555
- const sidecarState = await deps.deriveState(s.basePath);
649
+ const sidecarState = await deps.deriveState(s.canonicalProjectRoot);
650
+ debugLog("autoLoop", {
651
+ phase: "post-derive",
652
+ site: "sidecar",
653
+ basePath: s.basePath,
654
+ canonicalProjectRoot: s.canonicalProjectRoot,
655
+ derivedPhase: sidecarState.phase,
656
+ activeUnit: sidecarState.activeTask?.id ?? sidecarState.activeSlice?.id ?? sidecarState.activeMilestone?.id,
657
+ });
556
658
  iterData = {
557
659
  unitType: sidecarItem.unitType,
558
660
  unitId: sidecarItem.unitId,
@@ -573,7 +675,39 @@ export async function autoLoop(ctx, pi, s, deps, options) {
573
675
  });
574
676
  }
575
677
  await enforceMinRequestInterval(s, prefs);
576
- const unitPhaseResult = await runUnitPhaseViaContract(dispatchContract, ic, iterData, loopState, sidecarItem);
678
+ // Phase B: claim a unit_dispatches row before invoking the unit. The
679
+ // partial unique index idx_unit_dispatches_active_per_unit prevents
680
+ // a second worker from claiming the same unit concurrently. Returns
681
+ // null when DB unavailable, no worker registered, or no active lease
682
+ // — those degraded paths fall through to the existing single-worker
683
+ // semantics with no ledger entry, preserving back-compat.
684
+ const dispatchClaim = openDispatchClaim(s, flowId, turnId, iterData);
685
+ if (dispatchClaim.kind === "skip") {
686
+ finishTurn("skipped", "execution", dispatchClaim.reason);
687
+ continue;
688
+ }
689
+ dispatchId = dispatchClaim.kind === "opened" ? dispatchClaim.dispatchId : null;
690
+ let unitPhaseResult;
691
+ try {
692
+ unitPhaseResult = await runUnitPhaseViaContract(dispatchContract, ic, iterData, loopState, sidecarItem);
693
+ }
694
+ catch (err) {
695
+ if (err instanceof ModelPolicyDispatchBlockedError) {
696
+ throw err;
697
+ }
698
+ if (dispatchId !== null) {
699
+ try {
700
+ markDispatchFailed(dispatchId, {
701
+ errorSummary: `exception:${err instanceof Error ? err.message : String(err)}`,
702
+ });
703
+ dispatchSettled = true;
704
+ }
705
+ catch (ledgerErr) {
706
+ debugLog("autoLoop", { phase: "dispatch-ledger-write-failed", error: ledgerErr instanceof Error ? ledgerErr.message : String(ledgerErr) });
707
+ }
708
+ }
709
+ throw err;
710
+ }
577
711
  if (unitPhaseResult.action === "next") {
578
712
  const requestTimestamp = unitPhaseResult.data.requestDispatchedAt ?? unitPhaseResult.data.unitStartedAt;
579
713
  if (typeof requestTimestamp === "number")
@@ -584,11 +718,37 @@ export async function autoLoop(ctx, pi, s, deps, options) {
584
718
  unitId: iterData.unitId,
585
719
  });
586
720
  if (unitPhaseResult.action === "break") {
721
+ if (dispatchId !== null) {
722
+ try {
723
+ markDispatchFailed(dispatchId, { errorSummary: "unit-break" });
724
+ dispatchSettled = true;
725
+ }
726
+ catch (err) {
727
+ debugLog("autoLoop", { phase: "dispatch-ledger-write-failed", error: err instanceof Error ? err.message : String(err) });
728
+ }
729
+ }
587
730
  finishTurn("stopped", "execution", "unit-break");
588
731
  break;
589
732
  }
590
733
  // ── Phase 5: Finalize ───────────────────────────────────────────────
591
- const finalizeResult = await runFinalize(ic, iterData, loopState, sidecarItem);
734
+ let finalizeResult;
735
+ try {
736
+ finalizeResult = await runFinalize(ic, iterData, loopState, sidecarItem);
737
+ }
738
+ catch (err) {
739
+ if (dispatchId !== null) {
740
+ try {
741
+ markDispatchFailed(dispatchId, {
742
+ errorSummary: `exception:${err instanceof Error ? err.message : String(err)}`,
743
+ });
744
+ dispatchSettled = true;
745
+ }
746
+ catch (ledgerErr) {
747
+ debugLog("autoLoop", { phase: "dispatch-ledger-write-failed", error: ledgerErr instanceof Error ? ledgerErr.message : String(ledgerErr) });
748
+ }
749
+ }
750
+ throw err;
751
+ }
592
752
  deps.uokObserver?.onPhaseResult("finalize", finalizeResult.action, {
593
753
  unitType: iterData.unitType,
594
754
  unitId: iterData.unitId,
@@ -597,24 +757,63 @@ export async function autoLoop(ctx, pi, s, deps, options) {
597
757
  const finalizeFailureClass = finalizeResult.reason === "git-closeout-failure"
598
758
  ? "git"
599
759
  : "closeout";
760
+ if (dispatchId !== null) {
761
+ try {
762
+ markDispatchFailed(dispatchId, { errorSummary: `finalize-break:${finalizeResult.reason ?? "unknown"}` });
763
+ dispatchSettled = true;
764
+ }
765
+ catch (err) {
766
+ debugLog("autoLoop", { phase: "dispatch-ledger-write-failed", error: err instanceof Error ? err.message : String(err) });
767
+ }
768
+ }
600
769
  finishTurn("stopped", finalizeFailureClass, "finalize-break");
601
770
  break;
602
771
  }
603
772
  if (finalizeResult.action === "continue") {
773
+ if (dispatchId !== null) {
774
+ try {
775
+ markDispatchFailed(dispatchId, { errorSummary: "finalize-retry" });
776
+ dispatchSettled = true;
777
+ }
778
+ catch (err) {
779
+ debugLog("autoLoop", { phase: "dispatch-ledger-write-failed", error: err instanceof Error ? err.message : String(err) });
780
+ }
781
+ }
604
782
  finishTurn("retry");
605
783
  continue;
606
784
  }
785
+ if (dispatchId !== null) {
786
+ try {
787
+ markDispatchCompleted(dispatchId);
788
+ dispatchSettled = true;
789
+ }
790
+ catch (err) {
791
+ debugLog("autoLoop", { phase: "dispatch-ledger-write-failed", error: err instanceof Error ? err.message : String(err) });
792
+ }
793
+ }
607
794
  consecutiveErrors = 0; // Iteration completed successfully
608
795
  consecutiveCooldowns = 0;
609
796
  recentErrorMessages.length = 0;
610
797
  deps.emitJournalEvent({ ts: new Date().toISOString(), flowId, seq: nextSeq(), eventType: "iteration-end", data: { iteration } });
611
- saveStuckState(s.basePath, loopState); // persist across session restarts (#4382)
798
+ saveStuckState(s, loopState); // persist across session restarts (#4382)
612
799
  debugLog("autoLoop", { phase: "iteration-complete", iteration });
613
800
  finishTurn("completed");
614
801
  }
615
802
  catch (loopErr) {
616
803
  // ── Blanket catch: absorb unexpected exceptions, apply graduated recovery ──
617
804
  const msg = loopErr instanceof Error ? loopErr.message : String(loopErr);
805
+ if (dispatchId !== null && !dispatchSettled && !(loopErr instanceof ModelPolicyDispatchBlockedError)) {
806
+ try {
807
+ markDispatchFailed(dispatchId, { errorSummary: `unhandled-error:${msg.slice(0, 200)}` });
808
+ dispatchSettled = true;
809
+ }
810
+ catch (err) {
811
+ debugLog("autoLoop", {
812
+ phase: "dispatch-ledger-write-failed",
813
+ error: err instanceof Error ? err.message : String(err),
814
+ });
815
+ }
816
+ }
618
817
  // Always emit iteration-end on error so the journal records iteration
619
818
  // completion even on failure (#2344). Without this, errors in
620
819
  // runFinalize leave the journal incomplete, making diagnosis harder.
@@ -289,8 +289,10 @@ export async function runPreDispatch(ic, loopState) {
289
289
  s.currentMilestoneId) {
290
290
  deps.syncProjectRootToWorktree(s.originalBasePath, s.basePath, s.currentMilestoneId);
291
291
  }
292
- // Derive state
293
- let state = await deps.deriveState(s.basePath);
292
+ // Derive state — use canonical project root so the cache key is stable
293
+ // across worktree↔project-root path-form alternation. See PR #5236
294
+ // (workspace handle infrastructure) and the Phase A pt 2 plan.
295
+ let state = await deps.deriveState(s.canonicalProjectRoot);
294
296
  const { getDeepStageGate } = await import("../auto-dispatch.js");
295
297
  const deepStageGate = getDeepStageGate(prefs, s.basePath);
296
298
  const canRunDeepSetupGate = state.phase === "pre-planning" ||
@@ -324,7 +326,7 @@ export async function runPreDispatch(ic, loopState) {
324
326
  let compiled = ensurePlanV2Graph(s.basePath, state);
325
327
  if (isEmptyPlanV2GraphResult(compiled)) {
326
328
  deps.invalidateAllCaches();
327
- state = await deps.deriveState(s.basePath);
329
+ state = await deps.deriveState(s.canonicalProjectRoot);
328
330
  compiled = shouldRunPlanV2Gate(state.phase)
329
331
  ? ensurePlanV2Graph(s.basePath, state)
330
332
  : {
@@ -477,7 +479,7 @@ export async function runPreDispatch(ic, loopState) {
477
479
  }
478
480
  // PR creation (auto_pr) is handled inside mergeMilestoneToMain (#2302)
479
481
  deps.invalidateAllCaches();
480
- state = await deps.deriveState(s.basePath);
482
+ state = await deps.deriveState(s.canonicalProjectRoot);
481
483
  mid = state.activeMilestone?.id;
482
484
  midTitle = state.activeMilestone?.title;
483
485
  if (mid) {
@@ -596,7 +598,7 @@ export async function runPreDispatch(ic, loopState) {
596
598
  }
597
599
  if (mergeReconcileResult === "reconciled") {
598
600
  deps.invalidateAllCaches();
599
- state = await deps.deriveState(s.basePath);
601
+ state = await deps.deriveState(s.canonicalProjectRoot);
600
602
  mid = state.activeMilestone?.id;
601
603
  midTitle = state.activeMilestone?.title;
602
604
  }
@@ -16,6 +16,7 @@
16
16
  * `let` or `var` declarations.
17
17
  */
18
18
  import { resolveWorktreeProjectRoot } from "../worktree-root.js";
19
+ import { normalizeRealPath } from "../paths.js";
19
20
  // ─── Constants ───────────────────────────────────────────────────────────────
20
21
  export const STUB_RECOVERY_THRESHOLD = 2;
21
22
  export const NEW_SESSION_TIMEOUT_MS = 120_000;
@@ -34,6 +35,20 @@ export class AutoSession {
34
35
  originalBasePath = "";
35
36
  // TODO(C8): remove basePath/originalBasePath once all readers use s.scope
36
37
  scope = null;
38
+ // ── Coordination identity (Phase B — DB-backed coordination) ────────────
39
+ /**
40
+ * Worker registry ID set by registerAutoWorker() at session start. Used by
41
+ * heartbeatAutoWorker() each loop iteration and by recordDispatchClaim()
42
+ * to fence dispatch ledger writes against stale workers.
43
+ */
44
+ workerId = null;
45
+ /**
46
+ * Active milestone lease fencing token, set by claimMilestoneLease() inside
47
+ * worktree-resolver.enterMilestone(). Threaded into recordDispatchClaim()
48
+ * as milestone_lease_token so out-of-band dispatches by a stale worker
49
+ * are detectable.
50
+ */
51
+ milestoneLeaseToken = null;
37
52
  previousProjectRootEnv = null;
38
53
  hadProjectRootEnv = false;
39
54
  projectRootEnvCaptured = false;
@@ -162,6 +177,22 @@ export class AutoSession {
162
177
  get lockBasePath() {
163
178
  return resolveWorktreeProjectRoot(this.basePath, this.originalBasePath);
164
179
  }
180
+ /**
181
+ * Canonical project root for state-derivation reads AND writer paths.
182
+ *
183
+ * Prefers the realpath-normalized projectRoot from the MilestoneScope
184
+ * (introduced by PR #5236), falling back to resolveWorktreeProjectRoot
185
+ * during early lifecycle / engine-bypass paths where scope may be null.
186
+ *
187
+ * Always realpath-normalized so cache keys (e.g. deriveState's _stateCache)
188
+ * cannot drift across worktree↔project-root path-string variants for the
189
+ * same filesystem location.
190
+ */
191
+ get canonicalProjectRoot() {
192
+ const root = this.scope?.workspace.projectRoot
193
+ ?? resolveWorktreeProjectRoot(this.basePath, this.originalBasePath);
194
+ return normalizeRealPath(root);
195
+ }
165
196
  reset() {
166
197
  this.clearTimers();
167
198
  // Lifecycle
@@ -176,6 +207,8 @@ export class AutoSession {
176
207
  this.basePath = "";
177
208
  this.originalBasePath = "";
178
209
  this.scope = null;
210
+ this.workerId = null;
211
+ this.milestoneLeaseToken = null;
179
212
  this.previousProjectRootEnv = null;
180
213
  this.hadProjectRootEnv = false;
181
214
  this.projectRootEnvCaptured = false;