gsd-pi 2.78.1-dev.d8826a445 → 2.78.1-dev.eccf86e27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -7
- package/dist/help-text.js +1 -1
- package/dist/resource-loader.js +6 -1
- package/dist/resources/.managed-resources-content-hash +1 -1
- package/dist/resources/extensions/gsd/auto/detect-stuck.js +41 -5
- package/dist/resources/extensions/gsd/auto/loop.js +235 -36
- package/dist/resources/extensions/gsd/auto/phases.js +7 -5
- package/dist/resources/extensions/gsd/auto/session.js +33 -0
- package/dist/resources/extensions/gsd/auto-dispatch.js +46 -2
- package/dist/resources/extensions/gsd/auto-post-unit.js +19 -11
- package/dist/resources/extensions/gsd/auto-worktree.js +26 -187
- package/dist/resources/extensions/gsd/auto.js +79 -50
- package/dist/resources/extensions/gsd/bootstrap/register-hooks.js +9 -4
- package/dist/resources/extensions/gsd/crash-recovery.js +160 -47
- package/dist/resources/extensions/gsd/db/auto-workers.js +227 -0
- package/dist/resources/extensions/gsd/db/command-queue.js +105 -0
- package/dist/resources/extensions/gsd/db/milestone-leases.js +210 -0
- package/dist/resources/extensions/gsd/db/runtime-kv.js +91 -0
- package/dist/resources/extensions/gsd/db/unit-dispatches.js +322 -0
- package/dist/resources/extensions/gsd/docs/COORDINATION.md +42 -0
- package/dist/resources/extensions/gsd/doctor-proactive.js +4 -0
- package/dist/resources/extensions/gsd/doctor-runtime-checks.js +22 -6
- package/dist/resources/extensions/gsd/doctor.js +12 -2
- package/dist/resources/extensions/gsd/gsd-db.js +161 -3
- package/dist/resources/extensions/gsd/guided-flow.js +6 -2
- package/dist/resources/extensions/gsd/interrupted-session.js +18 -15
- package/dist/resources/extensions/gsd/state.js +21 -6
- package/dist/resources/extensions/gsd/worktree-resolver.js +64 -0
- package/dist/tsconfig.extensions.tsbuildinfo +1 -1
- package/dist/web/standalone/.next/BUILD_ID +1 -1
- package/dist/web/standalone/.next/app-path-routes-manifest.json +12 -12
- package/dist/web/standalone/.next/build-manifest.json +2 -2
- package/dist/web/standalone/.next/prerender-manifest.json +3 -3
- package/dist/web/standalone/.next/server/app/_global-error.html +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error/__PAGE__.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.html +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_full.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_index.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_tree.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.html +1 -1
- package/dist/web/standalone/.next/server/app/index.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/__PAGE__.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_full.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_head.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_index.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_tree.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app-paths-manifest.json +12 -12
- package/dist/web/standalone/.next/server/middleware-build-manifest.js +1 -1
- package/dist/web/standalone/.next/server/pages/404.html +1 -1
- package/dist/web/standalone/.next/server/pages/500.html +1 -1
- package/dist/web/standalone/.next/server/server-reference-manifest.json +1 -1
- package/package.json +1 -1
- package/src/resources/extensions/gsd/auto/detect-stuck.ts +37 -5
- package/src/resources/extensions/gsd/auto/loop.ts +263 -41
- package/src/resources/extensions/gsd/auto/phases.ts +7 -5
- package/src/resources/extensions/gsd/auto/session.ts +36 -0
- package/src/resources/extensions/gsd/auto-dispatch.ts +53 -2
- package/src/resources/extensions/gsd/auto-post-unit.ts +19 -11
- package/src/resources/extensions/gsd/auto-worktree.ts +26 -211
- package/src/resources/extensions/gsd/auto.ts +89 -44
- package/src/resources/extensions/gsd/bootstrap/register-hooks.ts +9 -4
- package/src/resources/extensions/gsd/crash-recovery.ts +177 -43
- package/src/resources/extensions/gsd/db/auto-workers.ts +273 -0
- package/src/resources/extensions/gsd/db/command-queue.ts +149 -0
- package/src/resources/extensions/gsd/db/milestone-leases.ts +274 -0
- package/src/resources/extensions/gsd/db/runtime-kv.ts +127 -0
- package/src/resources/extensions/gsd/db/unit-dispatches.ts +446 -0
- package/src/resources/extensions/gsd/docs/COORDINATION.md +42 -0
- package/src/resources/extensions/gsd/doctor-proactive.ts +4 -0
- package/src/resources/extensions/gsd/doctor-runtime-checks.ts +24 -6
- package/src/resources/extensions/gsd/doctor.ts +10 -2
- package/src/resources/extensions/gsd/gsd-db.ts +170 -3
- package/src/resources/extensions/gsd/guided-flow.ts +6 -2
- package/src/resources/extensions/gsd/interrupted-session.ts +19 -12
- package/src/resources/extensions/gsd/state.ts +44 -6
- package/src/resources/extensions/gsd/tests/auto-loop-no-copy-artifacts.test.ts +72 -0
- package/src/resources/extensions/gsd/tests/auto-loop-symlink-worktree.test.ts +190 -0
- package/src/resources/extensions/gsd/tests/auto-workers.test.ts +105 -0
- package/src/resources/extensions/gsd/tests/command-queue.test.ts +141 -0
- package/src/resources/extensions/gsd/tests/crash-recovery-via-db.test.ts +203 -0
- package/src/resources/extensions/gsd/tests/crash-recovery.test.ts +169 -59
- package/src/resources/extensions/gsd/tests/detect-stuck-respects-retry.test.ts +173 -0
- package/src/resources/extensions/gsd/tests/integration/auto-worktree.test.ts +22 -12
- package/src/resources/extensions/gsd/tests/integration/doctor-proactive.test.ts +24 -10
- package/src/resources/extensions/gsd/tests/integration/doctor-runtime.test.ts +35 -23
- package/src/resources/extensions/gsd/tests/integration/workspace-collapse-integration.test.ts +3 -5
- package/src/resources/extensions/gsd/tests/interrupted-session-auto.test.ts +72 -25
- package/src/resources/extensions/gsd/tests/interrupted-session-ui.test.ts +72 -25
- package/src/resources/extensions/gsd/tests/memory-pressure-stuck-state.test.ts +9 -6
- package/src/resources/extensions/gsd/tests/milestone-leases.test.ts +152 -0
- package/src/resources/extensions/gsd/tests/parallel-milestone-isolation.test.ts +106 -0
- package/src/resources/extensions/gsd/tests/paused-session-via-db.test.ts +119 -0
- package/src/resources/extensions/gsd/tests/pipeline-variant-dispatch.test.ts +58 -0
- package/src/resources/extensions/gsd/tests/preferences-worktree-sync.test.ts +3 -17
- package/src/resources/extensions/gsd/tests/register-hooks-depth-verification.test.ts +110 -0
- package/src/resources/extensions/gsd/tests/runtime-kv.test.ts +120 -0
- package/src/resources/extensions/gsd/tests/skipped-validation-completion.test.ts +133 -28
- package/src/resources/extensions/gsd/tests/skipped-validation-db-atomicity.test.ts +17 -0
- package/src/resources/extensions/gsd/tests/stuck-state-via-db.test.ts +134 -0
- package/src/resources/extensions/gsd/tests/sync-layer-scope.test.ts +7 -26
- package/src/resources/extensions/gsd/tests/teardown-cleanup-parity.test.ts +4 -8
- package/src/resources/extensions/gsd/tests/unit-dispatches.test.ts +247 -0
- package/src/resources/extensions/gsd/tests/validate-milestone.test.ts +41 -1
- package/src/resources/extensions/gsd/tests/workspace.test.ts +15 -9
- package/src/resources/extensions/gsd/tests/write-gate.test.ts +31 -23
- package/src/resources/extensions/gsd/worktree-resolver.ts +62 -0
- package/src/resources/extensions/gsd/tests/auto-lock-creation.test.ts +0 -213
- package/src/resources/extensions/gsd/tests/auto-stale-lock-self-kill.test.ts +0 -87
- package/src/resources/extensions/gsd/tests/stop-auto-remote.test.ts +0 -159
- /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_buildManifest.js +0 -0
- /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_ssgManifest.js +0 -0
|
@@ -11,7 +11,8 @@
|
|
|
11
11
|
*/
|
|
12
12
|
import { deriveState } from "./state.js";
|
|
13
13
|
import { parseUnitId } from "./unit-id.js";
|
|
14
|
-
import { assessInterruptedSession, readPausedSessionMetadata, } from "./interrupted-session.js";
|
|
14
|
+
import { assessInterruptedSession, readPausedSessionMetadata, PAUSED_SESSION_KV_KEY, } from "./interrupted-session.js";
|
|
15
|
+
import { setRuntimeKv, deleteRuntimeKv, } from "./db/runtime-kv.js";
|
|
15
16
|
import { getManifestStatus } from "./files.js";
|
|
16
17
|
export { inlinePriorMilestoneSummary } from "./files.js";
|
|
17
18
|
import { collectSecretsFromManifest } from "../get-secrets-from-user.js";
|
|
@@ -40,7 +41,7 @@ import { setLogBasePath, logWarning } from "./workflow-logger.js";
|
|
|
40
41
|
import { preflightCleanRoot, postflightPopStash } from "./clean-root-preflight.js";
|
|
41
42
|
import { isAbsolute, join } from "node:path";
|
|
42
43
|
import { pathToFileURL } from "node:url";
|
|
43
|
-
import { readFileSync, existsSync, mkdirSync
|
|
44
|
+
import { readFileSync, existsSync, mkdirSync } from "node:fs";
|
|
44
45
|
import { atomicWriteSync } from "./atomic-write.js";
|
|
45
46
|
import { autoCommitCurrentBranch, captureIntegrationBranch, detectWorktreeName, getCurrentBranch, getMainBranch, setActiveMilestoneId, } from "./worktree.js";
|
|
46
47
|
import { GitServiceImpl } from "./git-service.js";
|
|
@@ -87,6 +88,9 @@ export { STUB_RECOVERY_THRESHOLD, NEW_SESSION_TIMEOUT_MS, } from "./auto/session
|
|
|
87
88
|
import { autoSession as s } from "./auto-runtime-state.js";
|
|
88
89
|
import { gsdHome } from "./gsd-home.js";
|
|
89
90
|
import { createWorkspace, scopeMilestone } from "./workspace.js";
|
|
91
|
+
import { registerAutoWorker, markWorkerStopping } from "./db/auto-workers.js";
|
|
92
|
+
import { releaseMilestoneLease } from "./db/milestone-leases.js";
|
|
93
|
+
import { normalizeRealPath } from "./paths.js";
|
|
90
94
|
// ── ENCAPSULATION INVARIANT ─────────────────────────────────────────────────
|
|
91
95
|
// ALL mutable auto-mode state lives in the AutoSession class (auto/session.ts).
|
|
92
96
|
// This file must NOT declare module-level `let` or `var` variables for state.
|
|
@@ -101,6 +105,27 @@ import { createWorkspace, scopeMilestone } from "./workspace.js";
|
|
|
101
105
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
102
106
|
/** Throttle STATE.md rebuilds — at most once per 30 seconds */
|
|
103
107
|
const STATE_REBUILD_MIN_INTERVAL_MS = 30_000;
|
|
108
|
+
/**
|
|
109
|
+
* Phase B — register this auto-mode process in the workers table so other
|
|
110
|
+
* workers and janitors can detect liveness via heartbeat. Best-effort: if
|
|
111
|
+
* the DB is unavailable (e.g. fresh project before init) we skip registration
|
|
112
|
+
* silently rather than blocking session start.
|
|
113
|
+
*/
|
|
114
|
+
function registerAutoWorkerForSession(session) {
|
|
115
|
+
if (session.workerId)
|
|
116
|
+
return; // already registered (e.g. resume re-runs)
|
|
117
|
+
try {
|
|
118
|
+
const projectRootRealpath = normalizeRealPath(session.scope?.workspace.projectRoot
|
|
119
|
+
?? (session.originalBasePath || session.basePath));
|
|
120
|
+
session.workerId = registerAutoWorker({ projectRootRealpath });
|
|
121
|
+
}
|
|
122
|
+
catch (err) {
|
|
123
|
+
debugLog("autoLoop", {
|
|
124
|
+
phase: "register-worker-failed",
|
|
125
|
+
error: err instanceof Error ? err.message : String(err),
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
}
|
|
104
129
|
function captureProjectRootEnv(projectRoot) {
|
|
105
130
|
if (!s.projectRootEnvCaptured) {
|
|
106
131
|
s.hadProjectRootEnv = Object.prototype.hasOwnProperty.call(process.env, "GSD_PROJECT_ROOT");
|
|
@@ -621,6 +646,21 @@ export async function stopAuto(ctx, pi, reason) {
|
|
|
621
646
|
catch (e) {
|
|
622
647
|
debugLog("stop-cleanup-locks", { error: e instanceof Error ? e.message : String(e) });
|
|
623
648
|
}
|
|
649
|
+
// ── Step 1b: Coordination cleanup (Phase B) ──
|
|
650
|
+
// Release any active milestone lease so other workers don't have to
|
|
651
|
+
// wait for TTL expiry, then mark this worker as stopping. Best-effort:
|
|
652
|
+
// DB unavailability or stale state must not block shutdown.
|
|
653
|
+
try {
|
|
654
|
+
if (s.workerId && s.currentMilestoneId && s.milestoneLeaseToken) {
|
|
655
|
+
releaseMilestoneLease(s.workerId, s.currentMilestoneId, s.milestoneLeaseToken);
|
|
656
|
+
}
|
|
657
|
+
if (s.workerId) {
|
|
658
|
+
markWorkerStopping(s.workerId);
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
catch (e) {
|
|
662
|
+
debugLog("stop-cleanup-coordination", { error: e instanceof Error ? e.message : String(e) });
|
|
663
|
+
}
|
|
624
664
|
// ── Step 1b: Flush queued follow-up messages (#3512) ──
|
|
625
665
|
// Late async notifications (async_job_result, gsd-auto-wrapup) can trigger
|
|
626
666
|
// extra LLM turns after stop. Flush them the same way run-unit.ts does.
|
|
@@ -799,13 +839,12 @@ export async function stopAuto(ctx, pi, reason) {
|
|
|
799
839
|
debugLog("stop-cleanup-metrics", { error: e instanceof Error ? e.message : String(e) });
|
|
800
840
|
}
|
|
801
841
|
// ── Step 12: Remove paused-session metadata (#1383) ──
|
|
842
|
+
// Phase C pt 2: deleteRuntimeKv replaces unlinkSync(paused-session.json).
|
|
802
843
|
try {
|
|
803
|
-
|
|
804
|
-
if (existsSync(pausedPath))
|
|
805
|
-
unlinkSync(pausedPath);
|
|
844
|
+
deleteRuntimeKv("global", "", PAUSED_SESSION_KV_KEY);
|
|
806
845
|
}
|
|
807
846
|
catch (err) { /* non-fatal */
|
|
808
|
-
logWarning("engine", `
|
|
847
|
+
logWarning("engine", `paused-session DB delete failed: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
|
|
809
848
|
}
|
|
810
849
|
// ── Step 13: Restore original model + thinking (before reset clears IDs) ──
|
|
811
850
|
try {
|
|
@@ -897,10 +936,12 @@ export async function pauseAuto(ctx, _pi, _errorContext) {
|
|
|
897
936
|
resolveAgentEndCancelled(_errorContext);
|
|
898
937
|
s.pausedSessionFile = normalizeSessionFilePath(ctx?.sessionManager?.getSessionFile() ?? null);
|
|
899
938
|
// Persist paused-session metadata so resume survives /exit (#1383).
|
|
900
|
-
//
|
|
939
|
+
// Phase C pt 2: persisted to runtime_kv (global scope, key
|
|
940
|
+
// PAUSED_SESSION_KV_KEY) instead of runtime/paused-session.json. The
|
|
941
|
+
// fresh-start bootstrap below reads from the same key.
|
|
901
942
|
try {
|
|
902
943
|
const pausedMeta = {
|
|
903
|
-
milestoneId: s.currentMilestoneId,
|
|
944
|
+
milestoneId: s.currentMilestoneId ?? undefined,
|
|
904
945
|
worktreePath: isInAutoWorktree(s.basePath) ? s.basePath : null,
|
|
905
946
|
originalBasePath: s.originalBasePath,
|
|
906
947
|
stepMode: s.stepMode,
|
|
@@ -908,17 +949,16 @@ export async function pauseAuto(ctx, _pi, _errorContext) {
|
|
|
908
949
|
sessionFile: s.pausedSessionFile,
|
|
909
950
|
unitType: s.currentUnit?.type ?? undefined,
|
|
910
951
|
unitId: s.currentUnit?.id ?? undefined,
|
|
911
|
-
activeEngineId: s.activeEngineId,
|
|
952
|
+
activeEngineId: s.activeEngineId ?? undefined,
|
|
912
953
|
activeRunDir: s.activeRunDir,
|
|
913
954
|
autoStartTime: s.autoStartTime,
|
|
914
955
|
milestoneLock: s.sessionMilestoneLock ?? undefined,
|
|
915
956
|
};
|
|
916
|
-
|
|
917
|
-
atomicWriteSync(join(runtimeDir, "paused-session.json"), JSON.stringify(pausedMeta, null, 2), "utf-8");
|
|
957
|
+
setRuntimeKv("global", "", PAUSED_SESSION_KV_KEY, pausedMeta);
|
|
918
958
|
}
|
|
919
959
|
catch (err) {
|
|
920
960
|
// Non-fatal — resume will still work via full bootstrap, just without worktree context
|
|
921
|
-
logWarning("engine", `paused-session
|
|
961
|
+
logWarning("engine", `paused-session DB write failed: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
|
|
922
962
|
}
|
|
923
963
|
// Close out the current unit so its runtime record doesn't stay at "dispatched"
|
|
924
964
|
if (s.currentUnit && ctx) {
|
|
@@ -1154,8 +1194,10 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
|
|
|
1154
1194
|
if (recoverFailedMigration(base)) {
|
|
1155
1195
|
ctx.ui.notify("Recovered unfinished migration (.gsd.migrating → .gsd).", "info");
|
|
1156
1196
|
}
|
|
1157
|
-
const freshStartAssessment = interruptedAssessment
|
|
1158
|
-
??
|
|
1197
|
+
const freshStartAssessment = await (interruptedAssessment
|
|
1198
|
+
?? (() => {
|
|
1199
|
+
return ensureDbOpen(base).then(() => assessInterruptedSession(base));
|
|
1200
|
+
})());
|
|
1159
1201
|
if (freshStartAssessment.classification === "running") {
|
|
1160
1202
|
const pid = freshStartAssessment.lock?.pid;
|
|
1161
1203
|
ctx.ui.notify(pid
|
|
@@ -1165,10 +1207,20 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
|
|
|
1165
1207
|
}
|
|
1166
1208
|
// If resuming from paused state, just re-activate and dispatch next unit.
|
|
1167
1209
|
// Check persisted paused-session first (#1383) — survives /exit.
|
|
1210
|
+
// Phase C pt 2: persisted in runtime_kv (global scope) instead of
|
|
1211
|
+
// runtime/paused-session.json. The `clearPausedSession` helper
|
|
1212
|
+
// replaces every prior unlinkSync(pausedPath) call.
|
|
1213
|
+
const clearPausedSession = (logTag) => {
|
|
1214
|
+
try {
|
|
1215
|
+
deleteRuntimeKv("global", "", PAUSED_SESSION_KV_KEY);
|
|
1216
|
+
}
|
|
1217
|
+
catch (err) {
|
|
1218
|
+
logWarning("session", `${logTag}: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
|
|
1219
|
+
}
|
|
1220
|
+
};
|
|
1168
1221
|
if (!s.paused) {
|
|
1169
1222
|
try {
|
|
1170
1223
|
const meta = freshStartAssessment.pausedSession ?? readPausedSessionMetadata(base);
|
|
1171
|
-
const pausedPath = join(gsdRoot(base), "runtime", "paused-session.json");
|
|
1172
1224
|
if (meta?.activeEngineId && meta.activeEngineId !== "dev") {
|
|
1173
1225
|
// Custom workflow resume — restore engine state
|
|
1174
1226
|
s.activeEngineId = meta.activeEngineId;
|
|
@@ -1178,14 +1230,6 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
|
|
|
1178
1230
|
s.autoStartTime = meta.autoStartTime || Date.now();
|
|
1179
1231
|
s.sessionMilestoneLock = meta.milestoneLock ?? null;
|
|
1180
1232
|
s.paused = true;
|
|
1181
|
-
try {
|
|
1182
|
-
unlinkSync(pausedPath);
|
|
1183
|
-
}
|
|
1184
|
-
catch (e) {
|
|
1185
|
-
if (e.code !== "ENOENT") {
|
|
1186
|
-
logWarning("session", `pause file cleanup failed: ${e instanceof Error ? e.message : String(e)}`, { file: "auto.ts" });
|
|
1187
|
-
}
|
|
1188
|
-
}
|
|
1189
1233
|
ctx.ui.notify(`Resuming paused custom workflow${meta.activeRunDir ? ` (${meta.activeRunDir})` : ""}.`, "info");
|
|
1190
1234
|
}
|
|
1191
1235
|
else if (meta?.milestoneId) {
|
|
@@ -1223,14 +1267,7 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
|
|
|
1223
1267
|
}
|
|
1224
1268
|
}
|
|
1225
1269
|
if (!mDir || summaryIsTerminal) {
|
|
1226
|
-
|
|
1227
|
-
unlinkSync(pausedPath);
|
|
1228
|
-
}
|
|
1229
|
-
catch (err) {
|
|
1230
|
-
if (err.code !== "ENOENT") {
|
|
1231
|
-
logWarning("session", `pause file cleanup failed: ${err instanceof Error ? err.message : String(err)}`, { file: "auto.ts" });
|
|
1232
|
-
}
|
|
1233
|
-
}
|
|
1270
|
+
clearPausedSession("paused-session DB cleanup failed (milestone gone/complete)");
|
|
1234
1271
|
ctx.ui.notify(`Paused milestone ${meta.milestoneId} is ${!mDir ? "missing" : "already complete"}. Starting fresh.`, "info");
|
|
1235
1272
|
}
|
|
1236
1273
|
else {
|
|
@@ -1255,26 +1292,13 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
|
|
|
1255
1292
|
: (s.originalBasePath || base);
|
|
1256
1293
|
rebuildScope(rawForScope, s.currentMilestoneId);
|
|
1257
1294
|
}
|
|
1258
|
-
try {
|
|
1259
|
-
unlinkSync(pausedPath);
|
|
1260
|
-
}
|
|
1261
|
-
catch (e) {
|
|
1262
|
-
if (e.code !== "ENOENT") {
|
|
1263
|
-
logWarning("session", `pause file cleanup failed: ${e instanceof Error ? e.message : String(e)}`, { file: "auto.ts" });
|
|
1264
|
-
}
|
|
1265
|
-
}
|
|
1266
1295
|
ctx.ui.notify(`Resuming paused session for ${meta.milestoneId}${meta.worktreePath && existsSync(meta.worktreePath) ? ` (worktree)` : ""}.`, "info");
|
|
1267
1296
|
}
|
|
1268
1297
|
}
|
|
1269
|
-
else if (
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
catch (e) {
|
|
1274
|
-
if (e.code !== "ENOENT") {
|
|
1275
|
-
logWarning("session", `stale pause file cleanup failed: ${e instanceof Error ? e.message : String(e)}`, { file: "auto.ts" });
|
|
1276
|
-
}
|
|
1277
|
-
}
|
|
1298
|
+
else if (meta) {
|
|
1299
|
+
// Stale paused-session metadata that the assessment chose not to
|
|
1300
|
+
// resume — clean it up so the next bootstrap starts fresh.
|
|
1301
|
+
clearPausedSession("stale paused-session DB cleanup failed");
|
|
1278
1302
|
}
|
|
1279
1303
|
}
|
|
1280
1304
|
}
|
|
@@ -1425,10 +1449,14 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
|
|
|
1425
1449
|
}
|
|
1426
1450
|
s.pausedSessionFile = null;
|
|
1427
1451
|
}
|
|
1452
|
+
captureProjectRootEnv(s.originalBasePath || s.basePath);
|
|
1453
|
+
registerAutoWorkerForSession(s);
|
|
1428
1454
|
updateSessionLock(lockBase(), "resuming", s.currentMilestoneId ?? "unknown");
|
|
1429
|
-
|
|
1455
|
+
if (s.workerId) {
|
|
1456
|
+
writeLock(lockBase(), "resuming", s.currentMilestoneId ?? "unknown");
|
|
1457
|
+
clearPausedSession("paused-session DB cleanup failed (resume activation)");
|
|
1458
|
+
}
|
|
1430
1459
|
pi.events.emit(CMUX_CHANNELS.LOG, { preferences: loadEffectiveGSDPreferences(s.basePath || undefined)?.preferences, message: s.stepMode ? "Step-mode resumed." : "Auto-mode resumed.", level: "progress" });
|
|
1431
|
-
captureProjectRootEnv(s.originalBasePath || s.basePath);
|
|
1432
1460
|
startAutoCommandPolling(s.basePath);
|
|
1433
1461
|
await runAutoLoopWithUok({
|
|
1434
1462
|
ctx,
|
|
@@ -1455,6 +1483,7 @@ export async function startAuto(ctx, pi, base, verboseMode, options) {
|
|
|
1455
1483
|
// s.currentMilestoneId (including worktree setup inside bootstrapAutoSession).
|
|
1456
1484
|
rebuildScope(s.basePath, s.currentMilestoneId);
|
|
1457
1485
|
captureProjectRootEnv(s.originalBasePath || s.basePath);
|
|
1486
|
+
registerAutoWorkerForSession(s);
|
|
1458
1487
|
try {
|
|
1459
1488
|
pi.events.emit(CMUX_CHANNELS.SIDEBAR, { action: "sync", preferences: loadEffectiveGSDPreferences(s.basePath || undefined)?.preferences, state: await deriveState(s.basePath) });
|
|
1460
1489
|
}
|
|
@@ -515,8 +515,13 @@ export function registerHooks(pi, ecosystemHandlers) {
|
|
|
515
515
|
const currentPendingGate = getPendingGate();
|
|
516
516
|
if (currentPendingGate) {
|
|
517
517
|
if (details?.cancelled || !details?.response) {
|
|
518
|
-
// Gate stays pending.
|
|
519
|
-
//
|
|
518
|
+
// Gate stays pending. Direct the agent to the most reliable recovery
|
|
519
|
+
// path — re-calling ask_user_questions with the same gate id — without
|
|
520
|
+
// misrepresenting the plain-text path. The plain-text path also works
|
|
521
|
+
// (isExplicitApprovalResponse on the next before_agent_start clears
|
|
522
|
+
// the gate when the user replies with an approval keyword), but the
|
|
523
|
+
// structured re-ask is more deterministic and gives the user a clear UI.
|
|
524
|
+
resetToolCallLoopGuard();
|
|
520
525
|
return {
|
|
521
526
|
content: [{
|
|
522
527
|
type: "text",
|
|
@@ -524,8 +529,8 @@ export function registerHooks(pi, ecosystemHandlers) {
|
|
|
524
529
|
`HARD BLOCK: approval gate "${currentPendingGate}" is still pending.`,
|
|
525
530
|
"No user response was received for the confirmation question.",
|
|
526
531
|
"Do not infer approval from earlier or prior messages.",
|
|
527
|
-
"Do not proceed, write files, save artifacts, or call
|
|
528
|
-
|
|
532
|
+
"Do not proceed, write files, save artifacts, or call other tools.",
|
|
533
|
+
`Re-call ask_user_questions with the same gate question id ("${currentPendingGate}") and wait for the user's response.`,
|
|
529
534
|
].join(" "),
|
|
530
535
|
}],
|
|
531
536
|
};
|
|
@@ -1,24 +1,106 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* GSD Crash Recovery
|
|
2
|
+
* GSD Crash Recovery (Phase C pt 2 — DB-backed)
|
|
3
3
|
*
|
|
4
|
-
* Detects interrupted auto-mode sessions via
|
|
5
|
-
*
|
|
6
|
-
*
|
|
4
|
+
* Detects interrupted auto-mode sessions via the DB-backed workers +
|
|
5
|
+
* unit_dispatches + runtime_kv tables. The auto.lock file is gone; the
|
|
6
|
+
* `LockData` shape is preserved for backward compatibility with callers
|
|
7
|
+
* (auto.ts, doctor checks, interrupted-session.ts), but the contents are
|
|
8
|
+
* now synthesized from:
|
|
7
9
|
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
10
|
+
* - workers.pid / .started_at / .last_heartbeat_at → liveness + age
|
|
11
|
+
* - unit_dispatches.unit_type / .unit_id / .started_at → what was running
|
|
12
|
+
* - runtime_kv("worker", workerId, "session_file") → pi session JSONL path
|
|
13
|
+
*
|
|
14
|
+
* "Crashed" is detected via workers.status='active' + heartbeat past TTL,
|
|
15
|
+
* cross-checked with the OS PID via isLockProcessAlive(). When the DB is
|
|
16
|
+
* unavailable (fresh project before init), all readers return null and
|
|
17
|
+
* writers no-op — preserving the historical "no lock means no prior
|
|
18
|
+
* crash" semantics.
|
|
19
|
+
*
|
|
20
|
+
* The journal-based emitCrashRecoveredUnitEnd is unchanged from the file
|
|
21
|
+
* era — it queries the journal independently of the lock mechanism.
|
|
11
22
|
*/
|
|
23
|
+
import { emitJournalEvent, queryJournal, } from "./journal.js";
|
|
12
24
|
import { readFileSync, unlinkSync, existsSync } from "node:fs";
|
|
13
25
|
import { join } from "node:path";
|
|
14
|
-
import {
|
|
26
|
+
import { findStaleWorkerForProject, getAllAutoWorkers, } from "./db/auto-workers.js";
|
|
27
|
+
import { getRuntimeKv, setRuntimeKv, deleteRuntimeKv } from "./db/runtime-kv.js";
|
|
28
|
+
import { _getAdapter, isDbAvailable } from "./gsd-db.js";
|
|
29
|
+
import { gsdRoot, normalizeRealPath } from "./paths.js";
|
|
15
30
|
import { atomicWriteSync } from "./atomic-write.js";
|
|
16
31
|
import { effectiveLockFile } from "./session-lock.js";
|
|
17
|
-
|
|
32
|
+
const SESSION_FILE_KV_KEY = "session_file";
|
|
18
33
|
function lockPath(basePath) {
|
|
19
34
|
return join(gsdRoot(basePath), effectiveLockFile());
|
|
20
35
|
}
|
|
21
|
-
|
|
36
|
+
function readLegacyLock(basePath) {
|
|
37
|
+
try {
|
|
38
|
+
const p = lockPath(basePath);
|
|
39
|
+
if (!existsSync(p))
|
|
40
|
+
return null;
|
|
41
|
+
return JSON.parse(readFileSync(p, "utf-8"));
|
|
42
|
+
}
|
|
43
|
+
catch {
|
|
44
|
+
return null;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
function findActiveWorkerForCurrentProcess(projectRootRealpath) {
|
|
48
|
+
if (!isDbAvailable())
|
|
49
|
+
return null;
|
|
50
|
+
const workers = getAllAutoWorkers();
|
|
51
|
+
for (const worker of workers) {
|
|
52
|
+
if (worker.pid === process.pid
|
|
53
|
+
&& worker.project_root_realpath === projectRootRealpath) {
|
|
54
|
+
return worker;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return null;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Look up the most recent dispatch row for a worker, regardless of status.
|
|
61
|
+
* Returns null if the worker has no dispatch history yet (e.g. crashed
|
|
62
|
+
* during bootstrap before claiming the first unit).
|
|
63
|
+
*/
|
|
64
|
+
function getLatestDispatchForWorker(workerId) {
|
|
65
|
+
if (!isDbAvailable())
|
|
66
|
+
return null;
|
|
67
|
+
const db = _getAdapter();
|
|
68
|
+
const row = db.prepare(`SELECT unit_type, unit_id, started_at, status
|
|
69
|
+
FROM unit_dispatches
|
|
70
|
+
WHERE worker_id = :worker_id
|
|
71
|
+
ORDER BY id DESC
|
|
72
|
+
LIMIT 1`).get({ ":worker_id": workerId });
|
|
73
|
+
return row ?? null;
|
|
74
|
+
}
|
|
75
|
+
function workerToLockData(worker) {
|
|
76
|
+
const dispatch = getLatestDispatchForWorker(worker.worker_id);
|
|
77
|
+
const sessionFile = getRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY) ?? undefined;
|
|
78
|
+
return {
|
|
79
|
+
pid: worker.pid,
|
|
80
|
+
startedAt: worker.started_at,
|
|
81
|
+
// Pre-Phase-C-pt-2 default: when no dispatch row exists yet (bootstrap
|
|
82
|
+
// crash), report unitType="starting", unitId="bootstrap" — same shape
|
|
83
|
+
// the file-based writer used to produce.
|
|
84
|
+
unitType: dispatch?.unit_type ?? "starting",
|
|
85
|
+
unitId: dispatch?.unit_id ?? "bootstrap",
|
|
86
|
+
unitStartedAt: dispatch?.started_at ?? worker.started_at,
|
|
87
|
+
sessionFile,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Write or update the lock state for the current auto-mode session.
|
|
92
|
+
*
|
|
93
|
+
* Phase C pt 2: the only persistent state this function adds beyond what
|
|
94
|
+
* the workers + unit_dispatches tables already track is the pi session
|
|
95
|
+
* JSONL path, which lands in runtime_kv (worker scope, key
|
|
96
|
+
* "session_file"). The pid/startedAt/unitType/unitId/unitStartedAt are
|
|
97
|
+
* recorded by registerAutoWorker / heartbeatAutoWorker / recordDispatchClaim
|
|
98
|
+
* already.
|
|
99
|
+
*
|
|
100
|
+
* basePath is unused by the new implementation (kept as a parameter for
|
|
101
|
+
* back-compat with the 15+ call sites) — the worker is identified by
|
|
102
|
+
* pid + project_root_realpath in the workers table.
|
|
103
|
+
*/
|
|
22
104
|
export function writeLock(basePath, unitType, unitId, sessionFile) {
|
|
23
105
|
try {
|
|
24
106
|
const data = {
|
|
@@ -29,51 +111,86 @@ export function writeLock(basePath, unitType, unitId, sessionFile) {
|
|
|
29
111
|
unitStartedAt: new Date().toISOString(),
|
|
30
112
|
sessionFile,
|
|
31
113
|
};
|
|
32
|
-
|
|
33
|
-
|
|
114
|
+
atomicWriteSync(lockPath(basePath), JSON.stringify(data, null, 2));
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
// Best-effort — never throw from the lock writer.
|
|
118
|
+
}
|
|
119
|
+
if (!isDbAvailable() || !sessionFile)
|
|
120
|
+
return;
|
|
121
|
+
try {
|
|
122
|
+
const projectRoot = normalizeRealPath(basePath);
|
|
123
|
+
const worker = findActiveWorkerForCurrentProcess(projectRoot);
|
|
124
|
+
if (!worker)
|
|
125
|
+
return;
|
|
126
|
+
setRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY, sessionFile);
|
|
34
127
|
}
|
|
35
|
-
catch
|
|
36
|
-
|
|
128
|
+
catch {
|
|
129
|
+
// Best-effort — never throw from the lock writer.
|
|
37
130
|
}
|
|
38
131
|
}
|
|
39
|
-
/**
|
|
132
|
+
/**
|
|
133
|
+
* Phase C pt 2: clearLock no longer deletes a file. The cleanup path
|
|
134
|
+
* (markWorkerStopping in stopAuto) flips the workers row to 'stopping'.
|
|
135
|
+
* This function additionally drops the session_file runtime_kv row for
|
|
136
|
+
* the current worker so a follow-up crash detection doesn't pick up a
|
|
137
|
+
* stale session-file pointer.
|
|
138
|
+
*/
|
|
40
139
|
export function clearLock(basePath) {
|
|
41
140
|
try {
|
|
42
141
|
const p = lockPath(basePath);
|
|
43
142
|
if (existsSync(p))
|
|
44
143
|
unlinkSync(p);
|
|
45
144
|
}
|
|
46
|
-
catch
|
|
47
|
-
|
|
145
|
+
catch {
|
|
146
|
+
// Best-effort.
|
|
48
147
|
}
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
export function readCrashLock(basePath) {
|
|
148
|
+
if (!isDbAvailable())
|
|
149
|
+
return;
|
|
52
150
|
try {
|
|
53
|
-
const
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
151
|
+
const projectRoot = normalizeRealPath(basePath);
|
|
152
|
+
const worker = findActiveWorkerForCurrentProcess(projectRoot);
|
|
153
|
+
if (!worker)
|
|
154
|
+
return;
|
|
155
|
+
deleteRuntimeKv("worker", worker.worker_id, SESSION_FILE_KV_KEY);
|
|
58
156
|
}
|
|
59
|
-
catch
|
|
60
|
-
|
|
61
|
-
|
|
157
|
+
catch {
|
|
158
|
+
// Best-effort.
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Detect a previous crashed auto-mode session.
|
|
163
|
+
*
|
|
164
|
+
* Phase C pt 2: synthesized from workers (status='active' + lapsed
|
|
165
|
+
* heartbeat) + unit_dispatches (most recent for that worker) +
|
|
166
|
+
* runtime_kv (session_file). Returns null when no stale worker exists
|
|
167
|
+
* or the DB is unavailable.
|
|
168
|
+
*/
|
|
169
|
+
export function readCrashLock(basePath) {
|
|
170
|
+
if (isDbAvailable()) {
|
|
171
|
+
try {
|
|
172
|
+
const projectRoot = normalizeRealPath(basePath);
|
|
173
|
+
const stale = findStaleWorkerForProject(projectRoot);
|
|
174
|
+
if (stale)
|
|
175
|
+
return workerToLockData(stale);
|
|
176
|
+
}
|
|
177
|
+
catch {
|
|
178
|
+
// Fall through to the legacy lock-file compatibility path.
|
|
179
|
+
}
|
|
62
180
|
}
|
|
181
|
+
return readLegacyLock(basePath);
|
|
63
182
|
}
|
|
64
183
|
/**
|
|
65
184
|
* Check whether the process that wrote the lock is still running.
|
|
66
185
|
* Uses `process.kill(pid, 0)` which sends no signal but checks liveness.
|
|
67
186
|
* Returns true if the PID matches our own — we are the lock holder (#2470).
|
|
187
|
+
*
|
|
188
|
+
* Unchanged from the file-based era — pure stateless OS check.
|
|
68
189
|
*/
|
|
69
190
|
export function isLockProcessAlive(lock) {
|
|
70
191
|
const pid = lock.pid;
|
|
71
192
|
if (!Number.isInteger(pid) || pid <= 0)
|
|
72
193
|
return false;
|
|
73
|
-
// Our own PID means WE hold this lock — we are alive. (#2470)
|
|
74
|
-
// Callers that need to distinguish "our lock" from "someone else's lock"
|
|
75
|
-
// (e.g. startAuto checking for a prior crashed session with a recycled PID)
|
|
76
|
-
// already guard with `crashLock.pid !== process.pid` before calling us.
|
|
77
194
|
if (pid === process.pid)
|
|
78
195
|
return true;
|
|
79
196
|
try {
|
|
@@ -81,8 +198,6 @@ export function isLockProcessAlive(lock) {
|
|
|
81
198
|
return true;
|
|
82
199
|
}
|
|
83
200
|
catch (err) {
|
|
84
|
-
// EPERM means the process exists but we lack permission — treat as alive.
|
|
85
|
-
// ESRCH means the process does not exist — treat as dead (stale lock).
|
|
86
201
|
if (err.code === "EPERM")
|
|
87
202
|
return true;
|
|
88
203
|
return false;
|
|
@@ -96,7 +211,6 @@ export function formatCrashInfo(lock) {
|
|
|
96
211
|
` Started at: ${lock.unitStartedAt}`,
|
|
97
212
|
` PID: ${lock.pid}`,
|
|
98
213
|
];
|
|
99
|
-
// Add recovery guidance based on what was happening when it crashed
|
|
100
214
|
if (lock.unitType === "starting" && lock.unitId === "bootstrap") {
|
|
101
215
|
lines.push(`No work was lost. Run /gsd auto to restart.`);
|
|
102
216
|
}
|
|
@@ -113,33 +227,23 @@ export function formatCrashInfo(lock) {
|
|
|
113
227
|
}
|
|
114
228
|
/**
|
|
115
229
|
* Emit a synthetic unit-end event for a unit that crashed without emitting its own.
|
|
116
|
-
*
|
|
117
|
-
* Queries the journal to find the most recent unit-start for the crashed unit.
|
|
118
|
-
* If a matching unit-end already exists (e.g. the hard timeout fired), this is a
|
|
119
|
-
* no-op. Called during crash recovery, before clearing the stale lock.
|
|
120
|
-
*
|
|
121
|
-
* Addresses the gap reported in #3348 where `unit-start` was emitted but no
|
|
122
|
-
* `unit-end` followed — side effects landed but the worker died before closeout.
|
|
230
|
+
* Unchanged from the file era — operates on the journal, not the lock.
|
|
123
231
|
*/
|
|
124
232
|
export function emitCrashRecoveredUnitEnd(basePath, lock) {
|
|
125
|
-
// Skip bootstrap / starting pseudo-units — they have no meaningful unit-start event.
|
|
126
233
|
if (!lock.unitType || !lock.unitId || lock.unitType === "starting")
|
|
127
234
|
return;
|
|
128
235
|
try {
|
|
129
236
|
const all = queryJournal(basePath);
|
|
130
|
-
// Find the most recent unit-start for this unitId
|
|
131
237
|
const starts = all.filter((e) => e.eventType === "unit-start" && e.data?.unitId === lock.unitId);
|
|
132
238
|
if (starts.length === 0)
|
|
133
239
|
return;
|
|
134
240
|
const lastStart = starts[starts.length - 1];
|
|
135
|
-
// Check if a unit-end was already emitted (e.g. hard timeout fired after the crash)
|
|
136
241
|
const alreadyClosed = all.some((e) => e.eventType === "unit-end" &&
|
|
137
242
|
e.data?.unitId === lock.unitId &&
|
|
138
243
|
e.causedBy?.flowId === lastStart.flowId &&
|
|
139
244
|
e.causedBy?.seq === lastStart.seq);
|
|
140
245
|
if (alreadyClosed)
|
|
141
246
|
return;
|
|
142
|
-
// Find the highest seq in this flow for monotonic ordering
|
|
143
247
|
const maxSeq = all
|
|
144
248
|
.filter((e) => e.flowId === lastStart.flowId)
|
|
145
249
|
.reduce((max, e) => Math.max(max, e.seq), lastStart.seq);
|
|
@@ -158,6 +262,15 @@ export function emitCrashRecoveredUnitEnd(basePath, lock) {
|
|
|
158
262
|
});
|
|
159
263
|
}
|
|
160
264
|
catch {
|
|
161
|
-
// Never throw from crash recovery path
|
|
265
|
+
// Never throw from crash recovery path.
|
|
162
266
|
}
|
|
163
267
|
}
|
|
268
|
+
/**
|
|
269
|
+
* Used by the doctor checks (doctor-runtime-checks.ts, doctor-proactive.ts)
|
|
270
|
+
* to enumerate stale workers across all projects this DB knows about.
|
|
271
|
+
* Phase C pt 2 export — surface for the same diagnostics that previously
|
|
272
|
+
* iterated `auto.lock` files.
|
|
273
|
+
*/
|
|
274
|
+
export function findStaleAutoWorker(basePath) {
|
|
275
|
+
return readCrashLock(basePath);
|
|
276
|
+
}
|