gsd-pi 2.78.1-dev.d8826a445 → 2.78.1-dev.eccf86e27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -7
- package/dist/help-text.js +1 -1
- package/dist/resource-loader.js +6 -1
- package/dist/resources/.managed-resources-content-hash +1 -1
- package/dist/resources/extensions/gsd/auto/detect-stuck.js +41 -5
- package/dist/resources/extensions/gsd/auto/loop.js +235 -36
- package/dist/resources/extensions/gsd/auto/phases.js +7 -5
- package/dist/resources/extensions/gsd/auto/session.js +33 -0
- package/dist/resources/extensions/gsd/auto-dispatch.js +46 -2
- package/dist/resources/extensions/gsd/auto-post-unit.js +19 -11
- package/dist/resources/extensions/gsd/auto-worktree.js +26 -187
- package/dist/resources/extensions/gsd/auto.js +79 -50
- package/dist/resources/extensions/gsd/bootstrap/register-hooks.js +9 -4
- package/dist/resources/extensions/gsd/crash-recovery.js +160 -47
- package/dist/resources/extensions/gsd/db/auto-workers.js +227 -0
- package/dist/resources/extensions/gsd/db/command-queue.js +105 -0
- package/dist/resources/extensions/gsd/db/milestone-leases.js +210 -0
- package/dist/resources/extensions/gsd/db/runtime-kv.js +91 -0
- package/dist/resources/extensions/gsd/db/unit-dispatches.js +322 -0
- package/dist/resources/extensions/gsd/docs/COORDINATION.md +42 -0
- package/dist/resources/extensions/gsd/doctor-proactive.js +4 -0
- package/dist/resources/extensions/gsd/doctor-runtime-checks.js +22 -6
- package/dist/resources/extensions/gsd/doctor.js +12 -2
- package/dist/resources/extensions/gsd/gsd-db.js +161 -3
- package/dist/resources/extensions/gsd/guided-flow.js +6 -2
- package/dist/resources/extensions/gsd/interrupted-session.js +18 -15
- package/dist/resources/extensions/gsd/state.js +21 -6
- package/dist/resources/extensions/gsd/worktree-resolver.js +64 -0
- package/dist/tsconfig.extensions.tsbuildinfo +1 -1
- package/dist/web/standalone/.next/BUILD_ID +1 -1
- package/dist/web/standalone/.next/app-path-routes-manifest.json +12 -12
- package/dist/web/standalone/.next/build-manifest.json +2 -2
- package/dist/web/standalone/.next/prerender-manifest.json +3 -3
- package/dist/web/standalone/.next/server/app/_global-error.html +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error/__PAGE__.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_global-error.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.html +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_full.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_index.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/_not-found.segments/_tree.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.html +1 -1
- package/dist/web/standalone/.next/server/app/index.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/__PAGE__.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_full.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_head.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_index.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app/index.segments/_tree.segment.rsc +1 -1
- package/dist/web/standalone/.next/server/app-paths-manifest.json +12 -12
- package/dist/web/standalone/.next/server/middleware-build-manifest.js +1 -1
- package/dist/web/standalone/.next/server/pages/404.html +1 -1
- package/dist/web/standalone/.next/server/pages/500.html +1 -1
- package/dist/web/standalone/.next/server/server-reference-manifest.json +1 -1
- package/package.json +1 -1
- package/src/resources/extensions/gsd/auto/detect-stuck.ts +37 -5
- package/src/resources/extensions/gsd/auto/loop.ts +263 -41
- package/src/resources/extensions/gsd/auto/phases.ts +7 -5
- package/src/resources/extensions/gsd/auto/session.ts +36 -0
- package/src/resources/extensions/gsd/auto-dispatch.ts +53 -2
- package/src/resources/extensions/gsd/auto-post-unit.ts +19 -11
- package/src/resources/extensions/gsd/auto-worktree.ts +26 -211
- package/src/resources/extensions/gsd/auto.ts +89 -44
- package/src/resources/extensions/gsd/bootstrap/register-hooks.ts +9 -4
- package/src/resources/extensions/gsd/crash-recovery.ts +177 -43
- package/src/resources/extensions/gsd/db/auto-workers.ts +273 -0
- package/src/resources/extensions/gsd/db/command-queue.ts +149 -0
- package/src/resources/extensions/gsd/db/milestone-leases.ts +274 -0
- package/src/resources/extensions/gsd/db/runtime-kv.ts +127 -0
- package/src/resources/extensions/gsd/db/unit-dispatches.ts +446 -0
- package/src/resources/extensions/gsd/docs/COORDINATION.md +42 -0
- package/src/resources/extensions/gsd/doctor-proactive.ts +4 -0
- package/src/resources/extensions/gsd/doctor-runtime-checks.ts +24 -6
- package/src/resources/extensions/gsd/doctor.ts +10 -2
- package/src/resources/extensions/gsd/gsd-db.ts +170 -3
- package/src/resources/extensions/gsd/guided-flow.ts +6 -2
- package/src/resources/extensions/gsd/interrupted-session.ts +19 -12
- package/src/resources/extensions/gsd/state.ts +44 -6
- package/src/resources/extensions/gsd/tests/auto-loop-no-copy-artifacts.test.ts +72 -0
- package/src/resources/extensions/gsd/tests/auto-loop-symlink-worktree.test.ts +190 -0
- package/src/resources/extensions/gsd/tests/auto-workers.test.ts +105 -0
- package/src/resources/extensions/gsd/tests/command-queue.test.ts +141 -0
- package/src/resources/extensions/gsd/tests/crash-recovery-via-db.test.ts +203 -0
- package/src/resources/extensions/gsd/tests/crash-recovery.test.ts +169 -59
- package/src/resources/extensions/gsd/tests/detect-stuck-respects-retry.test.ts +173 -0
- package/src/resources/extensions/gsd/tests/integration/auto-worktree.test.ts +22 -12
- package/src/resources/extensions/gsd/tests/integration/doctor-proactive.test.ts +24 -10
- package/src/resources/extensions/gsd/tests/integration/doctor-runtime.test.ts +35 -23
- package/src/resources/extensions/gsd/tests/integration/workspace-collapse-integration.test.ts +3 -5
- package/src/resources/extensions/gsd/tests/interrupted-session-auto.test.ts +72 -25
- package/src/resources/extensions/gsd/tests/interrupted-session-ui.test.ts +72 -25
- package/src/resources/extensions/gsd/tests/memory-pressure-stuck-state.test.ts +9 -6
- package/src/resources/extensions/gsd/tests/milestone-leases.test.ts +152 -0
- package/src/resources/extensions/gsd/tests/parallel-milestone-isolation.test.ts +106 -0
- package/src/resources/extensions/gsd/tests/paused-session-via-db.test.ts +119 -0
- package/src/resources/extensions/gsd/tests/pipeline-variant-dispatch.test.ts +58 -0
- package/src/resources/extensions/gsd/tests/preferences-worktree-sync.test.ts +3 -17
- package/src/resources/extensions/gsd/tests/register-hooks-depth-verification.test.ts +110 -0
- package/src/resources/extensions/gsd/tests/runtime-kv.test.ts +120 -0
- package/src/resources/extensions/gsd/tests/skipped-validation-completion.test.ts +133 -28
- package/src/resources/extensions/gsd/tests/skipped-validation-db-atomicity.test.ts +17 -0
- package/src/resources/extensions/gsd/tests/stuck-state-via-db.test.ts +134 -0
- package/src/resources/extensions/gsd/tests/sync-layer-scope.test.ts +7 -26
- package/src/resources/extensions/gsd/tests/teardown-cleanup-parity.test.ts +4 -8
- package/src/resources/extensions/gsd/tests/unit-dispatches.test.ts +247 -0
- package/src/resources/extensions/gsd/tests/validate-milestone.test.ts +41 -1
- package/src/resources/extensions/gsd/tests/workspace.test.ts +15 -9
- package/src/resources/extensions/gsd/tests/write-gate.test.ts +31 -23
- package/src/resources/extensions/gsd/worktree-resolver.ts +62 -0
- package/src/resources/extensions/gsd/tests/auto-lock-creation.test.ts +0 -213
- package/src/resources/extensions/gsd/tests/auto-stale-lock-self-kill.test.ts +0 -87
- package/src/resources/extensions/gsd/tests/stop-auto-remote.test.ts +0 -159
- /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_buildManifest.js +0 -0
- /package/dist/web/standalone/.next/static/{AT5qi39nKXkdmQIOIoh0f → Y5UeGFkXTYM9WIQOWHkot}/_ssgManifest.js +0 -0
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
// gsd-2 + Unit dispatch ledger (DB-backed coordination, Phase B)
|
|
2
|
+
//
|
|
3
|
+
// Records every auto-mode unit dispatch (plan-slice, run-task, summarize, …)
|
|
4
|
+
// with worker_id, fencing token, status lifecycle, and retry metadata. The
|
|
5
|
+
// ledger is the substrate Phase C will consume to migrate stuck-state.json
|
|
6
|
+
// and paused-session.json out of the runtime/ directory.
|
|
7
|
+
//
|
|
8
|
+
// Codex review MEDIUM B2: partial unique index
|
|
9
|
+
// idx_unit_dispatches_active_per_unit ON unit_dispatches(unit_id)
|
|
10
|
+
// WHERE status IN ('claimed','running')
|
|
11
|
+
// enforces that two workers cannot simultaneously claim the same unit.
|
|
12
|
+
// recordDispatchClaim relies on the index to fail fast at INSERT time
|
|
13
|
+
// rather than racing in application code.
|
|
14
|
+
|
|
15
|
+
import { randomUUID } from "node:crypto";
|
|
16
|
+
|
|
17
|
+
import {
|
|
18
|
+
_getAdapter,
|
|
19
|
+
isDbAvailable,
|
|
20
|
+
transaction,
|
|
21
|
+
insertAuditEvent,
|
|
22
|
+
} from "../gsd-db.js";
|
|
23
|
+
|
|
24
|
+
export type DispatchStatus =
|
|
25
|
+
| "pending"
|
|
26
|
+
| "claimed"
|
|
27
|
+
| "running"
|
|
28
|
+
| "completed"
|
|
29
|
+
| "failed"
|
|
30
|
+
| "stuck"
|
|
31
|
+
| "canceled"
|
|
32
|
+
| "paused";
|
|
33
|
+
|
|
34
|
+
export interface UnitDispatchRow {
|
|
35
|
+
id: number;
|
|
36
|
+
trace_id: string;
|
|
37
|
+
turn_id: string | null;
|
|
38
|
+
worker_id: string;
|
|
39
|
+
milestone_lease_token: number;
|
|
40
|
+
milestone_id: string;
|
|
41
|
+
slice_id: string | null;
|
|
42
|
+
task_id: string | null;
|
|
43
|
+
unit_type: string;
|
|
44
|
+
unit_id: string;
|
|
45
|
+
status: DispatchStatus;
|
|
46
|
+
attempt_n: number;
|
|
47
|
+
started_at: string;
|
|
48
|
+
ended_at: string | null;
|
|
49
|
+
exit_reason: string | null;
|
|
50
|
+
error_summary: string | null;
|
|
51
|
+
verification_evidence_id: number | null;
|
|
52
|
+
next_run_at: string | null;
|
|
53
|
+
retry_after_ms: number | null;
|
|
54
|
+
max_attempts: number;
|
|
55
|
+
last_error_code: string | null;
|
|
56
|
+
last_error_at: string | null;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export interface RecordClaimInput {
|
|
60
|
+
traceId: string;
|
|
61
|
+
turnId?: string | null;
|
|
62
|
+
workerId: string;
|
|
63
|
+
milestoneLeaseToken: number;
|
|
64
|
+
milestoneId: string;
|
|
65
|
+
sliceId?: string | null;
|
|
66
|
+
taskId?: string | null;
|
|
67
|
+
unitType: string;
|
|
68
|
+
unitId: string;
|
|
69
|
+
/**
|
|
70
|
+
* Attempt number for this unit. Callers should compute this from the
|
|
71
|
+
* most recent prior dispatch for the same unit_id (use
|
|
72
|
+
* getRecentForUnit() then add 1). Defaults to 1 for fresh claims.
|
|
73
|
+
*/
|
|
74
|
+
attemptN?: number;
|
|
75
|
+
/** Per-attempt cap; defaults to 3. */
|
|
76
|
+
maxAttempts?: number;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export type RecordClaimResult =
|
|
80
|
+
| { ok: true; dispatchId: number }
|
|
81
|
+
| { ok: false; error: "already_active"; existingId: number; existingStatus: DispatchStatus; existingWorker: string }
|
|
82
|
+
| { ok: false; error: "stale_lease"; milestoneId: string; workerId: string; milestoneLeaseToken: number };
|
|
83
|
+
|
|
84
|
+
function isAlreadyActiveConstraintError(err: unknown): boolean {
|
|
85
|
+
const code =
|
|
86
|
+
err && typeof err === "object" && "code" in err
|
|
87
|
+
? String((err as { code?: unknown }).code ?? "")
|
|
88
|
+
: "";
|
|
89
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
90
|
+
if (/\bFOREIGN KEY\b/i.test(msg)) {
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if (code === "SQLITE_CONSTRAINT" || code === "SQLITE_CONSTRAINT_UNIQUE") {
|
|
95
|
+
return true;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return /\bUNIQUE\b|\bconstraint failed\b/i.test(msg);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Insert a new dispatch row in `claimed` state. Atomic guard against
|
|
103
|
+
* double-claim (B2): the partial unique index
|
|
104
|
+
* idx_unit_dispatches_active_per_unit refuses the INSERT if any row for
|
|
105
|
+
* the same unit_id already has status IN ('claimed','running').
|
|
106
|
+
*/
|
|
107
|
+
export function recordDispatchClaim(input: RecordClaimInput): RecordClaimResult {
|
|
108
|
+
if (!isDbAvailable()) {
|
|
109
|
+
throw new Error("recordDispatchClaim: DB unavailable");
|
|
110
|
+
}
|
|
111
|
+
const now = new Date().toISOString();
|
|
112
|
+
|
|
113
|
+
return transaction((): RecordClaimResult => {
|
|
114
|
+
const db = _getAdapter()!;
|
|
115
|
+
|
|
116
|
+
const lease = db.prepare(
|
|
117
|
+
`SELECT fencing_token
|
|
118
|
+
FROM milestone_leases
|
|
119
|
+
WHERE milestone_id = :milestone_id
|
|
120
|
+
AND worker_id = :worker_id
|
|
121
|
+
AND fencing_token = :token
|
|
122
|
+
AND status = 'held'`,
|
|
123
|
+
).get({
|
|
124
|
+
":milestone_id": input.milestoneId,
|
|
125
|
+
":worker_id": input.workerId,
|
|
126
|
+
":token": input.milestoneLeaseToken,
|
|
127
|
+
}) as { fencing_token: number } | undefined;
|
|
128
|
+
if (!lease) {
|
|
129
|
+
return {
|
|
130
|
+
ok: false,
|
|
131
|
+
error: "stale_lease",
|
|
132
|
+
milestoneId: input.milestoneId,
|
|
133
|
+
workerId: input.workerId,
|
|
134
|
+
milestoneLeaseToken: input.milestoneLeaseToken,
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
try {
|
|
139
|
+
const result = db.prepare(
|
|
140
|
+
`INSERT INTO unit_dispatches (
|
|
141
|
+
trace_id, turn_id, worker_id, milestone_lease_token,
|
|
142
|
+
milestone_id, slice_id, task_id,
|
|
143
|
+
unit_type, unit_id, status, attempt_n,
|
|
144
|
+
started_at, max_attempts
|
|
145
|
+
) VALUES (
|
|
146
|
+
:trace_id, :turn_id, :worker_id, :milestone_lease_token,
|
|
147
|
+
:milestone_id, :slice_id, :task_id,
|
|
148
|
+
:unit_type, :unit_id, 'claimed', :attempt_n,
|
|
149
|
+
:started_at, :max_attempts
|
|
150
|
+
)`,
|
|
151
|
+
).run({
|
|
152
|
+
":trace_id": input.traceId,
|
|
153
|
+
":turn_id": input.turnId ?? null,
|
|
154
|
+
":worker_id": input.workerId,
|
|
155
|
+
":milestone_lease_token": input.milestoneLeaseToken,
|
|
156
|
+
":milestone_id": input.milestoneId,
|
|
157
|
+
":slice_id": input.sliceId ?? null,
|
|
158
|
+
":task_id": input.taskId ?? null,
|
|
159
|
+
":unit_type": input.unitType,
|
|
160
|
+
":unit_id": input.unitId,
|
|
161
|
+
":attempt_n": input.attemptN ?? 1,
|
|
162
|
+
":started_at": now,
|
|
163
|
+
":max_attempts": input.maxAttempts ?? 3,
|
|
164
|
+
});
|
|
165
|
+
const id = Number((result as { lastInsertRowid?: number | bigint }).lastInsertRowid ?? 0);
|
|
166
|
+
|
|
167
|
+
insertAuditEvent({
|
|
168
|
+
eventId: randomUUID(),
|
|
169
|
+
traceId: input.traceId,
|
|
170
|
+
turnId: input.turnId ?? undefined,
|
|
171
|
+
category: "orchestration",
|
|
172
|
+
type: "dispatch-claimed",
|
|
173
|
+
ts: now,
|
|
174
|
+
payload: {
|
|
175
|
+
dispatchId: id,
|
|
176
|
+
unitId: input.unitId,
|
|
177
|
+
unitType: input.unitType,
|
|
178
|
+
workerId: input.workerId,
|
|
179
|
+
attemptN: input.attemptN ?? 1,
|
|
180
|
+
},
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
return { ok: true, dispatchId: id };
|
|
184
|
+
} catch (err) {
|
|
185
|
+
if (!isAlreadyActiveConstraintError(err)) throw err;
|
|
186
|
+
|
|
187
|
+
// Partial unique index rejected the INSERT — surface the existing
|
|
188
|
+
// active dispatch so callers can decide what to do.
|
|
189
|
+
const existing = db.prepare(
|
|
190
|
+
`SELECT id, status, worker_id FROM unit_dispatches
|
|
191
|
+
WHERE unit_id = :unit_id AND status IN ('claimed','running')
|
|
192
|
+
ORDER BY id DESC LIMIT 1`,
|
|
193
|
+
).get({ ":unit_id": input.unitId }) as { id: number; status: DispatchStatus; worker_id: string } | undefined;
|
|
194
|
+
|
|
195
|
+
return {
|
|
196
|
+
ok: false,
|
|
197
|
+
error: "already_active",
|
|
198
|
+
existingId: existing?.id ?? 0,
|
|
199
|
+
existingStatus: existing?.status ?? "claimed",
|
|
200
|
+
existingWorker: existing?.worker_id ?? "unknown",
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/** Transition a `claimed` dispatch into `running`. */
|
|
207
|
+
export function markRunning(dispatchId: number): void {
|
|
208
|
+
if (!isDbAvailable()) return;
|
|
209
|
+
const db = _getAdapter()!;
|
|
210
|
+
db.prepare(
|
|
211
|
+
`UPDATE unit_dispatches SET status = 'running'
|
|
212
|
+
WHERE id = :id AND status = 'claimed'`,
|
|
213
|
+
).run({ ":id": dispatchId });
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
export interface CompleteOpts {
|
|
217
|
+
verificationEvidenceId?: number | null;
|
|
218
|
+
exitReason?: string;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/** Transition a dispatch into `completed`. */
|
|
222
|
+
export function markCompleted(dispatchId: number, opts?: CompleteOpts): void {
|
|
223
|
+
if (!isDbAvailable()) return;
|
|
224
|
+
const now = new Date().toISOString();
|
|
225
|
+
const db = _getAdapter()!;
|
|
226
|
+
let changes = 0;
|
|
227
|
+
transaction(() => {
|
|
228
|
+
const result = db.prepare(
|
|
229
|
+
`UPDATE unit_dispatches
|
|
230
|
+
SET status = 'completed', ended_at = :ended_at,
|
|
231
|
+
exit_reason = :exit_reason,
|
|
232
|
+
verification_evidence_id = :evidence_id
|
|
233
|
+
WHERE id = :id
|
|
234
|
+
AND status IN ('claimed','running')`,
|
|
235
|
+
).run({
|
|
236
|
+
":id": dispatchId,
|
|
237
|
+
":ended_at": now,
|
|
238
|
+
":exit_reason": opts?.exitReason ?? null,
|
|
239
|
+
":evidence_id": opts?.verificationEvidenceId ?? null,
|
|
240
|
+
});
|
|
241
|
+
changes =
|
|
242
|
+
typeof (result as { changes?: unknown }).changes === "number"
|
|
243
|
+
? (result as { changes: number }).changes
|
|
244
|
+
: 0;
|
|
245
|
+
});
|
|
246
|
+
if (changes < 1) return;
|
|
247
|
+
insertAuditEvent({
|
|
248
|
+
eventId: randomUUID(),
|
|
249
|
+
traceId: dispatchId.toString(),
|
|
250
|
+
category: "orchestration",
|
|
251
|
+
type: "dispatch-completed",
|
|
252
|
+
ts: now,
|
|
253
|
+
payload: { dispatchId },
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
export interface FailureOpts {
|
|
258
|
+
errorSummary: string;
|
|
259
|
+
errorCode?: string;
|
|
260
|
+
/** Backoff before next attempt (used by stuck-detector retry suppression). */
|
|
261
|
+
retryAfterMs?: number;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/** Transition a dispatch into `failed`, optionally scheduling a retry. */
|
|
265
|
+
export function markFailed(dispatchId: number, opts: FailureOpts): void {
|
|
266
|
+
if (!isDbAvailable()) return;
|
|
267
|
+
const now = new Date();
|
|
268
|
+
const nowIso = now.toISOString();
|
|
269
|
+
const nextRunIso = opts.retryAfterMs
|
|
270
|
+
? new Date(now.getTime() + opts.retryAfterMs).toISOString()
|
|
271
|
+
: null;
|
|
272
|
+
const db = _getAdapter()!;
|
|
273
|
+
let changes = 0;
|
|
274
|
+
transaction(() => {
|
|
275
|
+
const result = db.prepare(
|
|
276
|
+
`UPDATE unit_dispatches
|
|
277
|
+
SET status = 'failed', ended_at = :ended_at,
|
|
278
|
+
error_summary = :error_summary,
|
|
279
|
+
last_error_code = :last_error_code,
|
|
280
|
+
last_error_at = :last_error_at,
|
|
281
|
+
retry_after_ms = :retry_after_ms,
|
|
282
|
+
next_run_at = :next_run_at
|
|
283
|
+
WHERE id = :id
|
|
284
|
+
AND status IN ('claimed','running')`,
|
|
285
|
+
).run({
|
|
286
|
+
":id": dispatchId,
|
|
287
|
+
":ended_at": nowIso,
|
|
288
|
+
":error_summary": opts.errorSummary,
|
|
289
|
+
":last_error_code": opts.errorCode ?? null,
|
|
290
|
+
":last_error_at": nowIso,
|
|
291
|
+
":retry_after_ms": opts.retryAfterMs ?? null,
|
|
292
|
+
":next_run_at": nextRunIso,
|
|
293
|
+
});
|
|
294
|
+
changes =
|
|
295
|
+
typeof (result as { changes?: unknown }).changes === "number"
|
|
296
|
+
? (result as { changes: number }).changes
|
|
297
|
+
: 0;
|
|
298
|
+
});
|
|
299
|
+
if (changes < 1) return;
|
|
300
|
+
insertAuditEvent({
|
|
301
|
+
eventId: randomUUID(),
|
|
302
|
+
traceId: dispatchId.toString(),
|
|
303
|
+
category: "orchestration",
|
|
304
|
+
type: "dispatch-failed",
|
|
305
|
+
ts: nowIso,
|
|
306
|
+
payload: { dispatchId, errorSummary: opts.errorSummary, retryAfterMs: opts.retryAfterMs ?? null },
|
|
307
|
+
});
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/** Transition a dispatch into `stuck`. */
|
|
311
|
+
export function markStuck(dispatchId: number, reason: string): void {
|
|
312
|
+
if (!isDbAvailable()) return;
|
|
313
|
+
const now = new Date().toISOString();
|
|
314
|
+
const db = _getAdapter()!;
|
|
315
|
+
const result = transaction(() => {
|
|
316
|
+
return db.prepare(
|
|
317
|
+
`UPDATE unit_dispatches
|
|
318
|
+
SET status = 'stuck', ended_at = :ended_at, exit_reason = :reason
|
|
319
|
+
WHERE id = :id
|
|
320
|
+
AND status IN ('claimed','running')`,
|
|
321
|
+
).run({ ":id": dispatchId, ":ended_at": now, ":reason": reason });
|
|
322
|
+
});
|
|
323
|
+
const changes =
|
|
324
|
+
typeof (result as { changes?: unknown }).changes === "number"
|
|
325
|
+
? (result as { changes: number }).changes
|
|
326
|
+
: 0;
|
|
327
|
+
if (changes <= 0) return;
|
|
328
|
+
insertAuditEvent({
|
|
329
|
+
eventId: randomUUID(),
|
|
330
|
+
traceId: dispatchId.toString(),
|
|
331
|
+
category: "orchestration",
|
|
332
|
+
type: "dispatch-stuck",
|
|
333
|
+
ts: now,
|
|
334
|
+
payload: { dispatchId, reason },
|
|
335
|
+
});
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
/** Transition a dispatch into `paused`. */
|
|
339
|
+
export function markPaused(dispatchId: number): void {
|
|
340
|
+
if (!isDbAvailable()) return;
|
|
341
|
+
const now = new Date().toISOString();
|
|
342
|
+
const db = _getAdapter()!;
|
|
343
|
+
db.prepare(
|
|
344
|
+
`UPDATE unit_dispatches
|
|
345
|
+
SET status = 'paused', ended_at = :ended_at
|
|
346
|
+
WHERE id = :id AND status IN ('claimed','running')`,
|
|
347
|
+
).run({ ":id": dispatchId, ":ended_at": now });
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
/** Transition a dispatch into `canceled`. */
|
|
351
|
+
export function markCanceled(dispatchId: number, reason: string): void {
|
|
352
|
+
if (!isDbAvailable()) return;
|
|
353
|
+
const now = new Date().toISOString();
|
|
354
|
+
const db = _getAdapter()!;
|
|
355
|
+
db.prepare(
|
|
356
|
+
`UPDATE unit_dispatches
|
|
357
|
+
SET status = 'canceled', ended_at = :ended_at, exit_reason = :reason
|
|
358
|
+
WHERE id = :id AND status IN ('pending','claimed','running')`,
|
|
359
|
+
).run({ ":id": dispatchId, ":ended_at": now, ":reason": reason });
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/**
|
|
363
|
+
* Fetch the most recent N dispatches for a unit. Used by recordDispatchClaim
|
|
364
|
+
* callers to compute attempt_n and by detect-stuck.ts (B3) to consult
|
|
365
|
+
* retry budget before tripping the stuck verdict.
|
|
366
|
+
*/
|
|
367
|
+
export function getRecentForUnit(unitId: string, limit = 10): UnitDispatchRow[] {
|
|
368
|
+
if (!isDbAvailable()) return [];
|
|
369
|
+
const db = _getAdapter()!;
|
|
370
|
+
return db.prepare(
|
|
371
|
+
`SELECT * FROM unit_dispatches WHERE unit_id = :unit_id ORDER BY id DESC LIMIT :limit`,
|
|
372
|
+
).all({ ":unit_id": unitId, ":limit": limit }) as unknown as UnitDispatchRow[];
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/**
|
|
376
|
+
* Fetch the latest dispatch for a unit, regardless of status. Returns null
|
|
377
|
+
* if the unit has never been dispatched.
|
|
378
|
+
*/
|
|
379
|
+
export function getLatestForUnit(unitId: string): UnitDispatchRow | null {
|
|
380
|
+
if (!isDbAvailable()) return null;
|
|
381
|
+
const db = _getAdapter()!;
|
|
382
|
+
const row = db.prepare(
|
|
383
|
+
`SELECT * FROM unit_dispatches WHERE unit_id = :unit_id ORDER BY id DESC LIMIT 1`,
|
|
384
|
+
).get({ ":unit_id": unitId }) as UnitDispatchRow | undefined;
|
|
385
|
+
return row ?? null;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
/**
|
|
389
|
+
* Phase C — return the most recent unit_id values for a worker, oldest-first.
|
|
390
|
+
*
|
|
391
|
+
* Drop-in replacement for the persistence side of stuck-state.json's
|
|
392
|
+
* `recentUnits` field. The auto-loop uses this to seed loopState.recentUnits
|
|
393
|
+
* on session start so the stuck-detector window survives a session restart
|
|
394
|
+
* (#3704). Returned in oldest-first order to match the in-memory window
|
|
395
|
+
* shape that detect-stuck.ts expects.
|
|
396
|
+
*/
|
|
397
|
+
export function getRecentUnitKeysForWorker(
|
|
398
|
+
workerId: string,
|
|
399
|
+
limit = 20,
|
|
400
|
+
): Array<{ key: string }> {
|
|
401
|
+
if (!isDbAvailable()) return [];
|
|
402
|
+
const db = _getAdapter()!;
|
|
403
|
+
const rows = db.prepare(
|
|
404
|
+
`SELECT unit_id FROM unit_dispatches
|
|
405
|
+
WHERE worker_id = :worker_id
|
|
406
|
+
ORDER BY started_at DESC, id DESC
|
|
407
|
+
LIMIT :limit`,
|
|
408
|
+
).all({ ":worker_id": workerId, ":limit": limit }) as Array<{ unit_id: string }>;
|
|
409
|
+
// Reverse so callers consume oldest-first (sliding-window semantics).
|
|
410
|
+
return rows.reverse().map((r) => ({ key: r.unit_id }));
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
export function getRecentUnitKeysForProjectRoot(
|
|
414
|
+
projectRootRealpath: string,
|
|
415
|
+
limit = 20,
|
|
416
|
+
): Array<{ key: string }> {
|
|
417
|
+
if (!isDbAvailable()) return [];
|
|
418
|
+
const db = _getAdapter()!;
|
|
419
|
+
const rows = db.prepare(
|
|
420
|
+
`SELECT ud.unit_id
|
|
421
|
+
FROM unit_dispatches ud
|
|
422
|
+
INNER JOIN workers w ON w.worker_id = ud.worker_id
|
|
423
|
+
WHERE w.project_root_realpath = :project_root_realpath
|
|
424
|
+
ORDER BY ud.started_at DESC, ud.id DESC
|
|
425
|
+
LIMIT :limit`,
|
|
426
|
+
).all({
|
|
427
|
+
":project_root_realpath": projectRootRealpath,
|
|
428
|
+
":limit": limit,
|
|
429
|
+
}) as Array<{ unit_id: string }>;
|
|
430
|
+
return rows.reverse().map((r) => ({ key: r.unit_id }));
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
/**
|
|
434
|
+
* Fetch dispatches for a milestone filtered by status. Useful for janitors
|
|
435
|
+
* + dashboards.
|
|
436
|
+
*/
|
|
437
|
+
export function getDispatchesByStatus(
|
|
438
|
+
milestoneId: string,
|
|
439
|
+
status: DispatchStatus,
|
|
440
|
+
): UnitDispatchRow[] {
|
|
441
|
+
if (!isDbAvailable()) return [];
|
|
442
|
+
const db = _getAdapter()!;
|
|
443
|
+
return db.prepare(
|
|
444
|
+
`SELECT * FROM unit_dispatches WHERE milestone_id = :mid AND status = :status ORDER BY id`,
|
|
445
|
+
).all({ ":mid": milestoneId, ":status": status }) as unknown as UnitDispatchRow[];
|
|
446
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Auto-mode coordination is single-host
|
|
2
|
+
|
|
3
|
+
The DB-backed coordination tables introduced by Phase B (`workers`,
|
|
4
|
+
`milestone_leases`, `unit_dispatches`, `cancellation_requests`,
|
|
5
|
+
`command_queue`) and the supporting `runtime_kv` table from Phase C all
|
|
6
|
+
rely on **shared SQLite WAL on local disk**. They do not work across
|
|
7
|
+
machines.
|
|
8
|
+
|
|
9
|
+
## Why single-host only
|
|
10
|
+
|
|
11
|
+
- SQLite WAL coordination — the locking primitives that make
|
|
12
|
+
`claimMilestoneLease`, `recordDispatchClaim`, and `claimNextCommand`
|
|
13
|
+
atomic — is local-disk only. Network filesystems (NFS, SMB, S3FS) and
|
|
14
|
+
fuse mounts break the lock semantics that the WAL relies on.
|
|
15
|
+
- Heartbeat TTL (`workers.last_heartbeat_at`) compares timestamps written
|
|
16
|
+
with SQLite wall-clock time (`datetime('now')`). Across machines without
|
|
17
|
+
wall-clock synchronization (for example NTP/chrony), TTL filtering can
|
|
18
|
+
produce phantom-active or premature-crashed verdicts. Monotonic clocks
|
|
19
|
+
are not used for these comparisons.
|
|
20
|
+
- Fencing tokens (`milestone_leases.fencing_token`) are monotonically
|
|
21
|
+
ordered by SQL within a single transaction. Cross-host races could
|
|
22
|
+
produce duplicate tokens if two SQLite processes opened the same DB
|
|
23
|
+
on a network mount.
|
|
24
|
+
|
|
25
|
+
## What does work
|
|
26
|
+
|
|
27
|
+
- Multiple `gsd auto` worker processes on the **same machine**, sharing
|
|
28
|
+
the project's SQLite DB via WAL. The lease check refuses concurrent
|
|
29
|
+
claims on the same milestone; the dispatch ledger's partial unique
|
|
30
|
+
index refuses double-claims of the same unit.
|
|
31
|
+
- A single `gsd auto` worker plus arbitrary read-only consumers
|
|
32
|
+
(dashboards, doctors) on the same machine.
|
|
33
|
+
- Worktree-based parallelism on the same machine, where each worker
|
|
34
|
+
holds a different milestone lease.
|
|
35
|
+
|
|
36
|
+
## Multi-host alternatives
|
|
37
|
+
|
|
38
|
+
If you need to coordinate `gsd auto` workers across machines, you need
|
|
39
|
+
a real coordinator: Postgres for the ledger + a leader-election service
|
|
40
|
+
(etcd, Consul) for the leases. That's out of scope for these phases.
|
|
41
|
+
The schema and module shapes here would need a non-trivial backend
|
|
42
|
+
swap before they could ride on top of either.
|
|
@@ -25,6 +25,7 @@ import { resolveMilestoneIntegrationBranch } from "./git-service.js";
|
|
|
25
25
|
import { nativeIsRepo, nativeHasChanges, nativeLastCommitEpoch, nativeGetCurrentBranch, nativeAddTracked, nativeCommit } from "./native-git-bridge.js";
|
|
26
26
|
import { loadEffectiveGSDPreferences } from "./preferences.js";
|
|
27
27
|
import { runEnvironmentChecks } from "./doctor-environment.js";
|
|
28
|
+
import { ensureDbOpen } from "./bootstrap/dynamic-tools.js";
|
|
28
29
|
|
|
29
30
|
// ── Health Score Tracking ──────────────────────────────────────────────────
|
|
30
31
|
|
|
@@ -219,6 +220,9 @@ export async function preDispatchHealthGate(basePath: string): Promise<PreDispat
|
|
|
219
220
|
// If a stale lock exists, the crash recovery path should handle it,
|
|
220
221
|
// not a new dispatch. This prevents double-dispatch after crashes.
|
|
221
222
|
try {
|
|
223
|
+
if (existsSync(join(gsdRoot(basePath), "gsd.db"))) {
|
|
224
|
+
await ensureDbOpen(basePath);
|
|
225
|
+
}
|
|
222
226
|
const lock = readCrashLock(basePath);
|
|
223
227
|
if (lock && !isLockProcessAlive(lock)) {
|
|
224
228
|
// Auto-clear it since we're about to dispatch anyway
|
|
@@ -8,6 +8,8 @@ import { deriveState, isGhostMilestone, isReusableGhostMilestone } from "./state
|
|
|
8
8
|
import { saveFile } from "./files.js";
|
|
9
9
|
import { nativeIsRepo, nativeForEachRef, nativeUpdateRef } from "./native-git-bridge.js";
|
|
10
10
|
import { readCrashLock, isLockProcessAlive, clearLock } from "./crash-recovery.js";
|
|
11
|
+
import { getActiveAutoWorkers } from "./db/auto-workers.js";
|
|
12
|
+
import { normalizeRealPath } from "./paths.js";
|
|
11
13
|
import { ensureGitignore, isGsdGitignored } from "./gitignore.js";
|
|
12
14
|
import { readAllSessionStatuses, isSessionStale, removeSessionStatus } from "./session-status-io.js";
|
|
13
15
|
import { recoverFailedMigration } from "./migrate-external.js";
|
|
@@ -35,6 +37,9 @@ export async function checkRuntimeHealth(
|
|
|
35
37
|
const root = gsdRoot(basePath);
|
|
36
38
|
|
|
37
39
|
// ── Stale crash lock ──────────────────────────────────────────────────
|
|
40
|
+
// Phase C pt 2: the lock state lives in the workers + unit_dispatches
|
|
41
|
+
// tables now, not auto.lock. readCrashLock synthesizes a LockData from
|
|
42
|
+
// the DB; isLockProcessAlive is a pure OS PID check.
|
|
38
43
|
try {
|
|
39
44
|
const lock = readCrashLock(basePath);
|
|
40
45
|
if (lock) {
|
|
@@ -45,14 +50,14 @@ export async function checkRuntimeHealth(
|
|
|
45
50
|
code: "stale_crash_lock",
|
|
46
51
|
scope: "project",
|
|
47
52
|
unitId: "project",
|
|
48
|
-
message: `Stale auto
|
|
49
|
-
file: "
|
|
53
|
+
message: `Stale auto-mode worker (PID ${lock.pid}, started ${lock.startedAt}, was executing ${lock.unitType} ${lock.unitId}) — process is no longer running`,
|
|
54
|
+
file: "<workers table>",
|
|
50
55
|
fixable: true,
|
|
51
56
|
});
|
|
52
57
|
|
|
53
58
|
if (shouldFix("stale_crash_lock")) {
|
|
54
59
|
clearLock(basePath);
|
|
55
|
-
fixesApplied.push("cleared stale auto
|
|
60
|
+
fixesApplied.push("cleared stale auto-mode worker state");
|
|
56
61
|
}
|
|
57
62
|
}
|
|
58
63
|
}
|
|
@@ -70,9 +75,22 @@ export async function checkRuntimeHealth(
|
|
|
70
75
|
if (existsSync(lockDir)) {
|
|
71
76
|
const statRes = statSync(lockDir);
|
|
72
77
|
if (statRes.isDirectory()) {
|
|
73
|
-
//
|
|
74
|
-
|
|
75
|
-
|
|
78
|
+
// Phase C pt 2: "any live process holds the lock?" check now means
|
|
79
|
+
// "is any worker registered with status='active' AND a fresh
|
|
80
|
+
// heartbeat for this project?" — readCrashLock returns null for
|
|
81
|
+
// healthy live workers (it surfaces stale ones only), so we must
|
|
82
|
+
// consult getActiveAutoWorkers directly.
|
|
83
|
+
const projectRoot = normalizeRealPath(basePath);
|
|
84
|
+
const activeWorkers = getActiveAutoWorkers().filter(
|
|
85
|
+
(w) => w.project_root_realpath === projectRoot && isLockProcessAlive({
|
|
86
|
+
pid: w.pid,
|
|
87
|
+
startedAt: w.started_at,
|
|
88
|
+
unitType: "starting",
|
|
89
|
+
unitId: "bootstrap",
|
|
90
|
+
unitStartedAt: w.started_at,
|
|
91
|
+
}),
|
|
92
|
+
);
|
|
93
|
+
const lockHolderAlive = activeWorkers.length > 0;
|
|
76
94
|
if (!lockHolderAlive) {
|
|
77
95
|
issues.push({
|
|
78
96
|
severity: "error",
|
|
@@ -3,8 +3,8 @@ import { join } from "node:path";
|
|
|
3
3
|
|
|
4
4
|
import { loadFile, parseSummary, saveFile, parseTaskPlanMustHaves, countMustHavesMentionedInSummary } from "./files.js";
|
|
5
5
|
import { parseRoadmap as parseLegacyRoadmap, parsePlan as parseLegacyPlan } from "./parsers-legacy.js";
|
|
6
|
-
import { isDbAvailable, getMilestoneSlices, getSliceTasks } from "./gsd-db.js";
|
|
7
|
-
import { resolveMilestoneFile, resolveMilestonePath, resolveSliceFile, resolveSlicePath, resolveTaskFile, resolveTasksDir, milestonesDir, gsdRoot, relMilestoneFile, relSliceFile, relTaskFile, relSlicePath, relGsdRootFile, resolveGsdRootFile, relMilestonePath } from "./paths.js";
|
|
6
|
+
import { isDbAvailable, openDatabase, getMilestoneSlices, getSliceTasks } from "./gsd-db.js";
|
|
7
|
+
import { resolveMilestoneFile, resolveMilestonePath, resolveSliceFile, resolveSlicePath, resolveTaskFile, resolveTasksDir, milestonesDir, gsdRoot, relMilestoneFile, relSliceFile, relTaskFile, relSlicePath, relGsdRootFile, resolveGsdRootFile, relMilestonePath, resolveGsdPathContract } from "./paths.js";
|
|
8
8
|
import { deriveState, isMilestoneComplete } from "./state.js";
|
|
9
9
|
import { invalidateAllCaches } from "./cache.js";
|
|
10
10
|
import { loadEffectiveGSDPreferences, type GSDPreferences } from "./preferences.js";
|
|
@@ -336,6 +336,14 @@ export async function runGSDDoctor(basePath: string, options?: { fix?: boolean;
|
|
|
336
336
|
const dryRun = options?.dryRun === true;
|
|
337
337
|
const fixLevel = options?.fixLevel ?? "all";
|
|
338
338
|
|
|
339
|
+
// CLI doctor can run before any tool handler has opened the DB. Runtime
|
|
340
|
+
// health checks need the existing project DB to surface DB-backed crash
|
|
341
|
+
// locks, paused sessions, and coordination rows.
|
|
342
|
+
const dbPath = resolveGsdPathContract(basePath).projectDb;
|
|
343
|
+
if (existsSync(dbPath)) {
|
|
344
|
+
try { openDatabase(dbPath); } catch { /* surfaced later as db_unavailable */ }
|
|
345
|
+
}
|
|
346
|
+
|
|
339
347
|
// Issue codes that represent completion state transitions — creating summary
|
|
340
348
|
// stubs, marking slices/milestones done in the roadmap. These belong to the
|
|
341
349
|
// dispatch lifecycle (complete-slice, complete-milestone units), not to
|