git-watchtower 1.12.2 → 1.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/git-watchtower.js +118 -24
- package/package.json +1 -1
- package/src/server/coordinator.js +83 -9
package/bin/git-watchtower.js
CHANGED
|
@@ -101,7 +101,7 @@ const store = new Store();
|
|
|
101
101
|
|
|
102
102
|
// Web dashboard server
|
|
103
103
|
const { WebDashboardServer } = require('../src/server/web');
|
|
104
|
-
const { Coordinator, Worker, generateProjectId, getActiveCoordinator,
|
|
104
|
+
const { Coordinator, Worker, generateProjectId, getActiveCoordinator, tryAcquireLock, finalizeLock, removeLock, removeSocket, isProcessAlive } = require('../src/server/coordinator');
|
|
105
105
|
|
|
106
106
|
const PROJECT_ROOT = process.cwd();
|
|
107
107
|
|
|
@@ -802,7 +802,7 @@ const { ansi, box, truncate, sparkline: uiSparkline, visibleLength, stripAnsi, p
|
|
|
802
802
|
|
|
803
803
|
// Error detection utilities imported from src/utils/errors.js
|
|
804
804
|
const { ErrorHandler, isAuthError, isMergeConflict, isNetworkError } = require('../src/utils/errors');
|
|
805
|
-
const { Mutex } = require('../src/utils/async');
|
|
805
|
+
const { Mutex, sleep } = require('../src/utils/async');
|
|
806
806
|
|
|
807
807
|
// Keyboard handling utilities imported from src/ui/keybindings.js
|
|
808
808
|
const { filterBranches } = require('../src/ui/keybindings');
|
|
@@ -3081,6 +3081,50 @@ async function handleWebAction(action, payload) {
|
|
|
3081
3081
|
}
|
|
3082
3082
|
}
|
|
3083
3083
|
|
|
3084
|
+
/**
|
|
3085
|
+
* Maximum attempts to connect to an existing coordinator as a worker
|
|
3086
|
+
* before giving up (or reclaiming the lock if the coordinator is dead).
|
|
3087
|
+
*/
|
|
3088
|
+
const WORKER_CONNECT_MAX_ATTEMPTS = 3;
|
|
3089
|
+
|
|
3090
|
+
/**
|
|
3091
|
+
* Base delay for exponential backoff between worker-connect attempts (ms).
|
|
3092
|
+
* Delays are 200ms, 400ms — total added latency ~600ms in the worst case.
|
|
3093
|
+
*/
|
|
3094
|
+
const WORKER_CONNECT_BASE_DELAY_MS = 200;
|
|
3095
|
+
|
|
3096
|
+
/**
|
|
3097
|
+
* Attempt to connect to an existing coordinator as a worker, with bounded
|
|
3098
|
+
* exponential backoff. Returns the connected Worker on success, or null if
|
|
3099
|
+
* every attempt failed. Between attempts, if the coordinator's process is
|
|
3100
|
+
* no longer alive, we stop retrying so the caller can reclaim the lock.
|
|
3101
|
+
*
|
|
3102
|
+
* @param {{pid: number, port: number, socketPath: string}} existing - Coordinator lock info
|
|
3103
|
+
* @param {string} projectIdArg - Project ID for worker registration
|
|
3104
|
+
* @returns {Promise<Worker|null>}
|
|
3105
|
+
*/
|
|
3106
|
+
async function connectWorkerWithRetry(existing, projectIdArg) {
|
|
3107
|
+
for (let attempt = 1; attempt <= WORKER_CONNECT_MAX_ATTEMPTS; attempt++) {
|
|
3108
|
+
try {
|
|
3109
|
+
const w = new Worker({
|
|
3110
|
+
id: projectIdArg,
|
|
3111
|
+
projectPath: PROJECT_ROOT,
|
|
3112
|
+
projectName: path.basename(PROJECT_ROOT),
|
|
3113
|
+
socketPath: existing.socketPath,
|
|
3114
|
+
});
|
|
3115
|
+
w.onCommand = (action, payload) => handleWebAction(action, payload);
|
|
3116
|
+
await w.connect();
|
|
3117
|
+
return w;
|
|
3118
|
+
} catch (err) {
|
|
3119
|
+
if (attempt >= WORKER_CONNECT_MAX_ATTEMPTS) return null;
|
|
3120
|
+
// Stop early if the coordinator has exited — caller will reclaim.
|
|
3121
|
+
if (!isProcessAlive(existing.pid)) return null;
|
|
3122
|
+
await sleep(WORKER_CONNECT_BASE_DELAY_MS * Math.pow(2, attempt - 1));
|
|
3123
|
+
}
|
|
3124
|
+
}
|
|
3125
|
+
return null;
|
|
3126
|
+
}
|
|
3127
|
+
|
|
3084
3128
|
/**
|
|
3085
3129
|
* Create and start the web dashboard, with coordinator support.
|
|
3086
3130
|
* @param {boolean} openBrowser - Whether to auto-open the browser
|
|
@@ -3104,20 +3148,42 @@ async function startWebDashboard(openBrowser) {
|
|
|
3104
3148
|
if (url) webDashboard.setRepoWebUrl(url);
|
|
3105
3149
|
}).catch(() => {});
|
|
3106
3150
|
|
|
3107
|
-
//
|
|
3108
|
-
|
|
3151
|
+
// Atomically try to claim the coordinator role. If another live instance
|
|
3152
|
+
// already owns the lock, connect as a worker instead. This prevents a
|
|
3153
|
+
// TOCTOU race where two instances both pass a "no coordinator" check and
|
|
3154
|
+
// then clobber each other's socket in Coordinator.start().
|
|
3155
|
+
//
|
|
3156
|
+
// The outer loop runs at most twice so we can reclaim the coordinator
|
|
3157
|
+
// role if the existing coordinator dies while we're retrying the worker
|
|
3158
|
+
// handshake (e.g. it crashed just before we attached). Without this, a
|
|
3159
|
+
// transient connect failure (peer not yet accepting, EPIPE, slow fork)
|
|
3160
|
+
// against a coordinator that later crashes would leave us with no web
|
|
3161
|
+
// dashboard even though we could safely take over.
|
|
3162
|
+
let acquired = false;
|
|
3163
|
+
let existing = null;
|
|
3164
|
+
for (let outer = 0; outer < 2 && !acquired; outer++) {
|
|
3165
|
+
const lockResult = tryAcquireLock(process.pid);
|
|
3166
|
+
if (lockResult.acquired) {
|
|
3167
|
+
acquired = true;
|
|
3168
|
+
break;
|
|
3169
|
+
}
|
|
3109
3170
|
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
|
|
3115
|
-
|
|
3116
|
-
|
|
3117
|
-
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
|
|
3171
|
+
existing = lockResult.existing || getActiveCoordinator();
|
|
3172
|
+
if (!existing) {
|
|
3173
|
+
// Lock exists but we couldn't claim it and couldn't read the owner.
|
|
3174
|
+
// Bail out rather than race a concurrent startup.
|
|
3175
|
+
addLog('Web dashboard unavailable: could not acquire coordinator lock', 'error');
|
|
3176
|
+
webDashboard = null;
|
|
3177
|
+
render();
|
|
3178
|
+
return;
|
|
3179
|
+
}
|
|
3180
|
+
|
|
3181
|
+
// Try to connect as a worker with bounded retry + exponential backoff.
|
|
3182
|
+
// The coordinator may still be finishing its bind after finalizeLock()
|
|
3183
|
+
// writes the real socket path, or temporarily unresponsive.
|
|
3184
|
+
const connectedWorker = await connectWorkerWithRetry(existing, projectId);
|
|
3185
|
+
if (connectedWorker) {
|
|
3186
|
+
worker = connectedWorker;
|
|
3121
3187
|
addLog(`Joined web dashboard at ${localhostUrl(existing.port)} (tab)`, 'success');
|
|
3122
3188
|
|
|
3123
3189
|
// Push state periodically
|
|
@@ -3135,13 +3201,35 @@ async function startWebDashboard(openBrowser) {
|
|
|
3135
3201
|
WEB_PORT = existing.port;
|
|
3136
3202
|
render();
|
|
3137
3203
|
return;
|
|
3138
|
-
} catch (err) {
|
|
3139
|
-
// Couldn't connect — become coordinator instead
|
|
3140
|
-
worker = null;
|
|
3141
3204
|
}
|
|
3205
|
+
|
|
3206
|
+
// Every connect attempt failed. If the coordinator process died while
|
|
3207
|
+
// we were retrying, clean up the stale lock/socket and loop once to
|
|
3208
|
+
// claim the coordinator role ourselves. Otherwise abort — do NOT take
|
|
3209
|
+
// over a live coordinator's socket.
|
|
3210
|
+
if (!isProcessAlive(existing.pid)) {
|
|
3211
|
+
removeLock();
|
|
3212
|
+
removeSocket();
|
|
3213
|
+
continue;
|
|
3214
|
+
}
|
|
3215
|
+
|
|
3216
|
+
addLog(`Could not join web dashboard at ${localhostUrl(existing.port)}: coordinator unreachable`, 'error');
|
|
3217
|
+
webDashboard = null;
|
|
3218
|
+
render();
|
|
3219
|
+
return;
|
|
3142
3220
|
}
|
|
3143
3221
|
|
|
3144
|
-
|
|
3222
|
+
if (!acquired) {
|
|
3223
|
+
addLog('Web dashboard unavailable: could not acquire coordinator lock after retry', 'error');
|
|
3224
|
+
webDashboard = null;
|
|
3225
|
+
render();
|
|
3226
|
+
return;
|
|
3227
|
+
}
|
|
3228
|
+
|
|
3229
|
+
// We hold the lock — it is now safe to remove any leftover socket and
|
|
3230
|
+
// start listening. The lock file contains a placeholder pid-only entry
|
|
3231
|
+
// until finalizeLock() writes the real port/socketPath after a successful
|
|
3232
|
+
// bind.
|
|
3145
3233
|
try {
|
|
3146
3234
|
coordinator = new Coordinator();
|
|
3147
3235
|
coordinator.onProjectsChanged = (projects) => {
|
|
@@ -3155,7 +3243,12 @@ async function startWebDashboard(openBrowser) {
|
|
|
3155
3243
|
await coordinator.start();
|
|
3156
3244
|
coordinator.registerLocal(projectId, PROJECT_ROOT, path.basename(PROJECT_ROOT), webDashboard.getSerializableState());
|
|
3157
3245
|
|
|
3158
|
-
|
|
3246
|
+
const { port } = await webDashboard.start();
|
|
3247
|
+
WEB_PORT = port;
|
|
3248
|
+
finalizeLock(process.pid, port, coordinator.socketPath);
|
|
3249
|
+
|
|
3250
|
+
// Update coordinator with our latest state periodically. Started only
|
|
3251
|
+
// after a successful bind so a failed start doesn't leak an interval.
|
|
3159
3252
|
webStateInterval = setInterval(() => {
|
|
3160
3253
|
if (coordinator && webDashboard) {
|
|
3161
3254
|
coordinator.updateLocal(projectId, webDashboard.getSerializableState());
|
|
@@ -3165,15 +3258,16 @@ async function startWebDashboard(openBrowser) {
|
|
|
3165
3258
|
}
|
|
3166
3259
|
}, 500);
|
|
3167
3260
|
|
|
3168
|
-
const { port } = await webDashboard.start();
|
|
3169
|
-
WEB_PORT = port;
|
|
3170
|
-
writeLock(process.pid, port, coordinator.socketPath);
|
|
3171
|
-
|
|
3172
3261
|
addLog(`Web dashboard: ${localhostUrl(port)}`, 'success');
|
|
3173
3262
|
if (openBrowser) openInBrowser(localhostUrl(port));
|
|
3174
3263
|
render();
|
|
3175
3264
|
} catch (err) {
|
|
3176
3265
|
addLog(`Web dashboard failed: ${err.message}`, 'error');
|
|
3266
|
+
if (coordinator) {
|
|
3267
|
+
try { coordinator.stop(); } catch (_) { /* ignore */ }
|
|
3268
|
+
}
|
|
3269
|
+
removeLock();
|
|
3270
|
+
removeSocket();
|
|
3177
3271
|
webDashboard = null;
|
|
3178
3272
|
coordinator = null;
|
|
3179
3273
|
render();
|
package/package.json
CHANGED
|
@@ -68,13 +68,18 @@ function isProcessAlive(pid) {
|
|
|
68
68
|
|
|
69
69
|
/**
|
|
70
70
|
* Read the lock file.
|
|
71
|
-
*
|
|
71
|
+
*
|
|
72
|
+
* A lock may be a placeholder (pid only, no port/socketPath) while a new
|
|
73
|
+
* coordinator is still binding its socket. Callers that need a connectable
|
|
74
|
+
* coordinator should use getActiveCoordinator(), which rejects placeholders.
|
|
75
|
+
*
|
|
76
|
+
* @returns {{ pid: number, port?: number, socketPath?: string, pending?: boolean } | null}
|
|
72
77
|
*/
|
|
73
78
|
function readLock() {
|
|
74
79
|
try {
|
|
75
80
|
if (!fs.existsSync(LOCK_FILE)) return null;
|
|
76
81
|
const data = JSON.parse(fs.readFileSync(LOCK_FILE, 'utf8'));
|
|
77
|
-
if (!data || !data.pid
|
|
82
|
+
if (!data || !data.pid) return null;
|
|
78
83
|
return data;
|
|
79
84
|
} catch (e) {
|
|
80
85
|
return null;
|
|
@@ -92,6 +97,66 @@ function writeLock(pid, port, socketPath) {
|
|
|
92
97
|
fs.writeFileSync(LOCK_FILE, JSON.stringify({ pid, port, socketPath }, null, 2) + '\n', 'utf8');
|
|
93
98
|
}
|
|
94
99
|
|
|
100
|
+
/**
|
|
101
|
+
* Atomically reserve the coordinator lock.
|
|
102
|
+
*
|
|
103
|
+
* Uses `fs.openSync(..., 'wx')` to create the lock file exclusively, so two
|
|
104
|
+
* instances racing to become coordinator cannot both succeed. A placeholder
|
|
105
|
+
* entry ({ pid, pending: true }) is written immediately so that any process
|
|
106
|
+
* reading the lock while we bind our socket still sees a valid owning PID.
|
|
107
|
+
*
|
|
108
|
+
* If the lock already exists but the owning process is dead, the stale lock
|
|
109
|
+
* (and socket) are cleaned up and the acquisition is retried once.
|
|
110
|
+
*
|
|
111
|
+
* @param {number} pid - PID of the acquiring process
|
|
112
|
+
* @returns {{acquired: true} | {acquired: false, existing: {pid: number, port?: number, socketPath?: string, pending?: boolean} | null}}
|
|
113
|
+
*/
|
|
114
|
+
function tryAcquireLock(pid) {
|
|
115
|
+
ensureDir();
|
|
116
|
+
|
|
117
|
+
// One retry after stale-lock cleanup; avoids looping if another process
|
|
118
|
+
// keeps recreating the lock faster than we can clean it up.
|
|
119
|
+
for (let attempt = 0; attempt < 2; attempt++) {
|
|
120
|
+
try {
|
|
121
|
+
const fd = fs.openSync(LOCK_FILE, 'wx');
|
|
122
|
+
try {
|
|
123
|
+
fs.writeSync(fd, JSON.stringify({ pid, pending: true }) + '\n');
|
|
124
|
+
} finally {
|
|
125
|
+
fs.closeSync(fd);
|
|
126
|
+
}
|
|
127
|
+
return { acquired: true };
|
|
128
|
+
} catch (err) {
|
|
129
|
+
if (err.code !== 'EEXIST') throw err;
|
|
130
|
+
|
|
131
|
+
// Lock file exists — check if the owner is alive.
|
|
132
|
+
const existing = readLock();
|
|
133
|
+
if (existing && isProcessAlive(existing.pid)) {
|
|
134
|
+
return { acquired: false, existing };
|
|
135
|
+
}
|
|
136
|
+
// Stale or unreadable — clean up and retry the exclusive create.
|
|
137
|
+
removeLock();
|
|
138
|
+
removeSocket();
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Another process raced us to re-create the lock. Treat it as active.
|
|
143
|
+
const existing = readLock();
|
|
144
|
+
return { acquired: false, existing: existing || null };
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Replace the placeholder lock with the final port/socket details after the
|
|
149
|
+
* coordinator has successfully bound its IPC socket and the web server has
|
|
150
|
+
* started listening. Caller must already own the lock via tryAcquireLock().
|
|
151
|
+
*
|
|
152
|
+
* @param {number} pid
|
|
153
|
+
* @param {number} port
|
|
154
|
+
* @param {string} socketPath
|
|
155
|
+
*/
|
|
156
|
+
function finalizeLock(pid, port, socketPath) {
|
|
157
|
+
writeLock(pid, port, socketPath);
|
|
158
|
+
}
|
|
159
|
+
|
|
95
160
|
/**
|
|
96
161
|
* Remove the lock file.
|
|
97
162
|
*/
|
|
@@ -107,18 +172,25 @@ function removeSocket() {
|
|
|
107
172
|
}
|
|
108
173
|
|
|
109
174
|
/**
|
|
110
|
-
* Check if a coordinator is already running.
|
|
111
|
-
*
|
|
175
|
+
* Check if a coordinator is already running and reachable.
|
|
176
|
+
*
|
|
177
|
+
* Returns null for stale locks (cleans them up) and for placeholder locks
|
|
178
|
+
* that haven't finished binding yet — callers shouldn't try to connect to
|
|
179
|
+
* a coordinator that isn't listening.
|
|
180
|
+
*
|
|
112
181
|
* @returns {{ pid: number, port: number, socketPath: string } | null}
|
|
113
182
|
*/
|
|
114
183
|
function getActiveCoordinator() {
|
|
115
184
|
const lock = readLock();
|
|
116
185
|
if (!lock) return null;
|
|
117
|
-
if (isProcessAlive(lock.pid))
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
186
|
+
if (!isProcessAlive(lock.pid)) {
|
|
187
|
+
removeLock();
|
|
188
|
+
removeSocket();
|
|
189
|
+
return null;
|
|
190
|
+
}
|
|
191
|
+
// Placeholder (pending) — coordinator is still binding.
|
|
192
|
+
if (!lock.port || !lock.socketPath) return null;
|
|
193
|
+
return /** @type {{pid:number,port:number,socketPath:string}} */ (lock);
|
|
122
194
|
}
|
|
123
195
|
|
|
124
196
|
// ─── Coordinator (first instance) ────────────────────────────────
|
|
@@ -533,6 +605,8 @@ module.exports = {
|
|
|
533
605
|
getActiveCoordinator,
|
|
534
606
|
readLock,
|
|
535
607
|
writeLock,
|
|
608
|
+
tryAcquireLock,
|
|
609
|
+
finalizeLock,
|
|
536
610
|
removeLock,
|
|
537
611
|
removeSocket,
|
|
538
612
|
isProcessAlive,
|