git-watchtower 1.12.2 → 1.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,7 +101,7 @@ const store = new Store();
101
101
 
102
102
  // Web dashboard server
103
103
  const { WebDashboardServer } = require('../src/server/web');
104
- const { Coordinator, Worker, generateProjectId, getActiveCoordinator, writeLock, removeLock } = require('../src/server/coordinator');
104
+ const { Coordinator, Worker, generateProjectId, getActiveCoordinator, tryAcquireLock, finalizeLock, removeLock, removeSocket, isProcessAlive } = require('../src/server/coordinator');
105
105
 
106
106
  const PROJECT_ROOT = process.cwd();
107
107
 
@@ -802,7 +802,7 @@ const { ansi, box, truncate, sparkline: uiSparkline, visibleLength, stripAnsi, p
802
802
 
803
803
  // Error detection utilities imported from src/utils/errors.js
804
804
  const { ErrorHandler, isAuthError, isMergeConflict, isNetworkError } = require('../src/utils/errors');
805
- const { Mutex } = require('../src/utils/async');
805
+ const { Mutex, sleep } = require('../src/utils/async');
806
806
 
807
807
  // Keyboard handling utilities imported from src/ui/keybindings.js
808
808
  const { filterBranches } = require('../src/ui/keybindings');
@@ -3081,6 +3081,50 @@ async function handleWebAction(action, payload) {
3081
3081
  }
3082
3082
  }
3083
3083
 
3084
+ /**
3085
+ * Maximum attempts to connect to an existing coordinator as a worker
3086
+ * before giving up (or reclaiming the lock if the coordinator is dead).
3087
+ */
3088
+ const WORKER_CONNECT_MAX_ATTEMPTS = 3;
3089
+
3090
+ /**
3091
+ * Base delay for exponential backoff between worker-connect attempts (ms).
3092
+ * Delays are 200ms, 400ms — total added latency ~600ms in the worst case.
3093
+ */
3094
+ const WORKER_CONNECT_BASE_DELAY_MS = 200;
3095
+
3096
+ /**
3097
+ * Attempt to connect to an existing coordinator as a worker, with bounded
3098
+ * exponential backoff. Returns the connected Worker on success, or null if
3099
+ * every attempt failed. Between attempts, if the coordinator's process is
3100
+ * no longer alive, we stop retrying so the caller can reclaim the lock.
3101
+ *
3102
+ * @param {{pid: number, port: number, socketPath: string}} existing - Coordinator lock info
3103
+ * @param {string} projectIdArg - Project ID for worker registration
3104
+ * @returns {Promise<Worker|null>}
3105
+ */
3106
+ async function connectWorkerWithRetry(existing, projectIdArg) {
3107
+ for (let attempt = 1; attempt <= WORKER_CONNECT_MAX_ATTEMPTS; attempt++) {
3108
+ try {
3109
+ const w = new Worker({
3110
+ id: projectIdArg,
3111
+ projectPath: PROJECT_ROOT,
3112
+ projectName: path.basename(PROJECT_ROOT),
3113
+ socketPath: existing.socketPath,
3114
+ });
3115
+ w.onCommand = (action, payload) => handleWebAction(action, payload);
3116
+ await w.connect();
3117
+ return w;
3118
+ } catch (err) {
3119
+ if (attempt >= WORKER_CONNECT_MAX_ATTEMPTS) return null;
3120
+ // Stop early if the coordinator has exited — caller will reclaim.
3121
+ if (!isProcessAlive(existing.pid)) return null;
3122
+ await sleep(WORKER_CONNECT_BASE_DELAY_MS * Math.pow(2, attempt - 1));
3123
+ }
3124
+ }
3125
+ return null;
3126
+ }
3127
+
3084
3128
  /**
3085
3129
  * Create and start the web dashboard, with coordinator support.
3086
3130
  * @param {boolean} openBrowser - Whether to auto-open the browser
@@ -3104,20 +3148,42 @@ async function startWebDashboard(openBrowser) {
3104
3148
  if (url) webDashboard.setRepoWebUrl(url);
3105
3149
  }).catch(() => {});
3106
3150
 
3107
- // Check if a coordinator is already running
3108
- const existing = getActiveCoordinator();
3151
+ // Atomically try to claim the coordinator role. If another live instance
3152
+ // already owns the lock, connect as a worker instead. This prevents a
3153
+ // TOCTOU race where two instances both pass a "no coordinator" check and
3154
+ // then clobber each other's socket in Coordinator.start().
3155
+ //
3156
+ // The outer loop runs at most twice so we can reclaim the coordinator
3157
+ // role if the existing coordinator dies while we're retrying the worker
3158
+ // handshake (e.g. it crashed just before we attached). Without this, a
3159
+ // transient connect failure (peer not yet accepting, EPIPE, slow fork)
3160
+ // against a coordinator that later crashes would leave us with no web
3161
+ // dashboard even though we could safely take over.
3162
+ let acquired = false;
3163
+ let existing = null;
3164
+ for (let outer = 0; outer < 2 && !acquired; outer++) {
3165
+ const lockResult = tryAcquireLock(process.pid);
3166
+ if (lockResult.acquired) {
3167
+ acquired = true;
3168
+ break;
3169
+ }
3109
3170
 
3110
- if (existing) {
3111
- // Connect as a worker to the existing coordinator
3112
- try {
3113
- worker = new Worker({
3114
- id: projectId,
3115
- projectPath: PROJECT_ROOT,
3116
- projectName: path.basename(PROJECT_ROOT),
3117
- socketPath: existing.socketPath,
3118
- });
3119
- worker.onCommand = (action, payload) => handleWebAction(action, payload);
3120
- await worker.connect();
3171
+ existing = lockResult.existing || getActiveCoordinator();
3172
+ if (!existing) {
3173
+ // Lock exists but we couldn't claim it and couldn't read the owner.
3174
+ // Bail out rather than race a concurrent startup.
3175
+ addLog('Web dashboard unavailable: could not acquire coordinator lock', 'error');
3176
+ webDashboard = null;
3177
+ render();
3178
+ return;
3179
+ }
3180
+
3181
+ // Try to connect as a worker with bounded retry + exponential backoff.
3182
+ // The coordinator may still be finishing its bind after finalizeLock()
3183
+ // writes the real socket path, or temporarily unresponsive.
3184
+ const connectedWorker = await connectWorkerWithRetry(existing, projectId);
3185
+ if (connectedWorker) {
3186
+ worker = connectedWorker;
3121
3187
  addLog(`Joined web dashboard at ${localhostUrl(existing.port)} (tab)`, 'success');
3122
3188
 
3123
3189
  // Push state periodically
@@ -3135,13 +3201,35 @@ async function startWebDashboard(openBrowser) {
3135
3201
  WEB_PORT = existing.port;
3136
3202
  render();
3137
3203
  return;
3138
- } catch (err) {
3139
- // Couldn't connect — become coordinator instead
3140
- worker = null;
3141
3204
  }
3205
+
3206
+ // Every connect attempt failed. If the coordinator process died while
3207
+ // we were retrying, clean up the stale lock/socket and loop once to
3208
+ // claim the coordinator role ourselves. Otherwise abort — do NOT take
3209
+ // over a live coordinator's socket.
3210
+ if (!isProcessAlive(existing.pid)) {
3211
+ removeLock();
3212
+ removeSocket();
3213
+ continue;
3214
+ }
3215
+
3216
+ addLog(`Could not join web dashboard at ${localhostUrl(existing.port)}: coordinator unreachable`, 'error');
3217
+ webDashboard = null;
3218
+ render();
3219
+ return;
3142
3220
  }
3143
3221
 
3144
- // We are the coordinator
3222
+ if (!acquired) {
3223
+ addLog('Web dashboard unavailable: could not acquire coordinator lock after retry', 'error');
3224
+ webDashboard = null;
3225
+ render();
3226
+ return;
3227
+ }
3228
+
3229
+ // We hold the lock — it is now safe to remove any leftover socket and
3230
+ // start listening. The lock file contains a placeholder pid-only entry
3231
+ // until finalizeLock() writes the real port/socketPath after a successful
3232
+ // bind.
3145
3233
  try {
3146
3234
  coordinator = new Coordinator();
3147
3235
  coordinator.onProjectsChanged = (projects) => {
@@ -3155,7 +3243,12 @@ async function startWebDashboard(openBrowser) {
3155
3243
  await coordinator.start();
3156
3244
  coordinator.registerLocal(projectId, PROJECT_ROOT, path.basename(PROJECT_ROOT), webDashboard.getSerializableState());
3157
3245
 
3158
- // Update coordinator with our latest state periodically
3246
+ const { port } = await webDashboard.start();
3247
+ WEB_PORT = port;
3248
+ finalizeLock(process.pid, port, coordinator.socketPath);
3249
+
3250
+ // Update coordinator with our latest state periodically. Started only
3251
+ // after a successful bind so a failed start doesn't leak an interval.
3159
3252
  webStateInterval = setInterval(() => {
3160
3253
  if (coordinator && webDashboard) {
3161
3254
  coordinator.updateLocal(projectId, webDashboard.getSerializableState());
@@ -3165,15 +3258,16 @@ async function startWebDashboard(openBrowser) {
3165
3258
  }
3166
3259
  }, 500);
3167
3260
 
3168
- const { port } = await webDashboard.start();
3169
- WEB_PORT = port;
3170
- writeLock(process.pid, port, coordinator.socketPath);
3171
-
3172
3261
  addLog(`Web dashboard: ${localhostUrl(port)}`, 'success');
3173
3262
  if (openBrowser) openInBrowser(localhostUrl(port));
3174
3263
  render();
3175
3264
  } catch (err) {
3176
3265
  addLog(`Web dashboard failed: ${err.message}`, 'error');
3266
+ if (coordinator) {
3267
+ try { coordinator.stop(); } catch (_) { /* ignore */ }
3268
+ }
3269
+ removeLock();
3270
+ removeSocket();
3177
3271
  webDashboard = null;
3178
3272
  coordinator = null;
3179
3273
  render();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "git-watchtower",
3
- "version": "1.12.2",
3
+ "version": "1.12.4",
4
4
  "description": "Terminal-based Git branch monitor with activity sparklines and optional dev server with live reload",
5
5
  "main": "bin/git-watchtower.js",
6
6
  "bin": {
@@ -68,13 +68,18 @@ function isProcessAlive(pid) {
68
68
 
69
69
  /**
70
70
  * Read the lock file.
71
- * @returns {{ pid: number, port: number, socketPath: string } | null}
71
+ *
72
+ * A lock may be a placeholder (pid only, no port/socketPath) while a new
73
+ * coordinator is still binding its socket. Callers that need a connectable
74
+ * coordinator should use getActiveCoordinator(), which rejects placeholders.
75
+ *
76
+ * @returns {{ pid: number, port?: number, socketPath?: string, pending?: boolean } | null}
72
77
  */
73
78
  function readLock() {
74
79
  try {
75
80
  if (!fs.existsSync(LOCK_FILE)) return null;
76
81
  const data = JSON.parse(fs.readFileSync(LOCK_FILE, 'utf8'));
77
- if (!data || !data.pid || !data.port) return null;
82
+ if (!data || !data.pid) return null;
78
83
  return data;
79
84
  } catch (e) {
80
85
  return null;
@@ -92,6 +97,66 @@ function writeLock(pid, port, socketPath) {
92
97
  fs.writeFileSync(LOCK_FILE, JSON.stringify({ pid, port, socketPath }, null, 2) + '\n', 'utf8');
93
98
  }
94
99
 
100
+ /**
101
+ * Atomically reserve the coordinator lock.
102
+ *
103
+ * Uses `fs.openSync(..., 'wx')` to create the lock file exclusively, so two
104
+ * instances racing to become coordinator cannot both succeed. A placeholder
105
+ * entry ({ pid, pending: true }) is written immediately so that any process
106
+ * reading the lock while we bind our socket still sees a valid owning PID.
107
+ *
108
+ * If the lock already exists but the owning process is dead, the stale lock
109
+ * (and socket) are cleaned up and the acquisition is retried once.
110
+ *
111
+ * @param {number} pid - PID of the acquiring process
112
+ * @returns {{acquired: true} | {acquired: false, existing: {pid: number, port?: number, socketPath?: string, pending?: boolean} | null}}
113
+ */
114
+ function tryAcquireLock(pid) {
115
+ ensureDir();
116
+
117
+ // One retry after stale-lock cleanup; avoids looping if another process
118
+ // keeps recreating the lock faster than we can clean it up.
119
+ for (let attempt = 0; attempt < 2; attempt++) {
120
+ try {
121
+ const fd = fs.openSync(LOCK_FILE, 'wx');
122
+ try {
123
+ fs.writeSync(fd, JSON.stringify({ pid, pending: true }) + '\n');
124
+ } finally {
125
+ fs.closeSync(fd);
126
+ }
127
+ return { acquired: true };
128
+ } catch (err) {
129
+ if (err.code !== 'EEXIST') throw err;
130
+
131
+ // Lock file exists — check if the owner is alive.
132
+ const existing = readLock();
133
+ if (existing && isProcessAlive(existing.pid)) {
134
+ return { acquired: false, existing };
135
+ }
136
+ // Stale or unreadable — clean up and retry the exclusive create.
137
+ removeLock();
138
+ removeSocket();
139
+ }
140
+ }
141
+
142
+ // Another process raced us to re-create the lock. Treat it as active.
143
+ const existing = readLock();
144
+ return { acquired: false, existing: existing || null };
145
+ }
146
+
147
+ /**
148
+ * Replace the placeholder lock with the final port/socket details after the
149
+ * coordinator has successfully bound its IPC socket and the web server has
150
+ * started listening. Caller must already own the lock via tryAcquireLock().
151
+ *
152
+ * @param {number} pid
153
+ * @param {number} port
154
+ * @param {string} socketPath
155
+ */
156
+ function finalizeLock(pid, port, socketPath) {
157
+ writeLock(pid, port, socketPath);
158
+ }
159
+
95
160
  /**
96
161
  * Remove the lock file.
97
162
  */
@@ -107,18 +172,25 @@ function removeSocket() {
107
172
  }
108
173
 
109
174
  /**
110
- * Check if a coordinator is already running.
111
- * Cleans up stale lock if the process is dead.
175
+ * Check if a coordinator is already running and reachable.
176
+ *
177
+ * Returns null for stale locks (cleans them up) and for placeholder locks
178
+ * that haven't finished binding yet — callers shouldn't try to connect to
179
+ * a coordinator that isn't listening.
180
+ *
112
181
  * @returns {{ pid: number, port: number, socketPath: string } | null}
113
182
  */
114
183
  function getActiveCoordinator() {
115
184
  const lock = readLock();
116
185
  if (!lock) return null;
117
- if (isProcessAlive(lock.pid)) return lock;
118
- // Stale lock — clean up
119
- removeLock();
120
- removeSocket();
121
- return null;
186
+ if (!isProcessAlive(lock.pid)) {
187
+ removeLock();
188
+ removeSocket();
189
+ return null;
190
+ }
191
+ // Placeholder (pending) — coordinator is still binding.
192
+ if (!lock.port || !lock.socketPath) return null;
193
+ return /** @type {{pid:number,port:number,socketPath:string}} */ (lock);
122
194
  }
123
195
 
124
196
  // ─── Coordinator (first instance) ────────────────────────────────
@@ -533,6 +605,8 @@ module.exports = {
533
605
  getActiveCoordinator,
534
606
  readLock,
535
607
  writeLock,
608
+ tryAcquireLock,
609
+ finalizeLock,
536
610
  removeLock,
537
611
  removeSocket,
538
612
  isProcessAlive,