@mndrk/agx 1.4.23 → 1.4.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/storage/locks.js +179 -24
- package/package.json +2 -1
package/lib/storage/locks.js
CHANGED
|
@@ -2,7 +2,12 @@
|
|
|
2
2
|
* Task-level file locking for agx local state storage.
|
|
3
3
|
*
|
|
4
4
|
* Prevents concurrent execution of the same task.
|
|
5
|
-
* Lock file contains: { pid, at, host }
|
|
5
|
+
* Lock file contains: { pid, at, host, startedAt }
|
|
6
|
+
*
|
|
7
|
+
* Lock validity checks:
|
|
8
|
+
* 1. Process must be alive (kill(pid, 0) succeeds)
|
|
9
|
+
* 2. Process start time must match (guards against PID reuse)
|
|
10
|
+
* 3. Lock must not be stale (default: 5 minutes)
|
|
6
11
|
*/
|
|
7
12
|
|
|
8
13
|
const fs = require('fs');
|
|
@@ -21,6 +26,9 @@ const LOCK_STALE_MS = process.env.AGX_LOCK_STALE_MS
|
|
|
21
26
|
? parseInt(process.env.AGX_LOCK_STALE_MS, 10)
|
|
22
27
|
: DEFAULT_LOCK_STALE_MS;
|
|
23
28
|
|
|
29
|
+
// Process start time - used to detect PID reuse
|
|
30
|
+
const PROCESS_STARTED_AT = Date.now();
|
|
31
|
+
|
|
24
32
|
// ============================================================
|
|
25
33
|
// Lock Management
|
|
26
34
|
// ============================================================
|
|
@@ -30,6 +38,7 @@ const LOCK_STALE_MS = process.env.AGX_LOCK_STALE_MS
|
|
|
30
38
|
* @property {string} lockPath - Path to the lock file
|
|
31
39
|
* @property {number} pid - Process ID that holds the lock
|
|
32
40
|
* @property {string} at - ISO timestamp when lock was acquired
|
|
41
|
+
* @property {number} startedAt - Process start timestamp (ms since epoch)
|
|
33
42
|
* @property {boolean} released - Whether the lock has been released
|
|
34
43
|
*/
|
|
35
44
|
|
|
@@ -38,8 +47,27 @@ const LOCK_STALE_MS = process.env.AGX_LOCK_STALE_MS
|
|
|
38
47
|
* @property {number} pid
|
|
39
48
|
* @property {string} at
|
|
40
49
|
* @property {string} host
|
|
50
|
+
* @property {number} [startedAt] - Process start timestamp (ms since epoch)
|
|
41
51
|
*/
|
|
42
52
|
|
|
53
|
+
/**
|
|
54
|
+
* Check if a lock is owned by the current process instance.
|
|
55
|
+
* This guards against PID reuse by comparing process start times.
|
|
56
|
+
* @param {LockPayload} lock
|
|
57
|
+
* @returns {boolean}
|
|
58
|
+
*/
|
|
59
|
+
function isCurrentProcessLock(lock) {
|
|
60
|
+
if (lock.pid !== process.pid) {
|
|
61
|
+
return false;
|
|
62
|
+
}
|
|
63
|
+
// If startedAt is present, it must match our start time
|
|
64
|
+
if (typeof lock.startedAt === 'number') {
|
|
65
|
+
return lock.startedAt === PROCESS_STARTED_AT;
|
|
66
|
+
}
|
|
67
|
+
// Legacy lock without startedAt - be conservative, assume it's not ours
|
|
68
|
+
return false;
|
|
69
|
+
}
|
|
70
|
+
|
|
43
71
|
/**
|
|
44
72
|
* Attempt to acquire a task lock.
|
|
45
73
|
* @param {string} taskRootPath - Path to the task directory
|
|
@@ -56,18 +84,44 @@ async function acquireTaskLock(taskRootPath, options = {}) {
|
|
|
56
84
|
const existingLock = await readJsonSafe(lockPath);
|
|
57
85
|
|
|
58
86
|
if (existingLock && !force) {
|
|
59
|
-
//
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
87
|
+
// First check: is this our own lock from a previous run that wasn't released?
|
|
88
|
+
// This handles the case where the same process crashed/restarted mid-task.
|
|
89
|
+
if (existingLock.pid === process.pid) {
|
|
90
|
+
// Same PID - check if it's truly from this process instance
|
|
91
|
+
if (!isCurrentProcessLock(existingLock)) {
|
|
92
|
+
// Lock is from a previous instance of this PID (PID was reused)
|
|
93
|
+
// or from before this process started - safe to take over
|
|
94
|
+
// This is the key fix: we clean up our own stale locks
|
|
95
|
+
} else {
|
|
96
|
+
// Lock is from current process instance - we already hold it
|
|
97
|
+
// This shouldn't happen in normal operation, but return the existing lock
|
|
98
|
+
// Actually, this would be a programming error, so let's throw
|
|
99
|
+
throw new Error(
|
|
100
|
+
`Lock already held by this process instance. ` +
|
|
101
|
+
`This indicates a logic error - lock should be released before re-acquiring.`
|
|
102
|
+
);
|
|
103
|
+
}
|
|
104
|
+
} else {
|
|
105
|
+
// Different PID - check if the process is still alive and lock is fresh
|
|
106
|
+
const isAlive = isProcessAlive(existingLock.pid);
|
|
107
|
+
const isStale = isLockStale(existingLock);
|
|
108
|
+
|
|
109
|
+
if (isAlive && !isStale) {
|
|
110
|
+
// Also check process start time if available
|
|
111
|
+
const startTimeValid = !existingLock.startedAt ||
|
|
112
|
+
isProcessStartTimeValid(existingLock.pid, existingLock.startedAt);
|
|
113
|
+
|
|
114
|
+
if (startTimeValid) {
|
|
115
|
+
throw new Error(
|
|
116
|
+
`Task is locked by process ${existingLock.pid} since ${existingLock.at}. ` +
|
|
117
|
+
`Use --force to override if you're sure the lock is stale.`
|
|
118
|
+
);
|
|
119
|
+
}
|
|
120
|
+
// Process has same PID but different start time (PID reuse) - lock is stale
|
|
121
|
+
}
|
|
69
122
|
|
|
70
|
-
|
|
123
|
+
// Lock is stale or process is dead - we can take it
|
|
124
|
+
}
|
|
71
125
|
}
|
|
72
126
|
|
|
73
127
|
const now = new Date().toISOString();
|
|
@@ -78,6 +132,7 @@ async function acquireTaskLock(taskRootPath, options = {}) {
|
|
|
78
132
|
pid,
|
|
79
133
|
at: now,
|
|
80
134
|
host: os.hostname(),
|
|
135
|
+
startedAt: PROCESS_STARTED_AT,
|
|
81
136
|
};
|
|
82
137
|
|
|
83
138
|
// Ensure task directory exists
|
|
@@ -97,6 +152,7 @@ async function acquireTaskLock(taskRootPath, options = {}) {
|
|
|
97
152
|
lockPath,
|
|
98
153
|
pid,
|
|
99
154
|
at: now,
|
|
155
|
+
startedAt: PROCESS_STARTED_AT,
|
|
100
156
|
released: false,
|
|
101
157
|
};
|
|
102
158
|
|
|
@@ -114,23 +170,51 @@ async function releaseTaskLock(handle) {
|
|
|
114
170
|
return;
|
|
115
171
|
}
|
|
116
172
|
|
|
173
|
+
let deleted = false;
|
|
174
|
+
let ownershipVerified = false;
|
|
175
|
+
|
|
117
176
|
try {
|
|
118
177
|
// Verify we still own the lock before removing
|
|
119
178
|
const current = await readJsonSafe(handle.lockPath);
|
|
120
179
|
|
|
121
180
|
if (current && current.pid === handle.pid && current.at === handle.at) {
|
|
181
|
+
ownershipVerified = true;
|
|
122
182
|
await fs.promises.unlink(handle.lockPath);
|
|
183
|
+
deleted = true;
|
|
184
|
+
} else if (!current) {
|
|
185
|
+
// Lock file doesn't exist - already released or never created
|
|
186
|
+
deleted = true;
|
|
123
187
|
}
|
|
124
|
-
// If someone else took the lock, don't remove it
|
|
188
|
+
// If someone else took the lock, don't remove it (deleted stays false)
|
|
125
189
|
|
|
126
190
|
} catch (err) {
|
|
127
|
-
if (err.code
|
|
128
|
-
//
|
|
191
|
+
if (err.code === 'ENOENT') {
|
|
192
|
+
// File doesn't exist - that's fine
|
|
193
|
+
deleted = true;
|
|
194
|
+
} else {
|
|
195
|
+
// Log the error but continue - we'll try a forceful cleanup below
|
|
129
196
|
console.error(`Warning: Failed to release lock ${handle.lockPath}:`, err.message);
|
|
197
|
+
|
|
198
|
+
// If we verified ownership but failed to delete, try once more
|
|
199
|
+
if (ownershipVerified) {
|
|
200
|
+
try {
|
|
201
|
+
await fs.promises.unlink(handle.lockPath);
|
|
202
|
+
deleted = true;
|
|
203
|
+
} catch (retryErr) {
|
|
204
|
+
if (retryErr.code === 'ENOENT') {
|
|
205
|
+
deleted = true; // Succeeded (file was removed between attempts)
|
|
206
|
+
} else {
|
|
207
|
+
console.error(`Warning: Retry failed for lock ${handle.lockPath}:`, retryErr.message);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
130
211
|
}
|
|
131
212
|
}
|
|
132
213
|
|
|
133
214
|
handle.released = true;
|
|
215
|
+
|
|
216
|
+
// Return status for debugging (callers typically ignore this)
|
|
217
|
+
return { deleted, ownershipVerified };
|
|
134
218
|
}
|
|
135
219
|
|
|
136
220
|
/**
|
|
@@ -146,12 +230,10 @@ async function checkTaskLock(taskRootPath) {
|
|
|
146
230
|
return null;
|
|
147
231
|
}
|
|
148
232
|
|
|
149
|
-
// Check if
|
|
150
|
-
const
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if (!isAlive || isStale) {
|
|
154
|
-
return null; // Lock is stale
|
|
233
|
+
// Check if lock is valid
|
|
234
|
+
const { valid } = isLockValid(lock);
|
|
235
|
+
if (!valid) {
|
|
236
|
+
return null; // Lock is stale or invalid
|
|
155
237
|
}
|
|
156
238
|
|
|
157
239
|
return lock;
|
|
@@ -170,10 +252,9 @@ async function cleanStaleLock(taskRootPath) {
|
|
|
170
252
|
return false;
|
|
171
253
|
}
|
|
172
254
|
|
|
173
|
-
const
|
|
174
|
-
const isStale = isLockStale(lock);
|
|
255
|
+
const { valid, reason } = isLockValid(lock);
|
|
175
256
|
|
|
176
|
-
if (!
|
|
257
|
+
if (!valid) {
|
|
177
258
|
try {
|
|
178
259
|
await fs.promises.unlink(lockPath);
|
|
179
260
|
return true;
|
|
@@ -181,6 +262,8 @@ async function cleanStaleLock(taskRootPath) {
|
|
|
181
262
|
if (err.code !== 'ENOENT') {
|
|
182
263
|
throw err;
|
|
183
264
|
}
|
|
265
|
+
// File already deleted - that's fine
|
|
266
|
+
return true;
|
|
184
267
|
}
|
|
185
268
|
}
|
|
186
269
|
|
|
@@ -206,6 +289,40 @@ function isProcessAlive(pid) {
|
|
|
206
289
|
}
|
|
207
290
|
}
|
|
208
291
|
|
|
292
|
+
/**
|
|
293
|
+
* Check if a process start time is valid (the lock was created by the current
|
|
294
|
+
* instance of that process, not a previous one that had the same PID).
|
|
295
|
+
*
|
|
296
|
+
* This is a heuristic: if the startedAt is in the future or very old compared
|
|
297
|
+
* to the lock timestamp, something is wrong.
|
|
298
|
+
*
|
|
299
|
+
* @param {number} pid
|
|
300
|
+
* @param {number} startedAt - Process start time from lock
|
|
301
|
+
* @returns {boolean}
|
|
302
|
+
*/
|
|
303
|
+
function isProcessStartTimeValid(pid, startedAt) {
|
|
304
|
+
if (typeof startedAt !== 'number') {
|
|
305
|
+
// No start time recorded - be conservative and assume valid
|
|
306
|
+
return true;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
const now = Date.now();
|
|
310
|
+
|
|
311
|
+
// If startedAt is in the future, that's suspicious
|
|
312
|
+
if (startedAt > now + 60000) { // Allow 1 minute clock skew
|
|
313
|
+
return false;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// If the lock holder is our own process, check if startedAt matches
|
|
317
|
+
if (pid === process.pid) {
|
|
318
|
+
return startedAt === PROCESS_STARTED_AT;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// For other processes, we can't reliably verify start time
|
|
322
|
+
// Just do a sanity check that it's not impossibly old
|
|
323
|
+
return true;
|
|
324
|
+
}
|
|
325
|
+
|
|
209
326
|
/**
|
|
210
327
|
* Check if a lock is stale based on timestamp.
|
|
211
328
|
* @param {LockPayload} lock
|
|
@@ -221,14 +338,52 @@ function isLockStale(lock) {
|
|
|
221
338
|
}
|
|
222
339
|
}
|
|
223
340
|
|
|
341
|
+
/**
|
|
342
|
+
* Determine if a lock is valid (should block acquisition).
|
|
343
|
+
* @param {LockPayload} lock
|
|
344
|
+
* @returns {{ valid: boolean, reason: string }}
|
|
345
|
+
*/
|
|
346
|
+
function isLockValid(lock) {
|
|
347
|
+
if (!lock) {
|
|
348
|
+
return { valid: false, reason: 'no lock' };
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// Check if process is alive
|
|
352
|
+
const alive = isProcessAlive(lock.pid);
|
|
353
|
+
if (!alive) {
|
|
354
|
+
return { valid: false, reason: 'process dead' };
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Check if lock is stale
|
|
358
|
+
if (isLockStale(lock)) {
|
|
359
|
+
return { valid: false, reason: 'lock stale' };
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Check process start time if available
|
|
363
|
+
if (lock.startedAt && !isProcessStartTimeValid(lock.pid, lock.startedAt)) {
|
|
364
|
+
return { valid: false, reason: 'pid reused' };
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// Check if it's our own lock from a previous run
|
|
368
|
+
if (lock.pid === process.pid && !isCurrentProcessLock(lock)) {
|
|
369
|
+
return { valid: false, reason: 'own stale lock' };
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
return { valid: true, reason: 'valid' };
|
|
373
|
+
}
|
|
374
|
+
|
|
224
375
|
// ============================================================
|
|
225
376
|
// Exports
|
|
226
377
|
// ============================================================
|
|
227
378
|
|
|
228
379
|
module.exports = {
|
|
229
380
|
LOCK_STALE_MS,
|
|
381
|
+
PROCESS_STARTED_AT,
|
|
230
382
|
acquireTaskLock,
|
|
231
383
|
releaseTaskLock,
|
|
232
384
|
checkTaskLock,
|
|
233
385
|
cleanStaleLock,
|
|
386
|
+
isProcessAlive,
|
|
387
|
+
isLockStale,
|
|
388
|
+
isLockValid,
|
|
234
389
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mndrk/agx",
|
|
3
|
-
"version": "1.4.
|
|
3
|
+
"version": "1.4.25",
|
|
4
4
|
"description": "Autonomous AI Agent Orchestrator for Claude, Gemini, and Ollama",
|
|
5
5
|
"main": "lib/index.js",
|
|
6
6
|
"exports": {
|
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
"cloud-runtime/**"
|
|
20
20
|
],
|
|
21
21
|
"scripts": {
|
|
22
|
+
"postinstall": "node ./scripts/postinstall.js || true",
|
|
22
23
|
"prepack": "node ./scripts/package-board-runtime.js",
|
|
23
24
|
"board:bundle": "node ./scripts/package-board-runtime.js",
|
|
24
25
|
"test": "node --localstorage-file=/tmp/agx-jest-localstorage ./node_modules/jest/bin/jest.js --forceExit",
|