@mndrk/agx 1.4.24 → 1.4.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -6378,18 +6378,21 @@ async function checkOnboarding() {
6378
6378
  process.exit(0);
6379
6379
  }
6380
6380
 
6381
- // agx retry <taskId> [--task <id>] [--swarm]
6381
+ // agx retry <taskId> [--task <id>] [--swarm] [--async]
6382
6382
  if (cmd === 'retry' || (cmd === 'task' && args[1] === 'retry')) {
6383
6383
  const runArgs = cmd === 'task' ? args.slice(1) : args;
6384
6384
  retryFlowActive = true;
6385
6385
  logExecutionFlow('retry command', 'input', `cmd=${cmd}, args=${runArgs.slice(1).join(' ')}`);
6386
6386
  let taskId = null;
6387
6387
  let forceSwarm = false;
6388
+ let asyncMode = false;
6388
6389
  for (let i = 1; i < runArgs.length; i++) {
6389
6390
  if (runArgs[i] === '--task' || runArgs[i] === '-t') {
6390
6391
  taskId = runArgs[++i];
6391
6392
  } else if (runArgs[i] === '--swarm') {
6392
6393
  forceSwarm = true;
6394
+ } else if (runArgs[i] === '--async' || runArgs[i] === '-a') {
6395
+ asyncMode = true;
6393
6396
  }
6394
6397
  }
6395
6398
  if (!taskId) {
@@ -6397,12 +6400,26 @@ async function checkOnboarding() {
6397
6400
  }
6398
6401
  if (!taskId) {
6399
6402
  logExecutionFlow('retry command', 'output', 'missing task id');
6400
- console.log(`${c.yellow}Usage:${c.reset} agx retry <taskId> [--task <id>] [--swarm]`);
6401
- console.log(`${c.dim} or:${c.reset} agx task retry <taskId> [--task <id>] [--swarm]`);
6403
+ console.log(`${c.yellow}Usage:${c.reset} agx retry <taskId> [--task <id>] [--swarm] [--async]`);
6404
+ console.log(`${c.dim} or:${c.reset} agx task retry <taskId> [--task <id>] [--swarm] [--async]`);
6405
+ console.log(`${c.dim}--async: Reset status and let daemon handle (non-blocking)${c.reset}`);
6402
6406
  process.exit(1);
6403
6407
  }
6404
6408
 
6405
6409
  try {
6410
+ // Async mode: just reset task status, daemon will pick it up
6411
+ if (asyncMode) {
6412
+ const resolvedId = await resolveTaskId(taskId);
6413
+ await cloudRequest('PATCH', `/api/tasks/${resolvedId}`, {
6414
+ status: 'queued',
6415
+ started_at: null,
6416
+ completed_at: null,
6417
+ });
6418
+ console.log(`${c.green}✓${c.reset} Task ${resolvedId.slice(0, 8)} queued for retry`);
6419
+ console.log(`${c.dim}Daemon will pick it up shortly${c.reset}`);
6420
+ process.exit(0);
6421
+ }
6422
+
6406
6423
  const exitCode = await runTaskInline(taskId, { resetFirst: true, forceSwarm });
6407
6424
  process.exit(exitCode);
6408
6425
  } catch (err) {
@@ -7421,7 +7438,7 @@ PROVIDERS:
7421
7438
  CLOUD:
7422
7439
  agx new "<task>" Create task in cloud
7423
7440
  agx run <id|slug|#> Claim and run a task
7424
- agx retry <id|slug|#> Reset + retry a task
7441
+ agx retry <id|slug|#> Reset + retry a task (--async for non-blocking)
7425
7442
  agx status Show cloud status
7426
7443
  agx complete <taskId> Mark task stage complete
7427
7444
  agx project assign <project> --task <task> Assign task to project
package/lib/cli/runCli.js CHANGED
@@ -1485,18 +1485,21 @@ async function checkOnboarding() {
1485
1485
  process.exit(0);
1486
1486
  }
1487
1487
 
1488
- // agx retry <taskId> [--task <id>] [--swarm]
1488
+ // agx retry <taskId> [--task <id>] [--swarm] [--async]
1489
1489
  if (cmd === 'retry' || (cmd === 'task' && args[1] === 'retry')) {
1490
1490
  const runArgs = cmd === 'task' ? args.slice(1) : args;
1491
1491
  retryFlowActive = true;
1492
1492
  logExecutionFlow('retry command', 'input', `cmd=${cmd}, args=${runArgs.slice(1).join(' ')}`);
1493
1493
  let taskId = null;
1494
1494
  let forceSwarm = false;
1495
+ let asyncMode = false;
1495
1496
  for (let i = 1; i < runArgs.length; i++) {
1496
1497
  if (runArgs[i] === '--task' || runArgs[i] === '-t') {
1497
1498
  taskId = runArgs[++i];
1498
1499
  } else if (runArgs[i] === '--swarm') {
1499
1500
  forceSwarm = true;
1501
+ } else if (runArgs[i] === '--async' || runArgs[i] === '-a') {
1502
+ asyncMode = true;
1500
1503
  }
1501
1504
  }
1502
1505
  if (!taskId) {
@@ -1504,12 +1507,27 @@ async function checkOnboarding() {
1504
1507
  }
1505
1508
  if (!taskId) {
1506
1509
  logExecutionFlow('retry command', 'output', 'missing task id');
1507
- console.log(`${c.yellow}Usage:${c.reset} agx retry <taskId> [--task <id>] [--swarm]`);
1508
- console.log(`${c.dim} or:${c.reset} agx task retry <taskId> [--task <id>] [--swarm]`);
1510
+ console.log(`${c.yellow}Usage:${c.reset} agx retry <taskId> [--task <id>] [--swarm] [--async]`);
1511
+ console.log(`${c.dim} or:${c.reset} agx task retry <taskId> [--task <id>] [--swarm] [--async]`);
1512
+ console.log(`${c.dim}--async: Reset status and let daemon handle (non-blocking)${c.reset}`);
1509
1513
  process.exit(1);
1510
1514
  }
1511
1515
 
1512
1516
  try {
1517
+ // Async mode: just reset task status, daemon will pick it up
1518
+ logExecutionFlow('retry command', 'processing', `asyncMode=${asyncMode}, taskId=${taskId}`);
1519
+ if (asyncMode) {
1520
+ const resolvedId = await resolveTaskId(taskId);
1521
+ await cloudRequest('PATCH', `/api/tasks/${resolvedId}`, {
1522
+ status: 'queued',
1523
+ started_at: null,
1524
+ completed_at: null,
1525
+ });
1526
+ console.log(`${c.green}✓${c.reset} Task ${resolvedId.slice(0, 8)} queued for retry`);
1527
+ console.log(`${c.dim}Daemon will pick it up shortly${c.reset}`);
1528
+ process.exit(0);
1529
+ }
1530
+
1513
1531
  const exitCode = await runTaskInline(taskId, { resetFirst: true, forceSwarm });
1514
1532
  process.exit(exitCode);
1515
1533
  } catch (err) {
@@ -2500,7 +2518,7 @@ PROVIDERS:
2500
2518
  CLOUD:
2501
2519
  agx new "<task>" Create task in cloud
2502
2520
  agx run <id|slug|#> Claim and run a task
2503
- agx retry <id|slug|#> Reset + retry a task
2521
+ agx retry <id|slug|#> Reset + retry a task (--async for non-blocking)
2504
2522
  agx status Show cloud status
2505
2523
  agx complete <taskId> Mark task stage complete
2506
2524
  agx project assign <project> --task <task> Assign task to project
@@ -2,7 +2,12 @@
2
2
  * Task-level file locking for agx local state storage.
3
3
  *
4
4
  * Prevents concurrent execution of the same task.
5
- * Lock file contains: { pid, at, host }
5
+ * Lock file contains: { pid, at, host, startedAt }
6
+ *
7
+ * Lock validity checks:
8
+ * 1. Process must be alive (kill(pid, 0) succeeds)
9
+ * 2. Process start time must match (guards against PID reuse)
10
+ * 3. Lock must not be stale (default: 5 minutes)
6
11
  */
7
12
 
8
13
  const fs = require('fs');
@@ -21,6 +26,9 @@ const LOCK_STALE_MS = process.env.AGX_LOCK_STALE_MS
21
26
  ? parseInt(process.env.AGX_LOCK_STALE_MS, 10)
22
27
  : DEFAULT_LOCK_STALE_MS;
23
28
 
29
+ // Process start time - used to detect PID reuse
30
+ const PROCESS_STARTED_AT = Date.now();
31
+
24
32
  // ============================================================
25
33
  // Lock Management
26
34
  // ============================================================
@@ -30,6 +38,7 @@ const LOCK_STALE_MS = process.env.AGX_LOCK_STALE_MS
30
38
  * @property {string} lockPath - Path to the lock file
31
39
  * @property {number} pid - Process ID that holds the lock
32
40
  * @property {string} at - ISO timestamp when lock was acquired
41
+ * @property {number} startedAt - Process start timestamp (ms since epoch)
33
42
  * @property {boolean} released - Whether the lock has been released
34
43
  */
35
44
 
@@ -38,8 +47,27 @@ const LOCK_STALE_MS = process.env.AGX_LOCK_STALE_MS
38
47
  * @property {number} pid
39
48
  * @property {string} at
40
49
  * @property {string} host
50
+ * @property {number} [startedAt] - Process start timestamp (ms since epoch)
41
51
  */
42
52
 
53
+ /**
54
+ * Check if a lock is owned by the current process instance.
55
+ * This guards against PID reuse by comparing process start times.
56
+ * @param {LockPayload} lock
57
+ * @returns {boolean}
58
+ */
59
+ function isCurrentProcessLock(lock) {
60
+ if (lock.pid !== process.pid) {
61
+ return false;
62
+ }
63
+ // If startedAt is present, it must match our start time
64
+ if (typeof lock.startedAt === 'number') {
65
+ return lock.startedAt === PROCESS_STARTED_AT;
66
+ }
67
+ // Legacy lock without startedAt - be conservative, assume it's not ours
68
+ return false;
69
+ }
70
+
43
71
  /**
44
72
  * Attempt to acquire a task lock.
45
73
  * @param {string} taskRootPath - Path to the task directory
@@ -56,18 +84,44 @@ async function acquireTaskLock(taskRootPath, options = {}) {
56
84
  const existingLock = await readJsonSafe(lockPath);
57
85
 
58
86
  if (existingLock && !force) {
59
- // Check if the process is still alive
60
- const isAlive = isProcessAlive(existingLock.pid);
61
- const isStale = isLockStale(existingLock);
62
-
63
- if (isAlive && !isStale) {
64
- throw new Error(
65
- `Task is locked by process ${existingLock.pid} since ${existingLock.at}. ` +
66
- `Use --force to override if you're sure the lock is stale.`
67
- );
68
- }
87
+ // First check: is this our own lock from a previous run that wasn't released?
88
+ // This handles the case where the same process crashed/restarted mid-task.
89
+ if (existingLock.pid === process.pid) {
90
+ // Same PID - check if it's truly from this process instance
91
+ if (!isCurrentProcessLock(existingLock)) {
92
+ // Lock is from a previous instance of this PID (PID was reused)
93
+ // or from before this process started - safe to take over
94
+ // This is the key fix: we clean up our own stale locks
95
+ } else {
96
+ // Lock is from current process instance - we already hold it
97
+ // This shouldn't happen in normal operation, but return the existing lock
98
+ // Actually, this would be a programming error, so let's throw
99
+ throw new Error(
100
+ `Lock already held by this process instance. ` +
101
+ `This indicates a logic error - lock should be released before re-acquiring.`
102
+ );
103
+ }
104
+ } else {
105
+ // Different PID - check if the process is still alive and lock is fresh
106
+ const isAlive = isProcessAlive(existingLock.pid);
107
+ const isStale = isLockStale(existingLock);
108
+
109
+ if (isAlive && !isStale) {
110
+ // Also check process start time if available
111
+ const startTimeValid = !existingLock.startedAt ||
112
+ isProcessStartTimeValid(existingLock.pid, existingLock.startedAt);
113
+
114
+ if (startTimeValid) {
115
+ throw new Error(
116
+ `Task is locked by process ${existingLock.pid} since ${existingLock.at}. ` +
117
+ `Use --force to override if you're sure the lock is stale.`
118
+ );
119
+ }
120
+ // Process has same PID but different start time (PID reuse) - lock is stale
121
+ }
69
122
 
70
- // Lock is stale or process is dead - we can take it
123
+ // Lock is stale or process is dead - we can take it
124
+ }
71
125
  }
72
126
 
73
127
  const now = new Date().toISOString();
@@ -78,6 +132,7 @@ async function acquireTaskLock(taskRootPath, options = {}) {
78
132
  pid,
79
133
  at: now,
80
134
  host: os.hostname(),
135
+ startedAt: PROCESS_STARTED_AT,
81
136
  };
82
137
 
83
138
  // Ensure task directory exists
@@ -97,6 +152,7 @@ async function acquireTaskLock(taskRootPath, options = {}) {
97
152
  lockPath,
98
153
  pid,
99
154
  at: now,
155
+ startedAt: PROCESS_STARTED_AT,
100
156
  released: false,
101
157
  };
102
158
 
@@ -114,23 +170,51 @@ async function releaseTaskLock(handle) {
114
170
  return;
115
171
  }
116
172
 
173
+ let deleted = false;
174
+ let ownershipVerified = false;
175
+
117
176
  try {
118
177
  // Verify we still own the lock before removing
119
178
  const current = await readJsonSafe(handle.lockPath);
120
179
 
121
180
  if (current && current.pid === handle.pid && current.at === handle.at) {
181
+ ownershipVerified = true;
122
182
  await fs.promises.unlink(handle.lockPath);
183
+ deleted = true;
184
+ } else if (!current) {
185
+ // Lock file doesn't exist - already released or never created
186
+ deleted = true;
123
187
  }
124
- // If someone else took the lock, don't remove it
188
+ // If someone else took the lock, don't remove it (deleted stays false)
125
189
 
126
190
  } catch (err) {
127
- if (err.code !== 'ENOENT') {
128
- // Log but don't throw - lock release is best-effort
191
+ if (err.code === 'ENOENT') {
192
+ // File doesn't exist - that's fine
193
+ deleted = true;
194
+ } else {
195
+ // Log the error but continue - we'll try a forceful cleanup below
129
196
  console.error(`Warning: Failed to release lock ${handle.lockPath}:`, err.message);
197
+
198
+ // If we verified ownership but failed to delete, try once more
199
+ if (ownershipVerified) {
200
+ try {
201
+ await fs.promises.unlink(handle.lockPath);
202
+ deleted = true;
203
+ } catch (retryErr) {
204
+ if (retryErr.code === 'ENOENT') {
205
+ deleted = true; // Succeeded (file was removed between attempts)
206
+ } else {
207
+ console.error(`Warning: Retry failed for lock ${handle.lockPath}:`, retryErr.message);
208
+ }
209
+ }
210
+ }
130
211
  }
131
212
  }
132
213
 
133
214
  handle.released = true;
215
+
216
+ // Return status for debugging (callers typically ignore this)
217
+ return { deleted, ownershipVerified };
134
218
  }
135
219
 
136
220
  /**
@@ -146,12 +230,10 @@ async function checkTaskLock(taskRootPath) {
146
230
  return null;
147
231
  }
148
232
 
149
- // Check if still valid
150
- const isAlive = isProcessAlive(lock.pid);
151
- const isStale = isLockStale(lock);
152
-
153
- if (!isAlive || isStale) {
154
- return null; // Lock is stale
233
+ // Check if lock is valid
234
+ const { valid } = isLockValid(lock);
235
+ if (!valid) {
236
+ return null; // Lock is stale or invalid
155
237
  }
156
238
 
157
239
  return lock;
@@ -170,10 +252,9 @@ async function cleanStaleLock(taskRootPath) {
170
252
  return false;
171
253
  }
172
254
 
173
- const isAlive = isProcessAlive(lock.pid);
174
- const isStale = isLockStale(lock);
255
+ const { valid, reason } = isLockValid(lock);
175
256
 
176
- if (!isAlive || isStale) {
257
+ if (!valid) {
177
258
  try {
178
259
  await fs.promises.unlink(lockPath);
179
260
  return true;
@@ -181,6 +262,8 @@ async function cleanStaleLock(taskRootPath) {
181
262
  if (err.code !== 'ENOENT') {
182
263
  throw err;
183
264
  }
265
+ // File already deleted - that's fine
266
+ return true;
184
267
  }
185
268
  }
186
269
 
@@ -206,6 +289,40 @@ function isProcessAlive(pid) {
206
289
  }
207
290
  }
208
291
 
292
+ /**
293
+ * Check if a process start time is valid (the lock was created by the current
294
+ * instance of that process, not a previous one that had the same PID).
295
+ *
296
+ * This is a heuristic: if the startedAt is in the future or very old compared
297
+ * to the lock timestamp, something is wrong.
298
+ *
299
+ * @param {number} pid
300
+ * @param {number} startedAt - Process start time from lock
301
+ * @returns {boolean}
302
+ */
303
+ function isProcessStartTimeValid(pid, startedAt) {
304
+ if (typeof startedAt !== 'number') {
305
+ // No start time recorded - be conservative and assume valid
306
+ return true;
307
+ }
308
+
309
+ const now = Date.now();
310
+
311
+ // If startedAt is in the future, that's suspicious
312
+ if (startedAt > now + 60000) { // Allow 1 minute clock skew
313
+ return false;
314
+ }
315
+
316
+ // If the lock holder is our own process, check if startedAt matches
317
+ if (pid === process.pid) {
318
+ return startedAt === PROCESS_STARTED_AT;
319
+ }
320
+
321
+ // For other processes, we can't reliably verify start time
322
+ // Just do a sanity check that it's not impossibly old
323
+ return true;
324
+ }
325
+
209
326
  /**
210
327
  * Check if a lock is stale based on timestamp.
211
328
  * @param {LockPayload} lock
@@ -221,14 +338,52 @@ function isLockStale(lock) {
221
338
  }
222
339
  }
223
340
 
341
+ /**
342
+ * Determine if a lock is valid (should block acquisition).
343
+ * @param {LockPayload} lock
344
+ * @returns {{ valid: boolean, reason: string }}
345
+ */
346
+ function isLockValid(lock) {
347
+ if (!lock) {
348
+ return { valid: false, reason: 'no lock' };
349
+ }
350
+
351
+ // Check if process is alive
352
+ const alive = isProcessAlive(lock.pid);
353
+ if (!alive) {
354
+ return { valid: false, reason: 'process dead' };
355
+ }
356
+
357
+ // Check if lock is stale
358
+ if (isLockStale(lock)) {
359
+ return { valid: false, reason: 'lock stale' };
360
+ }
361
+
362
+ // Check process start time if available
363
+ if (lock.startedAt && !isProcessStartTimeValid(lock.pid, lock.startedAt)) {
364
+ return { valid: false, reason: 'pid reused' };
365
+ }
366
+
367
+ // Check if it's our own lock from a previous run
368
+ if (lock.pid === process.pid && !isCurrentProcessLock(lock)) {
369
+ return { valid: false, reason: 'own stale lock' };
370
+ }
371
+
372
+ return { valid: true, reason: 'valid' };
373
+ }
374
+
224
375
  // ============================================================
225
376
  // Exports
226
377
  // ============================================================
227
378
 
228
379
  module.exports = {
229
380
  LOCK_STALE_MS,
381
+ PROCESS_STARTED_AT,
230
382
  acquireTaskLock,
231
383
  releaseTaskLock,
232
384
  checkTaskLock,
233
385
  cleanStaleLock,
386
+ isProcessAlive,
387
+ isLockStale,
388
+ isLockValid,
234
389
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mndrk/agx",
3
- "version": "1.4.24",
3
+ "version": "1.4.26",
4
4
  "description": "Autonomous AI Agent Orchestrator for Claude, Gemini, and Ollama",
5
5
  "main": "lib/index.js",
6
6
  "exports": {