openclaw-scheduler 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/AGENTS.md +302 -0
  2. package/BEST-PRACTICES.md +506 -0
  3. package/CHANGELOG.md +82 -0
  4. package/CODE_OF_CONDUCT.md +22 -0
  5. package/CONTEXT.md +26 -0
  6. package/CONTRIBUTING.md +73 -0
  7. package/IMPLEMENTATION_SPEC.md +170 -0
  8. package/INSTALL-ADDITIONAL-HOST.md +333 -0
  9. package/INSTALL-LINUX.md +419 -0
  10. package/INSTALL-WINDOWS.md +305 -0
  11. package/INSTALL.md +364 -0
  12. package/JOB-QUICK-REF.md +222 -0
  13. package/LICENSE +21 -0
  14. package/QUICK-START.md +256 -0
  15. package/README.md +2170 -0
  16. package/SECURITY.md +34 -0
  17. package/UNINSTALL.md +129 -0
  18. package/UPGRADING.md +436 -0
  19. package/agents.js +67 -0
  20. package/approval.js +107 -0
  21. package/backup.js +390 -0
  22. package/bin/openclaw-scheduler.js +138 -0
  23. package/cli.js +1083 -0
  24. package/db.js +122 -0
  25. package/dispatch/529-recovery.mjs +204 -0
  26. package/dispatch/README.md +372 -0
  27. package/dispatch/config.example.json +24 -0
  28. package/dispatch/deliver-watcher.sh +57 -0
  29. package/dispatch/hooks.mjs +171 -0
  30. package/dispatch/index.mjs +1836 -0
  31. package/dispatch/watcher.mjs +1396 -0
  32. package/dispatch-queue.js +112 -0
  33. package/dispatcher-approvals.js +96 -0
  34. package/dispatcher-delivery.js +43 -0
  35. package/dispatcher-maintenance.js +242 -0
  36. package/dispatcher-shell.js +29 -0
  37. package/dispatcher-strategies.js +1280 -0
  38. package/dispatcher-utils.js +81 -0
  39. package/dispatcher.js +855 -0
  40. package/docs/adr-schedule-ownership.md +73 -0
  41. package/docs/gateway-contract.md +904 -0
  42. package/docs/plans/2026-03-09-fix-typescript-types.md +91 -0
  43. package/docs/plans/2026-03-09-test-coverage-gaps.md +83 -0
  44. package/docs/plans/2026-03-10-dispatcher-refactor.md +801 -0
  45. package/docs/trust-architecture.md +266 -0
  46. package/gateway.js +473 -0
  47. package/idempotency.js +119 -0
  48. package/index.d.ts +864 -0
  49. package/index.js +17 -0
  50. package/jobs.js +1224 -0
  51. package/messages.js +357 -0
  52. package/migrate-consolidate.js +694 -0
  53. package/migrate.js +125 -0
  54. package/package.json +130 -0
  55. package/paths.js +79 -0
  56. package/prompt-context.js +94 -0
  57. package/retrieval.js +176 -0
  58. package/runs.js +270 -0
  59. package/scheduler-schema.js +101 -0
  60. package/schema.sql +480 -0
  61. package/scripts/dispatch-cli-utils.mjs +65 -0
  62. package/scripts/inbox-consumer.mjs +288 -0
  63. package/scripts/stuck-detector.sh +18 -0
  64. package/scripts/stuck-run-detector.mjs +333 -0
  65. package/scripts/telegram-webhook-check.mjs +238 -0
  66. package/setup.mjs +724 -0
  67. package/shell-result.js +214 -0
  68. package/task-tracker.js +300 -0
  69. package/team-adapter.js +335 -0
  70. package/v02-runtime.js +599 -0
@@ -0,0 +1,1396 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * dispatch watcher -- polls a session until done, outputs the result.
4
+ *
5
+ * Used by scheduler shell jobs for async delivery with retry + audit trail.
6
+ * The scheduler runs this as a shell job with delivery_mode='announce-always',
7
+ * so stdout is delivered via handleDelivery (retry, alias, audit).
8
+ *
9
+ * Detection strategy:
10
+ * 1. Check `status --label` -- if auto-resolved to 'done', use it
11
+ * 2. If status says 'running' but session is idle (no activity for >60s),
12
+ * also check `result --label` for a lastReply -- if found, session completed
13
+ * but status hasn't caught up yet (auto-resolve has 10min threshold)
14
+ *
15
+ * 529/Overload auto-retry:
16
+ * When a session errors with a 529/FailoverError/overload pattern, the watcher
17
+ * will automatically retry up to MAX_529_RETRIES times with exponential backoff
18
+ * (30s * retryCount). It respawns via `dispatch enqueue --mode reuse` to continue
19
+ * the same session, and tracks retryCount in labels.json.
20
+ *
21
+ * Usage: node watcher.mjs --label <label> [--timeout <seconds>] [--poll-interval <seconds>]
22
+ *
23
+ * Exit codes:
24
+ * 0 -- session completed, result on stdout
25
+ * 1 -- timeout or error
26
+ * 2 -- argument error
27
+ */
28
+
29
+ import { execFileSync, execSync } from 'child_process';
30
+ import { readFileSync, writeFileSync, renameSync, statSync } from 'fs';
31
+ import { dirname, join } from 'path';
32
+ import { homedir } from 'os';
33
+ import { fileURLToPath } from 'url';
34
+ import { sendMessage } from '../messages.js';
35
+
36
+ const __dirname = dirname(fileURLToPath(import.meta.url));
37
+ const INDEX_PATH = process.env.DISPATCH_INDEX_PATH || join(__dirname, 'index.mjs');
38
+ const LABELS_PATH = process.env.DISPATCH_LABELS_PATH || join(__dirname, 'labels.json');
39
+ const HOME_DIR = process.env.HOME || homedir();
40
+ let labelsCache = null;
41
+ let labelsCacheSignature = null;
42
+
43
+ const MAX_529_RETRIES = 3;
44
+ const RETRY_BASE_DELAY_MS = 30000; // 30 seconds
45
+
46
+ const MAX_GW_RESTART_RETRIES = 2; // Max retries for gateway-restart-kill recovery
47
+
48
+ const FLAT_WINDOW_MS = 3 * 60 * 1000; // 3 min flat = genuinely stuck
49
+ const ACTIVITY_POLL_MS = 30_000;
50
+
51
+ /** How often the watcher writes lastPing to labels.json (heartbeat signal).
52
+ * The watchdog guard in index.mjs treats pings older than 3x this as stale,
53
+ * so PING_INTERVAL_MS must stay well below PING_STALE_MS (3 * 60_000). */
54
+ const PING_INTERVAL_MS = 60_000; // 60 seconds
55
+
56
+ function getGatewayToken() {
57
+ if (process.env.OPENCLAW_GATEWAY_TOKEN) return process.env.OPENCLAW_GATEWAY_TOKEN;
58
+ try {
59
+ const configPath = join(HOME_DIR, '.openclaw', 'openclaw.json');
60
+ const cfg = JSON.parse(readFileSync(configPath, 'utf-8'));
61
+ return cfg?.gateway?.auth?.token || null;
62
+ } catch {
63
+ return null;
64
+ }
65
+ }
66
+
67
+ const GW_TOKEN = getGatewayToken();
68
+
69
+ // -- Gateway RPC (sync, matches index.mjs pattern) -----------
70
+
71
+ /**
72
+ * Sync gateway RPC call via `openclaw gateway call`.
73
+ * Returns parsed JSON or null on failure.
74
+ */
75
+ function gatewayCall(method, params = {}, opts = {}) {
76
+ const timeout = opts.timeout || 15000;
77
+ const args = ['gateway', 'call', method, '--json'];
78
+ args.push('--params', JSON.stringify(params));
79
+ args.push('--timeout', String(timeout));
80
+ const childEnv = GW_TOKEN ? { ...process.env, OPENCLAW_GATEWAY_TOKEN: GW_TOKEN } : process.env;
81
+
82
+ try {
83
+ const result = execFileSync('openclaw', args, {
84
+ encoding: 'utf-8',
85
+ timeout: timeout + 5000,
86
+ stdio: ['pipe', 'pipe', 'pipe'],
87
+ env: childEnv,
88
+ });
89
+ return JSON.parse(result.trim());
90
+ } catch (err) {
91
+ const stdout = err.stdout?.trim() || '';
92
+ if (stdout) try { return JSON.parse(stdout); } catch {}
93
+ return null;
94
+ }
95
+ }
96
+
97
+ /**
98
+ * Get current totalTokens for a session.
99
+ * Tries sessions.json first (ground truth), falls back to sessions.list API.
100
+ * Returns number or null if unavailable.
101
+ */
102
+ function getSessionTokens(sessionKey) {
103
+ // Primary: sessions.json direct read
104
+ const agent = sessionKey ? (sessionKey.split(':')[1] || 'main') : 'main';
105
+ const store = readSessionsStore(agent);
106
+ if (store && sessionKey in store) {
107
+ const tokens = store[sessionKey]?.totalTokens;
108
+ if (typeof tokens === 'number') return tokens;
109
+ }
110
+ // Fallback: gateway sessions.list API (may not see dispatcher-spawned sessions)
111
+ const result = gatewayCall('sessions.list', { activeMinutes: 1440 }, { timeout: 8000 });
112
+ const session = result?.sessions?.find(s => s.key === sessionKey);
113
+ return session?.totalTokens ?? null;
114
+ }
115
+
116
+ /** Returns the session entry from sessions.json, or null if not found. */
117
+ function getSessionStoreEntry(sessionKey) {
118
+ if (!sessionKey) return null;
119
+ const agent = sessionKey.split(':')[1] || 'main';
120
+ const store = readSessionsStore(agent);
121
+ return (store && sessionKey in store) ? store[sessionKey] : null;
122
+ }
123
+
124
+ /** Parse --flag value pairs from argv (supports both --flag value and --flag=value) */
125
+ function parseFlags(argv) {
126
+ const flags = {};
127
+ for (let i = 0; i < argv.length; i++) {
128
+ const a = argv[i];
129
+ const next = argv[i + 1];
130
+ if (a.startsWith('--')) {
131
+ const eqIdx = a.indexOf('=');
132
+ if (eqIdx > 0) {
133
+ flags[a.slice(2, eqIdx)] = a.slice(eqIdx + 1);
134
+ } else if (next && !next.startsWith('--')) {
135
+ flags[a.slice(2)] = next;
136
+ i++;
137
+ } else {
138
+ flags[a.slice(2)] = true;
139
+ }
140
+ }
141
+ }
142
+ return flags;
143
+ }
144
+
145
+ /**
146
+ * Run a dispatch subcommand and return parsed JSON, or null on failure.
147
+ */
148
+ function dispatch(subcmd, args) {
149
+ try {
150
+ const result = execFileSync(process.execPath, [INDEX_PATH, subcmd, ...args], {
151
+ encoding: 'utf-8',
152
+ timeout: 30000,
153
+ stdio: ['pipe', 'pipe', 'pipe'],
154
+ });
155
+ return JSON.parse(result.trim());
156
+ } catch {
157
+ return null;
158
+ }
159
+ }
160
+
161
+ function sleep(ms) {
162
+ return new Promise(r => setTimeout(r, ms));
163
+ }
164
+
165
+ // -- 529/Overload Detection & Retry --------------------------
166
+
167
+ /** Regex patterns that indicate a 529/overload error */
168
+ const OVERLOAD_PATTERNS = [
169
+ /529/i,
170
+ /failover\s*error/i,
171
+ /overload/i,
172
+ /temporarily\s+overloaded/i,
173
+ /service.*overloaded/i,
174
+ /rate.limit/i,
175
+ /too.many.requests/i,
176
+ ];
177
+
178
+ /**
179
+ * Check if an error message matches a 529/overload pattern.
180
+ */
181
+ function is529Error(errorMsg) {
182
+ if (!errorMsg || typeof errorMsg !== 'string') return false;
183
+ return OVERLOAD_PATTERNS.some(p => p.test(errorMsg));
184
+ }
185
+
186
+ /**
187
+ * Regex patterns that indicate the session was not found in the sessions store.
188
+ * This is the telltale signature of a gateway-restart-kill: the gateway restarted,
189
+ * wiped in-flight sessions, and the status command auto-resolved the label as 'done'
190
+ * because the sessionKey disappeared from sessions.json.
191
+ */
192
+ const GW_KILL_PATTERNS = [
193
+ /session not found in sessions store/i,
194
+ /session not found in gateway store/i,
195
+ /session never found/i,
196
+ /Auto-resolved.*session not found/i,
197
+ /Auto-resolved.*never found/i,
198
+ ];
199
+
200
+ /**
201
+ * Check if a status summary indicates the session was killed by a gateway restart.
202
+ */
203
+ function isGatewayRestartKill(summary) {
204
+ if (!summary || typeof summary !== 'string') return false;
205
+ return GW_KILL_PATTERNS.some(p => p.test(summary));
206
+ }
207
+
208
+ /**
209
+ * Load labels.json directly (avoids going through CLI for speed).
210
+ */
211
+ function getLabelsSignature() {
212
+ try {
213
+ const stats = statSync(LABELS_PATH);
214
+ return `${stats.mtimeMs}:${stats.size}`;
215
+ } catch {
216
+ return 'missing';
217
+ }
218
+ }
219
+
220
+ function loadLabels() {
221
+ const signature = getLabelsSignature();
222
+ if (labelsCache && labelsCacheSignature === signature) {
223
+ return labelsCache;
224
+ }
225
+ try {
226
+ const labels = JSON.parse(readFileSync(LABELS_PATH, 'utf-8'));
227
+ labelsCache = labels;
228
+ labelsCacheSignature = signature;
229
+ return labels;
230
+ } catch {
231
+ labelsCache = {};
232
+ labelsCacheSignature = 'missing';
233
+ return labelsCache;
234
+ }
235
+ }
236
+
237
+ /**
238
+ * Save labels.json directly.
239
+ */
240
+ function saveLabels(labels) {
241
+ const tmp = LABELS_PATH + '.tmp.' + process.pid;
242
+ writeFileSync(tmp, JSON.stringify(labels, null, 2) + '\n');
243
+ renameSync(tmp, LABELS_PATH);
244
+ labelsCache = labels;
245
+ labelsCacheSignature = getLabelsSignature();
246
+ }
247
+
248
+ function mutateLabels(mutator) {
249
+ const labels = loadLabels();
250
+ const changed = mutator(labels);
251
+ if (changed !== false) {
252
+ saveLabels(labels);
253
+ }
254
+ return labels;
255
+ }
256
+
257
+ function updateExistingLabel(label, mutator) {
258
+ return mutateLabels((labels) => {
259
+ if (!labels[label]) return false;
260
+ const changed = mutator(labels[label], labels);
261
+ if (changed === false) return false;
262
+ labels[label].updatedAt = new Date().toISOString();
263
+ return true;
264
+ });
265
+ }
266
+
267
+ /**
268
+ * Get the current retryCount for a label (default 0).
269
+ */
270
+ function getRetryCount(label) {
271
+ const labels = loadLabels();
272
+ return labels[label]?.retryCount || 0;
273
+ }
274
+
275
+ /**
276
+ * Update retryCount for a label.
277
+ */
278
+ function setRetryCount(label, count) {
279
+ updateExistingLabel(label, (entry) => {
280
+ entry.retryCount = count;
281
+ });
282
+ }
283
+
284
+ /**
285
+ * Get the current gateway-restart retry count for a label (default 0).
286
+ */
287
+ function getGwRestartRetryCount(label) {
288
+ const labels = loadLabels();
289
+ return labels[label]?.gwRestartRetryCount || 0;
290
+ }
291
+
292
+ /**
293
+ * Update the gateway-restart retry count for a label.
294
+ */
295
+ function setGwRestartRetryCount(label, count) {
296
+ updateExistingLabel(label, (entry) => {
297
+ entry.gwRestartRetryCount = count;
298
+ });
299
+ }
300
+
301
+ /**
302
+ * Send a notification via the scheduler messages table.
303
+ */
304
+ function notify(message) {
305
+ if (process.env.OPENCLAW_SCHEDULER_NOTIFY_DISABLED === '1') {
306
+ process.stderr.write(`[watcher] notify suppressed (test mode): ${message}\n`);
307
+ return;
308
+ }
309
+ try {
310
+ sendMessage({
311
+ from_agent: 'dispatch',
312
+ to_agent: 'main',
313
+ body: message,
314
+ kind: 'text',
315
+ });
316
+ } catch (err) {
317
+ process.stderr.write(`[watcher] notify failed: ${err.message}\n`);
318
+ }
319
+ }
320
+
321
+ /**
322
+ * Attempt to retry a 529'd session.
323
+ * Resets labels.json status to 'running', then re-enqueues with mode=reuse.
324
+ *
325
+ * Returns true if retry was dispatched, false if max retries exceeded.
326
+ */
327
+ function attempt529Retry(label, retryCount, errorMsg) {
328
+ if (retryCount >= MAX_529_RETRIES) {
329
+ // Max retries exceeded
330
+ updateExistingLabel(label, (entry) => {
331
+ entry.status = 'error';
332
+ entry.error = `max_retries_exceeded (${retryCount}x 529): ${errorMsg}`;
333
+ });
334
+ notify(`🌶️ Dispatch: [${label}] hit max retries (${MAX_529_RETRIES}x 529 overload) -- giving up`);
335
+ return { retry: false };
336
+ }
337
+
338
+ const newRetryCount = retryCount + 1;
339
+ const delayMs = RETRY_BASE_DELAY_MS * newRetryCount;
340
+
341
+ process.stderr.write(
342
+ `[watcher] 529 detected for [${label}] (attempt ${newRetryCount}/${MAX_529_RETRIES}). ` +
343
+ `Waiting ${delayMs / 1000}s before retry...\n`
344
+ );
345
+ notify(`🌶️ Dispatch: [${label}] hit 529 overload -- retry ${newRetryCount}/${MAX_529_RETRIES} in ${delayMs / 1000}s`);
346
+
347
+ // Update retryCount in labels.json BEFORE sleeping (persist intent)
348
+ setRetryCount(label, newRetryCount);
349
+
350
+ return { retry: true, delayMs, newRetryCount };
351
+ }
352
+
353
+ /**
354
+ * Re-enqueue a label via dispatch enqueue --mode reuse.
355
+ * Uses the original label's message from the gateway session.
356
+ */
357
+ function respawnSession(label) {
358
+ try {
359
+ // Reset the label status to 'running' so the re-enqueue can proceed
360
+ const labels = loadLabels();
361
+ const entry = labels[label];
362
+ if (!entry) throw new Error(`label "${label}" not found`);
363
+
364
+ // We need to re-enqueue. Since we're using mode=reuse, the session key
365
+ // is preserved and we send a continuation message.
366
+ const continuationMsg = `[Auto-retry after 529 overload] Please continue your previous task. Pick up where you left off.`;
367
+
368
+ execFileSync(process.execPath, [
369
+ INDEX_PATH, 'send',
370
+ '--label', label,
371
+ '--message', continuationMsg,
372
+ ], {
373
+ encoding: 'utf-8',
374
+ timeout: 30000,
375
+ stdio: ['pipe', 'pipe', 'pipe'],
376
+ });
377
+
378
+ // Reload labels after execFileSync (child may have modified labels.json)
379
+ updateExistingLabel(label, (entry) => {
380
+ entry.status = 'running';
381
+ entry.error = null;
382
+ });
383
+
384
+ process.stderr.write(`[watcher] respawned [${label}] via send (reuse session)\n`);
385
+ return true;
386
+ } catch (err) {
387
+ process.stderr.write(`[watcher] respawn via send failed: ${err.message}\n`);
388
+
389
+ // Fallback: try fresh enqueue if send fails (session may be dead)
390
+ try {
391
+ const labels = loadLabels();
392
+ const entry = labels[label];
393
+ const continuationMsg = `[Auto-retry after 529 overload] This is a retry of a previous task that failed due to API overload. Please continue the task from the beginning.`;
394
+
395
+ // Build enqueue args from original label data
396
+ const enqueueArgs = [
397
+ INDEX_PATH, 'enqueue',
398
+ '--label', label,
399
+ '--message', continuationMsg,
400
+ '--mode', 'fresh',
401
+ ];
402
+ if (entry?.model) enqueueArgs.push('--model', entry.model);
403
+ if (entry?.thinking) enqueueArgs.push('--thinking', entry.thinking);
404
+ if (entry?.origin) enqueueArgs.push('--origin', entry.origin);
405
+ if (entry?.deliverTo) {
406
+ enqueueArgs.push('--deliver-to', entry.deliverTo);
407
+ if (entry?.deliveryMode) enqueueArgs.push('--delivery-mode', entry.deliveryMode);
408
+ if (entry?.deliverChannel) enqueueArgs.push('--deliver-channel', entry.deliverChannel);
409
+ }
410
+
411
+ execFileSync(process.execPath, enqueueArgs, {
412
+ encoding: 'utf-8',
413
+ timeout: 30000,
414
+ stdio: ['pipe', 'pipe', 'pipe'],
415
+ });
416
+
417
+ process.stderr.write(`[watcher] respawned [${label}] via fresh enqueue (fallback)\n`);
418
+ return true;
419
+ } catch (err2) {
420
+ process.stderr.write(`[watcher] respawn fallback also failed: ${err2.message}\n`);
421
+ return false;
422
+ }
423
+ }
424
+ }
425
+
426
+ /**
427
+ * Re-enqueue a label after a gateway-restart kill.
428
+ * Always uses fresh mode since the original session is gone (the gateway restart
429
+ * wiped it). Resets label status to 'running' on success so the watcher can
430
+ * continue polling the new session.
431
+ */
432
+ function respawnAfterGwRestart(label) {
433
+ try {
434
+ const labels = loadLabels();
435
+ const entry = labels[label];
436
+ if (!entry) throw new Error(`label "${label}" not found`);
437
+
438
+ const continuationMsg =
439
+ `[Auto-retry after gateway restart] Previous run was killed by gateway restart. ` +
440
+ `Resume from the beginning.`;
441
+
442
+ const enqueueArgs = [
443
+ INDEX_PATH, 'enqueue',
444
+ '--label', label,
445
+ '--message', continuationMsg,
446
+ '--mode', 'fresh',
447
+ ];
448
+ if (entry?.model) enqueueArgs.push('--model', entry.model);
449
+ if (entry?.thinking) enqueueArgs.push('--thinking', entry.thinking);
450
+ if (entry?.origin) enqueueArgs.push('--origin', entry.origin);
451
+ if (entry?.deliverTo) {
452
+ enqueueArgs.push('--deliver-to', entry.deliverTo);
453
+ if (entry?.deliveryMode) enqueueArgs.push('--delivery-mode', entry.deliveryMode);
454
+ if (entry?.deliverChannel) enqueueArgs.push('--deliver-channel', entry.deliverChannel);
455
+ }
456
+
457
+ execFileSync(process.execPath, enqueueArgs, {
458
+ encoding: 'utf-8',
459
+ timeout: 30000,
460
+ stdio: ['pipe', 'pipe', 'pipe'],
461
+ });
462
+
463
+ // enqueue sets the label to 'running' with a new sessionKey -- also reset error field
464
+ updateExistingLabel(label, (entry) => {
465
+ entry.error = null;
466
+ });
467
+
468
+ process.stderr.write(`[watcher] respawned [${label}] via fresh enqueue after gateway restart\n`);
469
+ return true;
470
+ } catch (err) {
471
+ process.stderr.write(`[watcher] respawn after gw restart failed: ${err.message}\n`);
472
+ return false;
473
+ }
474
+ }
475
+
476
+ // -- Gateway Steer & Kill -------------------------------------
477
+
478
+ /**
479
+ * Send a steer message into a running session via gateway API (sync).
480
+ */
481
+ function steerSession(sessionKey, message) {
482
+ if (!GW_TOKEN) {
483
+ process.stderr.write(`[watcher] steer skipped: no gateway token\n`);
484
+ return false;
485
+ }
486
+ try {
487
+ gatewayCall('agent', {
488
+ message,
489
+ sessionKey,
490
+ deliver: false,
491
+ lane: 'nested',
492
+ }, { timeout: 15000 });
493
+ return true;
494
+ } catch (err) {
495
+ process.stderr.write(`[watcher] steer failed: ${err.message}\n`);
496
+ return false;
497
+ }
498
+ }
499
+
500
+ /**
501
+ * Kill a session via gateway subagents API (sync).
502
+ */
503
+ function killSession(sessionKey) {
504
+ if (!GW_TOKEN) {
505
+ process.stderr.write(`[watcher] kill skipped: no gateway token\n`);
506
+ return;
507
+ }
508
+ try {
509
+ gatewayCall('subagents.kill', { target: sessionKey }, { timeout: 10000 });
510
+ } catch (err) {
511
+ process.stderr.write(`[watcher] kill failed: ${err.message}\n`);
512
+ }
513
+ }
514
+
515
+ /**
516
+ * Read the sessions.json store for an agent directly from disk.
517
+ * Primary ground truth for session state -- sessions spawned via dispatcher
518
+ * HTTP agent endpoint appear here but NOT in sessions_list API results.
519
+ *
520
+ * @param {string} agent - Agent ID (default: 'main')
521
+ * @returns {Object|null} - Sessions store object, or null on read error
522
+ */
523
+ function readSessionsStore(agent = 'main') {
524
+ try {
525
+ const sessionsPath = join(HOME_DIR, '.openclaw', 'agents', agent, 'sessions', 'sessions.json');
526
+ return JSON.parse(readFileSync(sessionsPath, 'utf-8'));
527
+ } catch {
528
+ return null;
529
+ }
530
+ }
531
+
532
+ /**
533
+ * Get the mtime (in milliseconds) of a session's JSONL file.
534
+ *
535
+ * Unlike sessions.json (which is NOT flushed during active turns), the JSONL
536
+ * file at ~/.openclaw/agents/<agentDir>/sessions/<sessionId>.jsonl is written
537
+ * continuously as the session processes messages. Use this as a reliable
538
+ * activity signal when totalTokens and updatedAt are flat.
539
+ *
540
+ * Fix rationale: for spawned subagent sessions, OpenClaw does NOT flush
541
+ * totalTokens or updatedAt during active turns -- so sessions.json stays stale
542
+ * while the session is actively working. The JSONL mtime advances on every
543
+ * tool call, model reply, and streaming chunk, making it a much more reliable
544
+ * liveness signal. Without this, the watcher hits FLAT_WINDOW_MS mid-turn and
545
+ * marks the session done prematurely, causing zombie sessions with no delivery.
546
+ *
547
+ * @param {string} sessionId - Internal session UUID (entry.sessionId from sessions.json)
548
+ * @param {string} agentDir - Agent directory (default: 'main')
549
+ * @returns {number|null} mtimeMs if file exists, null otherwise
550
+ */
551
+ function getSessionJsonlMtime(sessionId, agentDir = 'main') {
552
+ if (!sessionId) return null;
553
+ try {
554
+ const jsonlPath = join(HOME_DIR, '.openclaw', 'agents', agentDir, 'sessions', `${sessionId}.jsonl`);
555
+ return statSync(jsonlPath).mtimeMs;
556
+ } catch {
557
+ return null;
558
+ }
559
+ }
560
+
561
+
562
+ /**
563
+ * Read the last N non-empty lines from a session's JSONL file and return them
564
+ * as parsed objects. Returns null if file doesn't exist or is unreadable.
565
+ *
566
+ * @param {string} sessionId - Internal session UUID
567
+ * @param {string} agentDir - Agent directory (default: 'main')
568
+ * @param {number} n - Number of lines to read from end (default: 3)
569
+ * @returns {Array|null} parsed JSON objects, or null
570
+ */
571
+ function readJsonlLastLines(sessionId, agentDir = 'main', n = 3) {
572
+ if (!sessionId) return null;
573
+ try {
574
+ const jsonlPath = join(HOME_DIR, '.openclaw', 'agents', agentDir, 'sessions', `${sessionId}.jsonl`);
575
+ const content = readFileSync(jsonlPath, 'utf-8');
576
+ return content
577
+ .split('\n')
578
+ .filter(l => l.trim())
579
+ .slice(-n)
580
+ .map(l => { try { return JSON.parse(l); } catch { return null; } })
581
+ .filter(Boolean);
582
+ } catch {
583
+ return null;
584
+ }
585
+ }
586
+
587
+ /**
588
+ * Check if a session is currently mid-turn by inspecting its JSONL tail.
589
+ * Returns a reason string if mid-turn is detected, null if safe to proceed.
590
+ *
591
+ * Mid-turn signals:
592
+ * - Last entry is role=assistant with content containing type=tool_use
593
+ * -> assistant dispatched a tool call, tool hasn't returned yet
594
+ * - Last entry is role=user with content containing type=tool_result
595
+ * -> tool result just delivered, assistant hasn't replied yet
596
+ * - JSONL modified within FLAT_WINDOW_MS (combined with above)
597
+ *
598
+ * Safe signals (return null):
599
+ * - JSONL doesn't exist or hasn't been modified in >FLAT_WINDOW_MS
600
+ * - Last assistant entry has type=text only (complete reply)
601
+ *
602
+ * @param {string} sessionId - Internal session UUID
603
+ * @param {string} agentDir - Agent directory (default: 'main')
604
+ * @returns {string|null} reason string if mid-turn, null if safe to proceed
605
+ */
606
+ function getJsonlMidTurnReason(sessionId, agentDir = 'main') {
607
+ if (!sessionId) return null;
608
+
609
+ const jsonlPath = join(HOME_DIR, '.openclaw', 'agents', agentDir, 'sessions', `${sessionId}.jsonl`);
610
+ let mtimeMs;
611
+ try {
612
+ mtimeMs = statSync(jsonlPath).mtimeMs;
613
+ } catch {
614
+ return null; // File doesn't exist -- session is genuinely gone, safe to proceed
615
+ }
616
+
617
+ // If JSONL hasn't been modified in >FLAT_WINDOW_MS, session isn't actively running
618
+ if (Date.now() - mtimeMs > FLAT_WINDOW_MS) {
619
+ return null;
620
+ }
621
+
622
+ const lastLines = readJsonlLastLines(sessionId, agentDir, 3);
623
+ if (!lastLines || lastLines.length === 0) return null;
624
+
625
+ const last = lastLines[lastLines.length - 1];
626
+
627
+ // Check last entry: role=assistant with tool_use in content array
628
+ // (assistant dispatched a tool call, awaiting tool result)
629
+ if (last?.role === 'assistant') {
630
+ const content = Array.isArray(last.content) ? last.content : [];
631
+ const hasToolUse = content.some(c => c?.type === 'tool_use');
632
+ if (hasToolUse) {
633
+ const toolName = content.find(c => c?.type === 'tool_use')?.name || 'unknown';
634
+ return `last assistant entry has tool_use (${toolName}) -- awaiting tool result`;
635
+ }
636
+ // Top-level type=tool_use (non-array content format)
637
+ if (last.type === 'tool_use') {
638
+ return `last entry is tool_use (${last.name || 'unknown'}) -- awaiting tool result`;
639
+ }
640
+ }
641
+
642
+ // Check last entry: role=user with tool_result in content
643
+ // (tool result just delivered, assistant hasn't replied yet)
644
+ if (last?.role === 'user') {
645
+ const content = Array.isArray(last.content) ? last.content : [];
646
+ if (content.some(c => c?.type === 'tool_result')) {
647
+ return 'last entry is tool_result (tool executed, awaiting assistant reply)';
648
+ }
649
+ }
650
+
651
+ // Top-level type=tool_result (alternative format)
652
+ if (last?.type === 'tool_result') {
653
+ return 'last entry is tool_result (tool executed, awaiting assistant reply)';
654
+ }
655
+
656
+ return null; // Last assistant entry appears to be a complete text reply -- safe to proceed
657
+ }
658
+
659
+ /**
660
+ * Update labels.json to mark the watched label as done (best-effort, atomic write).
661
+ * Called before exit to ensure labels.json is reconciled even if sync fails.
662
+ */
663
+ function markLabelDone(label, summary) {
664
+ try {
665
+ updateExistingLabel(label, (entry) => {
666
+ if (entry.status === 'done') return false;
667
+ entry.status = 'done';
668
+ entry.summary = summary || entry.summary || null;
669
+ });
670
+ } catch (e) {
671
+ process.stderr.write(`[watcher] markLabelDone failed: ${e.message}\n`);
672
+ }
673
+ }
674
+
675
+ /**
676
+ * Update labels.json to mark the watched label as 'error' (best-effort, atomic write).
677
+ * Used instead of markDoneSync/markLabelDone for sessions that did NOT complete
678
+ * successfully: gateway-restart-kill, timeout with no result, spawn failure.
679
+ * This ensures the scheduler run status reflects the true failure outcome.
680
+ */
681
+ function markLabelError(label, errorSummary) {
682
+ try {
683
+ updateExistingLabel(label, (entry) => {
684
+ if (entry.status === 'done') return false;
685
+ entry.status = 'error';
686
+ entry.summary = errorSummary || 'failed without result';
687
+ });
688
+ } catch (e) {
689
+ process.stderr.write(`[watcher] markLabelError failed: ${e.message}\n`);
690
+ }
691
+ }
692
+
693
+ /**
694
+ * Format and output the delivery message, then exit 0.
695
+ * Also marks the label as done in labels.json before exiting.
696
+ *
697
+ * If the label has a verifyCmd stored, it is run first.
698
+ * If the verify command exits non-zero, the job is marked as error and
699
+ * an alert is written to stdout (delivery target receives the failure notice).
700
+ */
701
+ function deliverResult(label, lastReply, fallbackSummary) {
702
+ // -- verify-cmd check -----------------------------------------------------
703
+ // Run the stored verify-cmd (if any) before declaring the job done.
704
+ // A non-zero exit flips the job to error state and sends an alert instead.
705
+ try {
706
+ const labels = loadLabels();
707
+ const entry = labels[label];
708
+ if (entry?.verifyCmd) {
709
+ process.stderr.write(`[watcher] running verify-cmd for ${label}: ${entry.verifyCmd}\n`);
710
+ try {
711
+ execSync(entry.verifyCmd, { stdio: 'pipe', timeout: 60000, shell: true });
712
+ process.stderr.write(`[watcher] verify-cmd passed for ${label}\n`);
713
+ } catch (verifyErr) {
714
+ const stderr = verifyErr.stderr ? verifyErr.stderr.toString().trim() : verifyErr.message;
715
+ const errMsg = `verify-cmd failed: ${stderr || 'exit code ' + (verifyErr.status ?? 1)}`;
716
+ process.stderr.write(`[watcher] ${errMsg}\n`);
717
+ markLabelError(label, errMsg);
718
+ // Output failure notice -- scheduler delivers this to the delivery target
719
+ process.stdout.write(
720
+ `🌶️ *dispatch* [${label}] ⚠️ VERIFICATION FAILED\n\n` +
721
+ `The agent session completed but the post-completion verify-cmd exited non-zero.\n\n` +
722
+ `**Verify command:** \`${entry.verifyCmd}\`\n` +
723
+ `**Error:** ${stderr || 'non-zero exit'}\n\n` +
724
+ `Job marked as \`error\`. The agent may have reported done without completing the actual work.\n`
725
+ );
726
+ process.exit(1);
727
+ }
728
+ }
729
+ } catch (loadErr) {
730
+ // Non-fatal -- if labels can't be read, skip verify check and proceed normally
731
+ process.stderr.write(`[watcher] verify-cmd check skipped (labels load error): ${loadErr.message}\n`);
732
+ }
733
+
734
+ // Update labels.json before exiting -- prevents stuck detector false positives
735
+ const summary = fallbackSummary || (lastReply ? lastReply.slice(0, 500) : null);
736
+ markLabelDone(label, summary);
737
+
738
+ if (lastReply) {
739
+ const maxLen = 3500;
740
+ const reply = lastReply.length > maxLen
741
+ ? lastReply.slice(0, maxLen) + '\n\n..[truncated]'
742
+ : lastReply;
743
+ process.stdout.write(`🌶️ *dispatch* [${label}] completed:\n\n${reply}\n`);
744
+ } else {
745
+ process.stdout.write(
746
+ `🌶️ *dispatch* [${label}] completed (no reply captured)\n` +
747
+ `Summary: ${fallbackSummary || 'none'}\n`
748
+ );
749
+ }
750
+ process.exit(0);
751
+ }
752
+
753
+ // -- Watcher heartbeat interval ref --------------------------------------
754
+ // Populated after label is validated (in main body). Cleared on exit.
755
+ // The interval writes lastPing to labels.json so the watchdog guard in
756
+ // index.mjs knows this watcher process is alive and actively monitoring.
757
+ let _pingInterval = null;
758
+
759
+ // -- Sync on Exit --------------------------------------------
760
+ // Best-effort sync of labels.json with gateway state on every watcher exit.
761
+ // Ensures stale 'running' entries are reconciled promptly, preventing
762
+ // false positives from the stuck detector.
763
+ process.on('exit', () => {
764
+ if (_pingInterval !== null) {
765
+ clearInterval(_pingInterval);
766
+ _pingInterval = null;
767
+ }
768
+ try {
769
+ execFileSync(process.execPath, [INDEX_PATH, 'sync'], {
770
+ encoding: 'utf-8',
771
+ timeout: 15000,
772
+ stdio: ['pipe', 'pipe', 'pipe'],
773
+ });
774
+ } catch {
775
+ // Best-effort -- never block exit
776
+ }
777
+ });
778
+
779
+ // -- Main ----------------------------------------------------
780
+
781
+ const flags = parseFlags(process.argv.slice(2));
782
+ const label = flags.label;
783
+ const timeoutS = parseInt(flags.timeout || '600', 10);
784
+ const pollS = parseInt(flags['poll-interval'] || '20', 10);
785
+
786
+ // How long a session must be idle before we proactively check result
787
+ const IDLE_RESULT_CHECK_MS = 60000;
788
+
789
+ if (!label) {
790
+ process.stderr.write('[watcher] --label is required\n');
791
+ process.exit(2);
792
+ }
793
+
794
+ // -- Start heartbeat -----------------------------------------------------
795
+ // Write lastPing to labels.json every PING_INTERVAL_MS while the session is
796
+ // still running. The watchdog guard in index.mjs reads lastPing to know this
797
+ // watcher process is alive -- preventing premature auto-resolve during slow
798
+ // tool calls, docker builds, long pytest runs, etc.
799
+ // Cleared automatically by the process.on('exit') handler above.
800
+ //
801
+ // Race-condition note: labels.json is cached by file mtime/size to avoid reparsing on
802
+ // every heartbeat tick, but each tick still re-validates the on-disk signature before
803
+ // patching lastPing. Worst case a concurrent writer wins one tick; the next tick repairs it.
804
+ _pingInterval = setInterval(() => {
805
+ try {
806
+ updateExistingLabel(label, (entry) => {
807
+ if (entry.status !== 'running') return false;
808
+ entry.lastPing = new Date().toISOString();
809
+ });
810
+ } catch {
811
+ // Best-effort -- never crash the watcher over a ping failure
812
+ }
813
+ }, PING_INTERVAL_MS);
814
+ _pingInterval.unref(); // don't prevent Node.js from exiting naturally
815
+
816
+ const spawnTime = Date.now();
817
+ let deadline = spawnTime + timeoutS * 1000;
818
+ let consecutiveFailures = 0;
819
+ const MAX_CONSECUTIVE_FAILURES = 10;
820
+ let recoverySessionKey = null; // captured during polling for steer/kill
821
+
822
+ // Module-level state accessible by SIGTERM handler
823
+ let lastKnownReply = null;
824
+
825
+ // -- SIGTERM handler (scheduler kills watcher with SIGTERM before SIGKILL) --
826
+ // Ensures labels.json is updated and a delivery attempt is made even when killed.
827
+ process.on('SIGTERM', () => {
828
+ process.stderr.write(`[watcher] SIGTERM received for ${label} -- marking as interrupted\n`);
829
+ // Try to fetch the latest result before dying
830
+ try {
831
+ const result = dispatch('result', ['--label', label]);
832
+ if (result?.lastReply) lastKnownReply = result.lastReply;
833
+ } catch {}
834
+ // deliverResult calls process.exit(0) internally
835
+ deliverResult(label, lastKnownReply, 'interrupted by watcher timeout');
836
+ });
837
+
838
+ // -- Rolling deadline vars ------------------------------------
839
+ let lastTokens = null;
840
+ let preDeadlineJsonlMtime = null; // JSONL mtime sampled each poll cycle for subagent activity signal
841
+ let preDeadlineSessionId = null; // reset on respawn to avoid cross-session mtime comparison
842
+ const ROLLING_EXTEND_MS = 5 * 60 * 1000; // extend by 5min when active
843
+ const MAX_DEADLINE_EXTENSION = 4 * 60 * 60 * 1000; // absolute hard ceiling for any deadline extension
844
+
845
+ /**
846
+ * Attempt to push the watcher deadline forward by ROLLING_EXTEND_MS, capped at
847
+ * spawnTime + min(timeoutS, MAX_DEADLINE_EXTENSION). This prevents a watcher
848
+ * from outliving its own timeout boundary via repeated JSONL mtime extensions.
849
+ * MAX_DEADLINE_EXTENSION (4h) is the absolute hard ceiling for any watcher.
850
+ * Returns true if the deadline was actually moved.
851
+ * @param {string} reason - Human-readable reason for the log line
852
+ */
853
+ function tryExtendDeadline(reason) {
854
+ const proposed = Date.now() + ROLLING_EXTEND_MS;
855
+ const cap = spawnTime + Math.min(timeoutS * 1000, MAX_DEADLINE_EXTENSION);
856
+ const extension = Math.min(proposed, cap);
857
+ if (extension <= deadline) return false;
858
+ deadline = extension;
859
+ process.stderr.write(
860
+ `[watcher] [${label}] ${reason}, deadline extended to +${Math.round((deadline - Date.now()) / 60000)}min\n`
861
+ );
862
+ return true;
863
+ }
864
+
865
+ // Track whether the session has EVER appeared in the gateway sessions list.
866
+ // Used to distinguish spawn failures (session never appeared) from normal
867
+ // completions (session appeared, ran, then cleaned up).
868
+ let sessionEverFound = false;
869
+
870
+ while (Date.now() < deadline) {
871
+ const status = dispatch('status', ['--label', label]);
872
+
873
+ if (!status?.ok) {
874
+ consecutiveFailures++;
875
+ if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
876
+ process.stdout.write(`⚠️ dispatch [${label}] watcher: gave up after ${MAX_CONSECUTIVE_FAILURES} consecutive status failures\n`);
877
+ process.exit(1);
878
+ }
879
+ await sleep(pollS * 1000);
880
+ continue;
881
+ }
882
+
883
+ consecutiveFailures = 0;
884
+
885
+ // Capture sessionKey for recovery steer/kill
886
+ if (status.sessionKey) recoverySessionKey = status.sessionKey;
887
+
888
+ // -- Rolling deadline: extend when session shows token activity --
889
+ const currentTokens = status?.liveness?.tokens ?? null;
890
+ if (currentTokens !== null && lastTokens !== null && currentTokens > lastTokens) {
891
+ tryExtendDeadline(`activity detected (${lastTokens}->${currentTokens} tokens)`);
892
+ }
893
+ if (currentTokens !== null) lastTokens = currentTokens;
894
+
895
+ // -- Rolling deadline: extend on JSONL mtime advance (subagent sessions) --
896
+ // Subagent sessions never populate totalTokens in sessions.json, so the token
897
+ // signal above is always null for them. Use JSONL file mtime as an alternative
898
+ // activity signal to prevent killing working subagent sessions mid-task.
899
+ if (status.sessionKey) {
900
+ const storeEntry = getSessionStoreEntry(status.sessionKey);
901
+ const sessionId = storeEntry?.sessionId || null;
902
+ const sessionAgent = status.sessionKey.split(':')[1] || 'main';
903
+
904
+ // Reset mtime baseline when the tracked session changes (e.g. after respawn)
905
+ if (sessionId && preDeadlineSessionId !== null && preDeadlineSessionId !== sessionId) {
906
+ preDeadlineJsonlMtime = null;
907
+ }
908
+ if (sessionId) preDeadlineSessionId = sessionId;
909
+
910
+ const curMtime = sessionId ? getSessionJsonlMtime(sessionId, sessionAgent) : null;
911
+ if (curMtime !== null) {
912
+ if (preDeadlineJsonlMtime !== null && curMtime > preDeadlineJsonlMtime + 1000) {
913
+ tryExtendDeadline('JSONL mtime advanced (subagent active)');
914
+ }
915
+ preDeadlineJsonlMtime = curMtime;
916
+ }
917
+ }
918
+
919
+ // Track session presence -- two independent signals, either is sufficient.
920
+ // 1. Sessions.json store (primary ground truth for dispatcher-spawned sessions)
921
+ // 2. Liveness field from dispatch status (secondary; also built from sessions.json
922
+ // in production, but test mocks may provide it directly)
923
+ if (!sessionEverFound && status.sessionKey) {
924
+ const sessionAgent = status.agent || 'main';
925
+ const watcherStore = readSessionsStore(sessionAgent);
926
+ if (watcherStore !== null && status.sessionKey in watcherStore) {
927
+ // Found in sessions.json -- authoritative
928
+ sessionEverFound = true;
929
+ } else if (status.liveness && !status.liveness.error) {
930
+ // Not in sessions.json (or store unavailable) but liveness signal says alive --
931
+ // session may still be initializing. Trust liveness as a secondary signal.
932
+ sessionEverFound = true;
933
+ }
934
+ }
935
+
936
+ // -- Path 0a: agent-side done signal (push-based) ----------
937
+ // If the agent ran `dispatch done --label <label>`, status is 'done' immediately.
938
+ // This is the fast path -- no need to poll for idle timeout.
939
+ // (Handled by Path 1 below since cmdDone sets status='done' in labels.json)
940
+
941
+ // -- Path 0b: 529/overload auto-retry ----------------------
942
+ if (status.status === 'error') {
943
+ const errorMsg = status.error || status.summary || '';
944
+ if (is529Error(errorMsg)) {
945
+ const retryCount = getRetryCount(label);
946
+ const retryResult = attempt529Retry(label, retryCount, errorMsg);
947
+
948
+ if (!retryResult.retry) {
949
+ // Max retries exceeded -- deliver error
950
+ process.stdout.write(
951
+ `🌶️ *dispatch* [${label}] failed after ${MAX_529_RETRIES} retries (529 overload)\n` +
952
+ `Error: ${errorMsg}\n`
953
+ );
954
+ process.exit(1);
955
+ }
956
+
957
+ // Wait with backoff then respawn
958
+ await sleep(retryResult.delayMs);
959
+
960
+ if (respawnSession(label)) {
961
+ // Session respawned -- reset consecutive failures for the fresh session
962
+ consecutiveFailures = 0;
963
+ process.stderr.write(`[watcher] [${label}] retry ${retryResult.newRetryCount} dispatched, continuing poll...\n`);
964
+ await sleep(pollS * 1000);
965
+ continue;
966
+ } else {
967
+ // Respawn failed -- deliver error
968
+ process.stdout.write(
969
+ `🌶️ *dispatch* [${label}] 529 retry failed -- could not respawn session\n` +
970
+ `Error: ${errorMsg}\n`
971
+ );
972
+ process.exit(1);
973
+ }
974
+ }
975
+ }
976
+
977
+ // -- Path 1: status auto-resolved to done ------------------
978
+ if (status.status !== 'running') {
979
+ // -- Spawn failure detection -----------------------------------------
980
+ // If the session was auto-resolved to 'done' (or 'spawn-warning') but was
981
+ // never seen in the gateway, it never ran -- this is a spawn failure.
982
+ // Causes: auth timeout, quota exhaustion, gateway error at spawn time.
983
+ if (!sessionEverFound && (status.status === 'done' || status.status === 'spawn-warning' || status.status === 'error')) {
984
+ const spawnErrMsg =
985
+ `[dispatch] SPAWN FAILURE: session ${status.sessionKey || '(unknown)'} never appeared ` +
986
+ `in gateway -- spawn likely failed (auth timeout, quota, or gateway error). Label: ${label}`;
987
+ process.stderr.write(spawnErrMsg + '\n');
988
+ markLabelError(label, `spawn-failure: session never appeared in gateway`);
989
+ process.stdout.write(
990
+ `🌶️ *dispatch* [${label}] SPAWN FAILURE: session never appeared in gateway -- ` +
991
+ `spawn likely failed (auth timeout, quota, or gateway error)\n`
992
+ );
993
+ process.exit(1);
994
+ }
995
+
996
+ // -- Gateway-restart-kill detection ----------------------------------
997
+ // When a gateway restart kills an in-flight session, the session disappears
998
+ // from sessions.json and the status command auto-resolves it as 'done' with
999
+ // a "session not found in sessions store" summary. This is NOT a real
1000
+ // completion -- the task was interrupted mid-run. Detect this pattern and
1001
+ // re-dispatch up to MAX_GW_RESTART_RETRIES times.
1002
+ //
1003
+ // Key distinction vs spawn failure:
1004
+ // spawn failure: sessionEverFound=false (session never appeared)
1005
+ // gateway-restart-kill: sessionEverFound=true (session ran, then was killed)
1006
+ //
1007
+ // If the session DID produce a lastReply before being killed, deliver it normally.
1008
+ if (sessionEverFound && isGatewayRestartKill(status.summary)) {
1009
+ const gwCheckResult = dispatch('result', ['--label', label]);
1010
+ if (!gwCheckResult?.lastReply) {
1011
+ // No result captured -- session was killed before completing
1012
+ const retryCount = getGwRestartRetryCount(label);
1013
+ if (retryCount >= MAX_GW_RESTART_RETRIES) {
1014
+ markLabelError(label,
1015
+ `gateway-restart-kill: max retries exceeded (${retryCount}x -- ${status.summary})`);
1016
+ notify(`🌶️ Dispatch: [${label}] gateway-restart-kill: max retries exceeded (${MAX_GW_RESTART_RETRIES}x)`);
1017
+ process.stdout.write(
1018
+ `🌶️ *dispatch* [${label}] failed: session killed by gateway restart, ` +
1019
+ `max retries (${MAX_GW_RESTART_RETRIES}) exceeded\n` +
1020
+ `Summary: ${status.summary}\n`
1021
+ );
1022
+ process.exit(1);
1023
+ }
1024
+ const newRetryCount = retryCount + 1;
1025
+ process.stderr.write(
1026
+ `[watcher] gateway-restart-kill detected for [${label}] -- ` +
1027
+ `attempt ${newRetryCount}/${MAX_GW_RESTART_RETRIES}\n`
1028
+ );
1029
+ notify(
1030
+ `🌶️ Dispatch: [${label}] session killed by gateway restart -- ` +
1031
+ `re-dispatching (${newRetryCount}/${MAX_GW_RESTART_RETRIES})`
1032
+ );
1033
+ setGwRestartRetryCount(label, newRetryCount);
1034
+ if (respawnAfterGwRestart(label)) {
1035
+ process.stderr.write(
1036
+ `[watcher] [${label}] gw-restart retry ${newRetryCount} dispatched, continuing poll...\n`
1037
+ );
1038
+ await sleep(pollS * 1000);
1039
+ continue;
1040
+ } else {
1041
+ markLabelError(label,
1042
+ `gateway-restart-kill: respawn failed (attempt ${newRetryCount})`);
1043
+ process.stdout.write(
1044
+ `🌶️ *dispatch* [${label}] failed: session killed by gateway restart, respawn failed\n`
1045
+ );
1046
+ process.exit(1);
1047
+ }
1048
+ }
1049
+ // lastReply present -- session completed before/during kill; fall through to normal delivery
1050
+ }
1051
+
1052
+ // Reset gw-restart retry count on successful completion
1053
+ const gwRetryCount = getGwRestartRetryCount(label);
1054
+ if (gwRetryCount > 0) {
1055
+ setGwRestartRetryCount(label, 0);
1056
+ process.stderr.write(
1057
+ `[watcher] [${label}] completed after ${gwRetryCount} gw-restart retry(ies), reset gwRestartRetryCount\n`
1058
+ );
1059
+ }
1060
+
1061
+ // -- Interrupted: session auto-resolved as incomplete ------------------
1062
+ // When cmdStatus auto-resolves a session as 'interrupted' (idle without
1063
+ // calling done), deliver the lastReply for diagnostics but exit non-zero
1064
+ // so the scheduler run is marked as error, not success.
1065
+ //
1066
+ // NOTE: Always resolve as 'interrupted', never 'done'. Only agent-side cmdDone may set status=done.
1067
+ if (status.status === 'interrupted') {
1068
+ process.stderr.write(`[watcher] [${label}] session auto-resolved as interrupted -- work may be incomplete\n`);
1069
+ process.stdout.write(
1070
+ `⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete\n`
1071
+ );
1072
+ markLabelError(label, status.summary || 'interrupted: session went idle without calling done');
1073
+ process.exit(1);
1074
+ }
1075
+
1076
+ // Reset 529 retryCount on successful completion
1077
+ if (status.status === 'done') {
1078
+ const currentRetryCount = getRetryCount(label);
1079
+ if (currentRetryCount > 0) {
1080
+ setRetryCount(label, 0);
1081
+ process.stderr.write(`[watcher] [${label}] completed after ${currentRetryCount} retry(ies), reset retryCount\n`);
1082
+ }
1083
+ }
1084
+ const result = dispatch('result', ['--label', label]);
1085
+ deliverResult(label, result?.lastReply, status.summary);
1086
+ }
1087
+
1088
+ // -- Path 2: status says 'running' but session may be idle -
1089
+ // If the session has no recent activity, proactively check for a result.
1090
+ // This catches the gap where the session completed but status hasn't
1091
+ // auto-resolved yet. The watchdog guard in index.mjs defers auto-resolve
1092
+ // while this watcher's lastPing heartbeat is fresh (written every 60s);
1093
+ // this path handles normal completion before the ping goes stale.
1094
+ const ageMs = status.liveness?.ageMs;
1095
+ if (ageMs != null && ageMs >= IDLE_RESULT_CHECK_MS) {
1096
+ const result = dispatch('result', ['--label', label]);
1097
+ if (result?.lastReply) {
1098
+ deliverResult(label, result.lastReply, null);
1099
+ }
1100
+ }
1101
+
1102
+
1103
+ await sleep(pollS * 1000);
1104
+ }
1105
+
1106
+ // Timed out -- try one last result check
1107
+ const finalResult = dispatch('result', ['--label', label]);
1108
+ const finalStatus = dispatch('status', ['--label', label]);
1109
+ if (finalResult?.lastReply) {
1110
+ const rc = getRetryCount(label);
1111
+ if (rc > 0) setRetryCount(label, 0);
1112
+ deliverResult(label, finalResult.lastReply, finalStatus?.summary || null);
1113
+ }
1114
+ // If status is explicitly done, exit cleanly even without lastReply
1115
+ if (finalStatus?.status === 'done') {
1116
+ markDoneSync(finalStatus?.summary || 'completed');
1117
+ process.stdout.write(`✅ dispatch [${label}] completed (status=done, no lastReply captured)\n`);
1118
+ process.exit(0);
1119
+ }
1120
+ // If status is interrupted (auto-resolved as incomplete), exit non-zero
1121
+ if (finalStatus?.status === 'interrupted') {
1122
+ process.stderr.write(`[watcher] [${label}] final status=interrupted -- session idle without completion\n`);
1123
+ process.stdout.write(
1124
+ `⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete\n`
1125
+ );
1126
+ markLabelError(label, finalStatus?.summary || 'interrupted: session went idle without calling done');
1127
+ process.exit(1);
1128
+ }
1129
+
1130
+ // -- Token-based activity check before steering ----------------------------
1131
+ // Only steer if tokens have been flat for 3+ minutes post-deadline.
1132
+ // If the session is still making model calls (tokens growing), stay silent.
1133
+ function getTokenCount(sessionKey) {
1134
+ const gatewayTokens = sessionKey ? getSessionTokens(sessionKey) : null;
1135
+ if (typeof gatewayTokens === 'number') return gatewayTokens;
1136
+ try {
1137
+ const result = dispatch('status', ['--label', label]);
1138
+ // sessions.list via gateway would be better but dispatch status has liveness
1139
+ const tokens = result?.liveness?.tokens;
1140
+ return typeof tokens === 'number' ? tokens : null;
1141
+ } catch { return null; }
1142
+ }
1143
+
1144
+ function markDoneSync(summary) {
1145
+ try {
1146
+ updateExistingLabel(label, (entry) => {
1147
+ entry.status = 'done';
1148
+ entry.summary = summary;
1149
+ });
1150
+ } catch (e) {
1151
+ process.stderr.write(`[watcher] markDoneSync failed: ${e.message}\n`);
1152
+ }
1153
+ }
1154
+
1155
+ const statusAtDeadline = dispatch('status', ['--label', label]);
1156
+ let tokenSessionKey = statusAtDeadline?.sessionKey || recoverySessionKey || null;
1157
+ let baselineTokens = getTokenCount(tokenSessionKey);
1158
+ let flatSince = Date.now();
1159
+
1160
+ // Capture the internal sessionId (UUID) from sessions.json -- this is the filename
1161
+ // of the JSONL file, distinct from the sessionKey (agent:main:subagent:UUID).
1162
+ // The JSONL is updated continuously during active turns, making it a reliable
1163
+ // activity signal when sessions.json totalTokens/updatedAt are stale.
1164
+ const _deadlineEntry = getSessionStoreEntry(tokenSessionKey);
1165
+ const sessionInternalId = _deadlineEntry?.sessionId || null;
1166
+ const sessionAgent = (tokenSessionKey?.split(':')[1]) || 'main';
1167
+ let lastJsonlMtime = getSessionJsonlMtime(sessionInternalId, sessionAgent);
1168
+
1169
+ process.stderr.write(`[watcher] deadline hit for ${label} -- watching token activity (baseline: ${baselineTokens})\n`);
1170
+ if (sessionInternalId) {
1171
+ process.stderr.write(`[watcher] ${label} JSONL tracking: sessionId=${sessionInternalId} mtime=${lastJsonlMtime}\n`);
1172
+ }
1173
+
1174
+ // If the session already completed (gateway pruned it -> null tokens), exit cleanly.
1175
+ if (statusAtDeadline?.status === 'done' || baselineTokens === null) {
1176
+ const r = dispatch('result', ['--label', label]);
1177
+ if (r?.lastReply) {
1178
+ // deliverResult calls process.exit(0) internally
1179
+ deliverResult(label, r.lastReply, statusAtDeadline?.summary || null);
1180
+ }
1181
+ // Status is explicitly done -- exit cleanly, no timeout noise
1182
+ if (statusAtDeadline?.status === 'done') {
1183
+ markDoneSync(statusAtDeadline?.summary || 'completed');
1184
+ process.stdout.write(`✅ dispatch [${label}] completed (status=done at deadline)\n`);
1185
+ process.exit(0);
1186
+ }
1187
+ // Truly no result and no tokens -- telemetry unavailable
1188
+ if (baselineTokens === null) {
1189
+ // Check if session is actually in the store (just mid-tool-call with no tokens yet)
1190
+ const entry = getSessionStoreEntry(tokenSessionKey);
1191
+ if (!entry) {
1192
+ // Session truly not found -- telemetry unavailable, exit
1193
+ process.stderr.write(`[watcher] token telemetry unavailable for ${label}; session not in store\n`);
1194
+ markLabelError(label, `timed out after ${timeoutS}s -- token telemetry unavailable`);
1195
+ process.stdout.write(`⏱ dispatch [${label}] timed out after ${timeoutS}s -- token telemetry unavailable; no steer/kill attempted\n`);
1196
+ process.exit(1);
1197
+ }
1198
+ // Session IS in store but no tokens -- mid-tool-call, fall through to activity window
1199
+ // Use updatedAt as activity signal instead of tokens
1200
+ process.stderr.write(`[watcher] ${label} in store but no tokens (mid-tool-call?) -- using updatedAt as activity signal\n`);
1201
+ baselineTokens = -1; // sentinel: token-free mode
1202
+ }
1203
+ }
1204
+
1205
+ while (Date.now() - flatSince < FLAT_WINDOW_MS) {
1206
+ await sleep(ACTIVITY_POLL_MS);
1207
+
1208
+ // Delivered?
1209
+ const st = dispatch('status', ['--label', label]);
1210
+ if (st?.sessionKey && !tokenSessionKey) tokenSessionKey = st.sessionKey;
1211
+ if (st?.status === 'done') {
1212
+ const r = dispatch('result', ['--label', label]);
1213
+ // deliverResult calls process.exit(0) internally
1214
+ deliverResult(label, r?.lastReply, st.summary);
1215
+ }
1216
+ const r2 = dispatch('result', ['--label', label]);
1217
+ if (r2?.lastReply) {
1218
+ // deliverResult calls process.exit(0) internally
1219
+ deliverResult(label, r2.lastReply, null);
1220
+ }
1221
+
1222
+ // Token growth?
1223
+ const cur = getTokenCount(tokenSessionKey);
1224
+ if (cur === null) {
1225
+ // Check updatedAt as fallback -- if session is still in store and recently updated, keep waiting
1226
+ const entry = getSessionStoreEntry(tokenSessionKey);
1227
+ if (!entry) {
1228
+ process.stderr.write(`[watcher] token telemetry lost for ${label}; session gone from store\n`);
1229
+ markLabelError(label, `timed out after ${timeoutS}s -- token telemetry lost`);
1230
+ process.stdout.write(`⏱ dispatch [${label}] timed out after ${timeoutS}s -- token telemetry lost; no steer/kill attempted\n`);
1231
+ process.exit(1);
1232
+ }
1233
+ // Still in store -- check if updatedAt advanced (tool call still running)
1234
+ // Normalize: updatedAt may be seconds or milliseconds depending on agent framework version
1235
+ const rawUpdatedAt = entry.updatedAt;
1236
+ const updatedAt = (typeof rawUpdatedAt === 'number' && rawUpdatedAt < 1e12)
1237
+ ? rawUpdatedAt * 1000 // seconds -> milliseconds
1238
+ : rawUpdatedAt;
1239
+ if (typeof updatedAt === 'number' && updatedAt > flatSince) {
1240
+ process.stderr.write(`[watcher] ${label} no tokens but updatedAt advanced -- tool call active, resetting flat timer\n`);
1241
+ flatSince = Date.now();
1242
+ } else {
1243
+ process.stderr.write(`[watcher] ${label} no tokens, updatedAt not advancing -- may be stuck\n`);
1244
+ }
1245
+ // Don't exit -- let FLAT_WINDOW_MS timeout handle the stuck case normally
1246
+ continue;
1247
+ }
1248
+ // Normal token comparison (skip if in token-free sentinel mode)
1249
+ if (baselineTokens !== -1 && cur > baselineTokens) {
1250
+ process.stderr.write(`[watcher] ${label} still active (${baselineTokens}->${cur} tokens), resetting flat timer\n`);
1251
+ baselineTokens = cur;
1252
+ flatSince = Date.now();
1253
+ } else if (baselineTokens === -1 && cur > 0) {
1254
+ // Tokens appeared for the first time -- switch from sentinel to real token tracking
1255
+ process.stderr.write(`[watcher] ${label} tokens now available (${cur}), switching to token tracking\n`);
1256
+ baselineTokens = cur;
1257
+ flatSince = Date.now();
1258
+ }
1259
+
1260
+ // -- JSONL mtime check -----------------------------------------------------
1261
+ // Most reliable activity signal for spawned subagent sessions: OpenClaw does
1262
+ // NOT flush totalTokens or updatedAt in sessions.json during active turns, but
1263
+ // the JSONL file IS written continuously. If the mtime advanced since last
1264
+ // check by >1s, the session is actively processing -- reset the flat timer.
1265
+ const curJsonlMtime = getSessionJsonlMtime(sessionInternalId, sessionAgent);
1266
+ if (curJsonlMtime !== null) {
1267
+ if (lastJsonlMtime !== null && curJsonlMtime > lastJsonlMtime + 1000) {
1268
+ process.stderr.write(
1269
+ `[watcher] ${label} JSONL mtime advanced (${lastJsonlMtime}->${curJsonlMtime}ms), ` +
1270
+ `session active -- resetting flat timer\n`
1271
+ );
1272
+ lastJsonlMtime = curJsonlMtime;
1273
+ flatSince = Date.now();
1274
+ } else if (lastJsonlMtime === null) {
1275
+ // First observation -- just record, don't reset yet
1276
+ process.stderr.write(`[watcher] ${label} JSONL mtime first observation: ${curJsonlMtime}\n`);
1277
+ lastJsonlMtime = curJsonlMtime;
1278
+ }
1279
+ }
1280
+ }
1281
+
1282
+ // -- Pre-steer JSONL sanity check ------------------------------------------
1283
+ // Before triggering steer/markDoneSync, verify the session is not currently
1284
+ // mid-turn. A mid-turn session has an in-flight tool call (JSONL last entry
1285
+ // is tool_use or tool_result) -- steering or declaring it done would interrupt
1286
+ // active work and produce a partial/zombie result.
1287
+ //
1288
+ // If mid-turn is detected AND the JSONL was modified recently, extend the flat
1289
+ // window one time to let the turn complete naturally.
1290
+ if (sessionInternalId) {
1291
+ const midTurnReason = getJsonlMidTurnReason(sessionInternalId, sessionAgent);
1292
+ if (midTurnReason) {
1293
+ process.stderr.write(
1294
+ `[watcher] ${label} pre-steer sanity check: ${midTurnReason} -- ` +
1295
+ `session is mid-turn, extending flat window once\n`
1296
+ );
1297
+ notify(`🌶️ Dispatch: [${label}] pre-steer: mid-turn detected (${midTurnReason}), extending wait`);
1298
+ flatSince = Date.now();
1299
+ // Re-enter the flat window loop for one more FLAT_WINDOW_MS extension
1300
+ while (Date.now() - flatSince < FLAT_WINDOW_MS) {
1301
+ await sleep(ACTIVITY_POLL_MS);
1302
+
1303
+ // Check for completion
1304
+ const stExt = dispatch('status', ['--label', label]);
1305
+ if (stExt?.status === 'done') {
1306
+ const rExt = dispatch('result', ['--label', label]);
1307
+ // deliverResult calls process.exit(0) internally
1308
+ deliverResult(label, rExt?.lastReply, stExt.summary);
1309
+ }
1310
+ const rExt2 = dispatch('result', ['--label', label]);
1311
+ if (rExt2?.lastReply) {
1312
+ // deliverResult calls process.exit(0) internally
1313
+ deliverResult(label, rExt2.lastReply, null);
1314
+ }
1315
+
1316
+ // JSONL mtime check during extended wait
1317
+ const extMtime = getSessionJsonlMtime(sessionInternalId, sessionAgent);
1318
+ if (extMtime !== null && lastJsonlMtime !== null && extMtime > lastJsonlMtime + 1000) {
1319
+ process.stderr.write(
1320
+ `[watcher] ${label} JSONL mtime advanced during extended wait (${lastJsonlMtime}->${extMtime}ms), resetting flat timer\n`
1321
+ );
1322
+ lastJsonlMtime = extMtime;
1323
+ flatSince = Date.now();
1324
+ } else if (extMtime !== null) {
1325
+ lastJsonlMtime = extMtime;
1326
+ }
1327
+
1328
+ // Token growth check during extended wait
1329
+ const extTokens = getTokenCount(tokenSessionKey);
1330
+ if (extTokens !== null && baselineTokens !== -1 && extTokens > baselineTokens) {
1331
+ process.stderr.write(`[watcher] ${label} tokens advanced during extended wait, resetting flat timer\n`);
1332
+ baselineTokens = extTokens;
1333
+ flatSince = Date.now();
1334
+ }
1335
+ }
1336
+ // Extended window expired -- proceed to steer regardless
1337
+ process.stderr.write(`[watcher] ${label} extended mid-turn wait expired -- proceeding to steer\n`);
1338
+ }
1339
+ }
1340
+
1341
+ // 3 min of genuinely flat tokens -- now steer
1342
+ process.stderr.write(`[watcher] ${label} inactive 3min post-deadline -- entering steer\n`);
1343
+
1344
+ // Get sessionKey for steer/kill
1345
+ const statusForSteer = dispatch('status', ['--label', label]);
1346
+ const steerSessionKey = statusForSteer?.sessionKey || null;
1347
+
1348
+ const steerRounds = [
1349
+ { waitMs: 30_000, msg: "Watcher check: if you're done, please send your final reply now. If still working, continue and ignore this." },
1350
+ { waitMs: 60_000, msg: "Watcher final check: please send your final reply now, or the session will be terminated in 2 minutes." },
1351
+ { waitMs: 120_000, msg: null }, // kill round
1352
+ ];
1353
+
1354
+ for (const round of steerRounds) {
1355
+ if (round.msg && steerSessionKey) {
1356
+ process.stderr.write(`[watcher] steering ${label}: "${round.msg.slice(0, 60)}..."\n`);
1357
+ await steerSession(steerSessionKey, round.msg);
1358
+ }
1359
+ await sleep(round.waitMs);
1360
+
1361
+ const st2 = dispatch('status', ['--label', label]);
1362
+ if (st2?.status === 'done') {
1363
+ const r3 = dispatch('result', ['--label', label]);
1364
+ // deliverResult calls process.exit(0) internally
1365
+ deliverResult(label, r3?.lastReply, st2.summary);
1366
+ }
1367
+ const r3 = dispatch('result', ['--label', label]);
1368
+ if (r3?.lastReply) {
1369
+ // deliverResult calls process.exit(0) internally
1370
+ deliverResult(label, r3.lastReply, null);
1371
+ }
1372
+
1373
+ if (!round.msg && steerSessionKey) {
1374
+ process.stderr.write(`[watcher] killing stuck session ${label}\n`);
1375
+ await killSession(steerSessionKey);
1376
+ // Wait up to 30s for confirmation
1377
+ for (let i = 0; i < 6; i++) {
1378
+ await sleep(5000);
1379
+ const st3 = dispatch('status', ['--label', label]);
1380
+ if (st3?.status === 'done') {
1381
+ // Check if a result was captured before marking as error
1382
+ const r4 = dispatch('result', ['--label', label]);
1383
+ if (r4?.lastReply) {
1384
+ deliverResult(label, r4.lastReply, st3.summary); // deliverResult calls process.exit(0)
1385
+ }
1386
+ markLabelError(label, 'timed out -- killed after steer attempts (no result captured)');
1387
+ process.stdout.write(`⏱ dispatch [${label}] killed after steer attempts -- no result captured\n`);
1388
+ process.exit(1);
1389
+ }
1390
+ }
1391
+ }
1392
+ }
1393
+
1394
+ markLabelError(label, `timed out after ${timeoutS}s -- killed after steer attempts`);
1395
+ process.stdout.write(`⏱ dispatch [${label}] timed out after ${timeoutS}s -- session killed after steer attempts\n`);
1396
+ process.exit(1);