openclaw-scheduler 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/AGENTS.md +302 -0
  2. package/BEST-PRACTICES.md +506 -0
  3. package/CHANGELOG.md +82 -0
  4. package/CODE_OF_CONDUCT.md +22 -0
  5. package/CONTEXT.md +26 -0
  6. package/CONTRIBUTING.md +73 -0
  7. package/IMPLEMENTATION_SPEC.md +170 -0
  8. package/INSTALL-ADDITIONAL-HOST.md +333 -0
  9. package/INSTALL-LINUX.md +419 -0
  10. package/INSTALL-WINDOWS.md +305 -0
  11. package/INSTALL.md +364 -0
  12. package/JOB-QUICK-REF.md +222 -0
  13. package/LICENSE +21 -0
  14. package/QUICK-START.md +256 -0
  15. package/README.md +2170 -0
  16. package/SECURITY.md +34 -0
  17. package/UNINSTALL.md +129 -0
  18. package/UPGRADING.md +436 -0
  19. package/agents.js +67 -0
  20. package/approval.js +107 -0
  21. package/backup.js +390 -0
  22. package/bin/openclaw-scheduler.js +138 -0
  23. package/cli.js +1083 -0
  24. package/db.js +122 -0
  25. package/dispatch/529-recovery.mjs +204 -0
  26. package/dispatch/README.md +372 -0
  27. package/dispatch/config.example.json +24 -0
  28. package/dispatch/deliver-watcher.sh +57 -0
  29. package/dispatch/hooks.mjs +171 -0
  30. package/dispatch/index.mjs +1836 -0
  31. package/dispatch/watcher.mjs +1396 -0
  32. package/dispatch-queue.js +112 -0
  33. package/dispatcher-approvals.js +96 -0
  34. package/dispatcher-delivery.js +43 -0
  35. package/dispatcher-maintenance.js +242 -0
  36. package/dispatcher-shell.js +29 -0
  37. package/dispatcher-strategies.js +1280 -0
  38. package/dispatcher-utils.js +81 -0
  39. package/dispatcher.js +855 -0
  40. package/docs/adr-schedule-ownership.md +73 -0
  41. package/docs/gateway-contract.md +904 -0
  42. package/docs/plans/2026-03-09-fix-typescript-types.md +91 -0
  43. package/docs/plans/2026-03-09-test-coverage-gaps.md +83 -0
  44. package/docs/plans/2026-03-10-dispatcher-refactor.md +801 -0
  45. package/docs/trust-architecture.md +266 -0
  46. package/gateway.js +473 -0
  47. package/idempotency.js +119 -0
  48. package/index.d.ts +864 -0
  49. package/index.js +17 -0
  50. package/jobs.js +1224 -0
  51. package/messages.js +357 -0
  52. package/migrate-consolidate.js +694 -0
  53. package/migrate.js +125 -0
  54. package/package.json +130 -0
  55. package/paths.js +79 -0
  56. package/prompt-context.js +94 -0
  57. package/retrieval.js +176 -0
  58. package/runs.js +270 -0
  59. package/scheduler-schema.js +101 -0
  60. package/schema.sql +480 -0
  61. package/scripts/dispatch-cli-utils.mjs +65 -0
  62. package/scripts/inbox-consumer.mjs +288 -0
  63. package/scripts/stuck-detector.sh +18 -0
  64. package/scripts/stuck-run-detector.mjs +333 -0
  65. package/scripts/telegram-webhook-check.mjs +238 -0
  66. package/setup.mjs +724 -0
  67. package/shell-result.js +214 -0
  68. package/task-tracker.js +300 -0
  69. package/team-adapter.js +335 -0
  70. package/v02-runtime.js +599 -0
@@ -0,0 +1,288 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * inbox-consumer.mjs
4
+ *
5
+ * Drains pending scheduler messages for an agent and delivers them to a channel target.
6
+ * Intended for signal-only queue patterns where scripts enqueue actionable messages.
7
+ *
8
+ * Usage:
9
+ * node scripts/inbox-consumer.mjs --to <target-id> [--channel telegram] [--agent main] [--limit 50]
10
+ * node scripts/inbox-consumer.mjs --to <target-id> --watch
11
+ *
12
+ * Env fallbacks:
13
+ * INBOX_DELIVERY_TO
14
+ * INBOX_DELIVERY_CHANNEL (default: telegram)
15
+ * INBOX_AGENT (default: main)
16
+ * INBOX_LIMIT (default: 50)
17
+ */
18
+
19
+ import { dirname, basename, join, resolve } from 'path';
20
+ import { fileURLToPath } from 'url';
21
+ import { watch } from 'fs';
22
+ import { getDb } from '../db.js';
23
+ import { resolveSchedulerDbPath } from '../paths.js';
24
+ import { deliverMessage } from '../gateway.js';
25
+ import { ackMessage, recordMessageAttempt } from '../messages.js';
26
+
27
+ const __dirname = dirname(fileURLToPath(import.meta.url));
28
+
29
+ function parseArgs(argv) {
30
+ const out = { watch: false };
31
+ for (let i = 0; i < argv.length; i += 1) {
32
+ const k = argv[i];
33
+ if (k === '--watch') {
34
+ out.watch = true;
35
+ continue;
36
+ }
37
+ if (!k.startsWith('--')) continue;
38
+ const key = k.slice(2);
39
+ const value = argv[i + 1];
40
+ if (value && !value.startsWith('--')) {
41
+ out[key] = value;
42
+ i += 1;
43
+ } else {
44
+ out[key] = true;
45
+ }
46
+ }
47
+ return out;
48
+ }
49
+
50
+ function parsePositiveInt(input, fallback) {
51
+ const n = Number.parseInt(String(input ?? ''), 10);
52
+ return Number.isFinite(n) && n > 0 ? n : fallback;
53
+ }
54
+
55
+ function timeAgo(dateStr) {
56
+ if (!dateStr) return 'unknown';
57
+ const normalized = dateStr.includes('T') ? dateStr : dateStr.replace(' ', 'T');
58
+ const ts = new Date(normalized.endsWith('Z') ? normalized : normalized + 'Z').getTime();
59
+ if (isNaN(ts)) return 'unknown';
60
+ const sec = Math.max(0, Math.floor((Date.now() - ts) / 1000));
61
+ if (sec < 60) return `${sec}s ago`;
62
+ const min = Math.floor(sec / 60);
63
+ if (min < 60) return `${min}m ago`;
64
+ const hr = Math.floor(min / 60);
65
+ if (hr < 24) return `${hr}h ago`;
66
+ return `${Math.floor(hr / 24)}d ago`;
67
+ }
68
+
69
+ /** Sentinel tokens that should never appear in user-facing delivery. */
70
+ const DELIVERY_SENTINELS = ['HEARTBEAT_OK', 'NO_FLUSH', 'IDEMPOTENT_SKIP'];
71
+
72
+ /**
73
+ * Strip common shell output noise from delivery content:
74
+ * - "stdout:\n" prefix added by the shell strategy
75
+ * - Timestamped INFO log lines like "[2026-03-31 00:21:03] INFO ..."
76
+ * Keep lines that look like actual results.
77
+ */
78
+ function cleanShellOutput(text) {
79
+ let cleaned = text;
80
+ // Strip leading "stdout:" or "stderr:" prefix
81
+ cleaned = cleaned.replace(/^stdout:\s*/i, '').replace(/^stderr:\s*/i, '');
82
+ // Remove timestamped log lines (keep everything else)
83
+ const lines = cleaned.split('\n');
84
+ const meaningful = lines.filter(line => {
85
+ const trimmed = line.trim();
86
+ if (!trimmed) return false;
87
+ // Skip lines like "[2026-03-31 00:21:03] INFO === Auto-Settle starting ==="
88
+ if (/^\[\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\]\s+(INFO|DEBUG|WARN)\s/.test(trimmed)) return false;
89
+ return true;
90
+ });
91
+ return meaningful.join('\n').trim();
92
+ }
93
+
94
+ /**
95
+ * Format a single message for user-facing delivery.
96
+ * Strips debug metadata, sentinel tokens, shell noise, and adds a branded header.
97
+ *
98
+ * Env config:
99
+ * INBOX_BRAND: display name for the header (default: "Scheduler")
100
+ */
101
+ function formatMessageForDelivery(msg, { brand = 'Scheduler' } = {}) {
102
+ let body = (msg.body || '').trim();
103
+
104
+ // Strip sentinel tokens from the end of the body
105
+ for (const sentinel of DELIVERY_SENTINELS) {
106
+ if (body.endsWith(sentinel)) {
107
+ body = body.slice(0, -sentinel.length).trim();
108
+ }
109
+ }
110
+
111
+ // Clean shell output noise
112
+ body = cleanShellOutput(body);
113
+
114
+ if (!body) return null;
115
+
116
+ // Header: brand + subject + age
117
+ const age = timeAgo(msg.created_at);
118
+ const subject = msg.subject || 'Notification';
119
+ const header = `${brand} | ${subject} | ${age}`;
120
+
121
+ return `${header}\n\n${body}`.slice(0, 4000);
122
+ }
123
+
124
+ /**
125
+ * Legacy debug format for --verbose mode.
126
+ */
127
+ function _formatMessagesDebug(msgs, agentId) {
128
+ const lines = [`Inbox for ${agentId}: ${msgs.length} message(s)`];
129
+ for (const msg of msgs) {
130
+ lines.push('');
131
+ lines.push(`[${msg.kind}] from=${msg.from_agent} age=${timeAgo(msg.created_at)} priority=${msg.priority}`);
132
+ if (msg.subject) lines.push(`subject: ${msg.subject}`);
133
+ if (msg.body) lines.push(msg.body.trim().slice(0, 1200));
134
+ lines.push('---');
135
+ }
136
+ return lines.join('\n').trim();
137
+ }
138
+
139
+ function selectPendingMessages(db, agentId, limit) {
140
+ return db.prepare(`
141
+ SELECT id, from_agent, to_agent, subject, body, kind, created_at, priority,
142
+ delivery_to, channel
143
+ FROM messages
144
+ WHERE (to_agent = ? OR to_agent = 'broadcast')
145
+ AND status IN ('pending', 'delivered')
146
+ ORDER BY
147
+ CASE kind
148
+ WHEN 'constraint' THEN 0
149
+ WHEN 'decision' THEN 1
150
+ WHEN 'fact' THEN 2
151
+ WHEN 'task' THEN 3
152
+ WHEN 'preference' THEN 4
153
+ ELSE 5
154
+ END ASC,
155
+ priority DESC,
156
+ created_at ASC
157
+ LIMIT ?
158
+ `).all(agentId, limit);
159
+ }
160
+
161
+ async function drainOnce(db, { to, channel, agentId, limit, brand }) {
162
+ const msgs = selectPendingMessages(db, agentId, limit);
163
+ if (msgs.length === 0) {
164
+ return 0;
165
+ }
166
+
167
+ let delivered = 0;
168
+ const deliveryErrors = [];
169
+
170
+ // Deliver each message individually with clean user-facing formatting.
171
+ // Messages are sent one at a time so a failure on one doesn't block others.
172
+ for (const msg of msgs) {
173
+ const msgTarget = msg.delivery_to || to;
174
+ const msgChannel = msg.channel || channel;
175
+ const text = formatMessageForDelivery(msg, { brand });
176
+
177
+ if (!text) {
178
+ // Empty after stripping sentinels -- ack without delivering
179
+ ackMessage(msg.id, 'inbox-consumer', 'Suppressed (empty after sentinel strip)');
180
+ delivered += 1;
181
+ continue;
182
+ }
183
+
184
+ try {
185
+ // Small delay between deliveries to avoid gateway rate/concurrency issues
186
+ if (delivered > 0) await new Promise(r => setTimeout(r, 1500));
187
+ await deliverMessage(msgChannel, msgTarget, text);
188
+ recordMessageAttempt(msg.id, { ok: true, actor: 'inbox-consumer' });
189
+ ackMessage(msg.id, 'inbox-consumer', `Delivered to ${msgChannel}:${msgTarget}`);
190
+ delivered += 1;
191
+ } catch (err) {
192
+ recordMessageAttempt(msg.id, {
193
+ ok: false,
194
+ actor: 'inbox-consumer',
195
+ error: err.message || 'delivery failed',
196
+ });
197
+ deliveryErrors.push(err);
198
+ }
199
+ }
200
+
201
+ if (delivered > 0) {
202
+ process.stdout.write(`[inbox-consumer] delivered ${delivered} message(s)\n`);
203
+ }
204
+ if (deliveryErrors.length > 0) {
205
+ throw new Error(`Delivery failed for ${deliveryErrors.length} message(s): ${deliveryErrors.map(e => e.message).join('; ')}`);
206
+ }
207
+ return delivered;
208
+ }
209
+
210
+ const args = parseArgs(process.argv.slice(2));
211
+ const deliveryTo = args.to || process.env.INBOX_DELIVERY_TO || '';
212
+ const channel = args.channel || process.env.INBOX_DELIVERY_CHANNEL || 'telegram';
213
+ const agentId = args.agent || process.env.INBOX_AGENT || 'main';
214
+ const limit = parsePositiveInt(args.limit || process.env.INBOX_LIMIT, 50);
215
+ // Brand resolution: --brand flag > INBOX_BRAND env > dispatch config brand > "Scheduler"
216
+ let brand = args.brand || process.env.INBOX_BRAND || '';
217
+ if (!brand) {
218
+ try {
219
+ const configDir = process.env.DISPATCH_CONFIG_DIR || join(resolve(resolveSchedulerDbPath({ env: process.env }), '..'), 'dispatch');
220
+ const { readFileSync } = await import('node:fs');
221
+ const config = JSON.parse(readFileSync(join(configDir, 'config.json'), 'utf8'));
222
+ brand = config.brand || config.name || '';
223
+ } catch (_e) { /* no dispatch config -- use default */ }
224
+ }
225
+ if (!brand) brand = 'Scheduler';
226
+ const watchMode = Boolean(args.watch);
227
+
228
+ if (!deliveryTo) {
229
+ process.stderr.write('[inbox-consumer] missing delivery target; pass --to or set INBOX_DELIVERY_TO\n');
230
+ process.exit(1);
231
+ }
232
+
233
+ const dbPath = resolve(resolveSchedulerDbPath({ env: process.env }));
234
+ const watchDir = dirname(dbPath);
235
+ const walFile = `${basename(dbPath)}-wal`;
236
+
237
+ try {
238
+ const db = getDb();
239
+
240
+ if (!watchMode) {
241
+ await drainOnce(db, { to: deliveryTo, channel, agentId, limit, brand });
242
+ process.exit(0);
243
+ }
244
+
245
+ process.stdout.write(`[inbox-consumer] watching ${join(watchDir, walFile)}\n`);
246
+ try {
247
+ await drainOnce(db, { to: deliveryTo, channel, agentId, limit, brand });
248
+ } catch (err) {
249
+ process.stderr.write(`[inbox-consumer] initial drain error: ${err.message}\n`);
250
+ }
251
+
252
+ let timer = null;
253
+ let draining = false;
254
+
255
+ const runDebouncedDrain = async () => {
256
+ if (draining) return;
257
+ draining = true;
258
+ try {
259
+ await drainOnce(db, { to: deliveryTo, channel, agentId, limit, brand });
260
+ } catch (err) {
261
+ process.stderr.write(`[inbox-consumer] drain error: ${err.message}\n`);
262
+ } finally {
263
+ draining = false;
264
+ }
265
+ };
266
+
267
+ const watcher = watch(watchDir, (_eventType, filename) => {
268
+ if (filename !== null && filename !== walFile) return;
269
+ if (timer) clearTimeout(timer);
270
+ timer = setTimeout(() => {
271
+ timer = null;
272
+ runDebouncedDrain();
273
+ }, 250);
274
+ });
275
+
276
+ const shutdown = (signal) => {
277
+ if (timer) clearTimeout(timer);
278
+ watcher.close();
279
+ process.stdout.write(`[inbox-consumer] ${signal}; exiting\n`);
280
+ process.exit(0);
281
+ };
282
+
283
+ process.on('SIGINT', () => shutdown('SIGINT'));
284
+ process.on('SIGTERM', () => shutdown('SIGTERM'));
285
+ } catch (err) {
286
+ process.stderr.write(`[inbox-consumer] error: ${err.stack || err.message}\n`);
287
+ process.exit(1);
288
+ }
@@ -0,0 +1,18 @@
1
+ #!/bin/bash
2
+ # Wrapper for stuck-run-detector.mjs that ensures proper PATH
3
+
4
+ # Ensure common Node.js install locations are in PATH
5
+ case "$(uname -s)" in
6
+ Darwin) export PATH="/opt/homebrew/bin:/usr/local/bin:$PATH" ;;
7
+ *) export PATH="/usr/local/bin:$PATH" ;;
8
+ esac
9
+
10
+ NODE_BIN="${NODE_BIN:-$(command -v node 2>/dev/null)}"
11
+ if [ -z "$NODE_BIN" ]; then
12
+ echo "[stuck-detector] node not found in PATH" >&2
13
+ exit 1
14
+ fi
15
+ THRESHOLD_MIN="${STUCK_THRESHOLD_MIN:-30}"
16
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
17
+
18
+ exec "$NODE_BIN" "$SCRIPT_DIR/stuck-run-detector.mjs" --threshold-min "$THRESHOLD_MIN"
@@ -0,0 +1,333 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * stuck-run-detector.mjs
4
+ *
5
+ * Three-phase stuck detection: liveness gating -> steer -> alert.
6
+ *
7
+ * Phase 1 (Liveness gate): For dispatch-tracked runs, check actual session
8
+ * activity via `dispatch status`. Skip if tokens recently active or status=done.
9
+ * Phase 2 (Steer): Send a nudge into the session before alerting. Give it
10
+ * one more cycle to respond.
11
+ * Phase 3 (Alert): If steer was ignored (tokens flat), alert as genuinely stuck.
12
+ *
13
+ * Non-dispatch scheduler jobs skip phases 1-2 and alert immediately (old behavior).
14
+ *
15
+ * State persisted in /tmp/stuck-detector-state.json (volatile, OK to lose on reboot).
16
+ *
17
+ * Usage:
18
+ * node scripts/stuck-run-detector.mjs [--threshold-min 45] [--limit 20]
19
+ *
20
+ * Exit codes:
21
+ * 0: no stuck runs (includes steered-but-not-yet-stuck)
22
+ * 1: genuinely stuck runs found or hard error
23
+ */
24
+
25
+ import { getDb } from '../db.js';
26
+ import { readFileSync, writeFileSync } from 'fs';
27
+ import { execFileSync } from 'child_process';
28
+ import { join, dirname } from 'path';
29
+ import { fileURLToPath } from 'url';
30
+ import { tmpdir } from 'os';
31
+ import { resolveDispatchCliPath, resolveDispatchLabel } from './dispatch-cli-utils.mjs';
32
+
33
+ const __dirname = dirname(fileURLToPath(import.meta.url));
34
+
35
+ // -- Paths ----------------------------------------------------
36
+
37
+ const LABELS_PATH = process.env.DISPATCH_LABELS_PATH || join(__dirname, '..', 'dispatch', 'labels.json');
38
+ const STATE_PATH = process.env.STUCK_STATE_PATH || join(tmpdir(), 'stuck-detector-state.json');
39
+ const DISPATCH_CLI = resolveDispatchCliPath(process.env);
40
+ const DISPATCH_IS_BIN = !DISPATCH_CLI.includes('/') && !DISPATCH_CLI.includes('\\');
41
+
42
+ // -- Constants ------------------------------------------------
43
+
44
+ const LIVENESS_THRESHOLD_MS = 180_000; // 3 min -- tokens active within this = alive
45
+
46
+ // -- Arg Parsing ----------------------------------------------
47
+
48
+ function parseArgs(argv) {
49
+ const out = {};
50
+ for (let i = 0; i < argv.length; i += 1) {
51
+ const k = argv[i];
52
+ if (!k.startsWith('--')) continue;
53
+ const key = k.slice(2);
54
+ const value = argv[i + 1];
55
+ if (value && !value.startsWith('--')) {
56
+ out[key] = value;
57
+ i += 1;
58
+ } else {
59
+ out[key] = true;
60
+ }
61
+ }
62
+ return out;
63
+ }
64
+
65
+ function parsePositiveInt(input, fallback) {
66
+ const n = Number.parseInt(String(input ?? ''), 10);
67
+ return Number.isFinite(n) && n > 0 ? n : fallback;
68
+ }
69
+
70
+ // -- Labels & State -------------------------------------------
71
+
72
+ function loadLabels() {
73
+ try {
74
+ return JSON.parse(readFileSync(LABELS_PATH, 'utf-8'));
75
+ } catch {
76
+ return {};
77
+ }
78
+ }
79
+
80
+ function loadState() {
81
+ try {
82
+ return JSON.parse(readFileSync(STATE_PATH, 'utf-8'));
83
+ } catch {
84
+ return {};
85
+ }
86
+ }
87
+
88
+ function saveState(state) {
89
+ writeFileSync(STATE_PATH, JSON.stringify(state, null, 2) + '\n');
90
+ }
91
+
92
+ // -- Dispatch Integration ------------------------------------
93
+
94
+ /**
95
+ * Get liveness info from `dispatch status --label <label>`.
96
+ * Returns { ageMs, tokens, status } or null on failure.
97
+ */
98
+ function getDispatchLiveness(label) {
99
+ try {
100
+ const execArgs = DISPATCH_IS_BIN
101
+ ? [DISPATCH_CLI, ['status', '--label', label]]
102
+ : [process.execPath, [DISPATCH_CLI, 'status', '--label', label]];
103
+ const result = execFileSync(execArgs[0], execArgs[1], {
104
+ encoding: 'utf-8',
105
+ timeout: 15_000,
106
+ stdio: ['pipe', 'pipe', 'pipe'],
107
+ });
108
+ const parsed = JSON.parse(result.trim());
109
+ return {
110
+ ageMs: parsed?.liveness?.ageMs ?? null,
111
+ tokens: parsed?.liveness?.tokens ?? null,
112
+ status: parsed?.status ?? null,
113
+ updatedAt: parsed?.liveness?.updatedAt ?? null,
114
+ };
115
+ } catch (err) {
116
+ process.stderr.write(
117
+ `[stuck-detector] dispatch status for "${label}" failed: ${err.message}\n`
118
+ );
119
+ return null;
120
+ }
121
+ }
122
+
123
+ /**
124
+ * Send a steering message into a dispatch session.
125
+ * Returns true on success, false on failure.
126
+ */
127
+ function steerSession(label, staleMins) {
128
+ const msg = [
129
+ `[Auto-steer] You have been silent for ${staleMins} minutes.`,
130
+ `Please reply with a brief progress update, then continue your task.`,
131
+ `If you are done, call the done signal as instructed in your prompt.`,
132
+ ].join(' ');
133
+ try {
134
+ const execArgs = DISPATCH_IS_BIN
135
+ ? [DISPATCH_CLI, ['send', '--label', label, '--message', msg]]
136
+ : [process.execPath, [DISPATCH_CLI, 'send', '--label', label, '--message', msg]];
137
+ execFileSync(execArgs[0], execArgs[1], {
138
+ encoding: 'utf-8',
139
+ timeout: 15_000,
140
+ stdio: ['pipe', 'pipe', 'pipe'],
141
+ });
142
+ return true;
143
+ } catch (err) {
144
+ process.stderr.write(`[stuck-detector] steer for "${label}" failed: ${err.message}\n`);
145
+ return false;
146
+ }
147
+ }
148
+
149
+ // -- Main -----------------------------------------------------
150
+
151
+ const args = parseArgs(process.argv.slice(2));
152
+ const thresholdMin = parsePositiveInt(args['threshold-min'], 45); // coding tasks regularly take 30m+
153
+ const thresholdS = thresholdMin * 60;
154
+ const limit = parsePositiveInt(args.limit, 20);
155
+
156
+ try {
157
+ const db = getDb();
158
+ const rows = db.prepare(`
159
+ SELECT
160
+ r.id,
161
+ r.job_id,
162
+ r.started_at,
163
+ r.last_heartbeat,
164
+ r.run_timeout_ms,
165
+ j.name AS job_name,
166
+ CAST((julianday('now') - julianday(COALESCE(r.last_heartbeat, r.started_at))) * 86400 AS INTEGER) AS stale_s
167
+ FROM runs r
168
+ JOIN jobs j ON j.id = r.job_id
169
+ WHERE r.status = 'running'
170
+ AND COALESCE(r.last_heartbeat, r.started_at) < datetime('now', '-' || ? || ' seconds')
171
+ ORDER BY stale_s DESC
172
+ LIMIT ?
173
+ `).all(thresholdS, limit);
174
+
175
+ if (rows.length === 0) {
176
+ process.stdout.write(`No stale runs older than ${thresholdMin} minute(s).\n`);
177
+ process.exit(0);
178
+ }
179
+
180
+ const labels = loadLabels();
181
+ const state = loadState();
182
+ const alertRuns = []; // Phase 3: genuinely stuck
183
+ const steeredRuns = []; // Phase 2: nudged, awaiting response
184
+ const skippedRuns = []; // Phase 1: alive or done
185
+ let stateChanged = false;
186
+
187
+ // Track which labels appeared as stale this cycle (for state cleanup)
188
+ const staleLabelsThisCycle = new Set();
189
+
190
+ for (const r of rows) {
191
+ const label = resolveDispatchLabel(r.job_name, labels);
192
+
193
+ // -- Non-dispatch job: alert immediately (old behavior) --
194
+ if (!label) {
195
+ alertRuns.push(r);
196
+ continue;
197
+ }
198
+
199
+ staleLabelsThisCycle.add(label);
200
+
201
+ // -- Phase 1: Liveness gate ------------------------------
202
+
203
+ // Quick check: if labels.json already says done, skip (DB lag)
204
+ if (labels[label]?.status === 'done') {
205
+ skippedRuns.push({ ...r, reason: 'labels.json status=done (DB lag)', label });
206
+ continue;
207
+ }
208
+
209
+ const liveness = getDispatchLiveness(label);
210
+
211
+ // dispatch status returned done -> skip
212
+ if (liveness?.status === 'done') {
213
+ skippedRuns.push({ ...r, reason: 'dispatch status=done', label });
214
+ continue;
215
+ }
216
+
217
+ // Tokens recently active -> skip (not stuck, just no heartbeat to DB)
218
+ if (
219
+ liveness &&
220
+ typeof liveness.ageMs === 'number' &&
221
+ liveness.ageMs < LIVENESS_THRESHOLD_MS
222
+ ) {
223
+ skippedRuns.push({
224
+ ...r,
225
+ reason: `active (ageMs=${liveness.ageMs}ms < ${LIVENESS_THRESHOLD_MS}ms)`,
226
+ label,
227
+ });
228
+ continue;
229
+ }
230
+
231
+ // -- Phase 2 / Phase 3: Steer or Alert -------------------
232
+
233
+ if (state[label]?.alertedAt) {
234
+ // Already alerted -- don't re-steer or re-alert
235
+ continue;
236
+ }
237
+
238
+ if (!state[label]) {
239
+ // Phase 2: First detection -- steer before alerting
240
+ const staleMins = Math.round(r.stale_s / 60);
241
+ const ok = steerSession(label, staleMins);
242
+
243
+ if (ok) {
244
+ state[label] = {
245
+ steeredAt: new Date().toISOString(),
246
+ tokensAtSteer: liveness?.tokens ?? 0,
247
+ staleS: r.stale_s,
248
+ };
249
+ stateChanged = true;
250
+ steeredRuns.push({
251
+ ...r,
252
+ label,
253
+ ageMs: liveness?.ageMs ?? null,
254
+ tokens: liveness?.tokens ?? null,
255
+ });
256
+ } else {
257
+ // Steer call failed -- still alert, note the failure
258
+ alertRuns.push({ ...r, steerNote: 'steer=failed' });
259
+ }
260
+ } else {
261
+ // Phase 3: Already steered -- did it help?
262
+ const prevTokens = state[label].tokensAtSteer;
263
+ const curTokens = liveness?.tokens ?? 0;
264
+
265
+ if (curTokens > prevTokens) {
266
+ // Tokens grew since steer -> it worked, clear state, skip alert
267
+ delete state[label];
268
+ stateChanged = true;
269
+ skippedRuns.push({
270
+ ...r,
271
+ reason: `steer worked (tokens ${prevTokens}->${curTokens})`,
272
+ label,
273
+ });
274
+ } else {
275
+ // Tokens flat since steer -> genuinely stuck -> alert
276
+ alertRuns.push({ ...r, steerNote: 'steer=ignored' });
277
+ state[label] = { alertedAt: new Date().toISOString() };
278
+ stateChanged = true;
279
+ }
280
+ }
281
+ }
282
+
283
+ // -- Clean state entries for labels no longer appearing as stale --
284
+ for (const key of Object.keys(state)) {
285
+ if (!staleLabelsThisCycle.has(key)) {
286
+ delete state[key];
287
+ stateChanged = true;
288
+ }
289
+ }
290
+
291
+ if (stateChanged) {
292
+ saveState(state);
293
+ }
294
+
295
+ // -- Output -------------------------------------------------
296
+
297
+ for (const s of skippedRuns) {
298
+ process.stdout.write(
299
+ `Skipped: job="${s.job_name}" label="${s.label}" -- ${s.reason}\n`
300
+ );
301
+ }
302
+
303
+ for (const s of steeredRuns) {
304
+ process.stdout.write(
305
+ `Steered: label=${s.label} ageMs=${s.ageMs ?? '?'} tokens=${s.tokens ?? '?'} (awaiting response)\n`
306
+ );
307
+ }
308
+
309
+ if (alertRuns.length === 0) {
310
+ if (steeredRuns.length > 0) {
311
+ process.stdout.write(
312
+ `No stuck runs -- ${steeredRuns.length} steered, awaiting response.\n`
313
+ );
314
+ } else {
315
+ process.stdout.write(`No stale runs older than ${thresholdMin} minute(s).\n`);
316
+ }
317
+ process.exit(0);
318
+ }
319
+
320
+ process.stdout.write(
321
+ `Detected ${alertRuns.length} stale run(s) older than ${thresholdMin} minute(s):\n`
322
+ );
323
+ for (const r of alertRuns) {
324
+ const suffix = r.steerNote ? ` ${r.steerNote}` : '';
325
+ process.stdout.write(
326
+ `- run=${r.id} job="${r.job_name}" job_id=${r.job_id} started=${r.started_at} last_heartbeat=${r.last_heartbeat} stale_s=${r.stale_s} timeout_ms=${r.run_timeout_ms}${suffix}\n`
327
+ );
328
+ }
329
+ process.exit(1);
330
+ } catch (err) {
331
+ process.stderr.write(`[stuck-run-detector] error: ${err.stack || err.message}\n`);
332
+ process.exit(1);
333
+ }