openclaw-scheduler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +302 -0
- package/BEST-PRACTICES.md +506 -0
- package/CHANGELOG.md +82 -0
- package/CODE_OF_CONDUCT.md +22 -0
- package/CONTEXT.md +26 -0
- package/CONTRIBUTING.md +73 -0
- package/IMPLEMENTATION_SPEC.md +170 -0
- package/INSTALL-ADDITIONAL-HOST.md +333 -0
- package/INSTALL-LINUX.md +419 -0
- package/INSTALL-WINDOWS.md +305 -0
- package/INSTALL.md +364 -0
- package/JOB-QUICK-REF.md +222 -0
- package/LICENSE +21 -0
- package/QUICK-START.md +256 -0
- package/README.md +2170 -0
- package/SECURITY.md +34 -0
- package/UNINSTALL.md +129 -0
- package/UPGRADING.md +436 -0
- package/agents.js +67 -0
- package/approval.js +107 -0
- package/backup.js +390 -0
- package/bin/openclaw-scheduler.js +138 -0
- package/cli.js +1083 -0
- package/db.js +122 -0
- package/dispatch/529-recovery.mjs +204 -0
- package/dispatch/README.md +372 -0
- package/dispatch/config.example.json +24 -0
- package/dispatch/deliver-watcher.sh +57 -0
- package/dispatch/hooks.mjs +171 -0
- package/dispatch/index.mjs +1836 -0
- package/dispatch/watcher.mjs +1396 -0
- package/dispatch-queue.js +112 -0
- package/dispatcher-approvals.js +96 -0
- package/dispatcher-delivery.js +43 -0
- package/dispatcher-maintenance.js +242 -0
- package/dispatcher-shell.js +29 -0
- package/dispatcher-strategies.js +1280 -0
- package/dispatcher-utils.js +81 -0
- package/dispatcher.js +855 -0
- package/docs/adr-schedule-ownership.md +73 -0
- package/docs/gateway-contract.md +904 -0
- package/docs/plans/2026-03-09-fix-typescript-types.md +91 -0
- package/docs/plans/2026-03-09-test-coverage-gaps.md +83 -0
- package/docs/plans/2026-03-10-dispatcher-refactor.md +801 -0
- package/docs/trust-architecture.md +266 -0
- package/gateway.js +473 -0
- package/idempotency.js +119 -0
- package/index.d.ts +864 -0
- package/index.js +17 -0
- package/jobs.js +1224 -0
- package/messages.js +357 -0
- package/migrate-consolidate.js +694 -0
- package/migrate.js +125 -0
- package/package.json +130 -0
- package/paths.js +79 -0
- package/prompt-context.js +94 -0
- package/retrieval.js +176 -0
- package/runs.js +270 -0
- package/scheduler-schema.js +101 -0
- package/schema.sql +480 -0
- package/scripts/dispatch-cli-utils.mjs +65 -0
- package/scripts/inbox-consumer.mjs +288 -0
- package/scripts/stuck-detector.sh +18 -0
- package/scripts/stuck-run-detector.mjs +333 -0
- package/scripts/telegram-webhook-check.mjs +238 -0
- package/setup.mjs +724 -0
- package/shell-result.js +214 -0
- package/task-tracker.js +300 -0
- package/team-adapter.js +335 -0
- package/v02-runtime.js +599 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* inbox-consumer.mjs
|
|
4
|
+
*
|
|
5
|
+
* Drains pending scheduler messages for an agent and delivers them to a channel target.
|
|
6
|
+
* Intended for signal-only queue patterns where scripts enqueue actionable messages.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* node scripts/inbox-consumer.mjs --to <target-id> [--channel telegram] [--agent main] [--limit 50]
|
|
10
|
+
* node scripts/inbox-consumer.mjs --to <target-id> --watch
|
|
11
|
+
*
|
|
12
|
+
* Env fallbacks:
|
|
13
|
+
* INBOX_DELIVERY_TO
|
|
14
|
+
* INBOX_DELIVERY_CHANNEL (default: telegram)
|
|
15
|
+
* INBOX_AGENT (default: main)
|
|
16
|
+
* INBOX_LIMIT (default: 50)
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { dirname, basename, join, resolve } from 'path';
|
|
20
|
+
import { fileURLToPath } from 'url';
|
|
21
|
+
import { watch } from 'fs';
|
|
22
|
+
import { getDb } from '../db.js';
|
|
23
|
+
import { resolveSchedulerDbPath } from '../paths.js';
|
|
24
|
+
import { deliverMessage } from '../gateway.js';
|
|
25
|
+
import { ackMessage, recordMessageAttempt } from '../messages.js';
|
|
26
|
+
|
|
27
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
28
|
+
|
|
29
|
+
function parseArgs(argv) {
|
|
30
|
+
const out = { watch: false };
|
|
31
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
32
|
+
const k = argv[i];
|
|
33
|
+
if (k === '--watch') {
|
|
34
|
+
out.watch = true;
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
if (!k.startsWith('--')) continue;
|
|
38
|
+
const key = k.slice(2);
|
|
39
|
+
const value = argv[i + 1];
|
|
40
|
+
if (value && !value.startsWith('--')) {
|
|
41
|
+
out[key] = value;
|
|
42
|
+
i += 1;
|
|
43
|
+
} else {
|
|
44
|
+
out[key] = true;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return out;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function parsePositiveInt(input, fallback) {
|
|
51
|
+
const n = Number.parseInt(String(input ?? ''), 10);
|
|
52
|
+
return Number.isFinite(n) && n > 0 ? n : fallback;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function timeAgo(dateStr) {
|
|
56
|
+
if (!dateStr) return 'unknown';
|
|
57
|
+
const normalized = dateStr.includes('T') ? dateStr : dateStr.replace(' ', 'T');
|
|
58
|
+
const ts = new Date(normalized.endsWith('Z') ? normalized : normalized + 'Z').getTime();
|
|
59
|
+
if (isNaN(ts)) return 'unknown';
|
|
60
|
+
const sec = Math.max(0, Math.floor((Date.now() - ts) / 1000));
|
|
61
|
+
if (sec < 60) return `${sec}s ago`;
|
|
62
|
+
const min = Math.floor(sec / 60);
|
|
63
|
+
if (min < 60) return `${min}m ago`;
|
|
64
|
+
const hr = Math.floor(min / 60);
|
|
65
|
+
if (hr < 24) return `${hr}h ago`;
|
|
66
|
+
return `${Math.floor(hr / 24)}d ago`;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** Sentinel tokens that should never appear in user-facing delivery. */
|
|
70
|
+
const DELIVERY_SENTINELS = ['HEARTBEAT_OK', 'NO_FLUSH', 'IDEMPOTENT_SKIP'];
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Strip common shell output noise from delivery content:
|
|
74
|
+
* - "stdout:\n" prefix added by the shell strategy
|
|
75
|
+
* - Timestamped INFO log lines like "[2026-03-31 00:21:03] INFO ..."
|
|
76
|
+
* Keep lines that look like actual results.
|
|
77
|
+
*/
|
|
78
|
+
function cleanShellOutput(text) {
|
|
79
|
+
let cleaned = text;
|
|
80
|
+
// Strip leading "stdout:" or "stderr:" prefix
|
|
81
|
+
cleaned = cleaned.replace(/^stdout:\s*/i, '').replace(/^stderr:\s*/i, '');
|
|
82
|
+
// Remove timestamped log lines (keep everything else)
|
|
83
|
+
const lines = cleaned.split('\n');
|
|
84
|
+
const meaningful = lines.filter(line => {
|
|
85
|
+
const trimmed = line.trim();
|
|
86
|
+
if (!trimmed) return false;
|
|
87
|
+
// Skip lines like "[2026-03-31 00:21:03] INFO === Auto-Settle starting ==="
|
|
88
|
+
if (/^\[\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\]\s+(INFO|DEBUG|WARN)\s/.test(trimmed)) return false;
|
|
89
|
+
return true;
|
|
90
|
+
});
|
|
91
|
+
return meaningful.join('\n').trim();
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Format a single message for user-facing delivery.
|
|
96
|
+
* Strips debug metadata, sentinel tokens, shell noise, and adds a branded header.
|
|
97
|
+
*
|
|
98
|
+
* Env config:
|
|
99
|
+
* INBOX_BRAND: display name for the header (default: "Scheduler")
|
|
100
|
+
*/
|
|
101
|
+
function formatMessageForDelivery(msg, { brand = 'Scheduler' } = {}) {
|
|
102
|
+
let body = (msg.body || '').trim();
|
|
103
|
+
|
|
104
|
+
// Strip sentinel tokens from the end of the body
|
|
105
|
+
for (const sentinel of DELIVERY_SENTINELS) {
|
|
106
|
+
if (body.endsWith(sentinel)) {
|
|
107
|
+
body = body.slice(0, -sentinel.length).trim();
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Clean shell output noise
|
|
112
|
+
body = cleanShellOutput(body);
|
|
113
|
+
|
|
114
|
+
if (!body) return null;
|
|
115
|
+
|
|
116
|
+
// Header: brand + subject + age
|
|
117
|
+
const age = timeAgo(msg.created_at);
|
|
118
|
+
const subject = msg.subject || 'Notification';
|
|
119
|
+
const header = `${brand} | ${subject} | ${age}`;
|
|
120
|
+
|
|
121
|
+
return `${header}\n\n${body}`.slice(0, 4000);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Legacy debug format for --verbose mode.
|
|
126
|
+
*/
|
|
127
|
+
function _formatMessagesDebug(msgs, agentId) {
|
|
128
|
+
const lines = [`Inbox for ${agentId}: ${msgs.length} message(s)`];
|
|
129
|
+
for (const msg of msgs) {
|
|
130
|
+
lines.push('');
|
|
131
|
+
lines.push(`[${msg.kind}] from=${msg.from_agent} age=${timeAgo(msg.created_at)} priority=${msg.priority}`);
|
|
132
|
+
if (msg.subject) lines.push(`subject: ${msg.subject}`);
|
|
133
|
+
if (msg.body) lines.push(msg.body.trim().slice(0, 1200));
|
|
134
|
+
lines.push('---');
|
|
135
|
+
}
|
|
136
|
+
return lines.join('\n').trim();
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function selectPendingMessages(db, agentId, limit) {
|
|
140
|
+
return db.prepare(`
|
|
141
|
+
SELECT id, from_agent, to_agent, subject, body, kind, created_at, priority,
|
|
142
|
+
delivery_to, channel
|
|
143
|
+
FROM messages
|
|
144
|
+
WHERE (to_agent = ? OR to_agent = 'broadcast')
|
|
145
|
+
AND status IN ('pending', 'delivered')
|
|
146
|
+
ORDER BY
|
|
147
|
+
CASE kind
|
|
148
|
+
WHEN 'constraint' THEN 0
|
|
149
|
+
WHEN 'decision' THEN 1
|
|
150
|
+
WHEN 'fact' THEN 2
|
|
151
|
+
WHEN 'task' THEN 3
|
|
152
|
+
WHEN 'preference' THEN 4
|
|
153
|
+
ELSE 5
|
|
154
|
+
END ASC,
|
|
155
|
+
priority DESC,
|
|
156
|
+
created_at ASC
|
|
157
|
+
LIMIT ?
|
|
158
|
+
`).all(agentId, limit);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
async function drainOnce(db, { to, channel, agentId, limit, brand }) {
|
|
162
|
+
const msgs = selectPendingMessages(db, agentId, limit);
|
|
163
|
+
if (msgs.length === 0) {
|
|
164
|
+
return 0;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
let delivered = 0;
|
|
168
|
+
const deliveryErrors = [];
|
|
169
|
+
|
|
170
|
+
// Deliver each message individually with clean user-facing formatting.
|
|
171
|
+
// Messages are sent one at a time so a failure on one doesn't block others.
|
|
172
|
+
for (const msg of msgs) {
|
|
173
|
+
const msgTarget = msg.delivery_to || to;
|
|
174
|
+
const msgChannel = msg.channel || channel;
|
|
175
|
+
const text = formatMessageForDelivery(msg, { brand });
|
|
176
|
+
|
|
177
|
+
if (!text) {
|
|
178
|
+
// Empty after stripping sentinels -- ack without delivering
|
|
179
|
+
ackMessage(msg.id, 'inbox-consumer', 'Suppressed (empty after sentinel strip)');
|
|
180
|
+
delivered += 1;
|
|
181
|
+
continue;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
try {
|
|
185
|
+
// Small delay between deliveries to avoid gateway rate/concurrency issues
|
|
186
|
+
if (delivered > 0) await new Promise(r => setTimeout(r, 1500));
|
|
187
|
+
await deliverMessage(msgChannel, msgTarget, text);
|
|
188
|
+
recordMessageAttempt(msg.id, { ok: true, actor: 'inbox-consumer' });
|
|
189
|
+
ackMessage(msg.id, 'inbox-consumer', `Delivered to ${msgChannel}:${msgTarget}`);
|
|
190
|
+
delivered += 1;
|
|
191
|
+
} catch (err) {
|
|
192
|
+
recordMessageAttempt(msg.id, {
|
|
193
|
+
ok: false,
|
|
194
|
+
actor: 'inbox-consumer',
|
|
195
|
+
error: err.message || 'delivery failed',
|
|
196
|
+
});
|
|
197
|
+
deliveryErrors.push(err);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if (delivered > 0) {
|
|
202
|
+
process.stdout.write(`[inbox-consumer] delivered ${delivered} message(s)\n`);
|
|
203
|
+
}
|
|
204
|
+
if (deliveryErrors.length > 0) {
|
|
205
|
+
throw new Error(`Delivery failed for ${deliveryErrors.length} message(s): ${deliveryErrors.map(e => e.message).join('; ')}`);
|
|
206
|
+
}
|
|
207
|
+
return delivered;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
const args = parseArgs(process.argv.slice(2));
|
|
211
|
+
const deliveryTo = args.to || process.env.INBOX_DELIVERY_TO || '';
|
|
212
|
+
const channel = args.channel || process.env.INBOX_DELIVERY_CHANNEL || 'telegram';
|
|
213
|
+
const agentId = args.agent || process.env.INBOX_AGENT || 'main';
|
|
214
|
+
const limit = parsePositiveInt(args.limit || process.env.INBOX_LIMIT, 50);
|
|
215
|
+
// Brand resolution: --brand flag > INBOX_BRAND env > dispatch config brand > "Scheduler"
|
|
216
|
+
let brand = args.brand || process.env.INBOX_BRAND || '';
|
|
217
|
+
if (!brand) {
|
|
218
|
+
try {
|
|
219
|
+
const configDir = process.env.DISPATCH_CONFIG_DIR || join(resolve(resolveSchedulerDbPath({ env: process.env }), '..'), 'dispatch');
|
|
220
|
+
const { readFileSync } = await import('node:fs');
|
|
221
|
+
const config = JSON.parse(readFileSync(join(configDir, 'config.json'), 'utf8'));
|
|
222
|
+
brand = config.brand || config.name || '';
|
|
223
|
+
} catch (_e) { /* no dispatch config -- use default */ }
|
|
224
|
+
}
|
|
225
|
+
if (!brand) brand = 'Scheduler';
|
|
226
|
+
const watchMode = Boolean(args.watch);
|
|
227
|
+
|
|
228
|
+
if (!deliveryTo) {
|
|
229
|
+
process.stderr.write('[inbox-consumer] missing delivery target; pass --to or set INBOX_DELIVERY_TO\n');
|
|
230
|
+
process.exit(1);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const dbPath = resolve(resolveSchedulerDbPath({ env: process.env }));
|
|
234
|
+
const watchDir = dirname(dbPath);
|
|
235
|
+
const walFile = `${basename(dbPath)}-wal`;
|
|
236
|
+
|
|
237
|
+
try {
|
|
238
|
+
const db = getDb();
|
|
239
|
+
|
|
240
|
+
if (!watchMode) {
|
|
241
|
+
await drainOnce(db, { to: deliveryTo, channel, agentId, limit, brand });
|
|
242
|
+
process.exit(0);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
process.stdout.write(`[inbox-consumer] watching ${join(watchDir, walFile)}\n`);
|
|
246
|
+
try {
|
|
247
|
+
await drainOnce(db, { to: deliveryTo, channel, agentId, limit, brand });
|
|
248
|
+
} catch (err) {
|
|
249
|
+
process.stderr.write(`[inbox-consumer] initial drain error: ${err.message}\n`);
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
let timer = null;
|
|
253
|
+
let draining = false;
|
|
254
|
+
|
|
255
|
+
const runDebouncedDrain = async () => {
|
|
256
|
+
if (draining) return;
|
|
257
|
+
draining = true;
|
|
258
|
+
try {
|
|
259
|
+
await drainOnce(db, { to: deliveryTo, channel, agentId, limit, brand });
|
|
260
|
+
} catch (err) {
|
|
261
|
+
process.stderr.write(`[inbox-consumer] drain error: ${err.message}\n`);
|
|
262
|
+
} finally {
|
|
263
|
+
draining = false;
|
|
264
|
+
}
|
|
265
|
+
};
|
|
266
|
+
|
|
267
|
+
const watcher = watch(watchDir, (_eventType, filename) => {
|
|
268
|
+
if (filename !== null && filename !== walFile) return;
|
|
269
|
+
if (timer) clearTimeout(timer);
|
|
270
|
+
timer = setTimeout(() => {
|
|
271
|
+
timer = null;
|
|
272
|
+
runDebouncedDrain();
|
|
273
|
+
}, 250);
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
const shutdown = (signal) => {
|
|
277
|
+
if (timer) clearTimeout(timer);
|
|
278
|
+
watcher.close();
|
|
279
|
+
process.stdout.write(`[inbox-consumer] ${signal}; exiting\n`);
|
|
280
|
+
process.exit(0);
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
process.on('SIGINT', () => shutdown('SIGINT'));
|
|
284
|
+
process.on('SIGTERM', () => shutdown('SIGTERM'));
|
|
285
|
+
} catch (err) {
|
|
286
|
+
process.stderr.write(`[inbox-consumer] error: ${err.stack || err.message}\n`);
|
|
287
|
+
process.exit(1);
|
|
288
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Wrapper for stuck-run-detector.mjs that ensures proper PATH
|
|
3
|
+
|
|
4
|
+
# Ensure common Node.js install locations are in PATH
|
|
5
|
+
case "$(uname -s)" in
|
|
6
|
+
Darwin) export PATH="/opt/homebrew/bin:/usr/local/bin:$PATH" ;;
|
|
7
|
+
*) export PATH="/usr/local/bin:$PATH" ;;
|
|
8
|
+
esac
|
|
9
|
+
|
|
10
|
+
NODE_BIN="${NODE_BIN:-$(command -v node 2>/dev/null)}"
|
|
11
|
+
if [ -z "$NODE_BIN" ]; then
|
|
12
|
+
echo "[stuck-detector] node not found in PATH" >&2
|
|
13
|
+
exit 1
|
|
14
|
+
fi
|
|
15
|
+
THRESHOLD_MIN="${STUCK_THRESHOLD_MIN:-30}"
|
|
16
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
17
|
+
|
|
18
|
+
exec "$NODE_BIN" "$SCRIPT_DIR/stuck-run-detector.mjs" --threshold-min "$THRESHOLD_MIN"
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* stuck-run-detector.mjs
|
|
4
|
+
*
|
|
5
|
+
* Three-phase stuck detection: liveness gating -> steer -> alert.
|
|
6
|
+
*
|
|
7
|
+
* Phase 1 (Liveness gate): For dispatch-tracked runs, check actual session
|
|
8
|
+
* activity via `dispatch status`. Skip if tokens recently active or status=done.
|
|
9
|
+
* Phase 2 (Steer): Send a nudge into the session before alerting. Give it
|
|
10
|
+
* one more cycle to respond.
|
|
11
|
+
* Phase 3 (Alert): If steer was ignored (tokens flat), alert as genuinely stuck.
|
|
12
|
+
*
|
|
13
|
+
* Non-dispatch scheduler jobs skip phases 1-2 and alert immediately (old behavior).
|
|
14
|
+
*
|
|
15
|
+
* State persisted in /tmp/stuck-detector-state.json (volatile, OK to lose on reboot).
|
|
16
|
+
*
|
|
17
|
+
* Usage:
|
|
18
|
+
* node scripts/stuck-run-detector.mjs [--threshold-min 45] [--limit 20]
|
|
19
|
+
*
|
|
20
|
+
* Exit codes:
|
|
21
|
+
* 0: no stuck runs (includes steered-but-not-yet-stuck)
|
|
22
|
+
* 1: genuinely stuck runs found or hard error
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import { getDb } from '../db.js';
|
|
26
|
+
import { readFileSync, writeFileSync } from 'fs';
|
|
27
|
+
import { execFileSync } from 'child_process';
|
|
28
|
+
import { join, dirname } from 'path';
|
|
29
|
+
import { fileURLToPath } from 'url';
|
|
30
|
+
import { tmpdir } from 'os';
|
|
31
|
+
import { resolveDispatchCliPath, resolveDispatchLabel } from './dispatch-cli-utils.mjs';
|
|
32
|
+
|
|
33
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
34
|
+
|
|
35
|
+
// -- Paths ----------------------------------------------------
|
|
36
|
+
|
|
37
|
+
const LABELS_PATH = process.env.DISPATCH_LABELS_PATH || join(__dirname, '..', 'dispatch', 'labels.json');
|
|
38
|
+
const STATE_PATH = process.env.STUCK_STATE_PATH || join(tmpdir(), 'stuck-detector-state.json');
|
|
39
|
+
const DISPATCH_CLI = resolveDispatchCliPath(process.env);
|
|
40
|
+
const DISPATCH_IS_BIN = !DISPATCH_CLI.includes('/') && !DISPATCH_CLI.includes('\\');
|
|
41
|
+
|
|
42
|
+
// -- Constants ------------------------------------------------
|
|
43
|
+
|
|
44
|
+
const LIVENESS_THRESHOLD_MS = 180_000; // 3 min -- tokens active within this = alive
|
|
45
|
+
|
|
46
|
+
// -- Arg Parsing ----------------------------------------------
|
|
47
|
+
|
|
48
|
+
function parseArgs(argv) {
|
|
49
|
+
const out = {};
|
|
50
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
51
|
+
const k = argv[i];
|
|
52
|
+
if (!k.startsWith('--')) continue;
|
|
53
|
+
const key = k.slice(2);
|
|
54
|
+
const value = argv[i + 1];
|
|
55
|
+
if (value && !value.startsWith('--')) {
|
|
56
|
+
out[key] = value;
|
|
57
|
+
i += 1;
|
|
58
|
+
} else {
|
|
59
|
+
out[key] = true;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return out;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function parsePositiveInt(input, fallback) {
|
|
66
|
+
const n = Number.parseInt(String(input ?? ''), 10);
|
|
67
|
+
return Number.isFinite(n) && n > 0 ? n : fallback;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// -- Labels & State -------------------------------------------
|
|
71
|
+
|
|
72
|
+
function loadLabels() {
|
|
73
|
+
try {
|
|
74
|
+
return JSON.parse(readFileSync(LABELS_PATH, 'utf-8'));
|
|
75
|
+
} catch {
|
|
76
|
+
return {};
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function loadState() {
|
|
81
|
+
try {
|
|
82
|
+
return JSON.parse(readFileSync(STATE_PATH, 'utf-8'));
|
|
83
|
+
} catch {
|
|
84
|
+
return {};
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function saveState(state) {
|
|
89
|
+
writeFileSync(STATE_PATH, JSON.stringify(state, null, 2) + '\n');
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// -- Dispatch Integration ------------------------------------
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Get liveness info from `dispatch status --label <label>`.
|
|
96
|
+
* Returns { ageMs, tokens, status } or null on failure.
|
|
97
|
+
*/
|
|
98
|
+
function getDispatchLiveness(label) {
|
|
99
|
+
try {
|
|
100
|
+
const execArgs = DISPATCH_IS_BIN
|
|
101
|
+
? [DISPATCH_CLI, ['status', '--label', label]]
|
|
102
|
+
: [process.execPath, [DISPATCH_CLI, 'status', '--label', label]];
|
|
103
|
+
const result = execFileSync(execArgs[0], execArgs[1], {
|
|
104
|
+
encoding: 'utf-8',
|
|
105
|
+
timeout: 15_000,
|
|
106
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
107
|
+
});
|
|
108
|
+
const parsed = JSON.parse(result.trim());
|
|
109
|
+
return {
|
|
110
|
+
ageMs: parsed?.liveness?.ageMs ?? null,
|
|
111
|
+
tokens: parsed?.liveness?.tokens ?? null,
|
|
112
|
+
status: parsed?.status ?? null,
|
|
113
|
+
updatedAt: parsed?.liveness?.updatedAt ?? null,
|
|
114
|
+
};
|
|
115
|
+
} catch (err) {
|
|
116
|
+
process.stderr.write(
|
|
117
|
+
`[stuck-detector] dispatch status for "${label}" failed: ${err.message}\n`
|
|
118
|
+
);
|
|
119
|
+
return null;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Send a steering message into a dispatch session.
|
|
125
|
+
* Returns true on success, false on failure.
|
|
126
|
+
*/
|
|
127
|
+
function steerSession(label, staleMins) {
|
|
128
|
+
const msg = [
|
|
129
|
+
`[Auto-steer] You have been silent for ${staleMins} minutes.`,
|
|
130
|
+
`Please reply with a brief progress update, then continue your task.`,
|
|
131
|
+
`If you are done, call the done signal as instructed in your prompt.`,
|
|
132
|
+
].join(' ');
|
|
133
|
+
try {
|
|
134
|
+
const execArgs = DISPATCH_IS_BIN
|
|
135
|
+
? [DISPATCH_CLI, ['send', '--label', label, '--message', msg]]
|
|
136
|
+
: [process.execPath, [DISPATCH_CLI, 'send', '--label', label, '--message', msg]];
|
|
137
|
+
execFileSync(execArgs[0], execArgs[1], {
|
|
138
|
+
encoding: 'utf-8',
|
|
139
|
+
timeout: 15_000,
|
|
140
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
141
|
+
});
|
|
142
|
+
return true;
|
|
143
|
+
} catch (err) {
|
|
144
|
+
process.stderr.write(`[stuck-detector] steer for "${label}" failed: ${err.message}\n`);
|
|
145
|
+
return false;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// -- Main -----------------------------------------------------
|
|
150
|
+
|
|
151
|
+
const args = parseArgs(process.argv.slice(2));
|
|
152
|
+
const thresholdMin = parsePositiveInt(args['threshold-min'], 45); // coding tasks regularly take 30m+
|
|
153
|
+
const thresholdS = thresholdMin * 60;
|
|
154
|
+
const limit = parsePositiveInt(args.limit, 20);
|
|
155
|
+
|
|
156
|
+
try {
|
|
157
|
+
const db = getDb();
|
|
158
|
+
const rows = db.prepare(`
|
|
159
|
+
SELECT
|
|
160
|
+
r.id,
|
|
161
|
+
r.job_id,
|
|
162
|
+
r.started_at,
|
|
163
|
+
r.last_heartbeat,
|
|
164
|
+
r.run_timeout_ms,
|
|
165
|
+
j.name AS job_name,
|
|
166
|
+
CAST((julianday('now') - julianday(COALESCE(r.last_heartbeat, r.started_at))) * 86400 AS INTEGER) AS stale_s
|
|
167
|
+
FROM runs r
|
|
168
|
+
JOIN jobs j ON j.id = r.job_id
|
|
169
|
+
WHERE r.status = 'running'
|
|
170
|
+
AND COALESCE(r.last_heartbeat, r.started_at) < datetime('now', '-' || ? || ' seconds')
|
|
171
|
+
ORDER BY stale_s DESC
|
|
172
|
+
LIMIT ?
|
|
173
|
+
`).all(thresholdS, limit);
|
|
174
|
+
|
|
175
|
+
if (rows.length === 0) {
|
|
176
|
+
process.stdout.write(`No stale runs older than ${thresholdMin} minute(s).\n`);
|
|
177
|
+
process.exit(0);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const labels = loadLabels();
|
|
181
|
+
const state = loadState();
|
|
182
|
+
const alertRuns = []; // Phase 3: genuinely stuck
|
|
183
|
+
const steeredRuns = []; // Phase 2: nudged, awaiting response
|
|
184
|
+
const skippedRuns = []; // Phase 1: alive or done
|
|
185
|
+
let stateChanged = false;
|
|
186
|
+
|
|
187
|
+
// Track which labels appeared as stale this cycle (for state cleanup)
|
|
188
|
+
const staleLabelsThisCycle = new Set();
|
|
189
|
+
|
|
190
|
+
for (const r of rows) {
|
|
191
|
+
const label = resolveDispatchLabel(r.job_name, labels);
|
|
192
|
+
|
|
193
|
+
// -- Non-dispatch job: alert immediately (old behavior) --
|
|
194
|
+
if (!label) {
|
|
195
|
+
alertRuns.push(r);
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
staleLabelsThisCycle.add(label);
|
|
200
|
+
|
|
201
|
+
// -- Phase 1: Liveness gate ------------------------------
|
|
202
|
+
|
|
203
|
+
// Quick check: if labels.json already says done, skip (DB lag)
|
|
204
|
+
if (labels[label]?.status === 'done') {
|
|
205
|
+
skippedRuns.push({ ...r, reason: 'labels.json status=done (DB lag)', label });
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const liveness = getDispatchLiveness(label);
|
|
210
|
+
|
|
211
|
+
// dispatch status returned done -> skip
|
|
212
|
+
if (liveness?.status === 'done') {
|
|
213
|
+
skippedRuns.push({ ...r, reason: 'dispatch status=done', label });
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Tokens recently active -> skip (not stuck, just no heartbeat to DB)
|
|
218
|
+
if (
|
|
219
|
+
liveness &&
|
|
220
|
+
typeof liveness.ageMs === 'number' &&
|
|
221
|
+
liveness.ageMs < LIVENESS_THRESHOLD_MS
|
|
222
|
+
) {
|
|
223
|
+
skippedRuns.push({
|
|
224
|
+
...r,
|
|
225
|
+
reason: `active (ageMs=${liveness.ageMs}ms < ${LIVENESS_THRESHOLD_MS}ms)`,
|
|
226
|
+
label,
|
|
227
|
+
});
|
|
228
|
+
continue;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// -- Phase 2 / Phase 3: Steer or Alert -------------------
|
|
232
|
+
|
|
233
|
+
if (state[label]?.alertedAt) {
|
|
234
|
+
// Already alerted -- don't re-steer or re-alert
|
|
235
|
+
continue;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (!state[label]) {
|
|
239
|
+
// Phase 2: First detection -- steer before alerting
|
|
240
|
+
const staleMins = Math.round(r.stale_s / 60);
|
|
241
|
+
const ok = steerSession(label, staleMins);
|
|
242
|
+
|
|
243
|
+
if (ok) {
|
|
244
|
+
state[label] = {
|
|
245
|
+
steeredAt: new Date().toISOString(),
|
|
246
|
+
tokensAtSteer: liveness?.tokens ?? 0,
|
|
247
|
+
staleS: r.stale_s,
|
|
248
|
+
};
|
|
249
|
+
stateChanged = true;
|
|
250
|
+
steeredRuns.push({
|
|
251
|
+
...r,
|
|
252
|
+
label,
|
|
253
|
+
ageMs: liveness?.ageMs ?? null,
|
|
254
|
+
tokens: liveness?.tokens ?? null,
|
|
255
|
+
});
|
|
256
|
+
} else {
|
|
257
|
+
// Steer call failed -- still alert, note the failure
|
|
258
|
+
alertRuns.push({ ...r, steerNote: 'steer=failed' });
|
|
259
|
+
}
|
|
260
|
+
} else {
|
|
261
|
+
// Phase 3: Already steered -- did it help?
|
|
262
|
+
const prevTokens = state[label].tokensAtSteer;
|
|
263
|
+
const curTokens = liveness?.tokens ?? 0;
|
|
264
|
+
|
|
265
|
+
if (curTokens > prevTokens) {
|
|
266
|
+
// Tokens grew since steer -> it worked, clear state, skip alert
|
|
267
|
+
delete state[label];
|
|
268
|
+
stateChanged = true;
|
|
269
|
+
skippedRuns.push({
|
|
270
|
+
...r,
|
|
271
|
+
reason: `steer worked (tokens ${prevTokens}->${curTokens})`,
|
|
272
|
+
label,
|
|
273
|
+
});
|
|
274
|
+
} else {
|
|
275
|
+
// Tokens flat since steer -> genuinely stuck -> alert
|
|
276
|
+
alertRuns.push({ ...r, steerNote: 'steer=ignored' });
|
|
277
|
+
state[label] = { alertedAt: new Date().toISOString() };
|
|
278
|
+
stateChanged = true;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// -- Clean state entries for labels no longer appearing as stale --
|
|
284
|
+
for (const key of Object.keys(state)) {
|
|
285
|
+
if (!staleLabelsThisCycle.has(key)) {
|
|
286
|
+
delete state[key];
|
|
287
|
+
stateChanged = true;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
if (stateChanged) {
|
|
292
|
+
saveState(state);
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// -- Output -------------------------------------------------
|
|
296
|
+
|
|
297
|
+
for (const s of skippedRuns) {
|
|
298
|
+
process.stdout.write(
|
|
299
|
+
`Skipped: job="${s.job_name}" label="${s.label}" -- ${s.reason}\n`
|
|
300
|
+
);
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
for (const s of steeredRuns) {
|
|
304
|
+
process.stdout.write(
|
|
305
|
+
`Steered: label=${s.label} ageMs=${s.ageMs ?? '?'} tokens=${s.tokens ?? '?'} (awaiting response)\n`
|
|
306
|
+
);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
if (alertRuns.length === 0) {
|
|
310
|
+
if (steeredRuns.length > 0) {
|
|
311
|
+
process.stdout.write(
|
|
312
|
+
`No stuck runs -- ${steeredRuns.length} steered, awaiting response.\n`
|
|
313
|
+
);
|
|
314
|
+
} else {
|
|
315
|
+
process.stdout.write(`No stale runs older than ${thresholdMin} minute(s).\n`);
|
|
316
|
+
}
|
|
317
|
+
process.exit(0);
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
process.stdout.write(
|
|
321
|
+
`Detected ${alertRuns.length} stale run(s) older than ${thresholdMin} minute(s):\n`
|
|
322
|
+
);
|
|
323
|
+
for (const r of alertRuns) {
|
|
324
|
+
const suffix = r.steerNote ? ` ${r.steerNote}` : '';
|
|
325
|
+
process.stdout.write(
|
|
326
|
+
`- run=${r.id} job="${r.job_name}" job_id=${r.job_id} started=${r.started_at} last_heartbeat=${r.last_heartbeat} stale_s=${r.stale_s} timeout_ms=${r.run_timeout_ms}${suffix}\n`
|
|
327
|
+
);
|
|
328
|
+
}
|
|
329
|
+
process.exit(1);
|
|
330
|
+
} catch (err) {
|
|
331
|
+
process.stderr.write(`[stuck-run-detector] error: ${err.stack || err.message}\n`);
|
|
332
|
+
process.exit(1);
|
|
333
|
+
}
|