openclaw-scheduler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +302 -0
- package/BEST-PRACTICES.md +506 -0
- package/CHANGELOG.md +82 -0
- package/CODE_OF_CONDUCT.md +22 -0
- package/CONTEXT.md +26 -0
- package/CONTRIBUTING.md +73 -0
- package/IMPLEMENTATION_SPEC.md +170 -0
- package/INSTALL-ADDITIONAL-HOST.md +333 -0
- package/INSTALL-LINUX.md +419 -0
- package/INSTALL-WINDOWS.md +305 -0
- package/INSTALL.md +364 -0
- package/JOB-QUICK-REF.md +222 -0
- package/LICENSE +21 -0
- package/QUICK-START.md +256 -0
- package/README.md +2170 -0
- package/SECURITY.md +34 -0
- package/UNINSTALL.md +129 -0
- package/UPGRADING.md +436 -0
- package/agents.js +67 -0
- package/approval.js +107 -0
- package/backup.js +390 -0
- package/bin/openclaw-scheduler.js +138 -0
- package/cli.js +1083 -0
- package/db.js +122 -0
- package/dispatch/529-recovery.mjs +204 -0
- package/dispatch/README.md +372 -0
- package/dispatch/config.example.json +24 -0
- package/dispatch/deliver-watcher.sh +57 -0
- package/dispatch/hooks.mjs +171 -0
- package/dispatch/index.mjs +1836 -0
- package/dispatch/watcher.mjs +1396 -0
- package/dispatch-queue.js +112 -0
- package/dispatcher-approvals.js +96 -0
- package/dispatcher-delivery.js +43 -0
- package/dispatcher-maintenance.js +242 -0
- package/dispatcher-shell.js +29 -0
- package/dispatcher-strategies.js +1280 -0
- package/dispatcher-utils.js +81 -0
- package/dispatcher.js +855 -0
- package/docs/adr-schedule-ownership.md +73 -0
- package/docs/gateway-contract.md +904 -0
- package/docs/plans/2026-03-09-fix-typescript-types.md +91 -0
- package/docs/plans/2026-03-09-test-coverage-gaps.md +83 -0
- package/docs/plans/2026-03-10-dispatcher-refactor.md +801 -0
- package/docs/trust-architecture.md +266 -0
- package/gateway.js +473 -0
- package/idempotency.js +119 -0
- package/index.d.ts +864 -0
- package/index.js +17 -0
- package/jobs.js +1224 -0
- package/messages.js +357 -0
- package/migrate-consolidate.js +694 -0
- package/migrate.js +125 -0
- package/package.json +130 -0
- package/paths.js +79 -0
- package/prompt-context.js +94 -0
- package/retrieval.js +176 -0
- package/runs.js +270 -0
- package/scheduler-schema.js +101 -0
- package/schema.sql +480 -0
- package/scripts/dispatch-cli-utils.mjs +65 -0
- package/scripts/inbox-consumer.mjs +288 -0
- package/scripts/stuck-detector.sh +18 -0
- package/scripts/stuck-run-detector.mjs +333 -0
- package/scripts/telegram-webhook-check.mjs +238 -0
- package/setup.mjs +724 -0
- package/shell-result.js +214 -0
- package/task-tracker.js +300 -0
- package/team-adapter.js +335 -0
- package/v02-runtime.js +599 -0
|
@@ -0,0 +1,1396 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* dispatch watcher -- polls a session until done, outputs the result.
|
|
4
|
+
*
|
|
5
|
+
* Used by scheduler shell jobs for async delivery with retry + audit trail.
|
|
6
|
+
* The scheduler runs this as a shell job with delivery_mode='announce-always',
|
|
7
|
+
* so stdout is delivered via handleDelivery (retry, alias, audit).
|
|
8
|
+
*
|
|
9
|
+
* Detection strategy:
|
|
10
|
+
* 1. Check `status --label` -- if auto-resolved to 'done', use it
|
|
11
|
+
* 2. If status says 'running' but session is idle (no activity for >60s),
|
|
12
|
+
* also check `result --label` for a lastReply -- if found, session completed
|
|
13
|
+
* but status hasn't caught up yet (auto-resolve has 10min threshold)
|
|
14
|
+
*
|
|
15
|
+
* 529/Overload auto-retry:
|
|
16
|
+
* When a session errors with a 529/FailoverError/overload pattern, the watcher
|
|
17
|
+
* will automatically retry up to MAX_529_RETRIES times with exponential backoff
|
|
18
|
+
* (30s * retryCount). It respawns via `dispatch enqueue --mode reuse` to continue
|
|
19
|
+
* the same session, and tracks retryCount in labels.json.
|
|
20
|
+
*
|
|
21
|
+
* Usage: node watcher.mjs --label <label> [--timeout <seconds>] [--poll-interval <seconds>]
|
|
22
|
+
*
|
|
23
|
+
* Exit codes:
|
|
24
|
+
* 0 -- session completed, result on stdout
|
|
25
|
+
* 1 -- timeout or error
|
|
26
|
+
* 2 -- argument error
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
import { execFileSync, execSync } from 'child_process';
|
|
30
|
+
import { readFileSync, writeFileSync, renameSync, statSync } from 'fs';
|
|
31
|
+
import { dirname, join } from 'path';
|
|
32
|
+
import { homedir } from 'os';
|
|
33
|
+
import { fileURLToPath } from 'url';
|
|
34
|
+
import { sendMessage } from '../messages.js';
|
|
35
|
+
|
|
36
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
37
|
+
const INDEX_PATH = process.env.DISPATCH_INDEX_PATH || join(__dirname, 'index.mjs');
|
|
38
|
+
const LABELS_PATH = process.env.DISPATCH_LABELS_PATH || join(__dirname, 'labels.json');
|
|
39
|
+
const HOME_DIR = process.env.HOME || homedir();
|
|
40
|
+
let labelsCache = null;
|
|
41
|
+
let labelsCacheSignature = null;
|
|
42
|
+
|
|
43
|
+
const MAX_529_RETRIES = 3;
|
|
44
|
+
const RETRY_BASE_DELAY_MS = 30000; // 30 seconds
|
|
45
|
+
|
|
46
|
+
const MAX_GW_RESTART_RETRIES = 2; // Max retries for gateway-restart-kill recovery
|
|
47
|
+
|
|
48
|
+
const FLAT_WINDOW_MS = 3 * 60 * 1000; // 3 min flat = genuinely stuck
|
|
49
|
+
const ACTIVITY_POLL_MS = 30_000;
|
|
50
|
+
|
|
51
|
+
/** How often the watcher writes lastPing to labels.json (heartbeat signal).
|
|
52
|
+
* The watchdog guard in index.mjs treats pings older than 3x this as stale,
|
|
53
|
+
* so PING_INTERVAL_MS must stay well below PING_STALE_MS (3 * 60_000). */
|
|
54
|
+
const PING_INTERVAL_MS = 60_000; // 60 seconds
|
|
55
|
+
|
|
56
|
+
function getGatewayToken() {
|
|
57
|
+
if (process.env.OPENCLAW_GATEWAY_TOKEN) return process.env.OPENCLAW_GATEWAY_TOKEN;
|
|
58
|
+
try {
|
|
59
|
+
const configPath = join(HOME_DIR, '.openclaw', 'openclaw.json');
|
|
60
|
+
const cfg = JSON.parse(readFileSync(configPath, 'utf-8'));
|
|
61
|
+
return cfg?.gateway?.auth?.token || null;
|
|
62
|
+
} catch {
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const GW_TOKEN = getGatewayToken();
|
|
68
|
+
|
|
69
|
+
// -- Gateway RPC (sync, matches index.mjs pattern) -----------
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Sync gateway RPC call via `openclaw gateway call`.
|
|
73
|
+
* Returns parsed JSON or null on failure.
|
|
74
|
+
*/
|
|
75
|
+
function gatewayCall(method, params = {}, opts = {}) {
|
|
76
|
+
const timeout = opts.timeout || 15000;
|
|
77
|
+
const args = ['gateway', 'call', method, '--json'];
|
|
78
|
+
args.push('--params', JSON.stringify(params));
|
|
79
|
+
args.push('--timeout', String(timeout));
|
|
80
|
+
const childEnv = GW_TOKEN ? { ...process.env, OPENCLAW_GATEWAY_TOKEN: GW_TOKEN } : process.env;
|
|
81
|
+
|
|
82
|
+
try {
|
|
83
|
+
const result = execFileSync('openclaw', args, {
|
|
84
|
+
encoding: 'utf-8',
|
|
85
|
+
timeout: timeout + 5000,
|
|
86
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
87
|
+
env: childEnv,
|
|
88
|
+
});
|
|
89
|
+
return JSON.parse(result.trim());
|
|
90
|
+
} catch (err) {
|
|
91
|
+
const stdout = err.stdout?.trim() || '';
|
|
92
|
+
if (stdout) try { return JSON.parse(stdout); } catch {}
|
|
93
|
+
return null;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Get current totalTokens for a session.
|
|
99
|
+
* Tries sessions.json first (ground truth), falls back to sessions.list API.
|
|
100
|
+
* Returns number or null if unavailable.
|
|
101
|
+
*/
|
|
102
|
+
function getSessionTokens(sessionKey) {
|
|
103
|
+
// Primary: sessions.json direct read
|
|
104
|
+
const agent = sessionKey ? (sessionKey.split(':')[1] || 'main') : 'main';
|
|
105
|
+
const store = readSessionsStore(agent);
|
|
106
|
+
if (store && sessionKey in store) {
|
|
107
|
+
const tokens = store[sessionKey]?.totalTokens;
|
|
108
|
+
if (typeof tokens === 'number') return tokens;
|
|
109
|
+
}
|
|
110
|
+
// Fallback: gateway sessions.list API (may not see dispatcher-spawned sessions)
|
|
111
|
+
const result = gatewayCall('sessions.list', { activeMinutes: 1440 }, { timeout: 8000 });
|
|
112
|
+
const session = result?.sessions?.find(s => s.key === sessionKey);
|
|
113
|
+
return session?.totalTokens ?? null;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/** Returns the session entry from sessions.json, or null if not found. */
|
|
117
|
+
function getSessionStoreEntry(sessionKey) {
|
|
118
|
+
if (!sessionKey) return null;
|
|
119
|
+
const agent = sessionKey.split(':')[1] || 'main';
|
|
120
|
+
const store = readSessionsStore(agent);
|
|
121
|
+
return (store && sessionKey in store) ? store[sessionKey] : null;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/** Parse --flag value pairs from argv (supports both --flag value and --flag=value) */
|
|
125
|
+
function parseFlags(argv) {
|
|
126
|
+
const flags = {};
|
|
127
|
+
for (let i = 0; i < argv.length; i++) {
|
|
128
|
+
const a = argv[i];
|
|
129
|
+
const next = argv[i + 1];
|
|
130
|
+
if (a.startsWith('--')) {
|
|
131
|
+
const eqIdx = a.indexOf('=');
|
|
132
|
+
if (eqIdx > 0) {
|
|
133
|
+
flags[a.slice(2, eqIdx)] = a.slice(eqIdx + 1);
|
|
134
|
+
} else if (next && !next.startsWith('--')) {
|
|
135
|
+
flags[a.slice(2)] = next;
|
|
136
|
+
i++;
|
|
137
|
+
} else {
|
|
138
|
+
flags[a.slice(2)] = true;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return flags;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Run a dispatch subcommand and return parsed JSON, or null on failure.
|
|
147
|
+
*/
|
|
148
|
+
function dispatch(subcmd, args) {
|
|
149
|
+
try {
|
|
150
|
+
const result = execFileSync(process.execPath, [INDEX_PATH, subcmd, ...args], {
|
|
151
|
+
encoding: 'utf-8',
|
|
152
|
+
timeout: 30000,
|
|
153
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
154
|
+
});
|
|
155
|
+
return JSON.parse(result.trim());
|
|
156
|
+
} catch {
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
function sleep(ms) {
|
|
162
|
+
return new Promise(r => setTimeout(r, ms));
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// -- 529/Overload Detection & Retry --------------------------
|
|
166
|
+
|
|
167
|
+
/** Regex patterns that indicate a 529/overload error */
|
|
168
|
+
const OVERLOAD_PATTERNS = [
|
|
169
|
+
/529/i,
|
|
170
|
+
/failover\s*error/i,
|
|
171
|
+
/overload/i,
|
|
172
|
+
/temporarily\s+overloaded/i,
|
|
173
|
+
/service.*overloaded/i,
|
|
174
|
+
/rate.limit/i,
|
|
175
|
+
/too.many.requests/i,
|
|
176
|
+
];
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Check if an error message matches a 529/overload pattern.
|
|
180
|
+
*/
|
|
181
|
+
function is529Error(errorMsg) {
|
|
182
|
+
if (!errorMsg || typeof errorMsg !== 'string') return false;
|
|
183
|
+
return OVERLOAD_PATTERNS.some(p => p.test(errorMsg));
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Regex patterns that indicate the session was not found in the sessions store.
|
|
188
|
+
* This is the telltale signature of a gateway-restart-kill: the gateway restarted,
|
|
189
|
+
* wiped in-flight sessions, and the status command auto-resolved the label as 'done'
|
|
190
|
+
* because the sessionKey disappeared from sessions.json.
|
|
191
|
+
*/
|
|
192
|
+
const GW_KILL_PATTERNS = [
|
|
193
|
+
/session not found in sessions store/i,
|
|
194
|
+
/session not found in gateway store/i,
|
|
195
|
+
/session never found/i,
|
|
196
|
+
/Auto-resolved.*session not found/i,
|
|
197
|
+
/Auto-resolved.*never found/i,
|
|
198
|
+
];
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Check if a status summary indicates the session was killed by a gateway restart.
|
|
202
|
+
*/
|
|
203
|
+
function isGatewayRestartKill(summary) {
|
|
204
|
+
if (!summary || typeof summary !== 'string') return false;
|
|
205
|
+
return GW_KILL_PATTERNS.some(p => p.test(summary));
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Load labels.json directly (avoids going through CLI for speed).
|
|
210
|
+
*/
|
|
211
|
+
function getLabelsSignature() {
|
|
212
|
+
try {
|
|
213
|
+
const stats = statSync(LABELS_PATH);
|
|
214
|
+
return `${stats.mtimeMs}:${stats.size}`;
|
|
215
|
+
} catch {
|
|
216
|
+
return 'missing';
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
function loadLabels() {
|
|
221
|
+
const signature = getLabelsSignature();
|
|
222
|
+
if (labelsCache && labelsCacheSignature === signature) {
|
|
223
|
+
return labelsCache;
|
|
224
|
+
}
|
|
225
|
+
try {
|
|
226
|
+
const labels = JSON.parse(readFileSync(LABELS_PATH, 'utf-8'));
|
|
227
|
+
labelsCache = labels;
|
|
228
|
+
labelsCacheSignature = signature;
|
|
229
|
+
return labels;
|
|
230
|
+
} catch {
|
|
231
|
+
labelsCache = {};
|
|
232
|
+
labelsCacheSignature = 'missing';
|
|
233
|
+
return labelsCache;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Save labels.json directly.
|
|
239
|
+
*/
|
|
240
|
+
function saveLabels(labels) {
|
|
241
|
+
const tmp = LABELS_PATH + '.tmp.' + process.pid;
|
|
242
|
+
writeFileSync(tmp, JSON.stringify(labels, null, 2) + '\n');
|
|
243
|
+
renameSync(tmp, LABELS_PATH);
|
|
244
|
+
labelsCache = labels;
|
|
245
|
+
labelsCacheSignature = getLabelsSignature();
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function mutateLabels(mutator) {
|
|
249
|
+
const labels = loadLabels();
|
|
250
|
+
const changed = mutator(labels);
|
|
251
|
+
if (changed !== false) {
|
|
252
|
+
saveLabels(labels);
|
|
253
|
+
}
|
|
254
|
+
return labels;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
function updateExistingLabel(label, mutator) {
|
|
258
|
+
return mutateLabels((labels) => {
|
|
259
|
+
if (!labels[label]) return false;
|
|
260
|
+
const changed = mutator(labels[label], labels);
|
|
261
|
+
if (changed === false) return false;
|
|
262
|
+
labels[label].updatedAt = new Date().toISOString();
|
|
263
|
+
return true;
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Get the current retryCount for a label (default 0).
|
|
269
|
+
*/
|
|
270
|
+
function getRetryCount(label) {
|
|
271
|
+
const labels = loadLabels();
|
|
272
|
+
return labels[label]?.retryCount || 0;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/**
|
|
276
|
+
* Update retryCount for a label.
|
|
277
|
+
*/
|
|
278
|
+
function setRetryCount(label, count) {
|
|
279
|
+
updateExistingLabel(label, (entry) => {
|
|
280
|
+
entry.retryCount = count;
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Get the current gateway-restart retry count for a label (default 0).
|
|
286
|
+
*/
|
|
287
|
+
function getGwRestartRetryCount(label) {
|
|
288
|
+
const labels = loadLabels();
|
|
289
|
+
return labels[label]?.gwRestartRetryCount || 0;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/**
|
|
293
|
+
* Update the gateway-restart retry count for a label.
|
|
294
|
+
*/
|
|
295
|
+
function setGwRestartRetryCount(label, count) {
|
|
296
|
+
updateExistingLabel(label, (entry) => {
|
|
297
|
+
entry.gwRestartRetryCount = count;
|
|
298
|
+
});
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Send a notification via the scheduler messages table.
|
|
303
|
+
*/
|
|
304
|
+
function notify(message) {
|
|
305
|
+
if (process.env.OPENCLAW_SCHEDULER_NOTIFY_DISABLED === '1') {
|
|
306
|
+
process.stderr.write(`[watcher] notify suppressed (test mode): ${message}\n`);
|
|
307
|
+
return;
|
|
308
|
+
}
|
|
309
|
+
try {
|
|
310
|
+
sendMessage({
|
|
311
|
+
from_agent: 'dispatch',
|
|
312
|
+
to_agent: 'main',
|
|
313
|
+
body: message,
|
|
314
|
+
kind: 'text',
|
|
315
|
+
});
|
|
316
|
+
} catch (err) {
|
|
317
|
+
process.stderr.write(`[watcher] notify failed: ${err.message}\n`);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Attempt to retry a 529'd session.
|
|
323
|
+
* Resets labels.json status to 'running', then re-enqueues with mode=reuse.
|
|
324
|
+
*
|
|
325
|
+
* Returns true if retry was dispatched, false if max retries exceeded.
|
|
326
|
+
*/
|
|
327
|
+
function attempt529Retry(label, retryCount, errorMsg) {
|
|
328
|
+
if (retryCount >= MAX_529_RETRIES) {
|
|
329
|
+
// Max retries exceeded
|
|
330
|
+
updateExistingLabel(label, (entry) => {
|
|
331
|
+
entry.status = 'error';
|
|
332
|
+
entry.error = `max_retries_exceeded (${retryCount}x 529): ${errorMsg}`;
|
|
333
|
+
});
|
|
334
|
+
notify(`🌶️ Dispatch: [${label}] hit max retries (${MAX_529_RETRIES}x 529 overload) -- giving up`);
|
|
335
|
+
return { retry: false };
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
const newRetryCount = retryCount + 1;
|
|
339
|
+
const delayMs = RETRY_BASE_DELAY_MS * newRetryCount;
|
|
340
|
+
|
|
341
|
+
process.stderr.write(
|
|
342
|
+
`[watcher] 529 detected for [${label}] (attempt ${newRetryCount}/${MAX_529_RETRIES}). ` +
|
|
343
|
+
`Waiting ${delayMs / 1000}s before retry...\n`
|
|
344
|
+
);
|
|
345
|
+
notify(`🌶️ Dispatch: [${label}] hit 529 overload -- retry ${newRetryCount}/${MAX_529_RETRIES} in ${delayMs / 1000}s`);
|
|
346
|
+
|
|
347
|
+
// Update retryCount in labels.json BEFORE sleeping (persist intent)
|
|
348
|
+
setRetryCount(label, newRetryCount);
|
|
349
|
+
|
|
350
|
+
return { retry: true, delayMs, newRetryCount };
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Re-enqueue a label via dispatch enqueue --mode reuse.
|
|
355
|
+
* Uses the original label's message from the gateway session.
|
|
356
|
+
*/
|
|
357
|
+
function respawnSession(label) {
|
|
358
|
+
try {
|
|
359
|
+
// Reset the label status to 'running' so the re-enqueue can proceed
|
|
360
|
+
const labels = loadLabels();
|
|
361
|
+
const entry = labels[label];
|
|
362
|
+
if (!entry) throw new Error(`label "${label}" not found`);
|
|
363
|
+
|
|
364
|
+
// We need to re-enqueue. Since we're using mode=reuse, the session key
|
|
365
|
+
// is preserved and we send a continuation message.
|
|
366
|
+
const continuationMsg = `[Auto-retry after 529 overload] Please continue your previous task. Pick up where you left off.`;
|
|
367
|
+
|
|
368
|
+
execFileSync(process.execPath, [
|
|
369
|
+
INDEX_PATH, 'send',
|
|
370
|
+
'--label', label,
|
|
371
|
+
'--message', continuationMsg,
|
|
372
|
+
], {
|
|
373
|
+
encoding: 'utf-8',
|
|
374
|
+
timeout: 30000,
|
|
375
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
376
|
+
});
|
|
377
|
+
|
|
378
|
+
// Reload labels after execFileSync (child may have modified labels.json)
|
|
379
|
+
updateExistingLabel(label, (entry) => {
|
|
380
|
+
entry.status = 'running';
|
|
381
|
+
entry.error = null;
|
|
382
|
+
});
|
|
383
|
+
|
|
384
|
+
process.stderr.write(`[watcher] respawned [${label}] via send (reuse session)\n`);
|
|
385
|
+
return true;
|
|
386
|
+
} catch (err) {
|
|
387
|
+
process.stderr.write(`[watcher] respawn via send failed: ${err.message}\n`);
|
|
388
|
+
|
|
389
|
+
// Fallback: try fresh enqueue if send fails (session may be dead)
|
|
390
|
+
try {
|
|
391
|
+
const labels = loadLabels();
|
|
392
|
+
const entry = labels[label];
|
|
393
|
+
const continuationMsg = `[Auto-retry after 529 overload] This is a retry of a previous task that failed due to API overload. Please continue the task from the beginning.`;
|
|
394
|
+
|
|
395
|
+
// Build enqueue args from original label data
|
|
396
|
+
const enqueueArgs = [
|
|
397
|
+
INDEX_PATH, 'enqueue',
|
|
398
|
+
'--label', label,
|
|
399
|
+
'--message', continuationMsg,
|
|
400
|
+
'--mode', 'fresh',
|
|
401
|
+
];
|
|
402
|
+
if (entry?.model) enqueueArgs.push('--model', entry.model);
|
|
403
|
+
if (entry?.thinking) enqueueArgs.push('--thinking', entry.thinking);
|
|
404
|
+
if (entry?.origin) enqueueArgs.push('--origin', entry.origin);
|
|
405
|
+
if (entry?.deliverTo) {
|
|
406
|
+
enqueueArgs.push('--deliver-to', entry.deliverTo);
|
|
407
|
+
if (entry?.deliveryMode) enqueueArgs.push('--delivery-mode', entry.deliveryMode);
|
|
408
|
+
if (entry?.deliverChannel) enqueueArgs.push('--deliver-channel', entry.deliverChannel);
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
execFileSync(process.execPath, enqueueArgs, {
|
|
412
|
+
encoding: 'utf-8',
|
|
413
|
+
timeout: 30000,
|
|
414
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
415
|
+
});
|
|
416
|
+
|
|
417
|
+
process.stderr.write(`[watcher] respawned [${label}] via fresh enqueue (fallback)\n`);
|
|
418
|
+
return true;
|
|
419
|
+
} catch (err2) {
|
|
420
|
+
process.stderr.write(`[watcher] respawn fallback also failed: ${err2.message}\n`);
|
|
421
|
+
return false;
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
/**
|
|
427
|
+
* Re-enqueue a label after a gateway-restart kill.
|
|
428
|
+
* Always uses fresh mode since the original session is gone (the gateway restart
|
|
429
|
+
* wiped it). Resets label status to 'running' on success so the watcher can
|
|
430
|
+
* continue polling the new session.
|
|
431
|
+
*/
|
|
432
|
+
function respawnAfterGwRestart(label) {
|
|
433
|
+
try {
|
|
434
|
+
const labels = loadLabels();
|
|
435
|
+
const entry = labels[label];
|
|
436
|
+
if (!entry) throw new Error(`label "${label}" not found`);
|
|
437
|
+
|
|
438
|
+
const continuationMsg =
|
|
439
|
+
`[Auto-retry after gateway restart] Previous run was killed by gateway restart. ` +
|
|
440
|
+
`Resume from the beginning.`;
|
|
441
|
+
|
|
442
|
+
const enqueueArgs = [
|
|
443
|
+
INDEX_PATH, 'enqueue',
|
|
444
|
+
'--label', label,
|
|
445
|
+
'--message', continuationMsg,
|
|
446
|
+
'--mode', 'fresh',
|
|
447
|
+
];
|
|
448
|
+
if (entry?.model) enqueueArgs.push('--model', entry.model);
|
|
449
|
+
if (entry?.thinking) enqueueArgs.push('--thinking', entry.thinking);
|
|
450
|
+
if (entry?.origin) enqueueArgs.push('--origin', entry.origin);
|
|
451
|
+
if (entry?.deliverTo) {
|
|
452
|
+
enqueueArgs.push('--deliver-to', entry.deliverTo);
|
|
453
|
+
if (entry?.deliveryMode) enqueueArgs.push('--delivery-mode', entry.deliveryMode);
|
|
454
|
+
if (entry?.deliverChannel) enqueueArgs.push('--deliver-channel', entry.deliverChannel);
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
execFileSync(process.execPath, enqueueArgs, {
|
|
458
|
+
encoding: 'utf-8',
|
|
459
|
+
timeout: 30000,
|
|
460
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
461
|
+
});
|
|
462
|
+
|
|
463
|
+
// enqueue sets the label to 'running' with a new sessionKey -- also reset error field
|
|
464
|
+
updateExistingLabel(label, (entry) => {
|
|
465
|
+
entry.error = null;
|
|
466
|
+
});
|
|
467
|
+
|
|
468
|
+
process.stderr.write(`[watcher] respawned [${label}] via fresh enqueue after gateway restart\n`);
|
|
469
|
+
return true;
|
|
470
|
+
} catch (err) {
|
|
471
|
+
process.stderr.write(`[watcher] respawn after gw restart failed: ${err.message}\n`);
|
|
472
|
+
return false;
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
// -- Gateway Steer & Kill -------------------------------------
|
|
477
|
+
|
|
478
|
+
/**
|
|
479
|
+
* Send a steer message into a running session via gateway API (sync).
|
|
480
|
+
*/
|
|
481
|
+
function steerSession(sessionKey, message) {
|
|
482
|
+
if (!GW_TOKEN) {
|
|
483
|
+
process.stderr.write(`[watcher] steer skipped: no gateway token\n`);
|
|
484
|
+
return false;
|
|
485
|
+
}
|
|
486
|
+
try {
|
|
487
|
+
gatewayCall('agent', {
|
|
488
|
+
message,
|
|
489
|
+
sessionKey,
|
|
490
|
+
deliver: false,
|
|
491
|
+
lane: 'nested',
|
|
492
|
+
}, { timeout: 15000 });
|
|
493
|
+
return true;
|
|
494
|
+
} catch (err) {
|
|
495
|
+
process.stderr.write(`[watcher] steer failed: ${err.message}\n`);
|
|
496
|
+
return false;
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
/**
|
|
501
|
+
* Kill a session via gateway subagents API (sync).
|
|
502
|
+
*/
|
|
503
|
+
function killSession(sessionKey) {
|
|
504
|
+
if (!GW_TOKEN) {
|
|
505
|
+
process.stderr.write(`[watcher] kill skipped: no gateway token\n`);
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
508
|
+
try {
|
|
509
|
+
gatewayCall('subagents.kill', { target: sessionKey }, { timeout: 10000 });
|
|
510
|
+
} catch (err) {
|
|
511
|
+
process.stderr.write(`[watcher] kill failed: ${err.message}\n`);
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
/**
|
|
516
|
+
* Read the sessions.json store for an agent directly from disk.
|
|
517
|
+
* Primary ground truth for session state -- sessions spawned via dispatcher
|
|
518
|
+
* HTTP agent endpoint appear here but NOT in sessions_list API results.
|
|
519
|
+
*
|
|
520
|
+
* @param {string} agent - Agent ID (default: 'main')
|
|
521
|
+
* @returns {Object|null} - Sessions store object, or null on read error
|
|
522
|
+
*/
|
|
523
|
+
function readSessionsStore(agent = 'main') {
|
|
524
|
+
try {
|
|
525
|
+
const sessionsPath = join(HOME_DIR, '.openclaw', 'agents', agent, 'sessions', 'sessions.json');
|
|
526
|
+
return JSON.parse(readFileSync(sessionsPath, 'utf-8'));
|
|
527
|
+
} catch {
|
|
528
|
+
return null;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
/**
|
|
533
|
+
* Get the mtime (in milliseconds) of a session's JSONL file.
|
|
534
|
+
*
|
|
535
|
+
* Unlike sessions.json (which is NOT flushed during active turns), the JSONL
|
|
536
|
+
* file at ~/.openclaw/agents/<agentDir>/sessions/<sessionId>.jsonl is written
|
|
537
|
+
* continuously as the session processes messages. Use this as a reliable
|
|
538
|
+
* activity signal when totalTokens and updatedAt are flat.
|
|
539
|
+
*
|
|
540
|
+
* Fix rationale: for spawned subagent sessions, OpenClaw does NOT flush
|
|
541
|
+
* totalTokens or updatedAt during active turns -- so sessions.json stays stale
|
|
542
|
+
* while the session is actively working. The JSONL mtime advances on every
|
|
543
|
+
* tool call, model reply, and streaming chunk, making it a much more reliable
|
|
544
|
+
* liveness signal. Without this, the watcher hits FLAT_WINDOW_MS mid-turn and
|
|
545
|
+
* marks the session done prematurely, causing zombie sessions with no delivery.
|
|
546
|
+
*
|
|
547
|
+
* @param {string} sessionId - Internal session UUID (entry.sessionId from sessions.json)
|
|
548
|
+
* @param {string} agentDir - Agent directory (default: 'main')
|
|
549
|
+
* @returns {number|null} mtimeMs if file exists, null otherwise
|
|
550
|
+
*/
|
|
551
|
+
function getSessionJsonlMtime(sessionId, agentDir = 'main') {
|
|
552
|
+
if (!sessionId) return null;
|
|
553
|
+
try {
|
|
554
|
+
const jsonlPath = join(HOME_DIR, '.openclaw', 'agents', agentDir, 'sessions', `${sessionId}.jsonl`);
|
|
555
|
+
return statSync(jsonlPath).mtimeMs;
|
|
556
|
+
} catch {
|
|
557
|
+
return null;
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
/**
|
|
563
|
+
* Read the last N non-empty lines from a session's JSONL file and return them
|
|
564
|
+
* as parsed objects. Returns null if file doesn't exist or is unreadable.
|
|
565
|
+
*
|
|
566
|
+
* @param {string} sessionId - Internal session UUID
|
|
567
|
+
* @param {string} agentDir - Agent directory (default: 'main')
|
|
568
|
+
* @param {number} n - Number of lines to read from end (default: 3)
|
|
569
|
+
* @returns {Array|null} parsed JSON objects, or null
|
|
570
|
+
*/
|
|
571
|
+
function readJsonlLastLines(sessionId, agentDir = 'main', n = 3) {
|
|
572
|
+
if (!sessionId) return null;
|
|
573
|
+
try {
|
|
574
|
+
const jsonlPath = join(HOME_DIR, '.openclaw', 'agents', agentDir, 'sessions', `${sessionId}.jsonl`);
|
|
575
|
+
const content = readFileSync(jsonlPath, 'utf-8');
|
|
576
|
+
return content
|
|
577
|
+
.split('\n')
|
|
578
|
+
.filter(l => l.trim())
|
|
579
|
+
.slice(-n)
|
|
580
|
+
.map(l => { try { return JSON.parse(l); } catch { return null; } })
|
|
581
|
+
.filter(Boolean);
|
|
582
|
+
} catch {
|
|
583
|
+
return null;
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
/**
|
|
588
|
+
* Check if a session is currently mid-turn by inspecting its JSONL tail.
|
|
589
|
+
* Returns a reason string if mid-turn is detected, null if safe to proceed.
|
|
590
|
+
*
|
|
591
|
+
* Mid-turn signals:
|
|
592
|
+
* - Last entry is role=assistant with content containing type=tool_use
|
|
593
|
+
* -> assistant dispatched a tool call, tool hasn't returned yet
|
|
594
|
+
* - Last entry is role=user with content containing type=tool_result
|
|
595
|
+
* -> tool result just delivered, assistant hasn't replied yet
|
|
596
|
+
* - JSONL modified within FLAT_WINDOW_MS (combined with above)
|
|
597
|
+
*
|
|
598
|
+
* Safe signals (return null):
|
|
599
|
+
* - JSONL doesn't exist or hasn't been modified in >FLAT_WINDOW_MS
|
|
600
|
+
* - Last assistant entry has type=text only (complete reply)
|
|
601
|
+
*
|
|
602
|
+
* @param {string} sessionId - Internal session UUID
|
|
603
|
+
* @param {string} agentDir - Agent directory (default: 'main')
|
|
604
|
+
* @returns {string|null} reason string if mid-turn, null if safe to proceed
|
|
605
|
+
*/
|
|
606
|
+
function getJsonlMidTurnReason(sessionId, agentDir = 'main') {
|
|
607
|
+
if (!sessionId) return null;
|
|
608
|
+
|
|
609
|
+
const jsonlPath = join(HOME_DIR, '.openclaw', 'agents', agentDir, 'sessions', `${sessionId}.jsonl`);
|
|
610
|
+
let mtimeMs;
|
|
611
|
+
try {
|
|
612
|
+
mtimeMs = statSync(jsonlPath).mtimeMs;
|
|
613
|
+
} catch {
|
|
614
|
+
return null; // File doesn't exist -- session is genuinely gone, safe to proceed
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
// If JSONL hasn't been modified in >FLAT_WINDOW_MS, session isn't actively running
|
|
618
|
+
if (Date.now() - mtimeMs > FLAT_WINDOW_MS) {
|
|
619
|
+
return null;
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
const lastLines = readJsonlLastLines(sessionId, agentDir, 3);
|
|
623
|
+
if (!lastLines || lastLines.length === 0) return null;
|
|
624
|
+
|
|
625
|
+
const last = lastLines[lastLines.length - 1];
|
|
626
|
+
|
|
627
|
+
// Check last entry: role=assistant with tool_use in content array
|
|
628
|
+
// (assistant dispatched a tool call, awaiting tool result)
|
|
629
|
+
if (last?.role === 'assistant') {
|
|
630
|
+
const content = Array.isArray(last.content) ? last.content : [];
|
|
631
|
+
const hasToolUse = content.some(c => c?.type === 'tool_use');
|
|
632
|
+
if (hasToolUse) {
|
|
633
|
+
const toolName = content.find(c => c?.type === 'tool_use')?.name || 'unknown';
|
|
634
|
+
return `last assistant entry has tool_use (${toolName}) -- awaiting tool result`;
|
|
635
|
+
}
|
|
636
|
+
// Top-level type=tool_use (non-array content format)
|
|
637
|
+
if (last.type === 'tool_use') {
|
|
638
|
+
return `last entry is tool_use (${last.name || 'unknown'}) -- awaiting tool result`;
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Check last entry: role=user with tool_result in content
|
|
643
|
+
// (tool result just delivered, assistant hasn't replied yet)
|
|
644
|
+
if (last?.role === 'user') {
|
|
645
|
+
const content = Array.isArray(last.content) ? last.content : [];
|
|
646
|
+
if (content.some(c => c?.type === 'tool_result')) {
|
|
647
|
+
return 'last entry is tool_result (tool executed, awaiting assistant reply)';
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
// Top-level type=tool_result (alternative format)
|
|
652
|
+
if (last?.type === 'tool_result') {
|
|
653
|
+
return 'last entry is tool_result (tool executed, awaiting assistant reply)';
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
return null; // Last assistant entry appears to be a complete text reply -- safe to proceed
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
/**
|
|
660
|
+
* Update labels.json to mark the watched label as done (best-effort, atomic write).
|
|
661
|
+
* Called before exit to ensure labels.json is reconciled even if sync fails.
|
|
662
|
+
*/
|
|
663
|
+
function markLabelDone(label, summary) {
|
|
664
|
+
try {
|
|
665
|
+
updateExistingLabel(label, (entry) => {
|
|
666
|
+
if (entry.status === 'done') return false;
|
|
667
|
+
entry.status = 'done';
|
|
668
|
+
entry.summary = summary || entry.summary || null;
|
|
669
|
+
});
|
|
670
|
+
} catch (e) {
|
|
671
|
+
process.stderr.write(`[watcher] markLabelDone failed: ${e.message}\n`);
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
/**
|
|
676
|
+
* Update labels.json to mark the watched label as 'error' (best-effort, atomic write).
|
|
677
|
+
* Used instead of markDoneSync/markLabelDone for sessions that did NOT complete
|
|
678
|
+
* successfully: gateway-restart-kill, timeout with no result, spawn failure.
|
|
679
|
+
* This ensures the scheduler run status reflects the true failure outcome.
|
|
680
|
+
*/
|
|
681
|
+
function markLabelError(label, errorSummary) {
|
|
682
|
+
try {
|
|
683
|
+
updateExistingLabel(label, (entry) => {
|
|
684
|
+
if (entry.status === 'done') return false;
|
|
685
|
+
entry.status = 'error';
|
|
686
|
+
entry.summary = errorSummary || 'failed without result';
|
|
687
|
+
});
|
|
688
|
+
} catch (e) {
|
|
689
|
+
process.stderr.write(`[watcher] markLabelError failed: ${e.message}\n`);
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
/**
|
|
694
|
+
* Format and output the delivery message, then exit 0.
|
|
695
|
+
* Also marks the label as done in labels.json before exiting.
|
|
696
|
+
*
|
|
697
|
+
* If the label has a verifyCmd stored, it is run first.
|
|
698
|
+
* If the verify command exits non-zero, the job is marked as error and
|
|
699
|
+
* an alert is written to stdout (delivery target receives the failure notice).
|
|
700
|
+
*/
|
|
701
|
+
function deliverResult(label, lastReply, fallbackSummary) {
|
|
702
|
+
// -- verify-cmd check -----------------------------------------------------
|
|
703
|
+
// Run the stored verify-cmd (if any) before declaring the job done.
|
|
704
|
+
// A non-zero exit flips the job to error state and sends an alert instead.
|
|
705
|
+
try {
|
|
706
|
+
const labels = loadLabels();
|
|
707
|
+
const entry = labels[label];
|
|
708
|
+
if (entry?.verifyCmd) {
|
|
709
|
+
process.stderr.write(`[watcher] running verify-cmd for ${label}: ${entry.verifyCmd}\n`);
|
|
710
|
+
try {
|
|
711
|
+
execSync(entry.verifyCmd, { stdio: 'pipe', timeout: 60000, shell: true });
|
|
712
|
+
process.stderr.write(`[watcher] verify-cmd passed for ${label}\n`);
|
|
713
|
+
} catch (verifyErr) {
|
|
714
|
+
const stderr = verifyErr.stderr ? verifyErr.stderr.toString().trim() : verifyErr.message;
|
|
715
|
+
const errMsg = `verify-cmd failed: ${stderr || 'exit code ' + (verifyErr.status ?? 1)}`;
|
|
716
|
+
process.stderr.write(`[watcher] ${errMsg}\n`);
|
|
717
|
+
markLabelError(label, errMsg);
|
|
718
|
+
// Output failure notice -- scheduler delivers this to the delivery target
|
|
719
|
+
process.stdout.write(
|
|
720
|
+
`🌶️ *dispatch* [${label}] ⚠️ VERIFICATION FAILED\n\n` +
|
|
721
|
+
`The agent session completed but the post-completion verify-cmd exited non-zero.\n\n` +
|
|
722
|
+
`**Verify command:** \`${entry.verifyCmd}\`\n` +
|
|
723
|
+
`**Error:** ${stderr || 'non-zero exit'}\n\n` +
|
|
724
|
+
`Job marked as \`error\`. The agent may have reported done without completing the actual work.\n`
|
|
725
|
+
);
|
|
726
|
+
process.exit(1);
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
} catch (loadErr) {
|
|
730
|
+
// Non-fatal -- if labels can't be read, skip verify check and proceed normally
|
|
731
|
+
process.stderr.write(`[watcher] verify-cmd check skipped (labels load error): ${loadErr.message}\n`);
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
// Update labels.json before exiting -- prevents stuck detector false positives
|
|
735
|
+
const summary = fallbackSummary || (lastReply ? lastReply.slice(0, 500) : null);
|
|
736
|
+
markLabelDone(label, summary);
|
|
737
|
+
|
|
738
|
+
if (lastReply) {
|
|
739
|
+
const maxLen = 3500;
|
|
740
|
+
const reply = lastReply.length > maxLen
|
|
741
|
+
? lastReply.slice(0, maxLen) + '\n\n..[truncated]'
|
|
742
|
+
: lastReply;
|
|
743
|
+
process.stdout.write(`🌶️ *dispatch* [${label}] completed:\n\n${reply}\n`);
|
|
744
|
+
} else {
|
|
745
|
+
process.stdout.write(
|
|
746
|
+
`🌶️ *dispatch* [${label}] completed (no reply captured)\n` +
|
|
747
|
+
`Summary: ${fallbackSummary || 'none'}\n`
|
|
748
|
+
);
|
|
749
|
+
}
|
|
750
|
+
process.exit(0);
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
// -- Watcher heartbeat interval ref --------------------------------------
|
|
754
|
+
// Populated after label is validated (in main body). Cleared on exit.
|
|
755
|
+
// The interval writes lastPing to labels.json so the watchdog guard in
|
|
756
|
+
// index.mjs knows this watcher process is alive and actively monitoring.
|
|
757
|
+
let _pingInterval = null;
|
|
758
|
+
|
|
759
|
+
// -- Sync on Exit --------------------------------------------
|
|
760
|
+
// Best-effort sync of labels.json with gateway state on every watcher exit.
|
|
761
|
+
// Ensures stale 'running' entries are reconciled promptly, preventing
|
|
762
|
+
// false positives from the stuck detector.
|
|
763
|
+
process.on('exit', () => {
|
|
764
|
+
if (_pingInterval !== null) {
|
|
765
|
+
clearInterval(_pingInterval);
|
|
766
|
+
_pingInterval = null;
|
|
767
|
+
}
|
|
768
|
+
try {
|
|
769
|
+
execFileSync(process.execPath, [INDEX_PATH, 'sync'], {
|
|
770
|
+
encoding: 'utf-8',
|
|
771
|
+
timeout: 15000,
|
|
772
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
773
|
+
});
|
|
774
|
+
} catch {
|
|
775
|
+
// Best-effort -- never block exit
|
|
776
|
+
}
|
|
777
|
+
});
|
|
778
|
+
|
|
779
|
+
// -- Main ----------------------------------------------------
|
|
780
|
+
|
|
781
|
+
const flags = parseFlags(process.argv.slice(2));
|
|
782
|
+
const label = flags.label;
|
|
783
|
+
const timeoutS = parseInt(flags.timeout || '600', 10);
|
|
784
|
+
const pollS = parseInt(flags['poll-interval'] || '20', 10);
|
|
785
|
+
|
|
786
|
+
// How long a session must be idle before we proactively check result
|
|
787
|
+
const IDLE_RESULT_CHECK_MS = 60000;
|
|
788
|
+
|
|
789
|
+
if (!label) {
|
|
790
|
+
process.stderr.write('[watcher] --label is required\n');
|
|
791
|
+
process.exit(2);
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
// -- Start heartbeat -----------------------------------------------------
|
|
795
|
+
// Write lastPing to labels.json every PING_INTERVAL_MS while the session is
|
|
796
|
+
// still running. The watchdog guard in index.mjs reads lastPing to know this
|
|
797
|
+
// watcher process is alive -- preventing premature auto-resolve during slow
|
|
798
|
+
// tool calls, docker builds, long pytest runs, etc.
|
|
799
|
+
// Cleared automatically by the process.on('exit') handler above.
|
|
800
|
+
//
|
|
801
|
+
// Race-condition note: labels.json is cached by file mtime/size to avoid reparsing on
|
|
802
|
+
// every heartbeat tick, but each tick still re-validates the on-disk signature before
|
|
803
|
+
// patching lastPing. Worst case a concurrent writer wins one tick; the next tick repairs it.
|
|
804
|
+
_pingInterval = setInterval(() => {
|
|
805
|
+
try {
|
|
806
|
+
updateExistingLabel(label, (entry) => {
|
|
807
|
+
if (entry.status !== 'running') return false;
|
|
808
|
+
entry.lastPing = new Date().toISOString();
|
|
809
|
+
});
|
|
810
|
+
} catch {
|
|
811
|
+
// Best-effort -- never crash the watcher over a ping failure
|
|
812
|
+
}
|
|
813
|
+
}, PING_INTERVAL_MS);
|
|
814
|
+
_pingInterval.unref(); // don't prevent Node.js from exiting naturally
|
|
815
|
+
|
|
816
|
+
const spawnTime = Date.now();
|
|
817
|
+
let deadline = spawnTime + timeoutS * 1000;
|
|
818
|
+
let consecutiveFailures = 0;
|
|
819
|
+
const MAX_CONSECUTIVE_FAILURES = 10;
|
|
820
|
+
let recoverySessionKey = null; // captured during polling for steer/kill
|
|
821
|
+
|
|
822
|
+
// Module-level state accessible by SIGTERM handler
|
|
823
|
+
let lastKnownReply = null;
|
|
824
|
+
|
|
825
|
+
// -- SIGTERM handler (scheduler kills watcher with SIGTERM before SIGKILL) --
|
|
826
|
+
// Ensures labels.json is updated and a delivery attempt is made even when killed.
|
|
827
|
+
process.on('SIGTERM', () => {
|
|
828
|
+
process.stderr.write(`[watcher] SIGTERM received for ${label} -- marking as interrupted\n`);
|
|
829
|
+
// Try to fetch the latest result before dying
|
|
830
|
+
try {
|
|
831
|
+
const result = dispatch('result', ['--label', label]);
|
|
832
|
+
if (result?.lastReply) lastKnownReply = result.lastReply;
|
|
833
|
+
} catch {}
|
|
834
|
+
// deliverResult calls process.exit(0) internally
|
|
835
|
+
deliverResult(label, lastKnownReply, 'interrupted by watcher timeout');
|
|
836
|
+
});
|
|
837
|
+
|
|
838
|
+
// -- Rolling deadline vars ------------------------------------
|
|
839
|
+
let lastTokens = null;
|
|
840
|
+
let preDeadlineJsonlMtime = null; // JSONL mtime sampled each poll cycle for subagent activity signal
|
|
841
|
+
let preDeadlineSessionId = null; // reset on respawn to avoid cross-session mtime comparison
|
|
842
|
+
const ROLLING_EXTEND_MS = 5 * 60 * 1000; // extend by 5min when active
|
|
843
|
+
const MAX_DEADLINE_EXTENSION = 4 * 60 * 60 * 1000; // absolute hard ceiling for any deadline extension
|
|
844
|
+
|
|
845
|
+
/**
|
|
846
|
+
* Attempt to push the watcher deadline forward by ROLLING_EXTEND_MS, capped at
|
|
847
|
+
* spawnTime + min(timeoutS, MAX_DEADLINE_EXTENSION). This prevents a watcher
|
|
848
|
+
* from outliving its own timeout boundary via repeated JSONL mtime extensions.
|
|
849
|
+
* MAX_DEADLINE_EXTENSION (4h) is the absolute hard ceiling for any watcher.
|
|
850
|
+
* Returns true if the deadline was actually moved.
|
|
851
|
+
* @param {string} reason - Human-readable reason for the log line
|
|
852
|
+
*/
|
|
853
|
+
function tryExtendDeadline(reason) {
|
|
854
|
+
const proposed = Date.now() + ROLLING_EXTEND_MS;
|
|
855
|
+
const cap = spawnTime + Math.min(timeoutS * 1000, MAX_DEADLINE_EXTENSION);
|
|
856
|
+
const extension = Math.min(proposed, cap);
|
|
857
|
+
if (extension <= deadline) return false;
|
|
858
|
+
deadline = extension;
|
|
859
|
+
process.stderr.write(
|
|
860
|
+
`[watcher] [${label}] ${reason}, deadline extended to +${Math.round((deadline - Date.now()) / 60000)}min\n`
|
|
861
|
+
);
|
|
862
|
+
return true;
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
// Track whether the session has EVER appeared in the gateway sessions list.
|
|
866
|
+
// Used to distinguish spawn failures (session never appeared) from normal
|
|
867
|
+
// completions (session appeared, ran, then cleaned up).
|
|
868
|
+
let sessionEverFound = false;
|
|
869
|
+
|
|
870
|
+
while (Date.now() < deadline) {
|
|
871
|
+
const status = dispatch('status', ['--label', label]);
|
|
872
|
+
|
|
873
|
+
if (!status?.ok) {
|
|
874
|
+
consecutiveFailures++;
|
|
875
|
+
if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
|
|
876
|
+
process.stdout.write(`⚠️ dispatch [${label}] watcher: gave up after ${MAX_CONSECUTIVE_FAILURES} consecutive status failures\n`);
|
|
877
|
+
process.exit(1);
|
|
878
|
+
}
|
|
879
|
+
await sleep(pollS * 1000);
|
|
880
|
+
continue;
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
consecutiveFailures = 0;
|
|
884
|
+
|
|
885
|
+
// Capture sessionKey for recovery steer/kill
|
|
886
|
+
if (status.sessionKey) recoverySessionKey = status.sessionKey;
|
|
887
|
+
|
|
888
|
+
// -- Rolling deadline: extend when session shows token activity --
|
|
889
|
+
const currentTokens = status?.liveness?.tokens ?? null;
|
|
890
|
+
if (currentTokens !== null && lastTokens !== null && currentTokens > lastTokens) {
|
|
891
|
+
tryExtendDeadline(`activity detected (${lastTokens}->${currentTokens} tokens)`);
|
|
892
|
+
}
|
|
893
|
+
if (currentTokens !== null) lastTokens = currentTokens;
|
|
894
|
+
|
|
895
|
+
// -- Rolling deadline: extend on JSONL mtime advance (subagent sessions) --
|
|
896
|
+
// Subagent sessions never populate totalTokens in sessions.json, so the token
|
|
897
|
+
// signal above is always null for them. Use JSONL file mtime as an alternative
|
|
898
|
+
// activity signal to prevent killing working subagent sessions mid-task.
|
|
899
|
+
if (status.sessionKey) {
|
|
900
|
+
const storeEntry = getSessionStoreEntry(status.sessionKey);
|
|
901
|
+
const sessionId = storeEntry?.sessionId || null;
|
|
902
|
+
const sessionAgent = status.sessionKey.split(':')[1] || 'main';
|
|
903
|
+
|
|
904
|
+
// Reset mtime baseline when the tracked session changes (e.g. after respawn)
|
|
905
|
+
if (sessionId && preDeadlineSessionId !== null && preDeadlineSessionId !== sessionId) {
|
|
906
|
+
preDeadlineJsonlMtime = null;
|
|
907
|
+
}
|
|
908
|
+
if (sessionId) preDeadlineSessionId = sessionId;
|
|
909
|
+
|
|
910
|
+
const curMtime = sessionId ? getSessionJsonlMtime(sessionId, sessionAgent) : null;
|
|
911
|
+
if (curMtime !== null) {
|
|
912
|
+
if (preDeadlineJsonlMtime !== null && curMtime > preDeadlineJsonlMtime + 1000) {
|
|
913
|
+
tryExtendDeadline('JSONL mtime advanced (subagent active)');
|
|
914
|
+
}
|
|
915
|
+
preDeadlineJsonlMtime = curMtime;
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
// Track session presence -- two independent signals, either is sufficient.
|
|
920
|
+
// 1. Sessions.json store (primary ground truth for dispatcher-spawned sessions)
|
|
921
|
+
// 2. Liveness field from dispatch status (secondary; also built from sessions.json
|
|
922
|
+
// in production, but test mocks may provide it directly)
|
|
923
|
+
if (!sessionEverFound && status.sessionKey) {
|
|
924
|
+
const sessionAgent = status.agent || 'main';
|
|
925
|
+
const watcherStore = readSessionsStore(sessionAgent);
|
|
926
|
+
if (watcherStore !== null && status.sessionKey in watcherStore) {
|
|
927
|
+
// Found in sessions.json -- authoritative
|
|
928
|
+
sessionEverFound = true;
|
|
929
|
+
} else if (status.liveness && !status.liveness.error) {
|
|
930
|
+
// Not in sessions.json (or store unavailable) but liveness signal says alive --
|
|
931
|
+
// session may still be initializing. Trust liveness as a secondary signal.
|
|
932
|
+
sessionEverFound = true;
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
// -- Path 0a: agent-side done signal (push-based) ----------
|
|
937
|
+
// If the agent ran `dispatch done --label <label>`, status is 'done' immediately.
|
|
938
|
+
// This is the fast path -- no need to poll for idle timeout.
|
|
939
|
+
// (Handled by Path 1 below since cmdDone sets status='done' in labels.json)
|
|
940
|
+
|
|
941
|
+
// -- Path 0b: 529/overload auto-retry ----------------------
|
|
942
|
+
if (status.status === 'error') {
|
|
943
|
+
const errorMsg = status.error || status.summary || '';
|
|
944
|
+
if (is529Error(errorMsg)) {
|
|
945
|
+
const retryCount = getRetryCount(label);
|
|
946
|
+
const retryResult = attempt529Retry(label, retryCount, errorMsg);
|
|
947
|
+
|
|
948
|
+
if (!retryResult.retry) {
|
|
949
|
+
// Max retries exceeded -- deliver error
|
|
950
|
+
process.stdout.write(
|
|
951
|
+
`🌶️ *dispatch* [${label}] failed after ${MAX_529_RETRIES} retries (529 overload)\n` +
|
|
952
|
+
`Error: ${errorMsg}\n`
|
|
953
|
+
);
|
|
954
|
+
process.exit(1);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
// Wait with backoff then respawn
|
|
958
|
+
await sleep(retryResult.delayMs);
|
|
959
|
+
|
|
960
|
+
if (respawnSession(label)) {
|
|
961
|
+
// Session respawned -- reset consecutive failures for the fresh session
|
|
962
|
+
consecutiveFailures = 0;
|
|
963
|
+
process.stderr.write(`[watcher] [${label}] retry ${retryResult.newRetryCount} dispatched, continuing poll...\n`);
|
|
964
|
+
await sleep(pollS * 1000);
|
|
965
|
+
continue;
|
|
966
|
+
} else {
|
|
967
|
+
// Respawn failed -- deliver error
|
|
968
|
+
process.stdout.write(
|
|
969
|
+
`🌶️ *dispatch* [${label}] 529 retry failed -- could not respawn session\n` +
|
|
970
|
+
`Error: ${errorMsg}\n`
|
|
971
|
+
);
|
|
972
|
+
process.exit(1);
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
// -- Path 1: status auto-resolved to done ------------------
|
|
978
|
+
if (status.status !== 'running') {
|
|
979
|
+
// -- Spawn failure detection -----------------------------------------
|
|
980
|
+
// If the session was auto-resolved to 'done' (or 'spawn-warning') but was
|
|
981
|
+
// never seen in the gateway, it never ran -- this is a spawn failure.
|
|
982
|
+
// Causes: auth timeout, quota exhaustion, gateway error at spawn time.
|
|
983
|
+
if (!sessionEverFound && (status.status === 'done' || status.status === 'spawn-warning' || status.status === 'error')) {
|
|
984
|
+
const spawnErrMsg =
|
|
985
|
+
`[dispatch] SPAWN FAILURE: session ${status.sessionKey || '(unknown)'} never appeared ` +
|
|
986
|
+
`in gateway -- spawn likely failed (auth timeout, quota, or gateway error). Label: ${label}`;
|
|
987
|
+
process.stderr.write(spawnErrMsg + '\n');
|
|
988
|
+
markLabelError(label, `spawn-failure: session never appeared in gateway`);
|
|
989
|
+
process.stdout.write(
|
|
990
|
+
`🌶️ *dispatch* [${label}] SPAWN FAILURE: session never appeared in gateway -- ` +
|
|
991
|
+
`spawn likely failed (auth timeout, quota, or gateway error)\n`
|
|
992
|
+
);
|
|
993
|
+
process.exit(1);
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
// -- Gateway-restart-kill detection ----------------------------------
|
|
997
|
+
// When a gateway restart kills an in-flight session, the session disappears
|
|
998
|
+
// from sessions.json and the status command auto-resolves it as 'done' with
|
|
999
|
+
// a "session not found in sessions store" summary. This is NOT a real
|
|
1000
|
+
// completion -- the task was interrupted mid-run. Detect this pattern and
|
|
1001
|
+
// re-dispatch up to MAX_GW_RESTART_RETRIES times.
|
|
1002
|
+
//
|
|
1003
|
+
// Key distinction vs spawn failure:
|
|
1004
|
+
// spawn failure: sessionEverFound=false (session never appeared)
|
|
1005
|
+
// gateway-restart-kill: sessionEverFound=true (session ran, then was killed)
|
|
1006
|
+
//
|
|
1007
|
+
// If the session DID produce a lastReply before being killed, deliver it normally.
|
|
1008
|
+
if (sessionEverFound && isGatewayRestartKill(status.summary)) {
|
|
1009
|
+
const gwCheckResult = dispatch('result', ['--label', label]);
|
|
1010
|
+
if (!gwCheckResult?.lastReply) {
|
|
1011
|
+
// No result captured -- session was killed before completing
|
|
1012
|
+
const retryCount = getGwRestartRetryCount(label);
|
|
1013
|
+
if (retryCount >= MAX_GW_RESTART_RETRIES) {
|
|
1014
|
+
markLabelError(label,
|
|
1015
|
+
`gateway-restart-kill: max retries exceeded (${retryCount}x -- ${status.summary})`);
|
|
1016
|
+
notify(`🌶️ Dispatch: [${label}] gateway-restart-kill: max retries exceeded (${MAX_GW_RESTART_RETRIES}x)`);
|
|
1017
|
+
process.stdout.write(
|
|
1018
|
+
`🌶️ *dispatch* [${label}] failed: session killed by gateway restart, ` +
|
|
1019
|
+
`max retries (${MAX_GW_RESTART_RETRIES}) exceeded\n` +
|
|
1020
|
+
`Summary: ${status.summary}\n`
|
|
1021
|
+
);
|
|
1022
|
+
process.exit(1);
|
|
1023
|
+
}
|
|
1024
|
+
const newRetryCount = retryCount + 1;
|
|
1025
|
+
process.stderr.write(
|
|
1026
|
+
`[watcher] gateway-restart-kill detected for [${label}] -- ` +
|
|
1027
|
+
`attempt ${newRetryCount}/${MAX_GW_RESTART_RETRIES}\n`
|
|
1028
|
+
);
|
|
1029
|
+
notify(
|
|
1030
|
+
`🌶️ Dispatch: [${label}] session killed by gateway restart -- ` +
|
|
1031
|
+
`re-dispatching (${newRetryCount}/${MAX_GW_RESTART_RETRIES})`
|
|
1032
|
+
);
|
|
1033
|
+
setGwRestartRetryCount(label, newRetryCount);
|
|
1034
|
+
if (respawnAfterGwRestart(label)) {
|
|
1035
|
+
process.stderr.write(
|
|
1036
|
+
`[watcher] [${label}] gw-restart retry ${newRetryCount} dispatched, continuing poll...\n`
|
|
1037
|
+
);
|
|
1038
|
+
await sleep(pollS * 1000);
|
|
1039
|
+
continue;
|
|
1040
|
+
} else {
|
|
1041
|
+
markLabelError(label,
|
|
1042
|
+
`gateway-restart-kill: respawn failed (attempt ${newRetryCount})`);
|
|
1043
|
+
process.stdout.write(
|
|
1044
|
+
`🌶️ *dispatch* [${label}] failed: session killed by gateway restart, respawn failed\n`
|
|
1045
|
+
);
|
|
1046
|
+
process.exit(1);
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
// lastReply present -- session completed before/during kill; fall through to normal delivery
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
// Reset gw-restart retry count on successful completion
|
|
1053
|
+
const gwRetryCount = getGwRestartRetryCount(label);
|
|
1054
|
+
if (gwRetryCount > 0) {
|
|
1055
|
+
setGwRestartRetryCount(label, 0);
|
|
1056
|
+
process.stderr.write(
|
|
1057
|
+
`[watcher] [${label}] completed after ${gwRetryCount} gw-restart retry(ies), reset gwRestartRetryCount\n`
|
|
1058
|
+
);
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
// -- Interrupted: session auto-resolved as incomplete ------------------
|
|
1062
|
+
// When cmdStatus auto-resolves a session as 'interrupted' (idle without
|
|
1063
|
+
// calling done), deliver the lastReply for diagnostics but exit non-zero
|
|
1064
|
+
// so the scheduler run is marked as error, not success.
|
|
1065
|
+
//
|
|
1066
|
+
// NOTE: Always resolve as 'interrupted', never 'done'. Only agent-side cmdDone may set status=done.
|
|
1067
|
+
if (status.status === 'interrupted') {
|
|
1068
|
+
process.stderr.write(`[watcher] [${label}] session auto-resolved as interrupted -- work may be incomplete\n`);
|
|
1069
|
+
process.stdout.write(
|
|
1070
|
+
`⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete\n`
|
|
1071
|
+
);
|
|
1072
|
+
markLabelError(label, status.summary || 'interrupted: session went idle without calling done');
|
|
1073
|
+
process.exit(1);
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
// Reset 529 retryCount on successful completion
|
|
1077
|
+
if (status.status === 'done') {
|
|
1078
|
+
const currentRetryCount = getRetryCount(label);
|
|
1079
|
+
if (currentRetryCount > 0) {
|
|
1080
|
+
setRetryCount(label, 0);
|
|
1081
|
+
process.stderr.write(`[watcher] [${label}] completed after ${currentRetryCount} retry(ies), reset retryCount\n`);
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
const result = dispatch('result', ['--label', label]);
|
|
1085
|
+
deliverResult(label, result?.lastReply, status.summary);
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
// -- Path 2: status says 'running' but session may be idle -
|
|
1089
|
+
// If the session has no recent activity, proactively check for a result.
|
|
1090
|
+
// This catches the gap where the session completed but status hasn't
|
|
1091
|
+
// auto-resolved yet. The watchdog guard in index.mjs defers auto-resolve
|
|
1092
|
+
// while this watcher's lastPing heartbeat is fresh (written every 60s);
|
|
1093
|
+
// this path handles normal completion before the ping goes stale.
|
|
1094
|
+
const ageMs = status.liveness?.ageMs;
|
|
1095
|
+
if (ageMs != null && ageMs >= IDLE_RESULT_CHECK_MS) {
|
|
1096
|
+
const result = dispatch('result', ['--label', label]);
|
|
1097
|
+
if (result?.lastReply) {
|
|
1098
|
+
deliverResult(label, result.lastReply, null);
|
|
1099
|
+
}
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
|
|
1103
|
+
await sleep(pollS * 1000);
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
// Timed out -- try one last result check
|
|
1107
|
+
const finalResult = dispatch('result', ['--label', label]);
|
|
1108
|
+
const finalStatus = dispatch('status', ['--label', label]);
|
|
1109
|
+
if (finalResult?.lastReply) {
|
|
1110
|
+
const rc = getRetryCount(label);
|
|
1111
|
+
if (rc > 0) setRetryCount(label, 0);
|
|
1112
|
+
deliverResult(label, finalResult.lastReply, finalStatus?.summary || null);
|
|
1113
|
+
}
|
|
1114
|
+
// If status is explicitly done, exit cleanly even without lastReply
|
|
1115
|
+
if (finalStatus?.status === 'done') {
|
|
1116
|
+
markDoneSync(finalStatus?.summary || 'completed');
|
|
1117
|
+
process.stdout.write(`✅ dispatch [${label}] completed (status=done, no lastReply captured)\n`);
|
|
1118
|
+
process.exit(0);
|
|
1119
|
+
}
|
|
1120
|
+
// If status is interrupted (auto-resolved as incomplete), exit non-zero
|
|
1121
|
+
if (finalStatus?.status === 'interrupted') {
|
|
1122
|
+
process.stderr.write(`[watcher] [${label}] final status=interrupted -- session idle without completion\n`);
|
|
1123
|
+
process.stdout.write(
|
|
1124
|
+
`⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete\n`
|
|
1125
|
+
);
|
|
1126
|
+
markLabelError(label, finalStatus?.summary || 'interrupted: session went idle without calling done');
|
|
1127
|
+
process.exit(1);
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
// -- Token-based activity check before steering ----------------------------
|
|
1131
|
+
// Only steer if tokens have been flat for 3+ minutes post-deadline.
|
|
1132
|
+
// If the session is still making model calls (tokens growing), stay silent.
|
|
1133
|
+
function getTokenCount(sessionKey) {
|
|
1134
|
+
const gatewayTokens = sessionKey ? getSessionTokens(sessionKey) : null;
|
|
1135
|
+
if (typeof gatewayTokens === 'number') return gatewayTokens;
|
|
1136
|
+
try {
|
|
1137
|
+
const result = dispatch('status', ['--label', label]);
|
|
1138
|
+
// sessions.list via gateway would be better but dispatch status has liveness
|
|
1139
|
+
const tokens = result?.liveness?.tokens;
|
|
1140
|
+
return typeof tokens === 'number' ? tokens : null;
|
|
1141
|
+
} catch { return null; }
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
function markDoneSync(summary) {
|
|
1145
|
+
try {
|
|
1146
|
+
updateExistingLabel(label, (entry) => {
|
|
1147
|
+
entry.status = 'done';
|
|
1148
|
+
entry.summary = summary;
|
|
1149
|
+
});
|
|
1150
|
+
} catch (e) {
|
|
1151
|
+
process.stderr.write(`[watcher] markDoneSync failed: ${e.message}\n`);
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
|
|
1155
|
+
const statusAtDeadline = dispatch('status', ['--label', label]);
|
|
1156
|
+
let tokenSessionKey = statusAtDeadline?.sessionKey || recoverySessionKey || null;
|
|
1157
|
+
let baselineTokens = getTokenCount(tokenSessionKey);
|
|
1158
|
+
let flatSince = Date.now();
|
|
1159
|
+
|
|
1160
|
+
// Capture the internal sessionId (UUID) from sessions.json -- this is the filename
|
|
1161
|
+
// of the JSONL file, distinct from the sessionKey (agent:main:subagent:UUID).
|
|
1162
|
+
// The JSONL is updated continuously during active turns, making it a reliable
|
|
1163
|
+
// activity signal when sessions.json totalTokens/updatedAt are stale.
|
|
1164
|
+
const _deadlineEntry = getSessionStoreEntry(tokenSessionKey);
|
|
1165
|
+
const sessionInternalId = _deadlineEntry?.sessionId || null;
|
|
1166
|
+
const sessionAgent = (tokenSessionKey?.split(':')[1]) || 'main';
|
|
1167
|
+
let lastJsonlMtime = getSessionJsonlMtime(sessionInternalId, sessionAgent);
|
|
1168
|
+
|
|
1169
|
+
process.stderr.write(`[watcher] deadline hit for ${label} -- watching token activity (baseline: ${baselineTokens})\n`);
|
|
1170
|
+
if (sessionInternalId) {
|
|
1171
|
+
process.stderr.write(`[watcher] ${label} JSONL tracking: sessionId=${sessionInternalId} mtime=${lastJsonlMtime}\n`);
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
// If the session already completed (gateway pruned it -> null tokens), exit cleanly.
|
|
1175
|
+
if (statusAtDeadline?.status === 'done' || baselineTokens === null) {
|
|
1176
|
+
const r = dispatch('result', ['--label', label]);
|
|
1177
|
+
if (r?.lastReply) {
|
|
1178
|
+
// deliverResult calls process.exit(0) internally
|
|
1179
|
+
deliverResult(label, r.lastReply, statusAtDeadline?.summary || null);
|
|
1180
|
+
}
|
|
1181
|
+
// Status is explicitly done -- exit cleanly, no timeout noise
|
|
1182
|
+
if (statusAtDeadline?.status === 'done') {
|
|
1183
|
+
markDoneSync(statusAtDeadline?.summary || 'completed');
|
|
1184
|
+
process.stdout.write(`✅ dispatch [${label}] completed (status=done at deadline)\n`);
|
|
1185
|
+
process.exit(0);
|
|
1186
|
+
}
|
|
1187
|
+
// Truly no result and no tokens -- telemetry unavailable
|
|
1188
|
+
if (baselineTokens === null) {
|
|
1189
|
+
// Check if session is actually in the store (just mid-tool-call with no tokens yet)
|
|
1190
|
+
const entry = getSessionStoreEntry(tokenSessionKey);
|
|
1191
|
+
if (!entry) {
|
|
1192
|
+
// Session truly not found -- telemetry unavailable, exit
|
|
1193
|
+
process.stderr.write(`[watcher] token telemetry unavailable for ${label}; session not in store\n`);
|
|
1194
|
+
markLabelError(label, `timed out after ${timeoutS}s -- token telemetry unavailable`);
|
|
1195
|
+
process.stdout.write(`⏱ dispatch [${label}] timed out after ${timeoutS}s -- token telemetry unavailable; no steer/kill attempted\n`);
|
|
1196
|
+
process.exit(1);
|
|
1197
|
+
}
|
|
1198
|
+
// Session IS in store but no tokens -- mid-tool-call, fall through to activity window
|
|
1199
|
+
// Use updatedAt as activity signal instead of tokens
|
|
1200
|
+
process.stderr.write(`[watcher] ${label} in store but no tokens (mid-tool-call?) -- using updatedAt as activity signal\n`);
|
|
1201
|
+
baselineTokens = -1; // sentinel: token-free mode
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
while (Date.now() - flatSince < FLAT_WINDOW_MS) {
|
|
1206
|
+
await sleep(ACTIVITY_POLL_MS);
|
|
1207
|
+
|
|
1208
|
+
// Delivered?
|
|
1209
|
+
const st = dispatch('status', ['--label', label]);
|
|
1210
|
+
if (st?.sessionKey && !tokenSessionKey) tokenSessionKey = st.sessionKey;
|
|
1211
|
+
if (st?.status === 'done') {
|
|
1212
|
+
const r = dispatch('result', ['--label', label]);
|
|
1213
|
+
// deliverResult calls process.exit(0) internally
|
|
1214
|
+
deliverResult(label, r?.lastReply, st.summary);
|
|
1215
|
+
}
|
|
1216
|
+
const r2 = dispatch('result', ['--label', label]);
|
|
1217
|
+
if (r2?.lastReply) {
|
|
1218
|
+
// deliverResult calls process.exit(0) internally
|
|
1219
|
+
deliverResult(label, r2.lastReply, null);
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
// Token growth?
|
|
1223
|
+
const cur = getTokenCount(tokenSessionKey);
|
|
1224
|
+
if (cur === null) {
|
|
1225
|
+
// Check updatedAt as fallback -- if session is still in store and recently updated, keep waiting
|
|
1226
|
+
const entry = getSessionStoreEntry(tokenSessionKey);
|
|
1227
|
+
if (!entry) {
|
|
1228
|
+
process.stderr.write(`[watcher] token telemetry lost for ${label}; session gone from store\n`);
|
|
1229
|
+
markLabelError(label, `timed out after ${timeoutS}s -- token telemetry lost`);
|
|
1230
|
+
process.stdout.write(`⏱ dispatch [${label}] timed out after ${timeoutS}s -- token telemetry lost; no steer/kill attempted\n`);
|
|
1231
|
+
process.exit(1);
|
|
1232
|
+
}
|
|
1233
|
+
// Still in store -- check if updatedAt advanced (tool call still running)
|
|
1234
|
+
// Normalize: updatedAt may be seconds or milliseconds depending on agent framework version
|
|
1235
|
+
const rawUpdatedAt = entry.updatedAt;
|
|
1236
|
+
const updatedAt = (typeof rawUpdatedAt === 'number' && rawUpdatedAt < 1e12)
|
|
1237
|
+
? rawUpdatedAt * 1000 // seconds -> milliseconds
|
|
1238
|
+
: rawUpdatedAt;
|
|
1239
|
+
if (typeof updatedAt === 'number' && updatedAt > flatSince) {
|
|
1240
|
+
process.stderr.write(`[watcher] ${label} no tokens but updatedAt advanced -- tool call active, resetting flat timer\n`);
|
|
1241
|
+
flatSince = Date.now();
|
|
1242
|
+
} else {
|
|
1243
|
+
process.stderr.write(`[watcher] ${label} no tokens, updatedAt not advancing -- may be stuck\n`);
|
|
1244
|
+
}
|
|
1245
|
+
// Don't exit -- let FLAT_WINDOW_MS timeout handle the stuck case normally
|
|
1246
|
+
continue;
|
|
1247
|
+
}
|
|
1248
|
+
// Normal token comparison (skip if in token-free sentinel mode)
|
|
1249
|
+
if (baselineTokens !== -1 && cur > baselineTokens) {
|
|
1250
|
+
process.stderr.write(`[watcher] ${label} still active (${baselineTokens}->${cur} tokens), resetting flat timer\n`);
|
|
1251
|
+
baselineTokens = cur;
|
|
1252
|
+
flatSince = Date.now();
|
|
1253
|
+
} else if (baselineTokens === -1 && cur > 0) {
|
|
1254
|
+
// Tokens appeared for the first time -- switch from sentinel to real token tracking
|
|
1255
|
+
process.stderr.write(`[watcher] ${label} tokens now available (${cur}), switching to token tracking\n`);
|
|
1256
|
+
baselineTokens = cur;
|
|
1257
|
+
flatSince = Date.now();
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
// -- JSONL mtime check -----------------------------------------------------
|
|
1261
|
+
// Most reliable activity signal for spawned subagent sessions: OpenClaw does
|
|
1262
|
+
// NOT flush totalTokens or updatedAt in sessions.json during active turns, but
|
|
1263
|
+
// the JSONL file IS written continuously. If the mtime advanced since last
|
|
1264
|
+
// check by >1s, the session is actively processing -- reset the flat timer.
|
|
1265
|
+
const curJsonlMtime = getSessionJsonlMtime(sessionInternalId, sessionAgent);
|
|
1266
|
+
if (curJsonlMtime !== null) {
|
|
1267
|
+
if (lastJsonlMtime !== null && curJsonlMtime > lastJsonlMtime + 1000) {
|
|
1268
|
+
process.stderr.write(
|
|
1269
|
+
`[watcher] ${label} JSONL mtime advanced (${lastJsonlMtime}->${curJsonlMtime}ms), ` +
|
|
1270
|
+
`session active -- resetting flat timer\n`
|
|
1271
|
+
);
|
|
1272
|
+
lastJsonlMtime = curJsonlMtime;
|
|
1273
|
+
flatSince = Date.now();
|
|
1274
|
+
} else if (lastJsonlMtime === null) {
|
|
1275
|
+
// First observation -- just record, don't reset yet
|
|
1276
|
+
process.stderr.write(`[watcher] ${label} JSONL mtime first observation: ${curJsonlMtime}\n`);
|
|
1277
|
+
lastJsonlMtime = curJsonlMtime;
|
|
1278
|
+
}
|
|
1279
|
+
}
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1282
|
+
// -- Pre-steer JSONL sanity check ------------------------------------------
|
|
1283
|
+
// Before triggering steer/markDoneSync, verify the session is not currently
|
|
1284
|
+
// mid-turn. A mid-turn session has an in-flight tool call (JSONL last entry
|
|
1285
|
+
// is tool_use or tool_result) -- steering or declaring it done would interrupt
|
|
1286
|
+
// active work and produce a partial/zombie result.
|
|
1287
|
+
//
|
|
1288
|
+
// If mid-turn is detected AND the JSONL was modified recently, extend the flat
|
|
1289
|
+
// window one time to let the turn complete naturally.
|
|
1290
|
+
if (sessionInternalId) {
|
|
1291
|
+
const midTurnReason = getJsonlMidTurnReason(sessionInternalId, sessionAgent);
|
|
1292
|
+
if (midTurnReason) {
|
|
1293
|
+
process.stderr.write(
|
|
1294
|
+
`[watcher] ${label} pre-steer sanity check: ${midTurnReason} -- ` +
|
|
1295
|
+
`session is mid-turn, extending flat window once\n`
|
|
1296
|
+
);
|
|
1297
|
+
notify(`🌶️ Dispatch: [${label}] pre-steer: mid-turn detected (${midTurnReason}), extending wait`);
|
|
1298
|
+
flatSince = Date.now();
|
|
1299
|
+
// Re-enter the flat window loop for one more FLAT_WINDOW_MS extension
|
|
1300
|
+
while (Date.now() - flatSince < FLAT_WINDOW_MS) {
|
|
1301
|
+
await sleep(ACTIVITY_POLL_MS);
|
|
1302
|
+
|
|
1303
|
+
// Check for completion
|
|
1304
|
+
const stExt = dispatch('status', ['--label', label]);
|
|
1305
|
+
if (stExt?.status === 'done') {
|
|
1306
|
+
const rExt = dispatch('result', ['--label', label]);
|
|
1307
|
+
// deliverResult calls process.exit(0) internally
|
|
1308
|
+
deliverResult(label, rExt?.lastReply, stExt.summary);
|
|
1309
|
+
}
|
|
1310
|
+
const rExt2 = dispatch('result', ['--label', label]);
|
|
1311
|
+
if (rExt2?.lastReply) {
|
|
1312
|
+
// deliverResult calls process.exit(0) internally
|
|
1313
|
+
deliverResult(label, rExt2.lastReply, null);
|
|
1314
|
+
}
|
|
1315
|
+
|
|
1316
|
+
// JSONL mtime check during extended wait
|
|
1317
|
+
const extMtime = getSessionJsonlMtime(sessionInternalId, sessionAgent);
|
|
1318
|
+
if (extMtime !== null && lastJsonlMtime !== null && extMtime > lastJsonlMtime + 1000) {
|
|
1319
|
+
process.stderr.write(
|
|
1320
|
+
`[watcher] ${label} JSONL mtime advanced during extended wait (${lastJsonlMtime}->${extMtime}ms), resetting flat timer\n`
|
|
1321
|
+
);
|
|
1322
|
+
lastJsonlMtime = extMtime;
|
|
1323
|
+
flatSince = Date.now();
|
|
1324
|
+
} else if (extMtime !== null) {
|
|
1325
|
+
lastJsonlMtime = extMtime;
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
// Token growth check during extended wait
|
|
1329
|
+
const extTokens = getTokenCount(tokenSessionKey);
|
|
1330
|
+
if (extTokens !== null && baselineTokens !== -1 && extTokens > baselineTokens) {
|
|
1331
|
+
process.stderr.write(`[watcher] ${label} tokens advanced during extended wait, resetting flat timer\n`);
|
|
1332
|
+
baselineTokens = extTokens;
|
|
1333
|
+
flatSince = Date.now();
|
|
1334
|
+
}
|
|
1335
|
+
}
|
|
1336
|
+
// Extended window expired -- proceed to steer regardless
|
|
1337
|
+
process.stderr.write(`[watcher] ${label} extended mid-turn wait expired -- proceeding to steer\n`);
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
// 3 min of genuinely flat tokens -- now steer
|
|
1342
|
+
process.stderr.write(`[watcher] ${label} inactive 3min post-deadline -- entering steer\n`);
|
|
1343
|
+
|
|
1344
|
+
// Get sessionKey for steer/kill
|
|
1345
|
+
const statusForSteer = dispatch('status', ['--label', label]);
|
|
1346
|
+
const steerSessionKey = statusForSteer?.sessionKey || null;
|
|
1347
|
+
|
|
1348
|
+
const steerRounds = [
|
|
1349
|
+
{ waitMs: 30_000, msg: "Watcher check: if you're done, please send your final reply now. If still working, continue and ignore this." },
|
|
1350
|
+
{ waitMs: 60_000, msg: "Watcher final check: please send your final reply now, or the session will be terminated in 2 minutes." },
|
|
1351
|
+
{ waitMs: 120_000, msg: null }, // kill round
|
|
1352
|
+
];
|
|
1353
|
+
|
|
1354
|
+
for (const round of steerRounds) {
|
|
1355
|
+
if (round.msg && steerSessionKey) {
|
|
1356
|
+
process.stderr.write(`[watcher] steering ${label}: "${round.msg.slice(0, 60)}..."\n`);
|
|
1357
|
+
await steerSession(steerSessionKey, round.msg);
|
|
1358
|
+
}
|
|
1359
|
+
await sleep(round.waitMs);
|
|
1360
|
+
|
|
1361
|
+
const st2 = dispatch('status', ['--label', label]);
|
|
1362
|
+
if (st2?.status === 'done') {
|
|
1363
|
+
const r3 = dispatch('result', ['--label', label]);
|
|
1364
|
+
// deliverResult calls process.exit(0) internally
|
|
1365
|
+
deliverResult(label, r3?.lastReply, st2.summary);
|
|
1366
|
+
}
|
|
1367
|
+
const r3 = dispatch('result', ['--label', label]);
|
|
1368
|
+
if (r3?.lastReply) {
|
|
1369
|
+
// deliverResult calls process.exit(0) internally
|
|
1370
|
+
deliverResult(label, r3.lastReply, null);
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1373
|
+
if (!round.msg && steerSessionKey) {
|
|
1374
|
+
process.stderr.write(`[watcher] killing stuck session ${label}\n`);
|
|
1375
|
+
await killSession(steerSessionKey);
|
|
1376
|
+
// Wait up to 30s for confirmation
|
|
1377
|
+
for (let i = 0; i < 6; i++) {
|
|
1378
|
+
await sleep(5000);
|
|
1379
|
+
const st3 = dispatch('status', ['--label', label]);
|
|
1380
|
+
if (st3?.status === 'done') {
|
|
1381
|
+
// Check if a result was captured before marking as error
|
|
1382
|
+
const r4 = dispatch('result', ['--label', label]);
|
|
1383
|
+
if (r4?.lastReply) {
|
|
1384
|
+
deliverResult(label, r4.lastReply, st3.summary); // deliverResult calls process.exit(0)
|
|
1385
|
+
}
|
|
1386
|
+
markLabelError(label, 'timed out -- killed after steer attempts (no result captured)');
|
|
1387
|
+
process.stdout.write(`⏱ dispatch [${label}] killed after steer attempts -- no result captured\n`);
|
|
1388
|
+
process.exit(1);
|
|
1389
|
+
}
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
}
|
|
1393
|
+
|
|
1394
|
+
markLabelError(label, `timed out after ${timeoutS}s -- killed after steer attempts`);
|
|
1395
|
+
process.stdout.write(`⏱ dispatch [${label}] timed out after ${timeoutS}s -- session killed after steer attempts\n`);
|
|
1396
|
+
process.exit(1);
|