openclaw-scheduler 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dispatch/README.md +16 -2
- package/dispatch/completion.mjs +297 -20
- package/dispatch/index.mjs +80 -57
- package/dispatch/liveness.mjs +61 -0
- package/dispatch/watcher.mjs +299 -17
- package/dispatcher-strategies.js +82 -10
- package/dispatcher.js +6 -1
- package/gateway.js +39 -0
- package/package.json +2 -1
package/dispatch/index.mjs
CHANGED
|
@@ -40,6 +40,7 @@ import {
|
|
|
40
40
|
hasCompletionSignal,
|
|
41
41
|
taskRequiresGitSha,
|
|
42
42
|
} from './completion.mjs';
|
|
43
|
+
import { getDispatchLivenessPolicy } from './liveness.mjs';
|
|
43
44
|
import { onStarted, onFinished, onStuck } from './hooks.mjs';
|
|
44
45
|
import { resolveMessageInput } from './message-input.mjs';
|
|
45
46
|
import { buildDispatchDeliverySurface } from '../scripts/dispatch-cli-utils.mjs';
|
|
@@ -205,6 +206,19 @@ function setLabel(name, data) {
|
|
|
205
206
|
return labels[name];
|
|
206
207
|
}
|
|
207
208
|
|
|
209
|
+
function setLabelDone(name, data) {
|
|
210
|
+
const labels = mutateLabels((current) => {
|
|
211
|
+
current[name] = {
|
|
212
|
+
...current[name],
|
|
213
|
+
...data,
|
|
214
|
+
status: 'done',
|
|
215
|
+
error: null,
|
|
216
|
+
updatedAt: new Date().toISOString(),
|
|
217
|
+
};
|
|
218
|
+
});
|
|
219
|
+
return labels[name];
|
|
220
|
+
}
|
|
221
|
+
|
|
208
222
|
// -- Gateway Calls --------------------------------------------
|
|
209
223
|
|
|
210
224
|
/**
|
|
@@ -352,7 +366,17 @@ function getSessionJsonlPath(agent = 'main', sessionId) {
|
|
|
352
366
|
|
|
353
367
|
function inspectSessionActivitySignal(sessionKey, sessionsStore) {
|
|
354
368
|
if (!sessionKey || !sessionsStore?.[sessionKey]) {
|
|
355
|
-
return {
|
|
369
|
+
return {
|
|
370
|
+
found: false,
|
|
371
|
+
hasStartedSignal: false,
|
|
372
|
+
hasActivitySignal: false,
|
|
373
|
+
messageCount: null,
|
|
374
|
+
jsonlExists: false,
|
|
375
|
+
hasTokens: false,
|
|
376
|
+
updatedAtMs: null,
|
|
377
|
+
sessionStartedAtMs: null,
|
|
378
|
+
sessionId: null,
|
|
379
|
+
};
|
|
356
380
|
}
|
|
357
381
|
|
|
358
382
|
const agent = agentFromSessionKey(sessionKey) || 'main';
|
|
@@ -360,6 +384,9 @@ function inspectSessionActivitySignal(sessionKey, sessionsStore) {
|
|
|
360
384
|
const jsonlPath = getSessionJsonlPath(agent, entry.sessionId);
|
|
361
385
|
const jsonlExists = jsonlPath ? existsSync(jsonlPath) : false;
|
|
362
386
|
const hasTokens = typeof entry.totalTokens === 'number' && entry.totalTokens > 0;
|
|
387
|
+
const sessionStartedAtMs = toTimestampMs(entry.sessionStartedAt || entry.startedAt);
|
|
388
|
+
const updatedAtMs = toTimestampMs(entry.updatedAt);
|
|
389
|
+
const hasStartedSignal = Boolean(entry.sessionId) || sessionStartedAtMs !== null || updatedAtMs !== null;
|
|
363
390
|
let messageCount = null;
|
|
364
391
|
|
|
365
392
|
try {
|
|
@@ -371,11 +398,14 @@ function inspectSessionActivitySignal(sessionKey, sessionsStore) {
|
|
|
371
398
|
|
|
372
399
|
return {
|
|
373
400
|
found: true,
|
|
401
|
+
hasStartedSignal,
|
|
374
402
|
hasActivitySignal: jsonlExists || hasTokens || (typeof messageCount === 'number' && messageCount > 0),
|
|
375
403
|
messageCount,
|
|
376
404
|
jsonlExists,
|
|
377
405
|
hasTokens,
|
|
378
|
-
updatedAtMs
|
|
406
|
+
updatedAtMs,
|
|
407
|
+
sessionStartedAtMs,
|
|
408
|
+
sessionId: entry.sessionId || null,
|
|
379
409
|
};
|
|
380
410
|
}
|
|
381
411
|
|
|
@@ -385,12 +415,7 @@ function inspectSessionBootstrapFailure(sessionKey, sessionsStore, spawnedAtMs,
|
|
|
385
415
|
}
|
|
386
416
|
|
|
387
417
|
const ageMs = spawnedAtMs ? Date.now() - spawnedAtMs : Infinity;
|
|
388
|
-
if (ageMs < startupGraceMs
|
|
389
|
-
return { shouldResolve: false, reason: null, errorMsg: null };
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
const signal = inspectSessionActivitySignal(sessionKey, sessionsStore);
|
|
393
|
-
if (signal.hasActivitySignal) {
|
|
418
|
+
if (ageMs < startupGraceMs) {
|
|
394
419
|
return { shouldResolve: false, reason: null, errorMsg: null };
|
|
395
420
|
}
|
|
396
421
|
|
|
@@ -403,22 +428,10 @@ function inspectSessionBootstrapFailure(sessionKey, sessionsStore, spawnedAtMs,
|
|
|
403
428
|
};
|
|
404
429
|
}
|
|
405
430
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
errorMsg: 'spawn-failure: session entered sessions store but never wrote transcript/history',
|
|
411
|
-
};
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
if (signal.updatedAtMs !== null && spawnedAtMs && signal.updatedAtMs <= spawnedAtMs + 5000) {
|
|
415
|
-
return {
|
|
416
|
-
shouldResolve: true,
|
|
417
|
-
reason: 'session entered sessions store but never showed any activity',
|
|
418
|
-
errorMsg: 'spawn-failure: session entered sessions store but never showed any activity',
|
|
419
|
-
};
|
|
420
|
-
}
|
|
421
|
-
|
|
431
|
+
// A Codex session can enter the sessions store before chat.history, JSONL, or
|
|
432
|
+
// token counters are written. Treat that as "still booting"; the watcher and
|
|
433
|
+
// job timeout own later failure handling. Only fail fast when the gateway has
|
|
434
|
+
// recorded an explicit lane error above.
|
|
422
435
|
return { shouldResolve: false, reason: null, errorMsg: null };
|
|
423
436
|
}
|
|
424
437
|
|
|
@@ -683,7 +696,7 @@ function quoteForSingleQuotedShell(value) {
|
|
|
683
696
|
}
|
|
684
697
|
|
|
685
698
|
/**
|
|
686
|
-
* Schedule a
|
|
699
|
+
* Schedule a quick-poll delivery watcher shell job for a dispatch label.
|
|
687
700
|
* Used both for the initial watcher registration and SIGTERM handoffs.
|
|
688
701
|
*/
|
|
689
702
|
function scheduleDeliveryWatcherJob({
|
|
@@ -704,13 +717,19 @@ function scheduleDeliveryWatcherJob({
|
|
|
704
717
|
const watcherTimeoutS = Number(timeoutSeconds) + 120;
|
|
705
718
|
const idleThresholdS = Number(idleThresholdSeconds) || 300;
|
|
706
719
|
const sq = quoteForSingleQuotedShell;
|
|
707
|
-
const watcherCmd =
|
|
720
|
+
const watcherCmd =
|
|
721
|
+
`DISPATCH_LABELS_PATH='${sq(LABELS_PATH)}' ` +
|
|
722
|
+
`DISPATCH_INDEX_PATH='${sq(join(__dirname, 'index.mjs'))}' ` +
|
|
723
|
+
`'${sq(process.execPath)}' '${sq(watcherPath)}' ` +
|
|
724
|
+
`--label '${sq(label)}' --timeout ${watcherTimeoutS} ` +
|
|
725
|
+
`--poll-interval 20 --idle-threshold ${idleThresholdS} --once`;
|
|
708
726
|
|
|
709
727
|
const nowUtc = new Date().toISOString().replace('T', ' ').slice(0, 19);
|
|
710
728
|
const jobSpec = {
|
|
711
729
|
name: `${agentBrand}-deliver:${label}${nameSuffix}`,
|
|
712
|
-
schedule_kind: '
|
|
713
|
-
|
|
730
|
+
schedule_kind: 'cron',
|
|
731
|
+
schedule_cron: config.deliver_watcher_cron || '* * * * *',
|
|
732
|
+
next_run_at: nowUtc,
|
|
714
733
|
session_target: 'shell',
|
|
715
734
|
payload_kind: 'shellCommand',
|
|
716
735
|
payload_message: watcherCmd,
|
|
@@ -720,8 +739,7 @@ function scheduleDeliveryWatcherJob({
|
|
|
720
739
|
delivery_guarantee: 'at-least-once',
|
|
721
740
|
ttl_hours: config.deliver_watcher_ttl_hours ?? 48,
|
|
722
741
|
overlap_policy: 'skip',
|
|
723
|
-
run_timeout_ms:
|
|
724
|
-
+ 420 * 1000,
|
|
742
|
+
run_timeout_ms: 120_000,
|
|
725
743
|
delete_after_run: 1,
|
|
726
744
|
origin: origin || 'system',
|
|
727
745
|
};
|
|
@@ -1088,9 +1106,10 @@ async function cmdEnqueue(flags) {
|
|
|
1088
1106
|
}
|
|
1089
1107
|
|
|
1090
1108
|
// -- Register scheduler watcher for delivery ---------------
|
|
1091
|
-
// Creates a
|
|
1092
|
-
//
|
|
1093
|
-
//
|
|
1109
|
+
// Creates a quick-poll shell job that runs watcher.mjs once per tick. Empty
|
|
1110
|
+
// stdout means "still running" and advances the next tick without delivery.
|
|
1111
|
+
// Terminal stdout goes through the scheduler's handleDelivery with retry,
|
|
1112
|
+
// alias resolution, and audit trail in scheduler.db.
|
|
1094
1113
|
// The watcher is the only final-delivery path for dispatched jobs.
|
|
1095
1114
|
const sq = s => String(s).replace(/'/g, "'\\''");
|
|
1096
1115
|
let schedulerWatcherOk = false;
|
|
@@ -1204,9 +1223,10 @@ async function cmdEnqueue(flags) {
|
|
|
1204
1223
|
|
|
1205
1224
|
// -- Post-spawn verification (Fix 3) --------------------------------
|
|
1206
1225
|
// Canary: poll sessions.json up to 3 times at 10s intervals to confirm the
|
|
1207
|
-
// session appeared in the store.
|
|
1208
|
-
//
|
|
1209
|
-
//
|
|
1226
|
+
// session appeared in the store. A session store entry with sessionId or
|
|
1227
|
+
// startedAt/sessionStartedAt is enough: long first turns may not flush JSONL,
|
|
1228
|
+
// token counts, or chat.history until the model call completes. The delivery
|
|
1229
|
+
// watcher owns later completion/failure handling.
|
|
1210
1230
|
const SPAWN_POLL_MAX = 3;
|
|
1211
1231
|
const SPAWN_POLL_DELAY_MS = 10_000;
|
|
1212
1232
|
let spawnConfirmed = false;
|
|
@@ -1214,7 +1234,7 @@ async function cmdEnqueue(flags) {
|
|
|
1214
1234
|
await sleep(SPAWN_POLL_DELAY_MS);
|
|
1215
1235
|
const spawnStore = readSessionsStore(agent);
|
|
1216
1236
|
const signal = inspectSessionActivitySignal(sessionKey, spawnStore);
|
|
1217
|
-
if (signal.hasActivitySignal) {
|
|
1237
|
+
if (signal.hasStartedSignal || signal.hasActivitySignal) {
|
|
1218
1238
|
spawnConfirmed = true;
|
|
1219
1239
|
break;
|
|
1220
1240
|
}
|
|
@@ -1292,16 +1312,17 @@ function cmdStatus(flags) {
|
|
|
1292
1312
|
//
|
|
1293
1313
|
// PING_STALE_MS: 3x the 60s ping interval -- if we haven't heard from the
|
|
1294
1314
|
// watcher in 3 min, it's probably dead; fall through to check.
|
|
1295
|
-
// hardCeilingMs:
|
|
1296
|
-
//
|
|
1297
|
-
// idleThresholdMs:
|
|
1298
|
-
//
|
|
1299
|
-
const
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
const
|
|
1315
|
+
// hardCeilingMs: timeout/reasoning-aware hard ceiling. High-thinking
|
|
1316
|
+
// work gets a larger quiet window before hard failure.
|
|
1317
|
+
// idleThresholdMs: timeout/reasoning-aware quiet threshold. Ambiguous or
|
|
1318
|
+
// missing liveness stays running until these thresholds.
|
|
1319
|
+
const livenessPolicy = getDispatchLivenessPolicy(entry, {
|
|
1320
|
+
startupGraceMs: STARTUP_GRACE_MS,
|
|
1321
|
+
defaultTimeoutSeconds: 600,
|
|
1322
|
+
});
|
|
1323
|
+
const PING_STALE_MS = livenessPolicy.pingStaleMs;
|
|
1324
|
+
const idleThresholdMs = livenessPolicy.idleFailureMs;
|
|
1325
|
+
const hardCeilingMs = livenessPolicy.hardCeilingMs;
|
|
1305
1326
|
|
|
1306
1327
|
let check;
|
|
1307
1328
|
if (ageMs < STARTUP_GRACE_MS) {
|
|
@@ -1314,13 +1335,13 @@ function cmdStatus(flags) {
|
|
|
1314
1335
|
check = { shouldResolve: false };
|
|
1315
1336
|
} else {
|
|
1316
1337
|
// Ping stale OR past hard ceiling: fall through to session store check
|
|
1317
|
-
const thresh = ageMs >= hardCeilingMs ?
|
|
1338
|
+
const thresh = ageMs >= hardCeilingMs ? livenessPolicy.hardTimeoutIdleMs : idleThresholdMs;
|
|
1318
1339
|
check = checkSessionDone(entry.sessionKey, sessionsStore, thresh, true, spawnedAtMs);
|
|
1319
1340
|
}
|
|
1320
1341
|
} else {
|
|
1321
1342
|
// No lastPing -- backward compat (sessions dispatched before heartbeat feature).
|
|
1322
1343
|
// Use idleThresholdMs (job-aware) instead of the old hardcoded 10 min.
|
|
1323
|
-
const thresh = ageMs >= hardCeilingMs ?
|
|
1344
|
+
const thresh = ageMs >= hardCeilingMs ? livenessPolicy.hardTimeoutIdleMs : idleThresholdMs;
|
|
1324
1345
|
check = checkSessionDone(entry.sessionKey, sessionsStore, thresh, true, spawnedAtMs);
|
|
1325
1346
|
}
|
|
1326
1347
|
|
|
@@ -1597,10 +1618,13 @@ function cmdSync(flags) {
|
|
|
1597
1618
|
// -- Heartbeat-based liveness guard (mirrors cmdStatus logic) ---------
|
|
1598
1619
|
// Skip auto-resolve when the watcher's lastPing heartbeat is fresh.
|
|
1599
1620
|
// See cmdStatus for full commentary on PING_STALE_MS / hardCeilingMs.
|
|
1600
|
-
const
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1621
|
+
const syncPolicy = getDispatchLivenessPolicy(entry, {
|
|
1622
|
+
startupGraceMs: STARTUP_GRACE_MS_SYNC,
|
|
1623
|
+
defaultTimeoutSeconds: 600,
|
|
1624
|
+
});
|
|
1625
|
+
const PING_STALE_MS_SYNC = syncPolicy.pingStaleMs;
|
|
1626
|
+
const idleThresholdMsSync = syncPolicy.idleFailureMs;
|
|
1627
|
+
const hardCeilingMsSync = syncPolicy.hardCeilingMs;
|
|
1604
1628
|
|
|
1605
1629
|
if (entry.lastPing) {
|
|
1606
1630
|
const pingAgeMs = Date.now() - new Date(entry.lastPing).getTime();
|
|
@@ -1610,7 +1634,7 @@ function cmdSync(flags) {
|
|
|
1610
1634
|
}
|
|
1611
1635
|
}
|
|
1612
1636
|
|
|
1613
|
-
const syncThresh = elapsedMs >= hardCeilingMsSync ?
|
|
1637
|
+
const syncThresh = elapsedMs >= hardCeilingMsSync ? syncPolicy.hardTimeoutIdleMs : idleThresholdMsSync;
|
|
1614
1638
|
const check = checkSessionDone(entry.sessionKey, syncStore, syncThresh, true, spawnedAtMs);
|
|
1615
1639
|
|
|
1616
1640
|
if (check.shouldResolve) {
|
|
@@ -1972,7 +1996,7 @@ async function cmdDone(flags) {
|
|
|
1972
1996
|
// Label was never registered (e.g. direct subagent spawn, not via enqueue).
|
|
1973
1997
|
// This is not an error -- the work completed, the label just wasn't tracked.
|
|
1974
1998
|
process.stderr.write(`[${BRAND}] warn: no session found for label "${label}" -- registering as done\n`);
|
|
1975
|
-
|
|
1999
|
+
setLabelDone(label, { summary, completion, ...(sha ? { sha } : {}) });
|
|
1976
2000
|
|
|
1977
2001
|
// No watcher is polling for this label, so actively notify via the gateway
|
|
1978
2002
|
// post office using delivery config from config.json as fallback target.
|
|
@@ -2001,8 +2025,7 @@ async function cmdDone(flags) {
|
|
|
2001
2025
|
return;
|
|
2002
2026
|
}
|
|
2003
2027
|
|
|
2004
|
-
|
|
2005
|
-
status: 'done',
|
|
2028
|
+
setLabelDone(label, {
|
|
2006
2029
|
summary,
|
|
2007
2030
|
completion,
|
|
2008
2031
|
...(sha ? { sha } : {}),
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
const MINUTE_MS = 60 * 1000;
|
|
2
|
+
|
|
3
|
+
function numberOrNull(value) {
|
|
4
|
+
const n = Number(value);
|
|
5
|
+
return Number.isFinite(n) && n > 0 ? n : null;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export function normalizeThinkingLevel(value) {
|
|
9
|
+
const text = typeof value === 'string' ? value.trim().toLowerCase() : '';
|
|
10
|
+
if (text === 'xhigh' || text === 'extra-high' || text === 'extra_high') return 'xhigh';
|
|
11
|
+
if (text === 'high') return 'high';
|
|
12
|
+
if (text === 'low') return 'low';
|
|
13
|
+
if (text === 'off' || text === 'none') return 'off';
|
|
14
|
+
return null;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function getDispatchTimeoutSeconds(entry = {}, fallbackSeconds = 300) {
|
|
18
|
+
return numberOrNull(entry.timeoutSeconds)
|
|
19
|
+
?? numberOrNull(entry.timeout)
|
|
20
|
+
?? numberOrNull(fallbackSeconds)
|
|
21
|
+
?? 300;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function getDispatchLivenessPolicy(entry = {}, opts = {}) {
|
|
25
|
+
const now = numberOrNull(opts.now) ?? Date.now();
|
|
26
|
+
const timeoutSeconds = getDispatchTimeoutSeconds(entry, opts.defaultTimeoutSeconds);
|
|
27
|
+
const timeoutMs = timeoutSeconds * 1000;
|
|
28
|
+
const thinking = normalizeThinkingLevel(entry.thinking);
|
|
29
|
+
const isHighThinking = thinking === 'high' || thinking === 'xhigh';
|
|
30
|
+
|
|
31
|
+
const startupGraceMs = numberOrNull(opts.startupGraceMs)
|
|
32
|
+
?? (isHighThinking ? 10 * MINUTE_MS : 5 * MINUTE_MS);
|
|
33
|
+
const pingStaleMs = numberOrNull(opts.pingStaleMs) ?? 3 * MINUTE_MS;
|
|
34
|
+
const idleProbeFloorMs = isHighThinking ? 10 * MINUTE_MS : 1 * MINUTE_MS;
|
|
35
|
+
const idleProbeMs = Math.max(
|
|
36
|
+
idleProbeFloorMs,
|
|
37
|
+
Math.min(timeoutMs * 0.25, isHighThinking ? 15 * MINUTE_MS : 5 * MINUTE_MS),
|
|
38
|
+
);
|
|
39
|
+
const idleFailureFloorMs = isHighThinking ? 20 * MINUTE_MS : 10 * MINUTE_MS;
|
|
40
|
+
const idleFailureMs = Math.max(timeoutMs, idleFailureFloorMs);
|
|
41
|
+
const hardCeilingMs = Math.max(timeoutMs * 1.5, idleFailureMs * (isHighThinking ? 2 : 1.5));
|
|
42
|
+
const hardTimeoutIdleMs = isHighThinking ? 5 * MINUTE_MS : 2 * MINUTE_MS;
|
|
43
|
+
const spawnedAtMs = entry.spawnedAt ? new Date(entry.spawnedAt).getTime() : 0;
|
|
44
|
+
const ageMs = spawnedAtMs ? now - spawnedAtMs : Infinity;
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
thinking,
|
|
48
|
+
isHighThinking,
|
|
49
|
+
timeoutSeconds,
|
|
50
|
+
timeoutMs,
|
|
51
|
+
startupGraceMs,
|
|
52
|
+
pingStaleMs,
|
|
53
|
+
idleProbeMs,
|
|
54
|
+
idleFailureMs,
|
|
55
|
+
hardCeilingMs,
|
|
56
|
+
hardTimeoutIdleMs,
|
|
57
|
+
spawnedAtMs,
|
|
58
|
+
ageMs,
|
|
59
|
+
pastHardCeiling: ageMs >= hardCeilingMs,
|
|
60
|
+
};
|
|
61
|
+
}
|