openclaw-scheduler 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,6 +40,7 @@ import {
40
40
  hasCompletionSignal,
41
41
  taskRequiresGitSha,
42
42
  } from './completion.mjs';
43
+ import { getDispatchLivenessPolicy } from './liveness.mjs';
43
44
  import { onStarted, onFinished, onStuck } from './hooks.mjs';
44
45
  import { resolveMessageInput } from './message-input.mjs';
45
46
  import { buildDispatchDeliverySurface } from '../scripts/dispatch-cli-utils.mjs';
@@ -205,6 +206,19 @@ function setLabel(name, data) {
205
206
  return labels[name];
206
207
  }
207
208
 
209
+ function setLabelDone(name, data) {
210
+ const labels = mutateLabels((current) => {
211
+ current[name] = {
212
+ ...current[name],
213
+ ...data,
214
+ status: 'done',
215
+ error: null,
216
+ updatedAt: new Date().toISOString(),
217
+ };
218
+ });
219
+ return labels[name];
220
+ }
221
+
208
222
  // -- Gateway Calls --------------------------------------------
209
223
 
210
224
  /**
@@ -352,7 +366,17 @@ function getSessionJsonlPath(agent = 'main', sessionId) {
352
366
 
353
367
  function inspectSessionActivitySignal(sessionKey, sessionsStore) {
354
368
  if (!sessionKey || !sessionsStore?.[sessionKey]) {
355
- return { found: false, hasActivitySignal: false, messageCount: null, jsonlExists: false, hasTokens: false, updatedAtMs: null };
369
+ return {
370
+ found: false,
371
+ hasStartedSignal: false,
372
+ hasActivitySignal: false,
373
+ messageCount: null,
374
+ jsonlExists: false,
375
+ hasTokens: false,
376
+ updatedAtMs: null,
377
+ sessionStartedAtMs: null,
378
+ sessionId: null,
379
+ };
356
380
  }
357
381
 
358
382
  const agent = agentFromSessionKey(sessionKey) || 'main';
@@ -360,6 +384,9 @@ function inspectSessionActivitySignal(sessionKey, sessionsStore) {
360
384
  const jsonlPath = getSessionJsonlPath(agent, entry.sessionId);
361
385
  const jsonlExists = jsonlPath ? existsSync(jsonlPath) : false;
362
386
  const hasTokens = typeof entry.totalTokens === 'number' && entry.totalTokens > 0;
387
+ const sessionStartedAtMs = toTimestampMs(entry.sessionStartedAt || entry.startedAt);
388
+ const updatedAtMs = toTimestampMs(entry.updatedAt);
389
+ const hasStartedSignal = Boolean(entry.sessionId) || sessionStartedAtMs !== null || updatedAtMs !== null;
363
390
  let messageCount = null;
364
391
 
365
392
  try {
@@ -371,11 +398,14 @@ function inspectSessionActivitySignal(sessionKey, sessionsStore) {
371
398
 
372
399
  return {
373
400
  found: true,
401
+ hasStartedSignal,
374
402
  hasActivitySignal: jsonlExists || hasTokens || (typeof messageCount === 'number' && messageCount > 0),
375
403
  messageCount,
376
404
  jsonlExists,
377
405
  hasTokens,
378
- updatedAtMs: toTimestampMs(entry.updatedAt),
406
+ updatedAtMs,
407
+ sessionStartedAtMs,
408
+ sessionId: entry.sessionId || null,
379
409
  };
380
410
  }
381
411
 
@@ -385,12 +415,7 @@ function inspectSessionBootstrapFailure(sessionKey, sessionsStore, spawnedAtMs,
385
415
  }
386
416
 
387
417
  const ageMs = spawnedAtMs ? Date.now() - spawnedAtMs : Infinity;
388
- if (ageMs < startupGraceMs || ageMs > startupGraceMs * 2) {
389
- return { shouldResolve: false, reason: null, errorMsg: null };
390
- }
391
-
392
- const signal = inspectSessionActivitySignal(sessionKey, sessionsStore);
393
- if (signal.hasActivitySignal) {
418
+ if (ageMs < startupGraceMs) {
394
419
  return { shouldResolve: false, reason: null, errorMsg: null };
395
420
  }
396
421
 
@@ -403,22 +428,10 @@ function inspectSessionBootstrapFailure(sessionKey, sessionsStore, spawnedAtMs,
403
428
  };
404
429
  }
405
430
 
406
- if (signal.messageCount === 0) {
407
- return {
408
- shouldResolve: true,
409
- reason: 'session entered sessions store but never wrote transcript/history',
410
- errorMsg: 'spawn-failure: session entered sessions store but never wrote transcript/history',
411
- };
412
- }
413
-
414
- if (signal.updatedAtMs !== null && spawnedAtMs && signal.updatedAtMs <= spawnedAtMs + 5000) {
415
- return {
416
- shouldResolve: true,
417
- reason: 'session entered sessions store but never showed any activity',
418
- errorMsg: 'spawn-failure: session entered sessions store but never showed any activity',
419
- };
420
- }
421
-
431
+ // A Codex session can enter the sessions store before chat.history, JSONL, or
432
+ // token counters are written. Treat that as "still booting"; the watcher and
433
+ // job timeout own later failure handling. Only fail fast when the gateway has
434
+ // recorded an explicit lane error above.
422
435
  return { shouldResolve: false, reason: null, errorMsg: null };
423
436
  }
424
437
 
@@ -683,7 +696,7 @@ function quoteForSingleQuotedShell(value) {
683
696
  }
684
697
 
685
698
  /**
686
- * Schedule a one-shot delivery watcher shell job for a dispatch label.
699
+ * Schedule a quick-poll delivery watcher shell job for a dispatch label.
687
700
  * Used both for the initial watcher registration and SIGTERM handoffs.
688
701
  */
689
702
  function scheduleDeliveryWatcherJob({
@@ -704,13 +717,19 @@ function scheduleDeliveryWatcherJob({
704
717
  const watcherTimeoutS = Number(timeoutSeconds) + 120;
705
718
  const idleThresholdS = Number(idleThresholdSeconds) || 300;
706
719
  const sq = quoteForSingleQuotedShell;
707
- const watcherCmd = `DISPATCH_LABELS_PATH='${sq(LABELS_PATH)}' '${sq(process.execPath)}' '${sq(watcherPath)}' --label '${sq(label)}' --timeout ${watcherTimeoutS} --poll-interval 20 --idle-threshold ${idleThresholdS}`;
720
+ const watcherCmd =
721
+ `DISPATCH_LABELS_PATH='${sq(LABELS_PATH)}' ` +
722
+ `DISPATCH_INDEX_PATH='${sq(join(__dirname, 'index.mjs'))}' ` +
723
+ `'${sq(process.execPath)}' '${sq(watcherPath)}' ` +
724
+ `--label '${sq(label)}' --timeout ${watcherTimeoutS} ` +
725
+ `--poll-interval 20 --idle-threshold ${idleThresholdS} --once`;
708
726
 
709
727
  const nowUtc = new Date().toISOString().replace('T', ' ').slice(0, 19);
710
728
  const jobSpec = {
711
729
  name: `${agentBrand}-deliver:${label}${nameSuffix}`,
712
- schedule_kind: 'at',
713
- schedule_at: nowUtc,
730
+ schedule_kind: 'cron',
731
+ schedule_cron: config.deliver_watcher_cron || '* * * * *',
732
+ next_run_at: nowUtc,
714
733
  session_target: 'shell',
715
734
  payload_kind: 'shellCommand',
716
735
  payload_message: watcherCmd,
@@ -720,8 +739,7 @@ function scheduleDeliveryWatcherJob({
720
739
  delivery_guarantee: 'at-least-once',
721
740
  ttl_hours: config.deliver_watcher_ttl_hours ?? 48,
722
741
  overlap_policy: 'skip',
723
- run_timeout_ms: Math.max(watcherTimeoutS, 4 * 3600) * 1000
724
- + 420 * 1000,
742
+ run_timeout_ms: 120_000,
725
743
  delete_after_run: 1,
726
744
  origin: origin || 'system',
727
745
  };
@@ -1088,9 +1106,10 @@ async function cmdEnqueue(flags) {
1088
1106
  }
1089
1107
 
1090
1108
  // -- Register scheduler watcher for delivery ---------------
1091
- // Creates a one-shot shell job that runs watcher.mjs (blocks until session
1092
- // completes, outputs result). The scheduler's handleDelivery delivers with
1093
- // retry, alias resolution, and audit trail in scheduler.db.
1109
+ // Creates a quick-poll shell job that runs watcher.mjs once per tick. Empty
1110
+ // stdout means "still running" and advances the next tick without delivery.
1111
+ // Terminal stdout goes through the scheduler's handleDelivery with retry,
1112
+ // alias resolution, and audit trail in scheduler.db.
1094
1113
  // The watcher is the only final-delivery path for dispatched jobs.
1095
1114
  const sq = s => String(s).replace(/'/g, "'\\''");
1096
1115
  let schedulerWatcherOk = false;
@@ -1204,9 +1223,10 @@ async function cmdEnqueue(flags) {
1204
1223
 
1205
1224
  // -- Post-spawn verification (Fix 3) --------------------------------
1206
1225
  // Canary: poll sessions.json up to 3 times at 10s intervals to confirm the
1207
- // session appeared in the store. Non-fatal -- output is already written above.
1208
- // If the session never shows up, stderr gets a loud warning and ledger status
1209
- // is set to 'spawn-warning'. The watcher provides the definitive error path.
1226
+ // session appeared in the store. A session store entry with sessionId or
1227
+ // startedAt/sessionStartedAt is enough: long first turns may not flush JSONL,
1228
+ // token counts, or chat.history until the model call completes. The delivery
1229
+ // watcher owns later completion/failure handling.
1210
1230
  const SPAWN_POLL_MAX = 3;
1211
1231
  const SPAWN_POLL_DELAY_MS = 10_000;
1212
1232
  let spawnConfirmed = false;
@@ -1214,7 +1234,7 @@ async function cmdEnqueue(flags) {
1214
1234
  await sleep(SPAWN_POLL_DELAY_MS);
1215
1235
  const spawnStore = readSessionsStore(agent);
1216
1236
  const signal = inspectSessionActivitySignal(sessionKey, spawnStore);
1217
- if (signal.hasActivitySignal) {
1237
+ if (signal.hasStartedSignal || signal.hasActivitySignal) {
1218
1238
  spawnConfirmed = true;
1219
1239
  break;
1220
1240
  }
@@ -1292,16 +1312,17 @@ function cmdStatus(flags) {
1292
1312
  //
1293
1313
  // PING_STALE_MS: 3x the 60s ping interval -- if we haven't heard from the
1294
1314
  // watcher in 3 min, it's probably dead; fall through to check.
1295
- // hardCeilingMs: job timeout * 1.5 -- absolute max regardless of ping age.
1296
- // Catches zombie watchers (watcher alive but session is stuck).
1297
- // idleThresholdMs: max(job timeout, 10 min) -- replaces the old hardcoded 10-min
1298
- // threshold so longer jobs aren't killed at exactly 10 min.
1299
- const PING_STALE_MS = 3 * 60 * 1000;
1300
- const idleThresholdMs = Math.max((entry.timeoutSeconds || 600) * 1000, 10 * 60 * 1000);
1301
- // hardCeilingMs must be >= idleThresholdMs to avoid the ceiling undercutting the
1302
- // idle floor (e.g. timeoutSeconds=300 -> ceiling=7.5 min < idle=10 min would force
1303
- // zombie-guard threshold for sessions that should still use idleThresholdMs).
1304
- const hardCeilingMs = Math.max((entry.timeoutSeconds || 600) * 1000 * 1.5, idleThresholdMs * 1.5);
1315
+ // hardCeilingMs: timeout/reasoning-aware hard ceiling. High-thinking
1316
+ // work gets a larger quiet window before hard failure.
1317
+ // idleThresholdMs: timeout/reasoning-aware quiet threshold. Ambiguous or
1318
+ // missing liveness stays running until these thresholds.
1319
+ const livenessPolicy = getDispatchLivenessPolicy(entry, {
1320
+ startupGraceMs: STARTUP_GRACE_MS,
1321
+ defaultTimeoutSeconds: 600,
1322
+ });
1323
+ const PING_STALE_MS = livenessPolicy.pingStaleMs;
1324
+ const idleThresholdMs = livenessPolicy.idleFailureMs;
1325
+ const hardCeilingMs = livenessPolicy.hardCeilingMs;
1305
1326
 
1306
1327
  let check;
1307
1328
  if (ageMs < STARTUP_GRACE_MS) {
@@ -1314,13 +1335,13 @@ function cmdStatus(flags) {
1314
1335
  check = { shouldResolve: false };
1315
1336
  } else {
1316
1337
  // Ping stale OR past hard ceiling: fall through to session store check
1317
- const thresh = ageMs >= hardCeilingMs ? 2 * 60 * 1000 : idleThresholdMs;
1338
+ const thresh = ageMs >= hardCeilingMs ? livenessPolicy.hardTimeoutIdleMs : idleThresholdMs;
1318
1339
  check = checkSessionDone(entry.sessionKey, sessionsStore, thresh, true, spawnedAtMs);
1319
1340
  }
1320
1341
  } else {
1321
1342
  // No lastPing -- backward compat (sessions dispatched before heartbeat feature).
1322
1343
  // Use idleThresholdMs (job-aware) instead of the old hardcoded 10 min.
1323
- const thresh = ageMs >= hardCeilingMs ? 2 * 60 * 1000 : idleThresholdMs;
1344
+ const thresh = ageMs >= hardCeilingMs ? livenessPolicy.hardTimeoutIdleMs : idleThresholdMs;
1324
1345
  check = checkSessionDone(entry.sessionKey, sessionsStore, thresh, true, spawnedAtMs);
1325
1346
  }
1326
1347
 
@@ -1597,10 +1618,13 @@ function cmdSync(flags) {
1597
1618
  // -- Heartbeat-based liveness guard (mirrors cmdStatus logic) ---------
1598
1619
  // Skip auto-resolve when the watcher's lastPing heartbeat is fresh.
1599
1620
  // See cmdStatus for full commentary on PING_STALE_MS / hardCeilingMs.
1600
- const PING_STALE_MS_SYNC = 3 * 60 * 1000;
1601
- const idleThresholdMsSync = Math.max((entry.timeoutSeconds || 600) * 1000, 10 * 60 * 1000);
1602
- // hardCeilingMsSync must be >= idleThresholdMsSync (mirrors cmdStatus fix).
1603
- const hardCeilingMsSync = Math.max((entry.timeoutSeconds || 600) * 1000 * 1.5, idleThresholdMsSync * 1.5);
1621
+ const syncPolicy = getDispatchLivenessPolicy(entry, {
1622
+ startupGraceMs: STARTUP_GRACE_MS_SYNC,
1623
+ defaultTimeoutSeconds: 600,
1624
+ });
1625
+ const PING_STALE_MS_SYNC = syncPolicy.pingStaleMs;
1626
+ const idleThresholdMsSync = syncPolicy.idleFailureMs;
1627
+ const hardCeilingMsSync = syncPolicy.hardCeilingMs;
1604
1628
 
1605
1629
  if (entry.lastPing) {
1606
1630
  const pingAgeMs = Date.now() - new Date(entry.lastPing).getTime();
@@ -1610,7 +1634,7 @@ function cmdSync(flags) {
1610
1634
  }
1611
1635
  }
1612
1636
 
1613
- const syncThresh = elapsedMs >= hardCeilingMsSync ? 2 * 60 * 1000 : idleThresholdMsSync;
1637
+ const syncThresh = elapsedMs >= hardCeilingMsSync ? syncPolicy.hardTimeoutIdleMs : idleThresholdMsSync;
1614
1638
  const check = checkSessionDone(entry.sessionKey, syncStore, syncThresh, true, spawnedAtMs);
1615
1639
 
1616
1640
  if (check.shouldResolve) {
@@ -1972,7 +1996,7 @@ async function cmdDone(flags) {
1972
1996
  // Label was never registered (e.g. direct subagent spawn, not via enqueue).
1973
1997
  // This is not an error -- the work completed, the label just wasn't tracked.
1974
1998
  process.stderr.write(`[${BRAND}] warn: no session found for label "${label}" -- registering as done\n`);
1975
- setLabel(label, { status: 'done', summary, completion, ...(sha ? { sha } : {}) });
1999
+ setLabelDone(label, { summary, completion, ...(sha ? { sha } : {}) });
1976
2000
 
1977
2001
  // No watcher is polling for this label, so actively notify via the gateway
1978
2002
  // post office using delivery config from config.json as fallback target.
@@ -2001,8 +2025,7 @@ async function cmdDone(flags) {
2001
2025
  return;
2002
2026
  }
2003
2027
 
2004
- setLabel(label, {
2005
- status: 'done',
2028
+ setLabelDone(label, {
2006
2029
  summary,
2007
2030
  completion,
2008
2031
  ...(sha ? { sha } : {}),
@@ -0,0 +1,61 @@
1
+ const MINUTE_MS = 60 * 1000;
2
+
3
+ function numberOrNull(value) {
4
+ const n = Number(value);
5
+ return Number.isFinite(n) && n > 0 ? n : null;
6
+ }
7
+
8
+ export function normalizeThinkingLevel(value) {
9
+ const text = typeof value === 'string' ? value.trim().toLowerCase() : '';
10
+ if (text === 'xhigh' || text === 'extra-high' || text === 'extra_high') return 'xhigh';
11
+ if (text === 'high') return 'high';
12
+ if (text === 'low') return 'low';
13
+ if (text === 'off' || text === 'none') return 'off';
14
+ return null;
15
+ }
16
+
17
+ export function getDispatchTimeoutSeconds(entry = {}, fallbackSeconds = 300) {
18
+ return numberOrNull(entry.timeoutSeconds)
19
+ ?? numberOrNull(entry.timeout)
20
+ ?? numberOrNull(fallbackSeconds)
21
+ ?? 300;
22
+ }
23
+
24
+ export function getDispatchLivenessPolicy(entry = {}, opts = {}) {
25
+ const now = numberOrNull(opts.now) ?? Date.now();
26
+ const timeoutSeconds = getDispatchTimeoutSeconds(entry, opts.defaultTimeoutSeconds);
27
+ const timeoutMs = timeoutSeconds * 1000;
28
+ const thinking = normalizeThinkingLevel(entry.thinking);
29
+ const isHighThinking = thinking === 'high' || thinking === 'xhigh';
30
+
31
+ const startupGraceMs = numberOrNull(opts.startupGraceMs)
32
+ ?? (isHighThinking ? 10 * MINUTE_MS : 5 * MINUTE_MS);
33
+ const pingStaleMs = numberOrNull(opts.pingStaleMs) ?? 3 * MINUTE_MS;
34
+ const idleProbeFloorMs = isHighThinking ? 10 * MINUTE_MS : 1 * MINUTE_MS;
35
+ const idleProbeMs = Math.max(
36
+ idleProbeFloorMs,
37
+ Math.min(timeoutMs * 0.25, isHighThinking ? 15 * MINUTE_MS : 5 * MINUTE_MS),
38
+ );
39
+ const idleFailureFloorMs = isHighThinking ? 20 * MINUTE_MS : 10 * MINUTE_MS;
40
+ const idleFailureMs = Math.max(timeoutMs, idleFailureFloorMs);
41
+ const hardCeilingMs = Math.max(timeoutMs * 1.5, idleFailureMs * (isHighThinking ? 2 : 1.5));
42
+ const hardTimeoutIdleMs = isHighThinking ? 5 * MINUTE_MS : 2 * MINUTE_MS;
43
+ const spawnedAtMs = entry.spawnedAt ? new Date(entry.spawnedAt).getTime() : 0;
44
+ const ageMs = spawnedAtMs ? now - spawnedAtMs : Infinity;
45
+
46
+ return {
47
+ thinking,
48
+ isHighThinking,
49
+ timeoutSeconds,
50
+ timeoutMs,
51
+ startupGraceMs,
52
+ pingStaleMs,
53
+ idleProbeMs,
54
+ idleFailureMs,
55
+ hardCeilingMs,
56
+ hardTimeoutIdleMs,
57
+ spawnedAtMs,
58
+ ageMs,
59
+ pastHardCeiling: ageMs >= hardCeilingMs,
60
+ };
61
+ }