openclaw-scheduler 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,7 @@ import {
36
36
  hasCompletionSignal,
37
37
  resolveCompletionDelivery,
38
38
  } from './completion.mjs';
39
+ import { getDispatchLivenessPolicy } from './liveness.mjs';
39
40
  import { sendMessage } from '../messages.js';
40
41
 
41
42
  const __dirname = dirname(fileURLToPath(import.meta.url));
@@ -684,6 +685,112 @@ function getJsonlMidTurnReason(sessionId, agentDir = 'main') {
684
685
  return null; // Last assistant entry appears to be a complete text reply -- safe to proceed
685
686
  }
686
687
 
688
+ /**
689
+ * Check the JSONL tail for a pending tool handoff without requiring recent
690
+ * file activity. Long-running tool calls can leave the transcript flat for
691
+ * minutes, so stale mtime alone is not enough to declare the agent stuck.
692
+ *
693
+ * @param {string} sessionId - Internal session UUID
694
+ * @param {string} agentDir - Agent directory (default: 'main')
695
+ * @returns {string|null} reason string if a tool handoff appears pending
696
+ */
697
+ function getJsonlPendingToolReason(sessionId, agentDir = 'main') {
698
+ const lastLines = readJsonlLastLines(sessionId, agentDir, 3);
699
+ if (!lastLines || lastLines.length === 0) return null;
700
+
701
+ const last = lastLines[lastLines.length - 1];
702
+
703
+ if (last?.role === 'assistant') {
704
+ const content = Array.isArray(last.content) ? last.content : [];
705
+ const toolUse = content.find(c => c?.type === 'tool_use');
706
+ if (toolUse) {
707
+ return `last assistant entry has tool_use (${toolUse.name || 'unknown'}) -- awaiting tool result`;
708
+ }
709
+ if (last.type === 'tool_use') {
710
+ return `last entry is tool_use (${last.name || 'unknown'}) -- awaiting tool result`;
711
+ }
712
+ }
713
+
714
+ if (last?.role === 'user') {
715
+ const content = Array.isArray(last.content) ? last.content : [];
716
+ if (content.some(c => c?.type === 'tool_result')) {
717
+ return 'last entry is tool_result (tool executed, awaiting assistant reply)';
718
+ }
719
+ }
720
+
721
+ if (last?.type === 'tool_result') {
722
+ return 'last entry is tool_result (tool executed, awaiting assistant reply)';
723
+ }
724
+
725
+ return null;
726
+ }
727
+
728
+ function parseTimestampMs(value) {
729
+ if (!value) return null;
730
+ if (typeof value === 'number') {
731
+ return Number.isFinite(value) ? value : null;
732
+ }
733
+ if (value instanceof Date) {
734
+ const timestamp = value.getTime();
735
+ return Number.isFinite(timestamp) ? timestamp : null;
736
+ }
737
+ const parsed = Date.parse(value);
738
+ return Number.isFinite(parsed) ? parsed : null;
739
+ }
740
+
741
+ /**
742
+ * Detect an agent session that has stopped making progress even though the
743
+ * watcher process itself is still alive and writing lastPing.
744
+ *
745
+ * This closes the failure mode where OpenClaw's Codex app-server retires a
746
+ * timed-out turn, but dispatch status keeps reporting "running" because the
747
+ * delivery watcher is still polling.
748
+ */
749
+ function getRunningSessionStallReason(status, thresholdMs) {
750
+ if (!status?.sessionKey) return null;
751
+
752
+ const sessionAgent = status.sessionKey.split(':')[1] || 'main';
753
+ const entry = getSessionStoreEntry(status.sessionKey);
754
+ if (!entry) return null;
755
+
756
+ const sessionId = entry.sessionId || null;
757
+ const now = Date.now();
758
+ const activityTimes = [
759
+ parseTimestampMs(entry.updatedAt),
760
+ parseTimestampMs(entry.lastActivityAt),
761
+ parseTimestampMs(entry.sessionStartedAt),
762
+ parseTimestampMs(entry.startedAt),
763
+ ].filter(t => typeof t === 'number');
764
+
765
+ const jsonlMtime = sessionId ? getSessionJsonlMtime(sessionId, sessionAgent) : null;
766
+ if (typeof jsonlMtime === 'number') activityTimes.push(jsonlMtime);
767
+
768
+ if (typeof status?.liveness?.ageMs === 'number' && status.liveness.ageMs < thresholdMs) {
769
+ return null;
770
+ }
771
+
772
+ const lastActivityMs = activityTimes.length ? Math.max(...activityTimes) : null;
773
+ if (lastActivityMs !== null && now - lastActivityMs < thresholdMs) {
774
+ return null;
775
+ }
776
+
777
+ const pendingToolReason = sessionId ? getJsonlPendingToolReason(sessionId, sessionAgent) : null;
778
+ if (pendingToolReason) {
779
+ process.stderr.write(
780
+ `[watcher] ${status.label || 'session'} stale telemetry but pending tool handoff detected: ${pendingToolReason}\n`
781
+ );
782
+ return null;
783
+ }
784
+
785
+ const idleMinutes = lastActivityMs === null
786
+ ? Math.ceil(thresholdMs / 60000)
787
+ : Math.max(1, Math.floor((now - lastActivityMs) / 60000));
788
+ return (
789
+ `agent session stalled: no session/jsonl activity for ~${idleMinutes}min ` +
790
+ `while delivery watcher remained alive; likely app-server turn retired or stopped producing events`
791
+ );
792
+ }
793
+
687
794
  /**
688
795
  * Read the last assistant entry's stop_reason from the session JSONL.
689
796
  * Returns the stop_reason string (e.g. 'end_turn', 'tool_use') or null if unavailable.
@@ -754,6 +861,7 @@ function markLabelError(label, errorSummary) {
754
861
  updateExistingLabel(label, (entry) => {
755
862
  if (entry.status === 'done') return false;
756
863
  entry.status = 'error';
864
+ entry.error = errorSummary || 'failed without result';
757
865
  entry.summary = errorSummary || 'failed without result';
758
866
  });
759
867
  } catch (e) {
@@ -761,6 +869,8 @@ function markLabelError(label, errorSummary) {
761
869
  }
762
870
  }
763
871
 
872
+ let exitZeroOnTerminal = false;
873
+
764
874
  /**
765
875
  * Format and output the delivery message, then exit 0.
766
876
  * Also marks the label as done in labels.json before exiting.
@@ -794,7 +904,7 @@ function deliverResult(label, lastReply, fallbackSummary, completionPayload = nu
794
904
  `**Error:** ${stderr || 'non-zero exit'}\n\n` +
795
905
  `Job marked as \`error\`. The agent may have reported done without completing the actual work.\n`
796
906
  );
797
- process.exit(1);
907
+ process.exit(exitZeroOnTerminal ? 0 : 1);
798
908
  }
799
909
  }
800
910
  } catch (loadErr) {
@@ -816,10 +926,17 @@ function deliverResult(label, lastReply, fallbackSummary, completionPayload = nu
816
926
  ? completion.deliveryText.slice(0, maxLen) + '\n\n..[truncated]'
817
927
  : completion.deliveryText;
818
928
  process.stdout.write(`🌶️ *dispatch* [${label}] completed:\n\n${reply}\n`);
819
- } else {
820
- process.stderr.write(`[watcher] [${label}] completion delivery suppressed (no meaningful reply or summary)\n`);
929
+ process.exit(0);
821
930
  }
822
- process.exit(0);
931
+
932
+ const failureSummary = 'completed without a clean user-facing completion';
933
+ process.stderr.write(`[watcher] [${label}] completion delivery suppressed (no meaningful reply or summary)\n`);
934
+ markLabelError(label, failureSummary);
935
+ process.stdout.write(
936
+ `⚠️ dispatch [${label}] completed, but no clean user-facing completion was captured. ` +
937
+ `Internal diagnostics were suppressed; check scheduler run logs for details.\n`
938
+ );
939
+ process.exit(exitZeroOnTerminal ? 0 : 1);
823
940
  }
824
941
 
825
942
  function emitInterruptedOutcome(label, summary, result = null) {
@@ -829,12 +946,12 @@ function emitInterruptedOutcome(label, summary, result = null) {
829
946
  `⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete` +
830
947
  `${formatDiagnosticSnippet(result?.diagnosticReply || result?.lastReply || null)}\n`
831
948
  );
832
- process.exit(1);
949
+ process.exit(exitZeroOnTerminal ? 0 : 1);
833
950
  }
834
951
 
835
952
  function emitTimeoutOutcome(label, message, result = null) {
836
953
  process.stdout.write(`${message}${formatDiagnosticSnippet(result?.diagnosticReply || result?.lastReply || null)}\n`);
837
- process.exit(1);
954
+ process.exit(exitZeroOnTerminal ? 0 : 1);
838
955
  }
839
956
 
840
957
  // -- Watcher heartbeat interval ref --------------------------------------
@@ -869,15 +986,165 @@ const flags = parseFlags(process.argv.slice(2));
869
986
  const label = flags.label;
870
987
  const timeoutS = parseInt(flags.timeout || '600', 10);
871
988
  const pollS = parseInt(flags['poll-interval'] || '20', 10);
989
+ const once = flags.once === true || flags.once === 'true';
990
+ exitZeroOnTerminal = once;
872
991
 
873
- // How long a session must be idle before we proactively check result
874
- const IDLE_RESULT_CHECK_MS = 60000;
992
+ function getCurrentLivenessPolicy() {
993
+ const entry = loadLabels()[label] || { timeoutSeconds: timeoutS };
994
+ return getDispatchLivenessPolicy(entry, { defaultTimeoutSeconds: timeoutS });
995
+ }
996
+
997
+ function hasStructuredCompletion(result) {
998
+ return hasCompletionSignal(result?.completion);
999
+ }
875
1000
 
876
1001
  if (!label) {
877
1002
  process.stderr.write('[watcher] --label is required\n');
878
1003
  process.exit(2);
879
1004
  }
880
1005
 
1006
+ function touchWatcherPing(label) {
1007
+ updateExistingLabel(label, (entry) => {
1008
+ if (entry.status !== 'running') return false;
1009
+ entry.lastPing = new Date().toISOString();
1010
+ });
1011
+ }
1012
+
1013
+ function markWatcherPending(label, reason = 'target still running') {
1014
+ process.stderr.write(`[watcher] WATCHER_PENDING label=${label} reason=${reason}\n`);
1015
+ process.exit(0);
1016
+ }
1017
+
1018
+ function clearWatcherRetryAfter(label) {
1019
+ updateExistingLabel(label, (entry) => {
1020
+ if (!entry.watcherRetryAfter) return false;
1021
+ delete entry.watcherRetryAfter;
1022
+ });
1023
+ }
1024
+
1025
+ function handleOnce529(label, errorMsg) {
1026
+ const labels = loadLabels();
1027
+ const entry = labels[label] || {};
1028
+ const retryCount = getRetryCount(label);
1029
+
1030
+ if (retryCount >= MAX_529_RETRIES) {
1031
+ markLabelError(label, `max_retries_exceeded (${retryCount}x 529): ${errorMsg}`);
1032
+ process.stdout.write(
1033
+ `🌶️ *dispatch* [${label}] failed after ${MAX_529_RETRIES} retries (529 overload)\n` +
1034
+ `Error: ${errorMsg}\n`
1035
+ );
1036
+ process.exit(0);
1037
+ }
1038
+
1039
+ const retryAfterMs = parseTimestampMs(entry.watcherRetryAfter);
1040
+ if (!retryAfterMs) {
1041
+ const retryResult = attempt529Retry(label, retryCount, errorMsg);
1042
+ if (!retryResult.retry) return handleOnce529(label, errorMsg);
1043
+ updateExistingLabel(label, (current) => {
1044
+ current.watcherRetryAfter = new Date(Date.now() + retryResult.delayMs).toISOString();
1045
+ });
1046
+ markWatcherPending(label, `529 retry scheduled for future tick (${retryResult.delayMs / 1000}s)`);
1047
+ }
1048
+
1049
+ if (Date.now() < retryAfterMs) {
1050
+ markWatcherPending(label, '529 retry backoff active');
1051
+ }
1052
+
1053
+ if (respawnSession(label)) {
1054
+ clearWatcherRetryAfter(label);
1055
+ markWatcherPending(label, '529 retry dispatched');
1056
+ }
1057
+
1058
+ markLabelError(label, `529 retry failed -- could not respawn session: ${errorMsg}`);
1059
+ process.stdout.write(
1060
+ `🌶️ *dispatch* [${label}] 529 retry failed -- could not respawn session\n` +
1061
+ `Error: ${errorMsg}\n`
1062
+ );
1063
+ process.exit(0);
1064
+ }
1065
+
1066
+ function runOnceAndExit() {
1067
+ try {
1068
+ touchWatcherPing(label);
1069
+ } catch {
1070
+ // Best-effort -- a quick-poll tick must not fail because heartbeat metadata raced.
1071
+ }
1072
+
1073
+ const status = dispatch('status', ['--label', label]);
1074
+ if (!status?.ok) {
1075
+ markWatcherPending(label, 'status unavailable');
1076
+ }
1077
+
1078
+ if (status.status === 'error') {
1079
+ const errorMsg = status.error || status.summary || '';
1080
+ if (is529Error(errorMsg)) {
1081
+ handleOnce529(label, errorMsg);
1082
+ }
1083
+ }
1084
+
1085
+ if (status.status !== 'running') {
1086
+ const terminalResult = dispatch('result', ['--label', label]);
1087
+ const terminalCompletion = terminalResult?.completion || status?.completion || null;
1088
+
1089
+ if (status.status === 'done') {
1090
+ const currentRetryCount = getRetryCount(label);
1091
+ if (currentRetryCount > 0) setRetryCount(label, 0);
1092
+ const gwRetryCount = getGwRestartRetryCount(label);
1093
+ if (gwRetryCount > 0) setGwRestartRetryCount(label, 0);
1094
+ deliverResult(label, terminalResult?.lastReply, status.summary, terminalCompletion);
1095
+ }
1096
+
1097
+ if (status.status === 'interrupted') {
1098
+ emitInterruptedOutcome(label, status.summary, terminalResult);
1099
+ }
1100
+
1101
+ const summary = status.error || status.summary || `terminal failure (${status.status || 'unknown'})`;
1102
+ markLabelError(label, summary);
1103
+ process.stdout.write(`🌶️ *dispatch* [${label}] failed\nSummary: ${summary}\n`);
1104
+ process.exit(0);
1105
+ }
1106
+
1107
+ if (status.sessionKey) {
1108
+ const entry = getSessionStoreEntry(status.sessionKey);
1109
+ const sessionId = entry?.sessionId || null;
1110
+ const sessionAgent = status.sessionKey.split(':')[1] || 'main';
1111
+ const terminalJsonlReply = sessionId ? getSessionTerminalReply(sessionId, sessionAgent) : null;
1112
+ if (sessionId && terminalJsonlReply && isSessionCleanlyFinished(sessionId, sessionAgent)) {
1113
+ const result = dispatch('result', ['--label', label]);
1114
+ if (hasStructuredCompletion(result)) {
1115
+ deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
1116
+ }
1117
+ process.stderr.write(`[watcher] stop_reason=end_turn observed without completion signal -- continuing to monitor\n`);
1118
+ }
1119
+ }
1120
+
1121
+ const ageMs = status.liveness?.ageMs;
1122
+ const idleResultCheckMs = getCurrentLivenessPolicy().idleProbeMs;
1123
+ if (ageMs != null && ageMs >= idleResultCheckMs) {
1124
+ const result = dispatch('result', ['--label', label]);
1125
+ if (hasStructuredCompletion(result)) {
1126
+ deliverResult(label, result?.lastReply || null, null, result?.completion || null);
1127
+ }
1128
+
1129
+ const stallReason = getRunningSessionStallReason(status, idleResultCheckMs);
1130
+ if (stallReason) {
1131
+ process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
1132
+ markLabelError(label, stallReason);
1133
+ process.stdout.write(
1134
+ `❌ *dispatch* [${label}] failed\n` +
1135
+ `Summary: ${stallReason}\n`
1136
+ );
1137
+ process.exit(0);
1138
+ }
1139
+ }
1140
+
1141
+ markWatcherPending(label);
1142
+ }
1143
+
1144
+ if (once) {
1145
+ runOnceAndExit();
1146
+ }
1147
+
881
1148
  // -- Start heartbeat -----------------------------------------------------
882
1149
  // Write lastPing to labels.json every PING_INTERVAL_MS while the session is
883
1150
  // still running. The watchdog guard in index.mjs reads lastPing to know this
@@ -1221,8 +1488,11 @@ while (Date.now() < deadline) {
1221
1488
  if (_sid2a && terminalJsonlReply && isSessionCleanlyFinished(_sid2a, _adir2a)) {
1222
1489
  process.stderr.write(`[watcher] stop_reason=end_turn detected -- delivering early\n`);
1223
1490
  const result = dispatch('result', ['--label', label]);
1224
- deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
1225
- // deliverResult exits
1491
+ if (hasStructuredCompletion(result)) {
1492
+ deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
1493
+ // deliverResult exits
1494
+ }
1495
+ process.stderr.write(`[watcher] stop_reason=end_turn observed without completion signal -- continuing to monitor\n`);
1226
1496
  }
1227
1497
  }
1228
1498
 
@@ -1233,11 +1503,23 @@ while (Date.now() < deadline) {
1233
1503
  // while this watcher's lastPing heartbeat is fresh (written every 60s);
1234
1504
  // this path handles normal completion before the ping goes stale.
1235
1505
  const ageMs = status.liveness?.ageMs;
1236
- if (ageMs != null && ageMs >= IDLE_RESULT_CHECK_MS) {
1506
+ const idleResultCheckMs = getCurrentLivenessPolicy().idleProbeMs;
1507
+ if (ageMs != null && ageMs >= idleResultCheckMs) {
1237
1508
  const result = dispatch('result', ['--label', label]);
1238
- if (result?.lastReply || hasCompletionSignal(result?.completion)) {
1509
+ if (hasStructuredCompletion(result)) {
1239
1510
  deliverResult(label, result?.lastReply || null, null, result?.completion || null);
1240
1511
  }
1512
+
1513
+ const stallReason = getRunningSessionStallReason(status, idleResultCheckMs);
1514
+ if (stallReason) {
1515
+ process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
1516
+ markLabelError(label, stallReason);
1517
+ process.stdout.write(
1518
+ `❌ *dispatch* [${label}] failed\n` +
1519
+ `Summary: ${stallReason}\n`
1520
+ );
1521
+ process.exit(1);
1522
+ }
1241
1523
  }
1242
1524
 
1243
1525
 
@@ -1310,7 +1592,7 @@ if (sessionInternalId) {
1310
1592
  // If the session already completed (gateway pruned it -> null tokens), exit cleanly.
1311
1593
  if (statusAtDeadline?.status === 'done' || baselineTokens === null) {
1312
1594
  const r = dispatch('result', ['--label', label]);
1313
- if (r?.lastReply || hasCompletionSignal(r?.completion)) {
1595
+ if (hasStructuredCompletion(r)) {
1314
1596
  // deliverResult calls process.exit(0) internally
1315
1597
  deliverResult(label, r?.lastReply || null, statusAtDeadline?.summary || null, r?.completion || null);
1316
1598
  }
@@ -1349,7 +1631,7 @@ while (Date.now() - flatSince < FLAT_WINDOW_MS) {
1349
1631
  deliverResult(label, r?.lastReply || null, st.summary, r?.completion || st?.completion || null);
1350
1632
  }
1351
1633
  const r2 = dispatch('result', ['--label', label]);
1352
- if (r2?.lastReply || hasCompletionSignal(r2?.completion)) {
1634
+ if (hasStructuredCompletion(r2)) {
1353
1635
  // deliverResult calls process.exit(0) internally
1354
1636
  deliverResult(label, r2?.lastReply || null, null, r2?.completion || null);
1355
1637
  }
@@ -1443,7 +1725,7 @@ if (sessionInternalId) {
1443
1725
  deliverResult(label, rExt?.lastReply || null, stExt.summary, rExt?.completion || stExt?.completion || null);
1444
1726
  }
1445
1727
  const rExt2 = dispatch('result', ['--label', label]);
1446
- if (rExt2?.lastReply || hasCompletionSignal(rExt2?.completion)) {
1728
+ if (hasStructuredCompletion(rExt2)) {
1447
1729
  // deliverResult calls process.exit(0) internally
1448
1730
  deliverResult(label, rExt2?.lastReply || null, null, rExt2?.completion || null);
1449
1731
  }
@@ -1500,7 +1782,7 @@ for (const round of steerRounds) {
1500
1782
  deliverResult(label, r3?.lastReply || null, st2.summary, r3?.completion || st2?.completion || null);
1501
1783
  }
1502
1784
  const r3 = dispatch('result', ['--label', label]);
1503
- if (r3?.lastReply || hasCompletionSignal(r3?.completion)) {
1785
+ if (hasStructuredCompletion(r3)) {
1504
1786
  // deliverResult calls process.exit(0) internally
1505
1787
  deliverResult(label, r3?.lastReply || null, null, r3?.completion || null);
1506
1788
  }
@@ -1515,7 +1797,7 @@ for (const round of steerRounds) {
1515
1797
  if (st3?.status === 'done') {
1516
1798
  // Check if a result was captured before marking as error
1517
1799
  const r4 = dispatch('result', ['--label', label]);
1518
- if (r4?.lastReply || hasCompletionSignal(r4?.completion)) {
1800
+ if (hasStructuredCompletion(r4)) {
1519
1801
  deliverResult(label, r4?.lastReply || null, st3.summary, r4?.completion || st3?.completion || null); // deliverResult calls process.exit(0)
1520
1802
  }
1521
1803
  markLabelError(label, 'timed out -- killed after steer attempts (no result captured)');
@@ -1095,6 +1095,25 @@ export async function executeMain(job, ctx, deps) {
1095
1095
 
1096
1096
  // -- Strategy: Shell -----------------------------------------
1097
1097
 
1098
+ function isCompletionDeliveryWatcherJob(job) {
1099
+ return /^(?:dispatch|chilisaus)-deliver:/.test(String(job?.name || ''));
1100
+ }
1101
+
1102
+ function isCompletionWatcherPendingTick(shellResult) {
1103
+ return !(shellResult.stdout || '').trim()
1104
+ && /\bWATCHER_PENDING\b/.test(shellResult.stderr || '');
1105
+ }
1106
+
1107
+ function buildCompletionWatcherNoPayloadMessage(job, shellResult) {
1108
+ const statusLabel = shellResult.status === 'ok'
1109
+ ? 'completed without a deliverable result'
1110
+ : `failed before producing a deliverable result${shellResult.errorMessage ? ` (${shellResult.errorMessage})` : ''}`;
1111
+ return [
1112
+ `⚠️ Completion delivery watcher for ${job.name} ${statusLabel}.`,
1113
+ 'No internal diagnostics were delivered as the completion message; check the scheduler run logs for stderr/details.',
1114
+ ].join('\n');
1115
+ }
1116
+
1098
1117
  export async function executeShell(job, ctx, deps) {
1099
1118
  const { runShellCommand, normalizeShellResult, log } = deps;
1100
1119
  const result = makeDefaultResult();
@@ -1129,18 +1148,61 @@ export async function executeShell(job, ctx, deps) {
1129
1148
  shell_stderr_bytes: shellResult.stderrBytes,
1130
1149
  };
1131
1150
 
1132
- // Shell delivery logic: announce-always sends on all results, announce sends on error only
1133
- const announcePayload = shellResult.deliveryText.trim() ? shellResult.deliveryText : shellResult.errorMessage;
1134
- if (job.delivery_mode === 'announce-always' && announcePayload) {
1135
- const prefix = shellResult.status === 'ok' ? '' : `\u26a0\ufe0f Shell job failed: ${job.name}\n\n`;
1136
- result.deliveryOverride = `${prefix}${announcePayload}`;
1137
- } else if (job.delivery_mode === 'announce' && shellResult.status !== 'ok' && announcePayload) {
1138
- result.deliveryOverride = announcePayload;
1151
+ if (isCompletionDeliveryWatcherJob(job)) {
1152
+ const watcherStdout = (shellResult.stdout || '').trim();
1153
+ const watcherStderr = (shellResult.stderr || '').trim();
1154
+
1155
+ if (isCompletionWatcherPendingTick(shellResult)) {
1156
+ result.status = 'skipped';
1157
+ result.summary = 'Completion delivery watcher pending; target session is still running';
1158
+ result.content = '';
1159
+ result.errorMessage = null;
1160
+ result.idemAction = 'release';
1161
+ result.skipDelivery = true;
1162
+ } else if (watcherStdout) {
1163
+ // Completion watcher stdout is the only user-facing contract. Stderr is
1164
+ // diagnostics-only and must never be repackaged as a "successful" final
1165
+ // completion if the watcher suppressed the real payload.
1166
+ result.summary = watcherStdout;
1167
+ result.content = watcherStdout;
1168
+ if (['announce', 'announce-always'].includes(job.delivery_mode)) {
1169
+ result.deliveryOverride = watcherStdout;
1170
+ } else {
1171
+ result.skipDelivery = true;
1172
+ }
1173
+ } else {
1174
+ const noPayloadMessage = buildCompletionWatcherNoPayloadMessage(job, shellResult);
1175
+ result.status = 'error';
1176
+ result.summary = noPayloadMessage;
1177
+ result.errorMessage = 'Completion delivery watcher produced no user-facing stdout payload';
1178
+ result.content = noPayloadMessage;
1179
+ if (['announce', 'announce-always'].includes(job.delivery_mode)) {
1180
+ result.deliveryOverride = noPayloadMessage;
1181
+ } else {
1182
+ result.skipDelivery = true;
1183
+ }
1184
+ log('warn', `Completion watcher produced no deliverable stdout: ${job.name}`, {
1185
+ runId: ctx.run.id,
1186
+ shellStatus: shellResult.status,
1187
+ exitCode: shellResult.exitCode,
1188
+ stderrExcerpt: watcherStderr.slice(0, 500),
1189
+ skippedOrDisabled: /\b(?:skipped|disabled)\b/i.test(watcherStderr),
1190
+ });
1191
+ }
1139
1192
  } else {
1140
- result.skipDelivery = true;
1193
+ // Shell delivery logic: announce-always sends on all results, announce sends on error only
1194
+ const announcePayload = shellResult.deliveryText.trim() ? shellResult.deliveryText : shellResult.errorMessage;
1195
+ if (job.delivery_mode === 'announce-always' && announcePayload) {
1196
+ const prefix = shellResult.status === 'ok' ? '' : `\u26a0\ufe0f Shell job failed: ${job.name}\n\n`;
1197
+ result.deliveryOverride = `${prefix}${announcePayload}`;
1198
+ } else if (job.delivery_mode === 'announce' && shellResult.status !== 'ok' && announcePayload) {
1199
+ result.deliveryOverride = announcePayload;
1200
+ } else {
1201
+ result.skipDelivery = true;
1202
+ }
1141
1203
  }
1142
1204
 
1143
- log('info', `Shell ${shellResult.status}: ${job.name}`, {
1205
+ log('info', `Shell ${result.status}: ${job.name}`, {
1144
1206
  runId: ctx.run.id,
1145
1207
  exitCode: shellResult.exitCode,
1146
1208
  signal: shellResult.signal,
@@ -1156,11 +1218,16 @@ export async function executeAgent(job, ctx, deps) {
1156
1218
  const {
1157
1219
  waitForGateway, updateRunSession, setAgentStatus,
1158
1220
  buildJobPrompt, runAgentTurnWithActivityTimeout,
1221
+ // Sanctioned isolated dispatch primitive. Falls back to the activity-aware
1222
+ // runner when callers (e.g. tests) wire only the older name -- both helpers
1223
+ // share the same HTTP-only contract, no subprocess spawn.
1224
+ runIsolatedAgentTurn,
1159
1225
  updateContextSummary, releaseDispatch, releaseIdempotencyKey,
1160
1226
  updateJob, matchesSentinel, detectTransientError,
1161
1227
  listSessions,
1162
1228
  sqliteNow, log,
1163
1229
  } = deps;
1230
+ const dispatchAgentTurn = runIsolatedAgentTurn || runAgentTurnWithActivityTimeout;
1164
1231
  const result = makeDefaultResult();
1165
1232
 
1166
1233
  // Gateway health check
@@ -1254,7 +1321,12 @@ export async function executeAgent(job, ctx, deps) {
1254
1321
  }
1255
1322
  }
1256
1323
 
1257
- const turnResult = await runAgentTurnWithActivityTimeout({
1324
+ // Isolated dispatch primitive: HTTP-only chat completions call. The
1325
+ // scheduler must never fork a sibling `openclaw` process to spawn an
1326
+ // isolated session -- that variant has historically SIGTERM'd the
1327
+ // launchd-tracked gateway parent and orphaned a node process on port
1328
+ // 18789 (see ISOLATED_DISPATCH_PRIMITIVE in gateway.js).
1329
+ const turnResult = await dispatchAgentTurn({
1258
1330
  message: prompt,
1259
1331
  agentId: job.agent_id || 'main',
1260
1332
  sessionKey,
package/dispatcher.js CHANGED
@@ -51,7 +51,8 @@ import {
51
51
  import { buildRetrievalContext } from './retrieval.js';
52
52
  import { upsertAgent, setAgentStatus } from './agents.js';
53
53
  import {
54
- runAgentTurnWithActivityTimeout, sendSystemEvent, getAllSubAgentSessions, listSessions,
54
+ runAgentTurnWithActivityTimeout, runIsolatedAgentTurn,
55
+ sendSystemEvent, getAllSubAgentSessions, listSessions,
55
56
  deliverMessage, checkGatewayHealth, waitForGateway, resolveDeliveryAlias,
56
57
  applyAuthProfileToSessionStore,
57
58
  syncAuthStoreToSession,
@@ -306,6 +307,10 @@ function buildDispatchDeps() {
306
307
  // Agent
307
308
  waitForGateway, updateRunSession, setAgentStatus,
308
309
  buildJobPrompt, runAgentTurnWithActivityTimeout,
310
+ // Isolated cron-dispatch primitive: HTTP-only wrapper around the
311
+ // chat-completions API; never forks a sibling openclaw process that
312
+ // could SIGTERM the launchd-tracked gateway parent.
313
+ runIsolatedAgentTurn,
309
314
  updateContextSummary, releaseIdempotencyKey,
310
315
  matchesSentinel, detectTransientError,
311
316
  listSessions,
package/gateway.js CHANGED
@@ -9,6 +9,22 @@ const GATEWAY_URL = process.env.OPENCLAW_GATEWAY_URL || 'http://127.0.0.1:18789'
9
9
  const HOME_DIR = process.env.HOME || homedir();
10
10
  export const TELEGRAM_MAX_MESSAGE_LENGTH = 4096;
11
11
 
12
+ // -- Isolated dispatch primitive contract --------------------
13
+ //
14
+ // Cron jobs with session_target=isolated must reach the gateway via the
15
+ // public HTTP API only. Forking a sibling `openclaw` process to spawn the
16
+ // session is rejected: in production that primitive has SIGTERM'd the
17
+ // launchd-tracked gateway parent (the child inherits the parent's listening
18
+ // socket on port 18789 and the parent dies), leaving an orphan node process
19
+ // holding the port. See rh-bot.lan zombie-cascade incident report.
20
+ //
21
+ // runIsolatedAgentTurn is the only sanctioned dispatch primitive for
22
+ // session_target=isolated cron jobs. It MUST NOT spawn, fork, or exec any
23
+ // child process. Any future change that needs subprocess execution belongs
24
+ // behind a different, explicitly-named helper so reviewers can keep this
25
+ // contract intact.
26
+ export const ISOLATED_DISPATCH_PRIMITIVE = 'http-chat-completions';
27
+
12
28
  let _cachedToken;
13
29
  let _tokenLoaded = false;
14
30
 
@@ -246,6 +262,29 @@ export async function runAgentTurnWithActivityTimeout(opts) {
246
262
  }
247
263
  }
248
264
 
265
+ // -- Isolated dispatch primitive -----------------------------
266
+
267
+ /**
268
+ * Sanctioned dispatch primitive for session_target=isolated cron jobs.
269
+ *
270
+ * This is a thin wrapper around runAgentTurnWithActivityTimeout that names
271
+ * the contract: HTTP-only request to the gateway, no child process spawn.
272
+ * The scheduler routes every session_target=isolated job through this
273
+ * helper so the no-fork invariant is reviewable at one call site and
274
+ * testable in isolation (see the no-subprocess regression test in test.js).
275
+ *
276
+ * Why a named wrapper instead of calling runAgentTurnWithActivityTimeout
277
+ * directly: the dispatch primitive is the load-bearing surface that the
278
+ * rh-bot.lan zombie-on-port outage cascaded through. A named entry point
279
+ * gives operators and reviewers a single grep target ("runIsolatedAgentTurn")
280
+ * to audit the no-spawn invariant.
281
+ *
282
+ * Accepts the same options as runAgentTurnWithActivityTimeout.
283
+ */
284
+ export async function runIsolatedAgentTurn(opts) {
285
+ return await runAgentTurnWithActivityTimeout(opts);
286
+ }
287
+
249
288
  // -- System Events (main session) ----------------------------
250
289
 
251
290
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "openclaw-scheduler",
3
- "version": "0.2.5",
3
+ "version": "0.2.7",
4
4
  "description": "SQLite-backed job scheduler and workflow engine for OpenClaw agents",
5
5
  "type": "module",
6
6
  "main": "./index.js",
@@ -42,6 +42,7 @@
42
42
  "dispatch/deliver-watcher.sh",
43
43
  "dispatch/hooks.mjs",
44
44
  "dispatch/index.mjs",
45
+ "dispatch/liveness.mjs",
45
46
  "dispatch/message-input.mjs",
46
47
  "dispatch/README.md",
47
48
  "dispatch/watcher.mjs",