openclaw-scheduler 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dispatch/README.md +16 -2
- package/dispatch/completion.mjs +297 -20
- package/dispatch/index.mjs +80 -57
- package/dispatch/liveness.mjs +61 -0
- package/dispatch/watcher.mjs +299 -17
- package/dispatcher-strategies.js +82 -10
- package/dispatcher.js +6 -1
- package/gateway.js +39 -0
- package/package.json +2 -1
package/dispatch/watcher.mjs
CHANGED
|
@@ -36,6 +36,7 @@ import {
|
|
|
36
36
|
hasCompletionSignal,
|
|
37
37
|
resolveCompletionDelivery,
|
|
38
38
|
} from './completion.mjs';
|
|
39
|
+
import { getDispatchLivenessPolicy } from './liveness.mjs';
|
|
39
40
|
import { sendMessage } from '../messages.js';
|
|
40
41
|
|
|
41
42
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
@@ -684,6 +685,112 @@ function getJsonlMidTurnReason(sessionId, agentDir = 'main') {
|
|
|
684
685
|
return null; // Last assistant entry appears to be a complete text reply -- safe to proceed
|
|
685
686
|
}
|
|
686
687
|
|
|
688
|
+
/**
|
|
689
|
+
* Check the JSONL tail for a pending tool handoff without requiring recent
|
|
690
|
+
* file activity. Long-running tool calls can leave the transcript flat for
|
|
691
|
+
* minutes, so stale mtime alone is not enough to declare the agent stuck.
|
|
692
|
+
*
|
|
693
|
+
* @param {string} sessionId - Internal session UUID
|
|
694
|
+
* @param {string} agentDir - Agent directory (default: 'main')
|
|
695
|
+
* @returns {string|null} reason string if a tool handoff appears pending
|
|
696
|
+
*/
|
|
697
|
+
function getJsonlPendingToolReason(sessionId, agentDir = 'main') {
|
|
698
|
+
const lastLines = readJsonlLastLines(sessionId, agentDir, 3);
|
|
699
|
+
if (!lastLines || lastLines.length === 0) return null;
|
|
700
|
+
|
|
701
|
+
const last = lastLines[lastLines.length - 1];
|
|
702
|
+
|
|
703
|
+
if (last?.role === 'assistant') {
|
|
704
|
+
const content = Array.isArray(last.content) ? last.content : [];
|
|
705
|
+
const toolUse = content.find(c => c?.type === 'tool_use');
|
|
706
|
+
if (toolUse) {
|
|
707
|
+
return `last assistant entry has tool_use (${toolUse.name || 'unknown'}) -- awaiting tool result`;
|
|
708
|
+
}
|
|
709
|
+
if (last.type === 'tool_use') {
|
|
710
|
+
return `last entry is tool_use (${last.name || 'unknown'}) -- awaiting tool result`;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
if (last?.role === 'user') {
|
|
715
|
+
const content = Array.isArray(last.content) ? last.content : [];
|
|
716
|
+
if (content.some(c => c?.type === 'tool_result')) {
|
|
717
|
+
return 'last entry is tool_result (tool executed, awaiting assistant reply)';
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
if (last?.type === 'tool_result') {
|
|
722
|
+
return 'last entry is tool_result (tool executed, awaiting assistant reply)';
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
return null;
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
function parseTimestampMs(value) {
|
|
729
|
+
if (!value) return null;
|
|
730
|
+
if (typeof value === 'number') {
|
|
731
|
+
return Number.isFinite(value) ? value : null;
|
|
732
|
+
}
|
|
733
|
+
if (value instanceof Date) {
|
|
734
|
+
const timestamp = value.getTime();
|
|
735
|
+
return Number.isFinite(timestamp) ? timestamp : null;
|
|
736
|
+
}
|
|
737
|
+
const parsed = Date.parse(value);
|
|
738
|
+
return Number.isFinite(parsed) ? parsed : null;
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
/**
|
|
742
|
+
* Detect an agent session that has stopped making progress even though the
|
|
743
|
+
* watcher process itself is still alive and writing lastPing.
|
|
744
|
+
*
|
|
745
|
+
* This closes the failure mode where OpenClaw's Codex app-server retires a
|
|
746
|
+
* timed-out turn, but dispatch status keeps reporting "running" because the
|
|
747
|
+
* delivery watcher is still polling.
|
|
748
|
+
*/
|
|
749
|
+
function getRunningSessionStallReason(status, thresholdMs) {
|
|
750
|
+
if (!status?.sessionKey) return null;
|
|
751
|
+
|
|
752
|
+
const sessionAgent = status.sessionKey.split(':')[1] || 'main';
|
|
753
|
+
const entry = getSessionStoreEntry(status.sessionKey);
|
|
754
|
+
if (!entry) return null;
|
|
755
|
+
|
|
756
|
+
const sessionId = entry.sessionId || null;
|
|
757
|
+
const now = Date.now();
|
|
758
|
+
const activityTimes = [
|
|
759
|
+
parseTimestampMs(entry.updatedAt),
|
|
760
|
+
parseTimestampMs(entry.lastActivityAt),
|
|
761
|
+
parseTimestampMs(entry.sessionStartedAt),
|
|
762
|
+
parseTimestampMs(entry.startedAt),
|
|
763
|
+
].filter(t => typeof t === 'number');
|
|
764
|
+
|
|
765
|
+
const jsonlMtime = sessionId ? getSessionJsonlMtime(sessionId, sessionAgent) : null;
|
|
766
|
+
if (typeof jsonlMtime === 'number') activityTimes.push(jsonlMtime);
|
|
767
|
+
|
|
768
|
+
if (typeof status?.liveness?.ageMs === 'number' && status.liveness.ageMs < thresholdMs) {
|
|
769
|
+
return null;
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
const lastActivityMs = activityTimes.length ? Math.max(...activityTimes) : null;
|
|
773
|
+
if (lastActivityMs !== null && now - lastActivityMs < thresholdMs) {
|
|
774
|
+
return null;
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
const pendingToolReason = sessionId ? getJsonlPendingToolReason(sessionId, sessionAgent) : null;
|
|
778
|
+
if (pendingToolReason) {
|
|
779
|
+
process.stderr.write(
|
|
780
|
+
`[watcher] ${status.label || 'session'} stale telemetry but pending tool handoff detected: ${pendingToolReason}\n`
|
|
781
|
+
);
|
|
782
|
+
return null;
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
const idleMinutes = lastActivityMs === null
|
|
786
|
+
? Math.ceil(thresholdMs / 60000)
|
|
787
|
+
: Math.max(1, Math.floor((now - lastActivityMs) / 60000));
|
|
788
|
+
return (
|
|
789
|
+
`agent session stalled: no session/jsonl activity for ~${idleMinutes}min ` +
|
|
790
|
+
`while delivery watcher remained alive; likely app-server turn retired or stopped producing events`
|
|
791
|
+
);
|
|
792
|
+
}
|
|
793
|
+
|
|
687
794
|
/**
|
|
688
795
|
* Read the last assistant entry's stop_reason from the session JSONL.
|
|
689
796
|
* Returns the stop_reason string (e.g. 'end_turn', 'tool_use') or null if unavailable.
|
|
@@ -754,6 +861,7 @@ function markLabelError(label, errorSummary) {
|
|
|
754
861
|
updateExistingLabel(label, (entry) => {
|
|
755
862
|
if (entry.status === 'done') return false;
|
|
756
863
|
entry.status = 'error';
|
|
864
|
+
entry.error = errorSummary || 'failed without result';
|
|
757
865
|
entry.summary = errorSummary || 'failed without result';
|
|
758
866
|
});
|
|
759
867
|
} catch (e) {
|
|
@@ -761,6 +869,8 @@ function markLabelError(label, errorSummary) {
|
|
|
761
869
|
}
|
|
762
870
|
}
|
|
763
871
|
|
|
872
|
+
let exitZeroOnTerminal = false;
|
|
873
|
+
|
|
764
874
|
/**
|
|
765
875
|
* Format and output the delivery message, then exit 0.
|
|
766
876
|
* Also marks the label as done in labels.json before exiting.
|
|
@@ -794,7 +904,7 @@ function deliverResult(label, lastReply, fallbackSummary, completionPayload = nu
|
|
|
794
904
|
`**Error:** ${stderr || 'non-zero exit'}\n\n` +
|
|
795
905
|
`Job marked as \`error\`. The agent may have reported done without completing the actual work.\n`
|
|
796
906
|
);
|
|
797
|
-
process.exit(1);
|
|
907
|
+
process.exit(exitZeroOnTerminal ? 0 : 1);
|
|
798
908
|
}
|
|
799
909
|
}
|
|
800
910
|
} catch (loadErr) {
|
|
@@ -816,10 +926,17 @@ function deliverResult(label, lastReply, fallbackSummary, completionPayload = nu
|
|
|
816
926
|
? completion.deliveryText.slice(0, maxLen) + '\n\n..[truncated]'
|
|
817
927
|
: completion.deliveryText;
|
|
818
928
|
process.stdout.write(`🌶️ *dispatch* [${label}] completed:\n\n${reply}\n`);
|
|
819
|
-
|
|
820
|
-
process.stderr.write(`[watcher] [${label}] completion delivery suppressed (no meaningful reply or summary)\n`);
|
|
929
|
+
process.exit(0);
|
|
821
930
|
}
|
|
822
|
-
|
|
931
|
+
|
|
932
|
+
const failureSummary = 'completed without a clean user-facing completion';
|
|
933
|
+
process.stderr.write(`[watcher] [${label}] completion delivery suppressed (no meaningful reply or summary)\n`);
|
|
934
|
+
markLabelError(label, failureSummary);
|
|
935
|
+
process.stdout.write(
|
|
936
|
+
`⚠️ dispatch [${label}] completed, but no clean user-facing completion was captured. ` +
|
|
937
|
+
`Internal diagnostics were suppressed; check scheduler run logs for details.\n`
|
|
938
|
+
);
|
|
939
|
+
process.exit(exitZeroOnTerminal ? 0 : 1);
|
|
823
940
|
}
|
|
824
941
|
|
|
825
942
|
function emitInterruptedOutcome(label, summary, result = null) {
|
|
@@ -829,12 +946,12 @@ function emitInterruptedOutcome(label, summary, result = null) {
|
|
|
829
946
|
`⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete` +
|
|
830
947
|
`${formatDiagnosticSnippet(result?.diagnosticReply || result?.lastReply || null)}\n`
|
|
831
948
|
);
|
|
832
|
-
process.exit(1);
|
|
949
|
+
process.exit(exitZeroOnTerminal ? 0 : 1);
|
|
833
950
|
}
|
|
834
951
|
|
|
835
952
|
function emitTimeoutOutcome(label, message, result = null) {
|
|
836
953
|
process.stdout.write(`${message}${formatDiagnosticSnippet(result?.diagnosticReply || result?.lastReply || null)}\n`);
|
|
837
|
-
process.exit(1);
|
|
954
|
+
process.exit(exitZeroOnTerminal ? 0 : 1);
|
|
838
955
|
}
|
|
839
956
|
|
|
840
957
|
// -- Watcher heartbeat interval ref --------------------------------------
|
|
@@ -869,15 +986,165 @@ const flags = parseFlags(process.argv.slice(2));
|
|
|
869
986
|
const label = flags.label;
|
|
870
987
|
const timeoutS = parseInt(flags.timeout || '600', 10);
|
|
871
988
|
const pollS = parseInt(flags['poll-interval'] || '20', 10);
|
|
989
|
+
const once = flags.once === true || flags.once === 'true';
|
|
990
|
+
exitZeroOnTerminal = once;
|
|
872
991
|
|
|
873
|
-
|
|
874
|
-
const
|
|
992
|
+
function getCurrentLivenessPolicy() {
|
|
993
|
+
const entry = loadLabels()[label] || { timeoutSeconds: timeoutS };
|
|
994
|
+
return getDispatchLivenessPolicy(entry, { defaultTimeoutSeconds: timeoutS });
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
function hasStructuredCompletion(result) {
|
|
998
|
+
return hasCompletionSignal(result?.completion);
|
|
999
|
+
}
|
|
875
1000
|
|
|
876
1001
|
if (!label) {
|
|
877
1002
|
process.stderr.write('[watcher] --label is required\n');
|
|
878
1003
|
process.exit(2);
|
|
879
1004
|
}
|
|
880
1005
|
|
|
1006
|
+
function touchWatcherPing(label) {
|
|
1007
|
+
updateExistingLabel(label, (entry) => {
|
|
1008
|
+
if (entry.status !== 'running') return false;
|
|
1009
|
+
entry.lastPing = new Date().toISOString();
|
|
1010
|
+
});
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
function markWatcherPending(label, reason = 'target still running') {
|
|
1014
|
+
process.stderr.write(`[watcher] WATCHER_PENDING label=${label} reason=${reason}\n`);
|
|
1015
|
+
process.exit(0);
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
function clearWatcherRetryAfter(label) {
|
|
1019
|
+
updateExistingLabel(label, (entry) => {
|
|
1020
|
+
if (!entry.watcherRetryAfter) return false;
|
|
1021
|
+
delete entry.watcherRetryAfter;
|
|
1022
|
+
});
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
function handleOnce529(label, errorMsg) {
|
|
1026
|
+
const labels = loadLabels();
|
|
1027
|
+
const entry = labels[label] || {};
|
|
1028
|
+
const retryCount = getRetryCount(label);
|
|
1029
|
+
|
|
1030
|
+
if (retryCount >= MAX_529_RETRIES) {
|
|
1031
|
+
markLabelError(label, `max_retries_exceeded (${retryCount}x 529): ${errorMsg}`);
|
|
1032
|
+
process.stdout.write(
|
|
1033
|
+
`🌶️ *dispatch* [${label}] failed after ${MAX_529_RETRIES} retries (529 overload)\n` +
|
|
1034
|
+
`Error: ${errorMsg}\n`
|
|
1035
|
+
);
|
|
1036
|
+
process.exit(0);
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
const retryAfterMs = parseTimestampMs(entry.watcherRetryAfter);
|
|
1040
|
+
if (!retryAfterMs) {
|
|
1041
|
+
const retryResult = attempt529Retry(label, retryCount, errorMsg);
|
|
1042
|
+
if (!retryResult.retry) return handleOnce529(label, errorMsg);
|
|
1043
|
+
updateExistingLabel(label, (current) => {
|
|
1044
|
+
current.watcherRetryAfter = new Date(Date.now() + retryResult.delayMs).toISOString();
|
|
1045
|
+
});
|
|
1046
|
+
markWatcherPending(label, `529 retry scheduled for future tick (${retryResult.delayMs / 1000}s)`);
|
|
1047
|
+
}
|
|
1048
|
+
|
|
1049
|
+
if (Date.now() < retryAfterMs) {
|
|
1050
|
+
markWatcherPending(label, '529 retry backoff active');
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
if (respawnSession(label)) {
|
|
1054
|
+
clearWatcherRetryAfter(label);
|
|
1055
|
+
markWatcherPending(label, '529 retry dispatched');
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
markLabelError(label, `529 retry failed -- could not respawn session: ${errorMsg}`);
|
|
1059
|
+
process.stdout.write(
|
|
1060
|
+
`🌶️ *dispatch* [${label}] 529 retry failed -- could not respawn session\n` +
|
|
1061
|
+
`Error: ${errorMsg}\n`
|
|
1062
|
+
);
|
|
1063
|
+
process.exit(0);
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
function runOnceAndExit() {
|
|
1067
|
+
try {
|
|
1068
|
+
touchWatcherPing(label);
|
|
1069
|
+
} catch {
|
|
1070
|
+
// Best-effort -- a quick-poll tick must not fail because heartbeat metadata raced.
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
const status = dispatch('status', ['--label', label]);
|
|
1074
|
+
if (!status?.ok) {
|
|
1075
|
+
markWatcherPending(label, 'status unavailable');
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
if (status.status === 'error') {
|
|
1079
|
+
const errorMsg = status.error || status.summary || '';
|
|
1080
|
+
if (is529Error(errorMsg)) {
|
|
1081
|
+
handleOnce529(label, errorMsg);
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
|
|
1085
|
+
if (status.status !== 'running') {
|
|
1086
|
+
const terminalResult = dispatch('result', ['--label', label]);
|
|
1087
|
+
const terminalCompletion = terminalResult?.completion || status?.completion || null;
|
|
1088
|
+
|
|
1089
|
+
if (status.status === 'done') {
|
|
1090
|
+
const currentRetryCount = getRetryCount(label);
|
|
1091
|
+
if (currentRetryCount > 0) setRetryCount(label, 0);
|
|
1092
|
+
const gwRetryCount = getGwRestartRetryCount(label);
|
|
1093
|
+
if (gwRetryCount > 0) setGwRestartRetryCount(label, 0);
|
|
1094
|
+
deliverResult(label, terminalResult?.lastReply, status.summary, terminalCompletion);
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
if (status.status === 'interrupted') {
|
|
1098
|
+
emitInterruptedOutcome(label, status.summary, terminalResult);
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
const summary = status.error || status.summary || `terminal failure (${status.status || 'unknown'})`;
|
|
1102
|
+
markLabelError(label, summary);
|
|
1103
|
+
process.stdout.write(`🌶️ *dispatch* [${label}] failed\nSummary: ${summary}\n`);
|
|
1104
|
+
process.exit(0);
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
if (status.sessionKey) {
|
|
1108
|
+
const entry = getSessionStoreEntry(status.sessionKey);
|
|
1109
|
+
const sessionId = entry?.sessionId || null;
|
|
1110
|
+
const sessionAgent = status.sessionKey.split(':')[1] || 'main';
|
|
1111
|
+
const terminalJsonlReply = sessionId ? getSessionTerminalReply(sessionId, sessionAgent) : null;
|
|
1112
|
+
if (sessionId && terminalJsonlReply && isSessionCleanlyFinished(sessionId, sessionAgent)) {
|
|
1113
|
+
const result = dispatch('result', ['--label', label]);
|
|
1114
|
+
if (hasStructuredCompletion(result)) {
|
|
1115
|
+
deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
|
|
1116
|
+
}
|
|
1117
|
+
process.stderr.write(`[watcher] stop_reason=end_turn observed without completion signal -- continuing to monitor\n`);
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
const ageMs = status.liveness?.ageMs;
|
|
1122
|
+
const idleResultCheckMs = getCurrentLivenessPolicy().idleProbeMs;
|
|
1123
|
+
if (ageMs != null && ageMs >= idleResultCheckMs) {
|
|
1124
|
+
const result = dispatch('result', ['--label', label]);
|
|
1125
|
+
if (hasStructuredCompletion(result)) {
|
|
1126
|
+
deliverResult(label, result?.lastReply || null, null, result?.completion || null);
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
const stallReason = getRunningSessionStallReason(status, idleResultCheckMs);
|
|
1130
|
+
if (stallReason) {
|
|
1131
|
+
process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
|
|
1132
|
+
markLabelError(label, stallReason);
|
|
1133
|
+
process.stdout.write(
|
|
1134
|
+
`❌ *dispatch* [${label}] failed\n` +
|
|
1135
|
+
`Summary: ${stallReason}\n`
|
|
1136
|
+
);
|
|
1137
|
+
process.exit(0);
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
markWatcherPending(label);
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
if (once) {
|
|
1145
|
+
runOnceAndExit();
|
|
1146
|
+
}
|
|
1147
|
+
|
|
881
1148
|
// -- Start heartbeat -----------------------------------------------------
|
|
882
1149
|
// Write lastPing to labels.json every PING_INTERVAL_MS while the session is
|
|
883
1150
|
// still running. The watchdog guard in index.mjs reads lastPing to know this
|
|
@@ -1221,8 +1488,11 @@ while (Date.now() < deadline) {
|
|
|
1221
1488
|
if (_sid2a && terminalJsonlReply && isSessionCleanlyFinished(_sid2a, _adir2a)) {
|
|
1222
1489
|
process.stderr.write(`[watcher] stop_reason=end_turn detected -- delivering early\n`);
|
|
1223
1490
|
const result = dispatch('result', ['--label', label]);
|
|
1224
|
-
|
|
1225
|
-
|
|
1491
|
+
if (hasStructuredCompletion(result)) {
|
|
1492
|
+
deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
|
|
1493
|
+
// deliverResult exits
|
|
1494
|
+
}
|
|
1495
|
+
process.stderr.write(`[watcher] stop_reason=end_turn observed without completion signal -- continuing to monitor\n`);
|
|
1226
1496
|
}
|
|
1227
1497
|
}
|
|
1228
1498
|
|
|
@@ -1233,11 +1503,23 @@ while (Date.now() < deadline) {
|
|
|
1233
1503
|
// while this watcher's lastPing heartbeat is fresh (written every 60s);
|
|
1234
1504
|
// this path handles normal completion before the ping goes stale.
|
|
1235
1505
|
const ageMs = status.liveness?.ageMs;
|
|
1236
|
-
|
|
1506
|
+
const idleResultCheckMs = getCurrentLivenessPolicy().idleProbeMs;
|
|
1507
|
+
if (ageMs != null && ageMs >= idleResultCheckMs) {
|
|
1237
1508
|
const result = dispatch('result', ['--label', label]);
|
|
1238
|
-
if (
|
|
1509
|
+
if (hasStructuredCompletion(result)) {
|
|
1239
1510
|
deliverResult(label, result?.lastReply || null, null, result?.completion || null);
|
|
1240
1511
|
}
|
|
1512
|
+
|
|
1513
|
+
const stallReason = getRunningSessionStallReason(status, idleResultCheckMs);
|
|
1514
|
+
if (stallReason) {
|
|
1515
|
+
process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
|
|
1516
|
+
markLabelError(label, stallReason);
|
|
1517
|
+
process.stdout.write(
|
|
1518
|
+
`❌ *dispatch* [${label}] failed\n` +
|
|
1519
|
+
`Summary: ${stallReason}\n`
|
|
1520
|
+
);
|
|
1521
|
+
process.exit(1);
|
|
1522
|
+
}
|
|
1241
1523
|
}
|
|
1242
1524
|
|
|
1243
1525
|
|
|
@@ -1310,7 +1592,7 @@ if (sessionInternalId) {
|
|
|
1310
1592
|
// If the session already completed (gateway pruned it -> null tokens), exit cleanly.
|
|
1311
1593
|
if (statusAtDeadline?.status === 'done' || baselineTokens === null) {
|
|
1312
1594
|
const r = dispatch('result', ['--label', label]);
|
|
1313
|
-
if (
|
|
1595
|
+
if (hasStructuredCompletion(r)) {
|
|
1314
1596
|
// deliverResult calls process.exit(0) internally
|
|
1315
1597
|
deliverResult(label, r?.lastReply || null, statusAtDeadline?.summary || null, r?.completion || null);
|
|
1316
1598
|
}
|
|
@@ -1349,7 +1631,7 @@ while (Date.now() - flatSince < FLAT_WINDOW_MS) {
|
|
|
1349
1631
|
deliverResult(label, r?.lastReply || null, st.summary, r?.completion || st?.completion || null);
|
|
1350
1632
|
}
|
|
1351
1633
|
const r2 = dispatch('result', ['--label', label]);
|
|
1352
|
-
if (
|
|
1634
|
+
if (hasStructuredCompletion(r2)) {
|
|
1353
1635
|
// deliverResult calls process.exit(0) internally
|
|
1354
1636
|
deliverResult(label, r2?.lastReply || null, null, r2?.completion || null);
|
|
1355
1637
|
}
|
|
@@ -1443,7 +1725,7 @@ if (sessionInternalId) {
|
|
|
1443
1725
|
deliverResult(label, rExt?.lastReply || null, stExt.summary, rExt?.completion || stExt?.completion || null);
|
|
1444
1726
|
}
|
|
1445
1727
|
const rExt2 = dispatch('result', ['--label', label]);
|
|
1446
|
-
if (
|
|
1728
|
+
if (hasStructuredCompletion(rExt2)) {
|
|
1447
1729
|
// deliverResult calls process.exit(0) internally
|
|
1448
1730
|
deliverResult(label, rExt2?.lastReply || null, null, rExt2?.completion || null);
|
|
1449
1731
|
}
|
|
@@ -1500,7 +1782,7 @@ for (const round of steerRounds) {
|
|
|
1500
1782
|
deliverResult(label, r3?.lastReply || null, st2.summary, r3?.completion || st2?.completion || null);
|
|
1501
1783
|
}
|
|
1502
1784
|
const r3 = dispatch('result', ['--label', label]);
|
|
1503
|
-
if (
|
|
1785
|
+
if (hasStructuredCompletion(r3)) {
|
|
1504
1786
|
// deliverResult calls process.exit(0) internally
|
|
1505
1787
|
deliverResult(label, r3?.lastReply || null, null, r3?.completion || null);
|
|
1506
1788
|
}
|
|
@@ -1515,7 +1797,7 @@ for (const round of steerRounds) {
|
|
|
1515
1797
|
if (st3?.status === 'done') {
|
|
1516
1798
|
// Check if a result was captured before marking as error
|
|
1517
1799
|
const r4 = dispatch('result', ['--label', label]);
|
|
1518
|
-
if (
|
|
1800
|
+
if (hasStructuredCompletion(r4)) {
|
|
1519
1801
|
deliverResult(label, r4?.lastReply || null, st3.summary, r4?.completion || st3?.completion || null); // deliverResult calls process.exit(0)
|
|
1520
1802
|
}
|
|
1521
1803
|
markLabelError(label, 'timed out -- killed after steer attempts (no result captured)');
|
package/dispatcher-strategies.js
CHANGED
|
@@ -1095,6 +1095,25 @@ export async function executeMain(job, ctx, deps) {
|
|
|
1095
1095
|
|
|
1096
1096
|
// -- Strategy: Shell -----------------------------------------
|
|
1097
1097
|
|
|
1098
|
+
function isCompletionDeliveryWatcherJob(job) {
|
|
1099
|
+
return /^(?:dispatch|chilisaus)-deliver:/.test(String(job?.name || ''));
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
function isCompletionWatcherPendingTick(shellResult) {
|
|
1103
|
+
return !(shellResult.stdout || '').trim()
|
|
1104
|
+
&& /\bWATCHER_PENDING\b/.test(shellResult.stderr || '');
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
function buildCompletionWatcherNoPayloadMessage(job, shellResult) {
|
|
1108
|
+
const statusLabel = shellResult.status === 'ok'
|
|
1109
|
+
? 'completed without a deliverable result'
|
|
1110
|
+
: `failed before producing a deliverable result${shellResult.errorMessage ? ` (${shellResult.errorMessage})` : ''}`;
|
|
1111
|
+
return [
|
|
1112
|
+
`⚠️ Completion delivery watcher for ${job.name} ${statusLabel}.`,
|
|
1113
|
+
'No internal diagnostics were delivered as the completion message; check the scheduler run logs for stderr/details.',
|
|
1114
|
+
].join('\n');
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1098
1117
|
export async function executeShell(job, ctx, deps) {
|
|
1099
1118
|
const { runShellCommand, normalizeShellResult, log } = deps;
|
|
1100
1119
|
const result = makeDefaultResult();
|
|
@@ -1129,18 +1148,61 @@ export async function executeShell(job, ctx, deps) {
|
|
|
1129
1148
|
shell_stderr_bytes: shellResult.stderrBytes,
|
|
1130
1149
|
};
|
|
1131
1150
|
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1151
|
+
if (isCompletionDeliveryWatcherJob(job)) {
|
|
1152
|
+
const watcherStdout = (shellResult.stdout || '').trim();
|
|
1153
|
+
const watcherStderr = (shellResult.stderr || '').trim();
|
|
1154
|
+
|
|
1155
|
+
if (isCompletionWatcherPendingTick(shellResult)) {
|
|
1156
|
+
result.status = 'skipped';
|
|
1157
|
+
result.summary = 'Completion delivery watcher pending; target session is still running';
|
|
1158
|
+
result.content = '';
|
|
1159
|
+
result.errorMessage = null;
|
|
1160
|
+
result.idemAction = 'release';
|
|
1161
|
+
result.skipDelivery = true;
|
|
1162
|
+
} else if (watcherStdout) {
|
|
1163
|
+
// Completion watcher stdout is the only user-facing contract. Stderr is
|
|
1164
|
+
// diagnostics-only and must never be repackaged as a "successful" final
|
|
1165
|
+
// completion if the watcher suppressed the real payload.
|
|
1166
|
+
result.summary = watcherStdout;
|
|
1167
|
+
result.content = watcherStdout;
|
|
1168
|
+
if (['announce', 'announce-always'].includes(job.delivery_mode)) {
|
|
1169
|
+
result.deliveryOverride = watcherStdout;
|
|
1170
|
+
} else {
|
|
1171
|
+
result.skipDelivery = true;
|
|
1172
|
+
}
|
|
1173
|
+
} else {
|
|
1174
|
+
const noPayloadMessage = buildCompletionWatcherNoPayloadMessage(job, shellResult);
|
|
1175
|
+
result.status = 'error';
|
|
1176
|
+
result.summary = noPayloadMessage;
|
|
1177
|
+
result.errorMessage = 'Completion delivery watcher produced no user-facing stdout payload';
|
|
1178
|
+
result.content = noPayloadMessage;
|
|
1179
|
+
if (['announce', 'announce-always'].includes(job.delivery_mode)) {
|
|
1180
|
+
result.deliveryOverride = noPayloadMessage;
|
|
1181
|
+
} else {
|
|
1182
|
+
result.skipDelivery = true;
|
|
1183
|
+
}
|
|
1184
|
+
log('warn', `Completion watcher produced no deliverable stdout: ${job.name}`, {
|
|
1185
|
+
runId: ctx.run.id,
|
|
1186
|
+
shellStatus: shellResult.status,
|
|
1187
|
+
exitCode: shellResult.exitCode,
|
|
1188
|
+
stderrExcerpt: watcherStderr.slice(0, 500),
|
|
1189
|
+
skippedOrDisabled: /\b(?:skipped|disabled)\b/i.test(watcherStderr),
|
|
1190
|
+
});
|
|
1191
|
+
}
|
|
1139
1192
|
} else {
|
|
1140
|
-
|
|
1193
|
+
// Shell delivery logic: announce-always sends on all results, announce sends on error only
|
|
1194
|
+
const announcePayload = shellResult.deliveryText.trim() ? shellResult.deliveryText : shellResult.errorMessage;
|
|
1195
|
+
if (job.delivery_mode === 'announce-always' && announcePayload) {
|
|
1196
|
+
const prefix = shellResult.status === 'ok' ? '' : `\u26a0\ufe0f Shell job failed: ${job.name}\n\n`;
|
|
1197
|
+
result.deliveryOverride = `${prefix}${announcePayload}`;
|
|
1198
|
+
} else if (job.delivery_mode === 'announce' && shellResult.status !== 'ok' && announcePayload) {
|
|
1199
|
+
result.deliveryOverride = announcePayload;
|
|
1200
|
+
} else {
|
|
1201
|
+
result.skipDelivery = true;
|
|
1202
|
+
}
|
|
1141
1203
|
}
|
|
1142
1204
|
|
|
1143
|
-
log('info', `Shell ${
|
|
1205
|
+
log('info', `Shell ${result.status}: ${job.name}`, {
|
|
1144
1206
|
runId: ctx.run.id,
|
|
1145
1207
|
exitCode: shellResult.exitCode,
|
|
1146
1208
|
signal: shellResult.signal,
|
|
@@ -1156,11 +1218,16 @@ export async function executeAgent(job, ctx, deps) {
|
|
|
1156
1218
|
const {
|
|
1157
1219
|
waitForGateway, updateRunSession, setAgentStatus,
|
|
1158
1220
|
buildJobPrompt, runAgentTurnWithActivityTimeout,
|
|
1221
|
+
// Sanctioned isolated dispatch primitive. Falls back to the activity-aware
|
|
1222
|
+
// runner when callers (e.g. tests) wire only the older name -- both helpers
|
|
1223
|
+
// share the same HTTP-only contract, no subprocess spawn.
|
|
1224
|
+
runIsolatedAgentTurn,
|
|
1159
1225
|
updateContextSummary, releaseDispatch, releaseIdempotencyKey,
|
|
1160
1226
|
updateJob, matchesSentinel, detectTransientError,
|
|
1161
1227
|
listSessions,
|
|
1162
1228
|
sqliteNow, log,
|
|
1163
1229
|
} = deps;
|
|
1230
|
+
const dispatchAgentTurn = runIsolatedAgentTurn || runAgentTurnWithActivityTimeout;
|
|
1164
1231
|
const result = makeDefaultResult();
|
|
1165
1232
|
|
|
1166
1233
|
// Gateway health check
|
|
@@ -1254,7 +1321,12 @@ export async function executeAgent(job, ctx, deps) {
|
|
|
1254
1321
|
}
|
|
1255
1322
|
}
|
|
1256
1323
|
|
|
1257
|
-
|
|
1324
|
+
// Isolated dispatch primitive: HTTP-only chat completions call. The
|
|
1325
|
+
// scheduler must never fork a sibling `openclaw` process to spawn an
|
|
1326
|
+
// isolated session -- that variant has historically SIGTERM'd the
|
|
1327
|
+
// launchd-tracked gateway parent and orphaned a node process on port
|
|
1328
|
+
// 18789 (see ISOLATED_DISPATCH_PRIMITIVE in gateway.js).
|
|
1329
|
+
const turnResult = await dispatchAgentTurn({
|
|
1258
1330
|
message: prompt,
|
|
1259
1331
|
agentId: job.agent_id || 'main',
|
|
1260
1332
|
sessionKey,
|
package/dispatcher.js
CHANGED
|
@@ -51,7 +51,8 @@ import {
|
|
|
51
51
|
import { buildRetrievalContext } from './retrieval.js';
|
|
52
52
|
import { upsertAgent, setAgentStatus } from './agents.js';
|
|
53
53
|
import {
|
|
54
|
-
runAgentTurnWithActivityTimeout,
|
|
54
|
+
runAgentTurnWithActivityTimeout, runIsolatedAgentTurn,
|
|
55
|
+
sendSystemEvent, getAllSubAgentSessions, listSessions,
|
|
55
56
|
deliverMessage, checkGatewayHealth, waitForGateway, resolveDeliveryAlias,
|
|
56
57
|
applyAuthProfileToSessionStore,
|
|
57
58
|
syncAuthStoreToSession,
|
|
@@ -306,6 +307,10 @@ function buildDispatchDeps() {
|
|
|
306
307
|
// Agent
|
|
307
308
|
waitForGateway, updateRunSession, setAgentStatus,
|
|
308
309
|
buildJobPrompt, runAgentTurnWithActivityTimeout,
|
|
310
|
+
// Isolated cron-dispatch primitive: HTTP-only wrapper around the
|
|
311
|
+
// chat-completions API; never forks a sibling openclaw process that
|
|
312
|
+
// could SIGTERM the launchd-tracked gateway parent.
|
|
313
|
+
runIsolatedAgentTurn,
|
|
309
314
|
updateContextSummary, releaseIdempotencyKey,
|
|
310
315
|
matchesSentinel, detectTransientError,
|
|
311
316
|
listSessions,
|
package/gateway.js
CHANGED
|
@@ -9,6 +9,22 @@ const GATEWAY_URL = process.env.OPENCLAW_GATEWAY_URL || 'http://127.0.0.1:18789'
|
|
|
9
9
|
const HOME_DIR = process.env.HOME || homedir();
|
|
10
10
|
export const TELEGRAM_MAX_MESSAGE_LENGTH = 4096;
|
|
11
11
|
|
|
12
|
+
// -- Isolated dispatch primitive contract --------------------
|
|
13
|
+
//
|
|
14
|
+
// Cron jobs with session_target=isolated must reach the gateway via the
|
|
15
|
+
// public HTTP API only. Forking a sibling `openclaw` process to spawn the
|
|
16
|
+
// session is rejected: in production that primitive has SIGTERM'd the
|
|
17
|
+
// launchd-tracked gateway parent (the child inherits the parent's listening
|
|
18
|
+
// socket on port 18789 and the parent dies), leaving an orphan node process
|
|
19
|
+
// holding the port. See rh-bot.lan zombie-cascade incident report.
|
|
20
|
+
//
|
|
21
|
+
// runIsolatedAgentTurn is the only sanctioned dispatch primitive for
|
|
22
|
+
// session_target=isolated cron jobs. It MUST NOT spawn, fork, or exec any
|
|
23
|
+
// child process. Any future change that needs subprocess execution belongs
|
|
24
|
+
// behind a different, explicitly-named helper so reviewers can keep this
|
|
25
|
+
// contract intact.
|
|
26
|
+
export const ISOLATED_DISPATCH_PRIMITIVE = 'http-chat-completions';
|
|
27
|
+
|
|
12
28
|
let _cachedToken;
|
|
13
29
|
let _tokenLoaded = false;
|
|
14
30
|
|
|
@@ -246,6 +262,29 @@ export async function runAgentTurnWithActivityTimeout(opts) {
|
|
|
246
262
|
}
|
|
247
263
|
}
|
|
248
264
|
|
|
265
|
+
// -- Isolated dispatch primitive -----------------------------
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Sanctioned dispatch primitive for session_target=isolated cron jobs.
|
|
269
|
+
*
|
|
270
|
+
* This is a thin wrapper around runAgentTurnWithActivityTimeout that names
|
|
271
|
+
* the contract: HTTP-only request to the gateway, no child process spawn.
|
|
272
|
+
* The scheduler routes every session_target=isolated job through this
|
|
273
|
+
* helper so the no-fork invariant is reviewable at one call site and
|
|
274
|
+
* testable in isolation (see the no-subprocess regression test in test.js).
|
|
275
|
+
*
|
|
276
|
+
* Why a named wrapper instead of calling runAgentTurnWithActivityTimeout
|
|
277
|
+
* directly: the dispatch primitive is the load-bearing surface that the
|
|
278
|
+
* rh-bot.lan zombie-on-port outage cascaded through. A named entry point
|
|
279
|
+
* gives operators and reviewers a single grep target ("runIsolatedAgentTurn")
|
|
280
|
+
* to audit the no-spawn invariant.
|
|
281
|
+
*
|
|
282
|
+
* Accepts the same options as runAgentTurnWithActivityTimeout.
|
|
283
|
+
*/
|
|
284
|
+
export async function runIsolatedAgentTurn(opts) {
|
|
285
|
+
return await runAgentTurnWithActivityTimeout(opts);
|
|
286
|
+
}
|
|
287
|
+
|
|
249
288
|
// -- System Events (main session) ----------------------------
|
|
250
289
|
|
|
251
290
|
/**
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "openclaw-scheduler",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.7",
|
|
4
4
|
"description": "SQLite-backed job scheduler and workflow engine for OpenClaw agents",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.js",
|
|
@@ -42,6 +42,7 @@
|
|
|
42
42
|
"dispatch/deliver-watcher.sh",
|
|
43
43
|
"dispatch/hooks.mjs",
|
|
44
44
|
"dispatch/index.mjs",
|
|
45
|
+
"dispatch/liveness.mjs",
|
|
45
46
|
"dispatch/message-input.mjs",
|
|
46
47
|
"dispatch/README.md",
|
|
47
48
|
"dispatch/watcher.mjs",
|