openclaw-scheduler 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/README.md +16 -6
- package/cli.js +13 -4
- package/dispatch/README.md +18 -3
- package/dispatch/completion.mjs +1312 -34
- package/dispatch/hooks.mjs +17 -5
- package/dispatch/index.mjs +600 -226
- package/dispatch/message-input.mjs +67 -0
- package/dispatch/watcher.mjs +381 -43
- package/dispatcher-strategies.js +203 -30
- package/dispatcher.js +6 -1
- package/gateway.js +71 -8
- package/index.d.ts +1 -0
- package/package.json +3 -1
- package/scripts/dispatch-cli-utils.mjs +53 -0
- package/scripts/inbox-watcher-guardrail.mjs +506 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { readFileSync } from 'fs';
|
|
2
|
+
import { isatty } from 'node:tty';
|
|
3
|
+
|
|
4
|
+
function normalizeFlagValue(value, flagName) {
|
|
5
|
+
if (value === undefined || value === null) return null;
|
|
6
|
+
if (value === true) throw new Error(`${flagName} requires a value`);
|
|
7
|
+
return String(value);
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export async function resolveMessageInput({
|
|
11
|
+
message = null,
|
|
12
|
+
messageFile = null,
|
|
13
|
+
messageEnv = null,
|
|
14
|
+
messageStdin = false,
|
|
15
|
+
stdinIsTTY = isatty(0),
|
|
16
|
+
env = process.env,
|
|
17
|
+
readFile = (path) => readFileSync(path, 'utf8'),
|
|
18
|
+
readStdin = () => readFileSync(0, 'utf8'),
|
|
19
|
+
} = {}) {
|
|
20
|
+
const directMessage = normalizeFlagValue(message, '--message');
|
|
21
|
+
const filePath = normalizeFlagValue(messageFile, '--message-file');
|
|
22
|
+
const envVar = normalizeFlagValue(messageEnv, '--message-env');
|
|
23
|
+
const wantsStdin = messageStdin === true || messageStdin === 'true';
|
|
24
|
+
|
|
25
|
+
const explicitSources = [];
|
|
26
|
+
if (directMessage !== null) explicitSources.push('--message');
|
|
27
|
+
if (filePath !== null) explicitSources.push('--message-file');
|
|
28
|
+
if (envVar !== null) explicitSources.push('--message-env');
|
|
29
|
+
if (wantsStdin) explicitSources.push('--message-stdin');
|
|
30
|
+
|
|
31
|
+
if (explicitSources.length > 1) {
|
|
32
|
+
throw new Error(`choose only one of ${explicitSources.join(', ')} for the prompt source`);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (directMessage !== null) return directMessage;
|
|
36
|
+
|
|
37
|
+
if (filePath !== null) {
|
|
38
|
+
if (filePath === '-') {
|
|
39
|
+
if (stdinIsTTY === true) throw new Error('--message-file - requires piped stdin');
|
|
40
|
+
return readStdin();
|
|
41
|
+
}
|
|
42
|
+
try {
|
|
43
|
+
return readFile(filePath);
|
|
44
|
+
} catch (err) {
|
|
45
|
+
throw new Error(`--message-file: could not read file: ${err.message}`, { cause: err });
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (envVar !== null) {
|
|
50
|
+
if (!Object.prototype.hasOwnProperty.call(env, envVar)) {
|
|
51
|
+
throw new Error(`--message-env: environment variable ${envVar} is not set`);
|
|
52
|
+
}
|
|
53
|
+
return String(env[envVar] ?? '');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (wantsStdin) {
|
|
57
|
+
if (stdinIsTTY === true) throw new Error('--message-stdin requires piped stdin');
|
|
58
|
+
return readStdin();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (stdinIsTTY !== true) {
|
|
62
|
+
const pipedText = readStdin();
|
|
63
|
+
return pipedText.length > 0 ? pipedText : null;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return null;
|
|
67
|
+
}
|
package/dispatch/watcher.mjs
CHANGED
|
@@ -31,7 +31,11 @@ import { readFileSync, writeFileSync, renameSync, statSync } from 'fs';
|
|
|
31
31
|
import { dirname, join } from 'path';
|
|
32
32
|
import { homedir } from 'os';
|
|
33
33
|
import { fileURLToPath } from 'url';
|
|
34
|
-
import {
|
|
34
|
+
import {
|
|
35
|
+
extractTerminalAssistantReplyFromEntries,
|
|
36
|
+
hasCompletionSignal,
|
|
37
|
+
resolveCompletionDelivery,
|
|
38
|
+
} from './completion.mjs';
|
|
35
39
|
import { sendMessage } from '../messages.js';
|
|
36
40
|
|
|
37
41
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
@@ -586,6 +590,28 @@ function readJsonlLastLines(sessionId, agentDir = 'main', n = 3) {
|
|
|
586
590
|
}
|
|
587
591
|
}
|
|
588
592
|
|
|
593
|
+
function readJsonlTailEntries(sessionId, agentDir = 'main', n = 200) {
|
|
594
|
+
return readJsonlLastLines(sessionId, agentDir, n);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
function getSessionTerminalReply(sessionId, agentDir = 'main') {
|
|
598
|
+
const entries = readJsonlTailEntries(sessionId, agentDir, 200);
|
|
599
|
+
return extractTerminalAssistantReplyFromEntries(entries);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
function formatDiagnosticSnippet(reply) {
|
|
603
|
+
if (!reply || typeof reply !== 'string') return '';
|
|
604
|
+
const normalized = reply.trim();
|
|
605
|
+
if (!normalized) return '';
|
|
606
|
+
|
|
607
|
+
const maxLen = 1200;
|
|
608
|
+
const clipped = normalized.length > maxLen
|
|
609
|
+
? normalized.slice(0, maxLen) + '\n\n..[truncated]'
|
|
610
|
+
: normalized;
|
|
611
|
+
|
|
612
|
+
return `\n\nLast assistant report observed:\n${clipped}`;
|
|
613
|
+
}
|
|
614
|
+
|
|
589
615
|
/**
|
|
590
616
|
* Check if a session is currently mid-turn by inspecting its JSONL tail.
|
|
591
617
|
* Returns a reason string if mid-turn is detected, null if safe to proceed.
|
|
@@ -658,6 +684,112 @@ function getJsonlMidTurnReason(sessionId, agentDir = 'main') {
|
|
|
658
684
|
return null; // Last assistant entry appears to be a complete text reply -- safe to proceed
|
|
659
685
|
}
|
|
660
686
|
|
|
687
|
+
/**
|
|
688
|
+
* Check the JSONL tail for a pending tool handoff without requiring recent
|
|
689
|
+
* file activity. Long-running tool calls can leave the transcript flat for
|
|
690
|
+
* minutes, so stale mtime alone is not enough to declare the agent stuck.
|
|
691
|
+
*
|
|
692
|
+
* @param {string} sessionId - Internal session UUID
|
|
693
|
+
* @param {string} agentDir - Agent directory (default: 'main')
|
|
694
|
+
* @returns {string|null} reason string if a tool handoff appears pending
|
|
695
|
+
*/
|
|
696
|
+
function getJsonlPendingToolReason(sessionId, agentDir = 'main') {
|
|
697
|
+
const lastLines = readJsonlLastLines(sessionId, agentDir, 3);
|
|
698
|
+
if (!lastLines || lastLines.length === 0) return null;
|
|
699
|
+
|
|
700
|
+
const last = lastLines[lastLines.length - 1];
|
|
701
|
+
|
|
702
|
+
if (last?.role === 'assistant') {
|
|
703
|
+
const content = Array.isArray(last.content) ? last.content : [];
|
|
704
|
+
const toolUse = content.find(c => c?.type === 'tool_use');
|
|
705
|
+
if (toolUse) {
|
|
706
|
+
return `last assistant entry has tool_use (${toolUse.name || 'unknown'}) -- awaiting tool result`;
|
|
707
|
+
}
|
|
708
|
+
if (last.type === 'tool_use') {
|
|
709
|
+
return `last entry is tool_use (${last.name || 'unknown'}) -- awaiting tool result`;
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
if (last?.role === 'user') {
|
|
714
|
+
const content = Array.isArray(last.content) ? last.content : [];
|
|
715
|
+
if (content.some(c => c?.type === 'tool_result')) {
|
|
716
|
+
return 'last entry is tool_result (tool executed, awaiting assistant reply)';
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
if (last?.type === 'tool_result') {
|
|
721
|
+
return 'last entry is tool_result (tool executed, awaiting assistant reply)';
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
return null;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
function parseTimestampMs(value) {
|
|
728
|
+
if (!value) return null;
|
|
729
|
+
if (typeof value === 'number') {
|
|
730
|
+
return Number.isFinite(value) ? value : null;
|
|
731
|
+
}
|
|
732
|
+
if (value instanceof Date) {
|
|
733
|
+
const timestamp = value.getTime();
|
|
734
|
+
return Number.isFinite(timestamp) ? timestamp : null;
|
|
735
|
+
}
|
|
736
|
+
const parsed = Date.parse(value);
|
|
737
|
+
return Number.isFinite(parsed) ? parsed : null;
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
/**
|
|
741
|
+
* Detect an agent session that has stopped making progress even though the
|
|
742
|
+
* watcher process itself is still alive and writing lastPing.
|
|
743
|
+
*
|
|
744
|
+
* This closes the failure mode where OpenClaw's Codex app-server retires a
|
|
745
|
+
* timed-out turn, but dispatch status keeps reporting "running" because the
|
|
746
|
+
* delivery watcher is still polling.
|
|
747
|
+
*/
|
|
748
|
+
function getRunningSessionStallReason(status, thresholdMs) {
|
|
749
|
+
if (!status?.sessionKey) return null;
|
|
750
|
+
|
|
751
|
+
const sessionAgent = status.sessionKey.split(':')[1] || 'main';
|
|
752
|
+
const entry = getSessionStoreEntry(status.sessionKey);
|
|
753
|
+
if (!entry) return null;
|
|
754
|
+
|
|
755
|
+
const sessionId = entry.sessionId || null;
|
|
756
|
+
const now = Date.now();
|
|
757
|
+
const activityTimes = [
|
|
758
|
+
parseTimestampMs(entry.updatedAt),
|
|
759
|
+
parseTimestampMs(entry.lastActivityAt),
|
|
760
|
+
parseTimestampMs(entry.sessionStartedAt),
|
|
761
|
+
parseTimestampMs(entry.startedAt),
|
|
762
|
+
].filter(t => typeof t === 'number');
|
|
763
|
+
|
|
764
|
+
const jsonlMtime = sessionId ? getSessionJsonlMtime(sessionId, sessionAgent) : null;
|
|
765
|
+
if (typeof jsonlMtime === 'number') activityTimes.push(jsonlMtime);
|
|
766
|
+
|
|
767
|
+
if (typeof status?.liveness?.ageMs === 'number' && status.liveness.ageMs < thresholdMs) {
|
|
768
|
+
return null;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
const lastActivityMs = activityTimes.length ? Math.max(...activityTimes) : null;
|
|
772
|
+
if (lastActivityMs !== null && now - lastActivityMs < thresholdMs) {
|
|
773
|
+
return null;
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
const pendingToolReason = sessionId ? getJsonlPendingToolReason(sessionId, sessionAgent) : null;
|
|
777
|
+
if (pendingToolReason) {
|
|
778
|
+
process.stderr.write(
|
|
779
|
+
`[watcher] ${status.label || 'session'} stale telemetry but pending tool handoff detected: ${pendingToolReason}\n`
|
|
780
|
+
);
|
|
781
|
+
return null;
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
const idleMinutes = lastActivityMs === null
|
|
785
|
+
? Math.ceil(thresholdMs / 60000)
|
|
786
|
+
: Math.max(1, Math.floor((now - lastActivityMs) / 60000));
|
|
787
|
+
return (
|
|
788
|
+
`agent session stalled: no session/jsonl activity for ~${idleMinutes}min ` +
|
|
789
|
+
`while delivery watcher remained alive; likely app-server turn retired or stopped producing events`
|
|
790
|
+
);
|
|
791
|
+
}
|
|
792
|
+
|
|
661
793
|
/**
|
|
662
794
|
* Read the last assistant entry's stop_reason from the session JSONL.
|
|
663
795
|
* Returns the stop_reason string (e.g. 'end_turn', 'tool_use') or null if unavailable.
|
|
@@ -728,6 +860,7 @@ function markLabelError(label, errorSummary) {
|
|
|
728
860
|
updateExistingLabel(label, (entry) => {
|
|
729
861
|
if (entry.status === 'done') return false;
|
|
730
862
|
entry.status = 'error';
|
|
863
|
+
entry.error = errorSummary || 'failed without result';
|
|
731
864
|
entry.summary = errorSummary || 'failed without result';
|
|
732
865
|
});
|
|
733
866
|
} catch (e) {
|
|
@@ -735,6 +868,8 @@ function markLabelError(label, errorSummary) {
|
|
|
735
868
|
}
|
|
736
869
|
}
|
|
737
870
|
|
|
871
|
+
let exitZeroOnTerminal = false;
|
|
872
|
+
|
|
738
873
|
/**
|
|
739
874
|
* Format and output the delivery message, then exit 0.
|
|
740
875
|
* Also marks the label as done in labels.json before exiting.
|
|
@@ -768,7 +903,7 @@ function deliverResult(label, lastReply, fallbackSummary, completionPayload = nu
|
|
|
768
903
|
`**Error:** ${stderr || 'non-zero exit'}\n\n` +
|
|
769
904
|
`Job marked as \`error\`. The agent may have reported done without completing the actual work.\n`
|
|
770
905
|
);
|
|
771
|
-
process.exit(1);
|
|
906
|
+
process.exit(exitZeroOnTerminal ? 0 : 1);
|
|
772
907
|
}
|
|
773
908
|
}
|
|
774
909
|
} catch (loadErr) {
|
|
@@ -790,10 +925,32 @@ function deliverResult(label, lastReply, fallbackSummary, completionPayload = nu
|
|
|
790
925
|
? completion.deliveryText.slice(0, maxLen) + '\n\n..[truncated]'
|
|
791
926
|
: completion.deliveryText;
|
|
792
927
|
process.stdout.write(`🌶️ *dispatch* [${label}] completed:\n\n${reply}\n`);
|
|
793
|
-
|
|
794
|
-
process.stderr.write(`[watcher] [${label}] completion delivery suppressed (no meaningful reply or summary)\n`);
|
|
928
|
+
process.exit(0);
|
|
795
929
|
}
|
|
796
|
-
|
|
930
|
+
|
|
931
|
+
const failureSummary = 'completed without a clean user-facing completion';
|
|
932
|
+
process.stderr.write(`[watcher] [${label}] completion delivery suppressed (no meaningful reply or summary)\n`);
|
|
933
|
+
markLabelError(label, failureSummary);
|
|
934
|
+
process.stdout.write(
|
|
935
|
+
`⚠️ dispatch [${label}] completed, but no clean user-facing completion was captured. ` +
|
|
936
|
+
`Internal diagnostics were suppressed; check scheduler run logs for details.\n`
|
|
937
|
+
);
|
|
938
|
+
process.exit(exitZeroOnTerminal ? 0 : 1);
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
function emitInterruptedOutcome(label, summary, result = null) {
|
|
942
|
+
process.stderr.write(`[watcher] [${label}] session auto-resolved as interrupted -- work may be incomplete\n`);
|
|
943
|
+
markLabelError(label, summary || 'interrupted: session went idle without calling done');
|
|
944
|
+
process.stdout.write(
|
|
945
|
+
`⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete` +
|
|
946
|
+
`${formatDiagnosticSnippet(result?.diagnosticReply || result?.lastReply || null)}\n`
|
|
947
|
+
);
|
|
948
|
+
process.exit(exitZeroOnTerminal ? 0 : 1);
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
function emitTimeoutOutcome(label, message, result = null) {
|
|
952
|
+
process.stdout.write(`${message}${formatDiagnosticSnippet(result?.diagnosticReply || result?.lastReply || null)}\n`);
|
|
953
|
+
process.exit(exitZeroOnTerminal ? 0 : 1);
|
|
797
954
|
}
|
|
798
955
|
|
|
799
956
|
// -- Watcher heartbeat interval ref --------------------------------------
|
|
@@ -828,6 +985,8 @@ const flags = parseFlags(process.argv.slice(2));
|
|
|
828
985
|
const label = flags.label;
|
|
829
986
|
const timeoutS = parseInt(flags.timeout || '600', 10);
|
|
830
987
|
const pollS = parseInt(flags['poll-interval'] || '20', 10);
|
|
988
|
+
const once = flags.once === true || flags.once === 'true';
|
|
989
|
+
exitZeroOnTerminal = once;
|
|
831
990
|
|
|
832
991
|
// How long a session must be idle before we proactively check result
|
|
833
992
|
const IDLE_RESULT_CHECK_MS = 60000;
|
|
@@ -837,6 +996,144 @@ if (!label) {
|
|
|
837
996
|
process.exit(2);
|
|
838
997
|
}
|
|
839
998
|
|
|
999
|
+
function touchWatcherPing(label) {
|
|
1000
|
+
updateExistingLabel(label, (entry) => {
|
|
1001
|
+
if (entry.status !== 'running') return false;
|
|
1002
|
+
entry.lastPing = new Date().toISOString();
|
|
1003
|
+
});
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
function markWatcherPending(label, reason = 'target still running') {
|
|
1007
|
+
process.stderr.write(`[watcher] WATCHER_PENDING label=${label} reason=${reason}\n`);
|
|
1008
|
+
process.exit(0);
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
function clearWatcherRetryAfter(label) {
|
|
1012
|
+
updateExistingLabel(label, (entry) => {
|
|
1013
|
+
if (!entry.watcherRetryAfter) return false;
|
|
1014
|
+
delete entry.watcherRetryAfter;
|
|
1015
|
+
});
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
function handleOnce529(label, errorMsg) {
|
|
1019
|
+
const labels = loadLabels();
|
|
1020
|
+
const entry = labels[label] || {};
|
|
1021
|
+
const retryCount = getRetryCount(label);
|
|
1022
|
+
|
|
1023
|
+
if (retryCount >= MAX_529_RETRIES) {
|
|
1024
|
+
markLabelError(label, `max_retries_exceeded (${retryCount}x 529): ${errorMsg}`);
|
|
1025
|
+
process.stdout.write(
|
|
1026
|
+
`🌶️ *dispatch* [${label}] failed after ${MAX_529_RETRIES} retries (529 overload)\n` +
|
|
1027
|
+
`Error: ${errorMsg}\n`
|
|
1028
|
+
);
|
|
1029
|
+
process.exit(0);
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
const retryAfterMs = parseTimestampMs(entry.watcherRetryAfter);
|
|
1033
|
+
if (!retryAfterMs) {
|
|
1034
|
+
const retryResult = attempt529Retry(label, retryCount, errorMsg);
|
|
1035
|
+
if (!retryResult.retry) return handleOnce529(label, errorMsg);
|
|
1036
|
+
updateExistingLabel(label, (current) => {
|
|
1037
|
+
current.watcherRetryAfter = new Date(Date.now() + retryResult.delayMs).toISOString();
|
|
1038
|
+
});
|
|
1039
|
+
markWatcherPending(label, `529 retry scheduled for future tick (${retryResult.delayMs / 1000}s)`);
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
if (Date.now() < retryAfterMs) {
|
|
1043
|
+
markWatcherPending(label, '529 retry backoff active');
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
if (respawnSession(label)) {
|
|
1047
|
+
clearWatcherRetryAfter(label);
|
|
1048
|
+
markWatcherPending(label, '529 retry dispatched');
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
markLabelError(label, `529 retry failed -- could not respawn session: ${errorMsg}`);
|
|
1052
|
+
process.stdout.write(
|
|
1053
|
+
`🌶️ *dispatch* [${label}] 529 retry failed -- could not respawn session\n` +
|
|
1054
|
+
`Error: ${errorMsg}\n`
|
|
1055
|
+
);
|
|
1056
|
+
process.exit(0);
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
function runOnceAndExit() {
|
|
1060
|
+
try {
|
|
1061
|
+
touchWatcherPing(label);
|
|
1062
|
+
} catch {
|
|
1063
|
+
// Best-effort -- a quick-poll tick must not fail because heartbeat metadata raced.
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
const status = dispatch('status', ['--label', label]);
|
|
1067
|
+
if (!status?.ok) {
|
|
1068
|
+
markWatcherPending(label, 'status unavailable');
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
if (status.status === 'error') {
|
|
1072
|
+
const errorMsg = status.error || status.summary || '';
|
|
1073
|
+
if (is529Error(errorMsg)) {
|
|
1074
|
+
handleOnce529(label, errorMsg);
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
if (status.status !== 'running') {
|
|
1079
|
+
const terminalResult = dispatch('result', ['--label', label]);
|
|
1080
|
+
const terminalCompletion = terminalResult?.completion || status?.completion || null;
|
|
1081
|
+
|
|
1082
|
+
if (status.status === 'done') {
|
|
1083
|
+
const currentRetryCount = getRetryCount(label);
|
|
1084
|
+
if (currentRetryCount > 0) setRetryCount(label, 0);
|
|
1085
|
+
const gwRetryCount = getGwRestartRetryCount(label);
|
|
1086
|
+
if (gwRetryCount > 0) setGwRestartRetryCount(label, 0);
|
|
1087
|
+
deliverResult(label, terminalResult?.lastReply, status.summary, terminalCompletion);
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
if (status.status === 'interrupted') {
|
|
1091
|
+
emitInterruptedOutcome(label, status.summary, terminalResult);
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
const summary = status.error || status.summary || `terminal failure (${status.status || 'unknown'})`;
|
|
1095
|
+
markLabelError(label, summary);
|
|
1096
|
+
process.stdout.write(`🌶️ *dispatch* [${label}] failed\nSummary: ${summary}\n`);
|
|
1097
|
+
process.exit(0);
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
if (status.sessionKey) {
|
|
1101
|
+
const entry = getSessionStoreEntry(status.sessionKey);
|
|
1102
|
+
const sessionId = entry?.sessionId || null;
|
|
1103
|
+
const sessionAgent = status.sessionKey.split(':')[1] || 'main';
|
|
1104
|
+
const terminalJsonlReply = sessionId ? getSessionTerminalReply(sessionId, sessionAgent) : null;
|
|
1105
|
+
if (sessionId && terminalJsonlReply && isSessionCleanlyFinished(sessionId, sessionAgent)) {
|
|
1106
|
+
const result = dispatch('result', ['--label', label]);
|
|
1107
|
+
deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
const ageMs = status.liveness?.ageMs;
|
|
1112
|
+
if (ageMs != null && ageMs >= IDLE_RESULT_CHECK_MS) {
|
|
1113
|
+
const result = dispatch('result', ['--label', label]);
|
|
1114
|
+
if (result?.lastReply || hasCompletionSignal(result?.completion)) {
|
|
1115
|
+
deliverResult(label, result?.lastReply || null, null, result?.completion || null);
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
const stallReason = getRunningSessionStallReason(status, IDLE_RESULT_CHECK_MS);
|
|
1119
|
+
if (stallReason) {
|
|
1120
|
+
process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
|
|
1121
|
+
markLabelError(label, stallReason);
|
|
1122
|
+
process.stdout.write(
|
|
1123
|
+
`❌ *dispatch* [${label}] failed\n` +
|
|
1124
|
+
`Summary: ${stallReason}\n`
|
|
1125
|
+
);
|
|
1126
|
+
process.exit(0);
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
markWatcherPending(label);
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
if (once) {
|
|
1134
|
+
runOnceAndExit();
|
|
1135
|
+
}
|
|
1136
|
+
|
|
840
1137
|
// -- Start heartbeat -----------------------------------------------------
|
|
841
1138
|
// Write lastPing to labels.json every PING_INTERVAL_MS while the session is
|
|
842
1139
|
// still running. The watchdog guard in index.mjs reads lastPing to know this
|
|
@@ -870,17 +1167,47 @@ let lastKnownReply = null;
|
|
|
870
1167
|
let lastKnownCompletion = null;
|
|
871
1168
|
|
|
872
1169
|
// -- SIGTERM handler (scheduler kills watcher with SIGTERM before SIGKILL) --
|
|
873
|
-
//
|
|
1170
|
+
// Hand off to a fresh watcher instead of converting the kill into a fake success.
|
|
874
1171
|
process.on('SIGTERM', () => {
|
|
875
|
-
process.stderr.write(`[watcher] SIGTERM received for ${label} --
|
|
876
|
-
|
|
1172
|
+
process.stderr.write(`[watcher] SIGTERM received for ${label} -- attempting watcher handoff\n`);
|
|
1173
|
+
|
|
1174
|
+
let latestStatus = null;
|
|
1175
|
+
try {
|
|
1176
|
+
latestStatus = dispatch('status', ['--label', label]);
|
|
1177
|
+
} catch {}
|
|
1178
|
+
|
|
877
1179
|
try {
|
|
878
1180
|
const result = dispatch('result', ['--label', label]);
|
|
879
1181
|
if (result?.lastReply) lastKnownReply = result.lastReply;
|
|
880
1182
|
if (result?.completion) lastKnownCompletion = result.completion;
|
|
881
1183
|
} catch {}
|
|
882
|
-
|
|
883
|
-
|
|
1184
|
+
|
|
1185
|
+
if (latestStatus?.status === 'done') {
|
|
1186
|
+
deliverResult(label, lastKnownReply, latestStatus.summary || null, lastKnownCompletion || latestStatus?.completion || null);
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
if (latestStatus?.status === 'interrupted') {
|
|
1190
|
+
markLabelError(label, latestStatus.summary || 'interrupted: session went idle without calling done');
|
|
1191
|
+
process.exit(1);
|
|
1192
|
+
}
|
|
1193
|
+
|
|
1194
|
+
if (latestStatus?.status && latestStatus.status !== 'running') {
|
|
1195
|
+
const summary = latestStatus.error || latestStatus.summary || `terminal failure (${latestStatus.status})`;
|
|
1196
|
+
markLabelError(label, summary);
|
|
1197
|
+
process.stdout.write(`🌶️ *dispatch* [${label}] failed\nSummary: ${summary}\n`);
|
|
1198
|
+
process.exit(1);
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
const handoff = dispatch('watcher-handoff', ['--label', label, '--reason', 'sigterm']);
|
|
1202
|
+
if (handoff?.ok && (handoff.scheduled || handoff.reason === 'label already terminal' || handoff.reason === 'delivery disabled for this label')) {
|
|
1203
|
+
process.stderr.write(`[watcher] SIGTERM handoff ${handoff.scheduled ? 'scheduled' : 'skipped'} for ${label}\n`);
|
|
1204
|
+
process.exit(0);
|
|
1205
|
+
}
|
|
1206
|
+
|
|
1207
|
+
const failureSummary = 'interrupted by watcher timeout (handoff failed)';
|
|
1208
|
+
markLabelError(label, failureSummary);
|
|
1209
|
+
process.stdout.write(`⚠️ dispatch [${label}] watcher interrupted and handoff failed\nSummary: ${failureSummary}\n`);
|
|
1210
|
+
process.exit(1);
|
|
884
1211
|
});
|
|
885
1212
|
|
|
886
1213
|
// -- Rolling deadline vars ------------------------------------
|
|
@@ -1024,11 +1351,21 @@ while (Date.now() < deadline) {
|
|
|
1024
1351
|
|
|
1025
1352
|
// -- Path 1: status auto-resolved to done ------------------
|
|
1026
1353
|
if (status.status !== 'running') {
|
|
1354
|
+
const terminalResult = dispatch('result', ['--label', label]);
|
|
1355
|
+
const terminalCompletion = terminalResult?.completion || status?.completion || null;
|
|
1356
|
+
const hasTerminalCompletionEvidence = Boolean(
|
|
1357
|
+
terminalResult?.lastReply
|
|
1358
|
+
|| terminalResult?.completion?.deliveryText
|
|
1359
|
+
|| terminalResult?.completion?.summary
|
|
1360
|
+
|| status?.completion?.deliveryText
|
|
1361
|
+
|| status?.completion?.summary
|
|
1362
|
+
);
|
|
1363
|
+
|
|
1027
1364
|
// -- Spawn failure detection -----------------------------------------
|
|
1028
1365
|
// If the session was auto-resolved to 'done' (or 'spawn-warning') but was
|
|
1029
|
-
// never seen in the gateway, it never ran --
|
|
1030
|
-
//
|
|
1031
|
-
if (!sessionEverFound && (status.status === '
|
|
1366
|
+
// never seen in the gateway, it never ran -- unless a terminal completion
|
|
1367
|
+
// payload/reply proves the work already finished before this watcher saw it.
|
|
1368
|
+
if (!sessionEverFound && (status.status === 'spawn-warning' || status.status === 'error' || (status.status === 'done' && !hasTerminalCompletionEvidence))) {
|
|
1032
1369
|
const spawnErrMsg =
|
|
1033
1370
|
`[dispatch] SPAWN FAILURE: session ${status.sessionKey || '(unknown)'} never appeared ` +
|
|
1034
1371
|
`in gateway -- spawn likely failed (auth timeout, quota, or gateway error). Label: ${label}`;
|
|
@@ -1055,7 +1392,7 @@ while (Date.now() < deadline) {
|
|
|
1055
1392
|
// If the session DID produce a lastReply before being killed, deliver it normally.
|
|
1056
1393
|
if (sessionEverFound && isGatewayRestartKill(status.summary)) {
|
|
1057
1394
|
const gwCheckResult = dispatch('result', ['--label', label]);
|
|
1058
|
-
if (!gwCheckResult?.lastReply && !gwCheckResult?.completion
|
|
1395
|
+
if (!gwCheckResult?.lastReply && !hasCompletionSignal(gwCheckResult?.completion)) {
|
|
1059
1396
|
// No result captured -- session was killed before completing
|
|
1060
1397
|
const retryCount = getGwRestartRetryCount(label);
|
|
1061
1398
|
if (retryCount >= MAX_GW_RESTART_RETRIES) {
|
|
@@ -1113,12 +1450,8 @@ while (Date.now() < deadline) {
|
|
|
1113
1450
|
//
|
|
1114
1451
|
// NOTE: Always resolve as 'interrupted', never 'done'. Only agent-side cmdDone may set status=done.
|
|
1115
1452
|
if (status.status === 'interrupted') {
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
`⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete\n`
|
|
1119
|
-
);
|
|
1120
|
-
markLabelError(label, status.summary || 'interrupted: session went idle without calling done');
|
|
1121
|
-
process.exit(1);
|
|
1453
|
+
const interruptedResult = dispatch('result', ['--label', label]);
|
|
1454
|
+
emitInterruptedOutcome(label, status.summary, interruptedResult);
|
|
1122
1455
|
}
|
|
1123
1456
|
|
|
1124
1457
|
// Reset 529 retryCount on successful completion
|
|
@@ -1129,8 +1462,7 @@ while (Date.now() < deadline) {
|
|
|
1129
1462
|
process.stderr.write(`[watcher] [${label}] completed after ${currentRetryCount} retry(ies), reset retryCount\n`);
|
|
1130
1463
|
}
|
|
1131
1464
|
}
|
|
1132
|
-
|
|
1133
|
-
deliverResult(label, result?.lastReply, status.summary, result?.completion || status?.completion || null);
|
|
1465
|
+
deliverResult(label, terminalResult?.lastReply, status.summary, terminalCompletion);
|
|
1134
1466
|
}
|
|
1135
1467
|
|
|
1136
1468
|
// -- Path 2a: stop_reason early delivery (clean end_turn) --
|
|
@@ -1141,10 +1473,11 @@ while (Date.now() < deadline) {
|
|
|
1141
1473
|
const _e2a = getSessionStoreEntry(status.sessionKey);
|
|
1142
1474
|
const _sid2a = _e2a?.sessionId || null;
|
|
1143
1475
|
const _adir2a = (status.sessionKey.split(':')[1]) || 'main';
|
|
1144
|
-
|
|
1476
|
+
const terminalJsonlReply = _sid2a ? getSessionTerminalReply(_sid2a, _adir2a) : null;
|
|
1477
|
+
if (_sid2a && terminalJsonlReply && isSessionCleanlyFinished(_sid2a, _adir2a)) {
|
|
1145
1478
|
process.stderr.write(`[watcher] stop_reason=end_turn detected -- delivering early\n`);
|
|
1146
1479
|
const result = dispatch('result', ['--label', label]);
|
|
1147
|
-
deliverResult(label, result?.lastReply, 'completed (stop_reason=end_turn)', result?.completion || null);
|
|
1480
|
+
deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
|
|
1148
1481
|
// deliverResult exits
|
|
1149
1482
|
}
|
|
1150
1483
|
}
|
|
@@ -1158,9 +1491,20 @@ while (Date.now() < deadline) {
|
|
|
1158
1491
|
const ageMs = status.liveness?.ageMs;
|
|
1159
1492
|
if (ageMs != null && ageMs >= IDLE_RESULT_CHECK_MS) {
|
|
1160
1493
|
const result = dispatch('result', ['--label', label]);
|
|
1161
|
-
if (result?.lastReply || result?.completion
|
|
1494
|
+
if (result?.lastReply || hasCompletionSignal(result?.completion)) {
|
|
1162
1495
|
deliverResult(label, result?.lastReply || null, null, result?.completion || null);
|
|
1163
1496
|
}
|
|
1497
|
+
|
|
1498
|
+
const stallReason = getRunningSessionStallReason(status, IDLE_RESULT_CHECK_MS);
|
|
1499
|
+
if (stallReason) {
|
|
1500
|
+
process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
|
|
1501
|
+
markLabelError(label, stallReason);
|
|
1502
|
+
process.stdout.write(
|
|
1503
|
+
`❌ *dispatch* [${label}] failed\n` +
|
|
1504
|
+
`Summary: ${stallReason}\n`
|
|
1505
|
+
);
|
|
1506
|
+
process.exit(1);
|
|
1507
|
+
}
|
|
1164
1508
|
}
|
|
1165
1509
|
|
|
1166
1510
|
|
|
@@ -1183,11 +1527,7 @@ if (finalStatus?.status === 'done') {
|
|
|
1183
1527
|
// If status is interrupted (auto-resolved as incomplete), exit non-zero
|
|
1184
1528
|
if (finalStatus?.status === 'interrupted') {
|
|
1185
1529
|
process.stderr.write(`[watcher] [${label}] final status=interrupted -- session idle without completion\n`);
|
|
1186
|
-
|
|
1187
|
-
`⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete\n`
|
|
1188
|
-
);
|
|
1189
|
-
markLabelError(label, finalStatus?.summary || 'interrupted: session went idle without calling done');
|
|
1190
|
-
process.exit(1);
|
|
1530
|
+
emitInterruptedOutcome(label, finalStatus?.summary, finalResult);
|
|
1191
1531
|
}
|
|
1192
1532
|
|
|
1193
1533
|
// -- Token-based activity check before steering ----------------------------
|
|
@@ -1237,7 +1577,7 @@ if (sessionInternalId) {
|
|
|
1237
1577
|
// If the session already completed (gateway pruned it -> null tokens), exit cleanly.
|
|
1238
1578
|
if (statusAtDeadline?.status === 'done' || baselineTokens === null) {
|
|
1239
1579
|
const r = dispatch('result', ['--label', label]);
|
|
1240
|
-
if (r?.lastReply || r?.completion
|
|
1580
|
+
if (r?.lastReply || hasCompletionSignal(r?.completion)) {
|
|
1241
1581
|
// deliverResult calls process.exit(0) internally
|
|
1242
1582
|
deliverResult(label, r?.lastReply || null, statusAtDeadline?.summary || null, r?.completion || null);
|
|
1243
1583
|
}
|
|
@@ -1255,8 +1595,7 @@ if (statusAtDeadline?.status === 'done' || baselineTokens === null) {
|
|
|
1255
1595
|
// Session truly not found -- telemetry unavailable, exit
|
|
1256
1596
|
process.stderr.write(`[watcher] token telemetry unavailable for ${label}; session not in store\n`);
|
|
1257
1597
|
markLabelError(label, `timed out after ${timeoutS}s -- token telemetry unavailable`);
|
|
1258
|
-
|
|
1259
|
-
process.exit(1);
|
|
1598
|
+
emitTimeoutOutcome(label, `⏱ dispatch [${label}] timed out after ${timeoutS}s -- token telemetry unavailable; no steer/kill attempted`, r);
|
|
1260
1599
|
}
|
|
1261
1600
|
// Session IS in store but no tokens -- mid-tool-call, fall through to activity window
|
|
1262
1601
|
// Use updatedAt as activity signal instead of tokens
|
|
@@ -1277,7 +1616,7 @@ while (Date.now() - flatSince < FLAT_WINDOW_MS) {
|
|
|
1277
1616
|
deliverResult(label, r?.lastReply || null, st.summary, r?.completion || st?.completion || null);
|
|
1278
1617
|
}
|
|
1279
1618
|
const r2 = dispatch('result', ['--label', label]);
|
|
1280
|
-
if (r2?.lastReply || r2?.completion
|
|
1619
|
+
if (r2?.lastReply || hasCompletionSignal(r2?.completion)) {
|
|
1281
1620
|
// deliverResult calls process.exit(0) internally
|
|
1282
1621
|
deliverResult(label, r2?.lastReply || null, null, r2?.completion || null);
|
|
1283
1622
|
}
|
|
@@ -1290,8 +1629,8 @@ while (Date.now() - flatSince < FLAT_WINDOW_MS) {
|
|
|
1290
1629
|
if (!entry) {
|
|
1291
1630
|
process.stderr.write(`[watcher] token telemetry lost for ${label}; session gone from store\n`);
|
|
1292
1631
|
markLabelError(label, `timed out after ${timeoutS}s -- token telemetry lost`);
|
|
1293
|
-
|
|
1294
|
-
|
|
1632
|
+
const tokenLostResult = dispatch('result', ['--label', label]);
|
|
1633
|
+
emitTimeoutOutcome(label, `⏱ dispatch [${label}] timed out after ${timeoutS}s -- token telemetry lost; no steer/kill attempted`, tokenLostResult);
|
|
1295
1634
|
}
|
|
1296
1635
|
// Still in store -- check if updatedAt advanced (tool call still running)
|
|
1297
1636
|
// Normalize: updatedAt may be seconds or milliseconds depending on agent framework version
|
|
@@ -1371,7 +1710,7 @@ if (sessionInternalId) {
|
|
|
1371
1710
|
deliverResult(label, rExt?.lastReply || null, stExt.summary, rExt?.completion || stExt?.completion || null);
|
|
1372
1711
|
}
|
|
1373
1712
|
const rExt2 = dispatch('result', ['--label', label]);
|
|
1374
|
-
if (rExt2?.lastReply || rExt2?.completion
|
|
1713
|
+
if (rExt2?.lastReply || hasCompletionSignal(rExt2?.completion)) {
|
|
1375
1714
|
// deliverResult calls process.exit(0) internally
|
|
1376
1715
|
deliverResult(label, rExt2?.lastReply || null, null, rExt2?.completion || null);
|
|
1377
1716
|
}
|
|
@@ -1428,7 +1767,7 @@ for (const round of steerRounds) {
|
|
|
1428
1767
|
deliverResult(label, r3?.lastReply || null, st2.summary, r3?.completion || st2?.completion || null);
|
|
1429
1768
|
}
|
|
1430
1769
|
const r3 = dispatch('result', ['--label', label]);
|
|
1431
|
-
if (r3?.lastReply || r3?.completion
|
|
1770
|
+
if (r3?.lastReply || hasCompletionSignal(r3?.completion)) {
|
|
1432
1771
|
// deliverResult calls process.exit(0) internally
|
|
1433
1772
|
deliverResult(label, r3?.lastReply || null, null, r3?.completion || null);
|
|
1434
1773
|
}
|
|
@@ -1443,17 +1782,16 @@ for (const round of steerRounds) {
|
|
|
1443
1782
|
if (st3?.status === 'done') {
|
|
1444
1783
|
// Check if a result was captured before marking as error
|
|
1445
1784
|
const r4 = dispatch('result', ['--label', label]);
|
|
1446
|
-
if (r4?.lastReply || r4?.completion
|
|
1785
|
+
if (r4?.lastReply || hasCompletionSignal(r4?.completion)) {
|
|
1447
1786
|
deliverResult(label, r4?.lastReply || null, st3.summary, r4?.completion || st3?.completion || null); // deliverResult calls process.exit(0)
|
|
1448
1787
|
}
|
|
1449
1788
|
markLabelError(label, 'timed out -- killed after steer attempts (no result captured)');
|
|
1450
|
-
|
|
1451
|
-
process.exit(1);
|
|
1789
|
+
emitTimeoutOutcome(label, `⏱ dispatch [${label}] killed after steer attempts -- no result captured`, r4);
|
|
1452
1790
|
}
|
|
1453
1791
|
}
|
|
1454
1792
|
}
|
|
1455
1793
|
}
|
|
1456
1794
|
|
|
1457
1795
|
markLabelError(label, `timed out after ${timeoutS}s -- killed after steer attempts`);
|
|
1458
|
-
|
|
1459
|
-
|
|
1796
|
+
const timeoutResult = dispatch('result', ['--label', label]);
|
|
1797
|
+
emitTimeoutOutcome(label, `⏱ dispatch [${label}] timed out after ${timeoutS}s -- session killed after steer attempts`, timeoutResult);
|