npm - openclaw-scheduler - Versions diffs - 0.2.5 → 0.2.7 - Mend

openclaw-scheduler 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dispatch/README.md +16 -2
package/dispatch/completion.mjs +297 -20
package/dispatch/index.mjs +80 -57
package/dispatch/liveness.mjs +61 -0
package/dispatch/watcher.mjs +299 -17
package/dispatcher-strategies.js +82 -10
package/dispatcher.js +6 -1
package/gateway.js +39 -0
package/package.json +2 -1

package/dispatch/watcher.mjs CHANGED Viewed

@@ -36,6 +36,7 @@ import {
   hasCompletionSignal,
   resolveCompletionDelivery,
 } from './completion.mjs';
+import { getDispatchLivenessPolicy } from './liveness.mjs';
 import { sendMessage } from '../messages.js';
 const __dirname = dirname(fileURLToPath(import.meta.url));
@@ -684,6 +685,112 @@ function getJsonlMidTurnReason(sessionId, agentDir = 'main') {
   return null; // Last assistant entry appears to be a complete text reply -- safe to proceed
 }
+/**
+ * Check the JSONL tail for a pending tool handoff without requiring recent
+ * file activity. Long-running tool calls can leave the transcript flat for
+ * minutes, so stale mtime alone is not enough to declare the agent stuck.
+ *
+ * @param {string} sessionId - Internal session UUID
+ * @param {string} agentDir - Agent directory (default: 'main')
+ * @returns {string|null} reason string if a tool handoff appears pending
+ */
+function getJsonlPendingToolReason(sessionId, agentDir = 'main') {
+  const lastLines = readJsonlLastLines(sessionId, agentDir, 3);
+  if (!lastLines || lastLines.length === 0) return null;
+  const last = lastLines[lastLines.length - 1];
+  if (last?.role === 'assistant') {
+    const content = Array.isArray(last.content) ? last.content : [];
+    const toolUse = content.find(c => c?.type === 'tool_use');
+    if (toolUse) {
+      return `last assistant entry has tool_use (${toolUse.name || 'unknown'}) -- awaiting tool result`;
+    }
+    if (last.type === 'tool_use') {
+      return `last entry is tool_use (${last.name || 'unknown'}) -- awaiting tool result`;
+    }
+  }
+  if (last?.role === 'user') {
+    const content = Array.isArray(last.content) ? last.content : [];
+    if (content.some(c => c?.type === 'tool_result')) {
+      return 'last entry is tool_result (tool executed, awaiting assistant reply)';
+    }
+  }
+  if (last?.type === 'tool_result') {
+    return 'last entry is tool_result (tool executed, awaiting assistant reply)';
+  }
+  return null;
+}
+function parseTimestampMs(value) {
+  if (!value) return null;
+  if (typeof value === 'number') {
+    return Number.isFinite(value) ? value : null;
+  }
+  if (value instanceof Date) {
+    const timestamp = value.getTime();
+    return Number.isFinite(timestamp) ? timestamp : null;
+  }
+  const parsed = Date.parse(value);
+  return Number.isFinite(parsed) ? parsed : null;
+}
+/**
+ * Detect an agent session that has stopped making progress even though the
+ * watcher process itself is still alive and writing lastPing.
+ *
+ * This closes the failure mode where OpenClaw's Codex app-server retires a
+ * timed-out turn, but dispatch status keeps reporting "running" because the
+ * delivery watcher is still polling.
+ */
+function getRunningSessionStallReason(status, thresholdMs) {
+  if (!status?.sessionKey) return null;
+  const sessionAgent = status.sessionKey.split(':')[1] || 'main';
+  const entry = getSessionStoreEntry(status.sessionKey);
+  if (!entry) return null;
+  const sessionId = entry.sessionId || null;
+  const now = Date.now();
+  const activityTimes = [
+    parseTimestampMs(entry.updatedAt),
+    parseTimestampMs(entry.lastActivityAt),
+    parseTimestampMs(entry.sessionStartedAt),
+    parseTimestampMs(entry.startedAt),
+  ].filter(t => typeof t === 'number');
+  const jsonlMtime = sessionId ? getSessionJsonlMtime(sessionId, sessionAgent) : null;
+  if (typeof jsonlMtime === 'number') activityTimes.push(jsonlMtime);
+  if (typeof status?.liveness?.ageMs === 'number' && status.liveness.ageMs < thresholdMs) {
+    return null;
+  }
+  const lastActivityMs = activityTimes.length ? Math.max(...activityTimes) : null;
+  if (lastActivityMs !== null && now - lastActivityMs < thresholdMs) {
+    return null;
+  }
+  const pendingToolReason = sessionId ? getJsonlPendingToolReason(sessionId, sessionAgent) : null;
+  if (pendingToolReason) {
+    process.stderr.write(
+      `[watcher] ${status.label || 'session'} stale telemetry but pending tool handoff detected: ${pendingToolReason}\n`
+    );
+    return null;
+  }
+  const idleMinutes = lastActivityMs === null
+    ? Math.ceil(thresholdMs / 60000)
+    : Math.max(1, Math.floor((now - lastActivityMs) / 60000));
+  return (
+    `agent session stalled: no session/jsonl activity for ~${idleMinutes}min ` +
+    `while delivery watcher remained alive; likely app-server turn retired or stopped producing events`
+  );
+}
 /**
  * Read the last assistant entry's stop_reason from the session JSONL.
  * Returns the stop_reason string (e.g. 'end_turn', 'tool_use') or null if unavailable.
@@ -754,6 +861,7 @@ function markLabelError(label, errorSummary) {
     updateExistingLabel(label, (entry) => {
       if (entry.status === 'done') return false;
       entry.status = 'error';
+      entry.error = errorSummary || 'failed without result';
       entry.summary = errorSummary || 'failed without result';
     });
   } catch (e) {
@@ -761,6 +869,8 @@ function markLabelError(label, errorSummary) {
   }
 }
+let exitZeroOnTerminal = false;
 /**
  * Format and output the delivery message, then exit 0.
  * Also marks the label as done in labels.json before exiting.
@@ -794,7 +904,7 @@ function deliverResult(label, lastReply, fallbackSummary, completionPayload = nu
           `**Error:** ${stderr || 'non-zero exit'}\n\n` +
           `Job marked as \`error\`. The agent may have reported done without completing the actual work.\n`
         );
-        process.exit(1);
+        process.exit(exitZeroOnTerminal ? 0 : 1);
       }
     }
   } catch (loadErr) {
@@ -816,10 +926,17 @@ function deliverResult(label, lastReply, fallbackSummary, completionPayload = nu
       ? completion.deliveryText.slice(0, maxLen) + '\n\n..[truncated]'
       : completion.deliveryText;
     process.stdout.write(`🌶️ *dispatch* [${label}] completed:\n\n${reply}\n`);
-  } else {
-    process.stderr.write(`[watcher] [${label}] completion delivery suppressed (no meaningful reply or summary)\n`);
+    process.exit(0);
   }
-  process.exit(0);
+  const failureSummary = 'completed without a clean user-facing completion';
+  process.stderr.write(`[watcher] [${label}] completion delivery suppressed (no meaningful reply or summary)\n`);
+  markLabelError(label, failureSummary);
+  process.stdout.write(
+    `⚠️ dispatch [${label}] completed, but no clean user-facing completion was captured. ` +
+    `Internal diagnostics were suppressed; check scheduler run logs for details.\n`
+  );
+  process.exit(exitZeroOnTerminal ? 0 : 1);
 }
 function emitInterruptedOutcome(label, summary, result = null) {
@@ -829,12 +946,12 @@ function emitInterruptedOutcome(label, summary, result = null) {
     `⚠️ dispatch [${label}] session went idle before completing -- work may be incomplete` +
     `${formatDiagnosticSnippet(result?.diagnosticReply || result?.lastReply || null)}\n`
   );
-  process.exit(1);
+  process.exit(exitZeroOnTerminal ? 0 : 1);
 }
 function emitTimeoutOutcome(label, message, result = null) {
   process.stdout.write(`${message}${formatDiagnosticSnippet(result?.diagnosticReply || result?.lastReply || null)}\n`);
-  process.exit(1);
+  process.exit(exitZeroOnTerminal ? 0 : 1);
 }
 // -- Watcher heartbeat interval ref --------------------------------------
@@ -869,15 +986,165 @@ const flags = parseFlags(process.argv.slice(2));
 const label       = flags.label;
 const timeoutS    = parseInt(flags.timeout || '600', 10);
 const pollS       = parseInt(flags['poll-interval'] || '20', 10);
+const once        = flags.once === true || flags.once === 'true';
+exitZeroOnTerminal = once;
-// How long a session must be idle before we proactively check result
-const IDLE_RESULT_CHECK_MS = 60000;
+function getCurrentLivenessPolicy() {
+  const entry = loadLabels()[label] || { timeoutSeconds: timeoutS };
+  return getDispatchLivenessPolicy(entry, { defaultTimeoutSeconds: timeoutS });
+}
+function hasStructuredCompletion(result) {
+  return hasCompletionSignal(result?.completion);
+}
 if (!label) {
   process.stderr.write('[watcher] --label is required\n');
   process.exit(2);
 }
+function touchWatcherPing(label) {
+  updateExistingLabel(label, (entry) => {
+    if (entry.status !== 'running') return false;
+    entry.lastPing = new Date().toISOString();
+  });
+}
+function markWatcherPending(label, reason = 'target still running') {
+  process.stderr.write(`[watcher] WATCHER_PENDING label=${label} reason=${reason}\n`);
+  process.exit(0);
+}
+function clearWatcherRetryAfter(label) {
+  updateExistingLabel(label, (entry) => {
+    if (!entry.watcherRetryAfter) return false;
+    delete entry.watcherRetryAfter;
+  });
+}
+function handleOnce529(label, errorMsg) {
+  const labels = loadLabels();
+  const entry = labels[label] || {};
+  const retryCount = getRetryCount(label);
+  if (retryCount >= MAX_529_RETRIES) {
+    markLabelError(label, `max_retries_exceeded (${retryCount}x 529): ${errorMsg}`);
+    process.stdout.write(
+      `🌶️ *dispatch* [${label}] failed after ${MAX_529_RETRIES} retries (529 overload)\n` +
+      `Error: ${errorMsg}\n`
+    );
+    process.exit(0);
+  }
+  const retryAfterMs = parseTimestampMs(entry.watcherRetryAfter);
+  if (!retryAfterMs) {
+    const retryResult = attempt529Retry(label, retryCount, errorMsg);
+    if (!retryResult.retry) return handleOnce529(label, errorMsg);
+    updateExistingLabel(label, (current) => {
+      current.watcherRetryAfter = new Date(Date.now() + retryResult.delayMs).toISOString();
+    });
+    markWatcherPending(label, `529 retry scheduled for future tick (${retryResult.delayMs / 1000}s)`);
+  }
+  if (Date.now() < retryAfterMs) {
+    markWatcherPending(label, '529 retry backoff active');
+  }
+  if (respawnSession(label)) {
+    clearWatcherRetryAfter(label);
+    markWatcherPending(label, '529 retry dispatched');
+  }
+  markLabelError(label, `529 retry failed -- could not respawn session: ${errorMsg}`);
+  process.stdout.write(
+    `🌶️ *dispatch* [${label}] 529 retry failed -- could not respawn session\n` +
+    `Error: ${errorMsg}\n`
+  );
+  process.exit(0);
+}
+function runOnceAndExit() {
+  try {
+    touchWatcherPing(label);
+  } catch {
+    // Best-effort -- a quick-poll tick must not fail because heartbeat metadata raced.
+  }
+  const status = dispatch('status', ['--label', label]);
+  if (!status?.ok) {
+    markWatcherPending(label, 'status unavailable');
+  }
+  if (status.status === 'error') {
+    const errorMsg = status.error || status.summary || '';
+    if (is529Error(errorMsg)) {
+      handleOnce529(label, errorMsg);
+    }
+  }
+  if (status.status !== 'running') {
+    const terminalResult = dispatch('result', ['--label', label]);
+    const terminalCompletion = terminalResult?.completion || status?.completion || null;
+    if (status.status === 'done') {
+      const currentRetryCount = getRetryCount(label);
+      if (currentRetryCount > 0) setRetryCount(label, 0);
+      const gwRetryCount = getGwRestartRetryCount(label);
+      if (gwRetryCount > 0) setGwRestartRetryCount(label, 0);
+      deliverResult(label, terminalResult?.lastReply, status.summary, terminalCompletion);
+    }
+    if (status.status === 'interrupted') {
+      emitInterruptedOutcome(label, status.summary, terminalResult);
+    }
+    const summary = status.error || status.summary || `terminal failure (${status.status || 'unknown'})`;
+    markLabelError(label, summary);
+    process.stdout.write(`🌶️ *dispatch* [${label}] failed\nSummary: ${summary}\n`);
+    process.exit(0);
+  }
+  if (status.sessionKey) {
+    const entry = getSessionStoreEntry(status.sessionKey);
+    const sessionId = entry?.sessionId || null;
+    const sessionAgent = status.sessionKey.split(':')[1] || 'main';
+    const terminalJsonlReply = sessionId ? getSessionTerminalReply(sessionId, sessionAgent) : null;
+    if (sessionId && terminalJsonlReply && isSessionCleanlyFinished(sessionId, sessionAgent)) {
+      const result = dispatch('result', ['--label', label]);
+      if (hasStructuredCompletion(result)) {
+        deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
+      }
+      process.stderr.write(`[watcher] stop_reason=end_turn observed without completion signal -- continuing to monitor\n`);
+    }
+  }
+  const ageMs = status.liveness?.ageMs;
+  const idleResultCheckMs = getCurrentLivenessPolicy().idleProbeMs;
+  if (ageMs != null && ageMs >= idleResultCheckMs) {
+    const result = dispatch('result', ['--label', label]);
+    if (hasStructuredCompletion(result)) {
+      deliverResult(label, result?.lastReply || null, null, result?.completion || null);
+    }
+    const stallReason = getRunningSessionStallReason(status, idleResultCheckMs);
+    if (stallReason) {
+      process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
+      markLabelError(label, stallReason);
+      process.stdout.write(
+        `❌ *dispatch* [${label}] failed\n` +
+        `Summary: ${stallReason}\n`
+      );
+      process.exit(0);
+    }
+  }
+  markWatcherPending(label);
+}
+if (once) {
+  runOnceAndExit();
+}
 // -- Start heartbeat -----------------------------------------------------
 // Write lastPing to labels.json every PING_INTERVAL_MS while the session is
 // still running. The watchdog guard in index.mjs reads lastPing to know this
@@ -1221,8 +1488,11 @@ while (Date.now() < deadline) {
     if (_sid2a && terminalJsonlReply && isSessionCleanlyFinished(_sid2a, _adir2a)) {
       process.stderr.write(`[watcher] stop_reason=end_turn detected -- delivering early\n`);
       const result = dispatch('result', ['--label', label]);
-      deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
-      // deliverResult exits
+      if (hasStructuredCompletion(result)) {
+        deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
+        // deliverResult exits
+      }
+      process.stderr.write(`[watcher] stop_reason=end_turn observed without completion signal -- continuing to monitor\n`);
     }
   }
@@ -1233,11 +1503,23 @@ while (Date.now() < deadline) {
   // while this watcher's lastPing heartbeat is fresh (written every 60s);
   // this path handles normal completion before the ping goes stale.
   const ageMs = status.liveness?.ageMs;
-  if (ageMs != null && ageMs >= IDLE_RESULT_CHECK_MS) {
+  const idleResultCheckMs = getCurrentLivenessPolicy().idleProbeMs;
+  if (ageMs != null && ageMs >= idleResultCheckMs) {
     const result = dispatch('result', ['--label', label]);
-    if (result?.lastReply || hasCompletionSignal(result?.completion)) {
+    if (hasStructuredCompletion(result)) {
       deliverResult(label, result?.lastReply || null, null, result?.completion || null);
     }
+    const stallReason = getRunningSessionStallReason(status, idleResultCheckMs);
+    if (stallReason) {
+      process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
+      markLabelError(label, stallReason);
+      process.stdout.write(
+        `❌ *dispatch* [${label}] failed\n` +
+        `Summary: ${stallReason}\n`
+      );
+      process.exit(1);
+    }
   }
@@ -1310,7 +1592,7 @@ if (sessionInternalId) {
 // If the session already completed (gateway pruned it -> null tokens), exit cleanly.
 if (statusAtDeadline?.status === 'done' || baselineTokens === null) {
   const r = dispatch('result', ['--label', label]);
-  if (r?.lastReply || hasCompletionSignal(r?.completion)) {
+  if (hasStructuredCompletion(r)) {
     // deliverResult calls process.exit(0) internally
     deliverResult(label, r?.lastReply || null, statusAtDeadline?.summary || null, r?.completion || null);
   }
@@ -1349,7 +1631,7 @@ while (Date.now() - flatSince < FLAT_WINDOW_MS) {
     deliverResult(label, r?.lastReply || null, st.summary, r?.completion || st?.completion || null);
   }
   const r2 = dispatch('result', ['--label', label]);
-  if (r2?.lastReply || hasCompletionSignal(r2?.completion)) {
+  if (hasStructuredCompletion(r2)) {
     // deliverResult calls process.exit(0) internally
     deliverResult(label, r2?.lastReply || null, null, r2?.completion || null);
   }
@@ -1443,7 +1725,7 @@ if (sessionInternalId) {
         deliverResult(label, rExt?.lastReply || null, stExt.summary, rExt?.completion || stExt?.completion || null);
       }
       const rExt2 = dispatch('result', ['--label', label]);
-      if (rExt2?.lastReply || hasCompletionSignal(rExt2?.completion)) {
+      if (hasStructuredCompletion(rExt2)) {
         // deliverResult calls process.exit(0) internally
         deliverResult(label, rExt2?.lastReply || null, null, rExt2?.completion || null);
       }
@@ -1500,7 +1782,7 @@ for (const round of steerRounds) {
     deliverResult(label, r3?.lastReply || null, st2.summary, r3?.completion || st2?.completion || null);
   }
   const r3 = dispatch('result', ['--label', label]);
-  if (r3?.lastReply || hasCompletionSignal(r3?.completion)) {
+  if (hasStructuredCompletion(r3)) {
     // deliverResult calls process.exit(0) internally
     deliverResult(label, r3?.lastReply || null, null, r3?.completion || null);
   }
@@ -1515,7 +1797,7 @@ for (const round of steerRounds) {
       if (st3?.status === 'done') {
         // Check if a result was captured before marking as error
         const r4 = dispatch('result', ['--label', label]);
-        if (r4?.lastReply || hasCompletionSignal(r4?.completion)) {
+        if (hasStructuredCompletion(r4)) {
           deliverResult(label, r4?.lastReply || null, st3.summary, r4?.completion || st3?.completion || null); // deliverResult calls process.exit(0)
         }
         markLabelError(label, 'timed out -- killed after steer attempts (no result captured)');

package/dispatcher-strategies.js CHANGED Viewed

@@ -1095,6 +1095,25 @@ export async function executeMain(job, ctx, deps) {
 // -- Strategy: Shell -----------------------------------------
+function isCompletionDeliveryWatcherJob(job) {
+  return /^(?:dispatch|chilisaus)-deliver:/.test(String(job?.name || ''));
+}
+function isCompletionWatcherPendingTick(shellResult) {
+  return !(shellResult.stdout || '').trim()
+    && /\bWATCHER_PENDING\b/.test(shellResult.stderr || '');
+}
+function buildCompletionWatcherNoPayloadMessage(job, shellResult) {
+  const statusLabel = shellResult.status === 'ok'
+    ? 'completed without a deliverable result'
+    : `failed before producing a deliverable result${shellResult.errorMessage ? ` (${shellResult.errorMessage})` : ''}`;
+  return [
+    `⚠️ Completion delivery watcher for ${job.name} ${statusLabel}.`,
+    'No internal diagnostics were delivered as the completion message; check the scheduler run logs for stderr/details.',
+  ].join('\n');
+}
 export async function executeShell(job, ctx, deps) {
   const { runShellCommand, normalizeShellResult, log } = deps;
   const result = makeDefaultResult();
@@ -1129,18 +1148,61 @@ export async function executeShell(job, ctx, deps) {
     shell_stderr_bytes: shellResult.stderrBytes,
   };
-  // Shell delivery logic: announce-always sends on all results, announce sends on error only
-  const announcePayload = shellResult.deliveryText.trim() ? shellResult.deliveryText : shellResult.errorMessage;
-  if (job.delivery_mode === 'announce-always' && announcePayload) {
-    const prefix = shellResult.status === 'ok' ? '' : `\u26a0\ufe0f Shell job failed: ${job.name}\n\n`;
-    result.deliveryOverride = `${prefix}${announcePayload}`;
-  } else if (job.delivery_mode === 'announce' && shellResult.status !== 'ok' && announcePayload) {
-    result.deliveryOverride = announcePayload;
+  if (isCompletionDeliveryWatcherJob(job)) {
+    const watcherStdout = (shellResult.stdout || '').trim();
+    const watcherStderr = (shellResult.stderr || '').trim();
+    if (isCompletionWatcherPendingTick(shellResult)) {
+      result.status = 'skipped';
+      result.summary = 'Completion delivery watcher pending; target session is still running';
+      result.content = '';
+      result.errorMessage = null;
+      result.idemAction = 'release';
+      result.skipDelivery = true;
+    } else if (watcherStdout) {
+      // Completion watcher stdout is the only user-facing contract.  Stderr is
+      // diagnostics-only and must never be repackaged as a "successful" final
+      // completion if the watcher suppressed the real payload.
+      result.summary = watcherStdout;
+      result.content = watcherStdout;
+      if (['announce', 'announce-always'].includes(job.delivery_mode)) {
+        result.deliveryOverride = watcherStdout;
+      } else {
+        result.skipDelivery = true;
+      }
+    } else {
+      const noPayloadMessage = buildCompletionWatcherNoPayloadMessage(job, shellResult);
+      result.status = 'error';
+      result.summary = noPayloadMessage;
+      result.errorMessage = 'Completion delivery watcher produced no user-facing stdout payload';
+      result.content = noPayloadMessage;
+      if (['announce', 'announce-always'].includes(job.delivery_mode)) {
+        result.deliveryOverride = noPayloadMessage;
+      } else {
+        result.skipDelivery = true;
+      }
+      log('warn', `Completion watcher produced no deliverable stdout: ${job.name}`, {
+        runId: ctx.run.id,
+        shellStatus: shellResult.status,
+        exitCode: shellResult.exitCode,
+        stderrExcerpt: watcherStderr.slice(0, 500),
+        skippedOrDisabled: /\b(?:skipped|disabled)\b/i.test(watcherStderr),
+      });
+    }
   } else {
-    result.skipDelivery = true;
+    // Shell delivery logic: announce-always sends on all results, announce sends on error only
+    const announcePayload = shellResult.deliveryText.trim() ? shellResult.deliveryText : shellResult.errorMessage;
+    if (job.delivery_mode === 'announce-always' && announcePayload) {
+      const prefix = shellResult.status === 'ok' ? '' : `\u26a0\ufe0f Shell job failed: ${job.name}\n\n`;
+      result.deliveryOverride = `${prefix}${announcePayload}`;
+    } else if (job.delivery_mode === 'announce' && shellResult.status !== 'ok' && announcePayload) {
+      result.deliveryOverride = announcePayload;
+    } else {
+      result.skipDelivery = true;
+    }
   }
-  log('info', `Shell ${shellResult.status}: ${job.name}`, {
+  log('info', `Shell ${result.status}: ${job.name}`, {
     runId: ctx.run.id,
     exitCode: shellResult.exitCode,
     signal: shellResult.signal,
@@ -1156,11 +1218,16 @@ export async function executeAgent(job, ctx, deps) {
   const {
     waitForGateway, updateRunSession, setAgentStatus,
     buildJobPrompt, runAgentTurnWithActivityTimeout,
+    // Sanctioned isolated dispatch primitive. Falls back to the activity-aware
+    // runner when callers (e.g. tests) wire only the older name -- both helpers
+    // share the same HTTP-only contract, no subprocess spawn.
+    runIsolatedAgentTurn,
     updateContextSummary, releaseDispatch, releaseIdempotencyKey,
     updateJob, matchesSentinel, detectTransientError,
     listSessions,
     sqliteNow, log,
   } = deps;
+  const dispatchAgentTurn = runIsolatedAgentTurn || runAgentTurnWithActivityTimeout;
   const result = makeDefaultResult();
   // Gateway health check
@@ -1254,7 +1321,12 @@ export async function executeAgent(job, ctx, deps) {
     }
   }
-  const turnResult = await runAgentTurnWithActivityTimeout({
+  // Isolated dispatch primitive: HTTP-only chat completions call. The
+  // scheduler must never fork a sibling `openclaw` process to spawn an
+  // isolated session -- that variant has historically SIGTERM'd the
+  // launchd-tracked gateway parent and orphaned a node process on port
+  // 18789 (see ISOLATED_DISPATCH_PRIMITIVE in gateway.js).
+  const turnResult = await dispatchAgentTurn({
     message: prompt,
     agentId: job.agent_id || 'main',
     sessionKey,

package/dispatcher.js CHANGED Viewed

@@ -51,7 +51,8 @@ import {
 import { buildRetrievalContext } from './retrieval.js';
 import { upsertAgent, setAgentStatus } from './agents.js';
 import {
-  runAgentTurnWithActivityTimeout, sendSystemEvent, getAllSubAgentSessions, listSessions,
+  runAgentTurnWithActivityTimeout, runIsolatedAgentTurn,
+  sendSystemEvent, getAllSubAgentSessions, listSessions,
   deliverMessage, checkGatewayHealth, waitForGateway, resolveDeliveryAlias,
   applyAuthProfileToSessionStore,
   syncAuthStoreToSession,
@@ -306,6 +307,10 @@ function buildDispatchDeps() {
     // Agent
     waitForGateway, updateRunSession, setAgentStatus,
     buildJobPrompt, runAgentTurnWithActivityTimeout,
+    // Isolated cron-dispatch primitive: HTTP-only wrapper around the
+    // chat-completions API; never forks a sibling openclaw process that
+    // could SIGTERM the launchd-tracked gateway parent.
+    runIsolatedAgentTurn,
     updateContextSummary, releaseIdempotencyKey,
     matchesSentinel, detectTransientError,
     listSessions,

package/gateway.js CHANGED Viewed

@@ -9,6 +9,22 @@ const GATEWAY_URL = process.env.OPENCLAW_GATEWAY_URL || 'http://127.0.0.1:18789'
 const HOME_DIR = process.env.HOME || homedir();
 export const TELEGRAM_MAX_MESSAGE_LENGTH = 4096;
+// -- Isolated dispatch primitive contract --------------------
+//
+// Cron jobs with session_target=isolated must reach the gateway via the
+// public HTTP API only. Forking a sibling `openclaw` process to spawn the
+// session is rejected: in production that primitive has SIGTERM'd the
+// launchd-tracked gateway parent (the child inherits the parent's listening
+// socket on port 18789 and the parent dies), leaving an orphan node process
+// holding the port. See rh-bot.lan zombie-cascade incident report.
+//
+// runIsolatedAgentTurn is the only sanctioned dispatch primitive for
+// session_target=isolated cron jobs. It MUST NOT spawn, fork, or exec any
+// child process. Any future change that needs subprocess execution belongs
+// behind a different, explicitly-named helper so reviewers can keep this
+// contract intact.
+export const ISOLATED_DISPATCH_PRIMITIVE = 'http-chat-completions';
 let _cachedToken;
 let _tokenLoaded = false;
@@ -246,6 +262,29 @@ export async function runAgentTurnWithActivityTimeout(opts) {
   }
 }
+// -- Isolated dispatch primitive -----------------------------
+/**
+ * Sanctioned dispatch primitive for session_target=isolated cron jobs.
+ *
+ * This is a thin wrapper around runAgentTurnWithActivityTimeout that names
+ * the contract: HTTP-only request to the gateway, no child process spawn.
+ * The scheduler routes every session_target=isolated job through this
+ * helper so the no-fork invariant is reviewable at one call site and
+ * testable in isolation (see the no-subprocess regression test in test.js).
+ *
+ * Why a named wrapper instead of calling runAgentTurnWithActivityTimeout
+ * directly: the dispatch primitive is the load-bearing surface that the
+ * rh-bot.lan zombie-on-port outage cascaded through. A named entry point
+ * gives operators and reviewers a single grep target ("runIsolatedAgentTurn")
+ * to audit the no-spawn invariant.
+ *
+ * Accepts the same options as runAgentTurnWithActivityTimeout.
+ */
+export async function runIsolatedAgentTurn(opts) {
+  return await runAgentTurnWithActivityTimeout(opts);
+}
 // -- System Events (main session) ----------------------------
 /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "openclaw-scheduler",
-  "version": "0.2.5",
+  "version": "0.2.7",
   "description": "SQLite-backed job scheduler and workflow engine for OpenClaw agents",
   "type": "module",
   "main": "./index.js",
@@ -42,6 +42,7 @@
     "dispatch/deliver-watcher.sh",
     "dispatch/hooks.mjs",
     "dispatch/index.mjs",
+    "dispatch/liveness.mjs",
     "dispatch/message-input.mjs",
     "dispatch/README.md",
     "dispatch/watcher.mjs",