npm - openclaw-scheduler - Versions diffs - 0.2.9 → 0.2.11 - Mend

openclaw-scheduler 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/CHANGELOG.md +7 -0
package/INSTALL-ADDITIONAL-HOST.md +1 -1
package/INSTALL-LINUX.md +1 -1
package/INSTALL-WINDOWS.md +1 -1
package/INSTALL.md +1 -1
package/JOB-QUICK-REF.md +2 -0
package/README.md +5 -5
package/cli.js +9 -1
package/dispatch/529-recovery.mjs +21 -2
package/dispatch/completion.mjs +50 -0
package/dispatch/index.mjs +179 -11
package/dispatch/watcher.mjs +106 -16
package/dispatcher-strategies.js +121 -72
package/dispatcher.js +4 -2
package/docs/gateway-contract.md +21 -0
package/gateway.js +140 -30
package/index.d.ts +5 -0
package/jobs.js +23 -8
package/migrate-consolidate.js +6 -2
package/package.json +3 -3
package/paths.js +43 -1
package/scheduler-schema.js +2 -0
package/schema.sql +6 -1
package/setup.mjs +24 -22

package/dispatch/watcher.mjs CHANGED Viewed

@@ -39,6 +39,7 @@ import {
 import { getDispatchLivenessPolicy } from './liveness.mjs';
 import { resolveLabelsPath } from './paths.mjs';
 import { sendMessage } from '../messages.js';
+import { ensureArtifactsDir, resolveArtifactsDir } from '../paths.js';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const INDEX_PATH = process.env.DISPATCH_INDEX_PATH || join(__dirname, 'index.mjs');
@@ -54,12 +55,68 @@ const MAX_GW_RESTART_RETRIES = 2; // Max retries for gateway-restart-kill recove
 const FLAT_WINDOW_MS = 3 * 60 * 1000; // 3 min flat = genuinely stuck
 const ACTIVITY_POLL_MS = 30_000;
+const COMPLETION_INLINE_LIMIT_BYTES = parsePositiveEnvInt('DISPATCH_COMPLETION_INLINE_LIMIT_BYTES', 60 * 1024);
 /** How often the watcher writes lastPing to labels.json (heartbeat signal).
  *  The watchdog guard in index.mjs treats pings older than 3x this as stale,
  *  so PING_INTERVAL_MS must stay well below PING_STALE_MS (3 * 60_000). */
 const PING_INTERVAL_MS = 60_000; // 60 seconds
+function parsePositiveEnvInt(name, fallback) {
+  const value = Number.parseInt(String(process.env[name] ?? ''), 10);
+  return Number.isFinite(value) && value > 0 ? value : fallback;
+}
+function byteLength(text) {
+  return Buffer.byteLength(String(text ?? ''), 'utf8');
+}
+function sliceUtf8Bytes(text, maxBytes) {
+  const source = String(text ?? '');
+  if (byteLength(source) <= maxBytes) return source;
+  let usedBytes = 0;
+  let endIndex = 0;
+  for (const char of source) {
+    const charBytes = byteLength(char);
+    if (usedBytes + charBytes > maxBytes) break;
+    usedBytes += charBytes;
+    endIndex += char.length;
+  }
+  return source.slice(0, endIndex).trimEnd();
+}
+function completionArtifactPath(label) {
+  const safeLabel = String(label || 'completion')
+    .replace(/[^a-z0-9._-]+/gi, '-')
+    .replace(/^-+|-+$/g, '')
+    .slice(0, 80) || 'completion';
+  const dir = ensureArtifactsDir(join(resolveArtifactsDir({ env: process.env }), 'dispatch-completions'));
+  return join(dir, `${new Date().toISOString().replace(/[:.]/g, '-')}-${safeLabel}.txt`);
+}
+function formatCompletionStdout(label, deliveryText) {
+  const header = `🌶️ *dispatch* [${label}] completed:\n\n`;
+  const body = String(deliveryText ?? '');
+  const bodyBytes = byteLength(body);
+  if (bodyBytes <= COMPLETION_INLINE_LIMIT_BYTES) {
+    return `${header}${body}\n`;
+  }
+  let artifactNote;
+  try {
+    const artifactPath = completionArtifactPath(label);
+    writeFileSync(artifactPath, body, 'utf8');
+    artifactNote = `\n\nFull completion report saved to ${artifactPath} (${bodyBytes} bytes). Inline delivery capped at ${COMPLETION_INLINE_LIMIT_BYTES} bytes to avoid dumping an oversized report.`;
+  } catch (err) {
+    artifactNote = `\n\nFull completion report was ${bodyBytes} bytes, but saving the oversized report failed: ${err.message}. Inline delivery capped at ${COMPLETION_INLINE_LIMIT_BYTES} bytes.`;
+  }
+  const bodyBudget = Math.max(0, COMPLETION_INLINE_LIMIT_BYTES - byteLength(artifactNote));
+  const inlineBody = sliceUtf8Bytes(body, bodyBudget);
+  return `${header}${inlineBody}${artifactNote}\n`;
+}
 function getGatewayToken() {
   if (process.env.OPENCLAW_GATEWAY_TOKEN) return process.env.OPENCLAW_GATEWAY_TOKEN;
@@ -922,11 +979,7 @@ function deliverResult(label, lastReply, fallbackSummary, completionPayload = nu
   markLabelDone(label, completion.summary);
   if (completion.deliveryText) {
-    const maxLen = 3500;
-    const reply = completion.deliveryText.length > maxLen
-      ? completion.deliveryText.slice(0, maxLen) + '\n\n..[truncated]'
-      : completion.deliveryText;
-    process.stdout.write(`🌶️ *dispatch* [${label}] completed:\n\n${reply}\n`);
+    process.stdout.write(formatCompletionStdout(label, completion.deliveryText));
     process.exit(0);
   }
@@ -999,6 +1052,22 @@ function hasStructuredCompletion(result) {
   return hasCompletionSignal(result?.completion);
 }
+function getCleanTerminalReply(status) {
+  if (!status?.sessionKey) return null;
+  const entry = getSessionStoreEntry(status.sessionKey);
+  const sessionId = entry?.sessionId || null;
+  const sessionAgent = status.sessionKey.split(':')[1] || 'main';
+  const terminalJsonlReply = sessionId ? getSessionTerminalReply(sessionId, sessionAgent) : null;
+  if (!sessionId || !terminalJsonlReply) return null;
+  return isSessionCleanlyFinished(sessionId, sessionAgent) ? terminalJsonlReply : null;
+}
+function getStrictTerminalReply(result, status) {
+  const terminalJsonlReply = getCleanTerminalReply(status);
+  if (!terminalJsonlReply) return null;
+  return result?.lastReply || terminalJsonlReply;
+}
 if (!label) {
   process.stderr.write('[watcher] --label is required\n');
   process.exit(2);
@@ -1106,28 +1175,33 @@ function runOnceAndExit() {
   }
   if (status.sessionKey) {
-    const entry = getSessionStoreEntry(status.sessionKey);
-    const sessionId = entry?.sessionId || null;
-    const sessionAgent = status.sessionKey.split(':')[1] || 'main';
-    const terminalJsonlReply = sessionId ? getSessionTerminalReply(sessionId, sessionAgent) : null;
-    if (sessionId && terminalJsonlReply && isSessionCleanlyFinished(sessionId, sessionAgent)) {
+    const terminalJsonlReply = getCleanTerminalReply(status);
+    if (terminalJsonlReply) {
       const result = dispatch('result', ['--label', label]);
       if (hasStructuredCompletion(result)) {
         deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
       }
-      process.stderr.write(`[watcher] stop_reason=end_turn observed without completion signal -- continuing to monitor\n`);
+      deliverResult(label, terminalJsonlReply, 'completed (stop_reason=end_turn)', null);
     }
   }
   const ageMs = status.liveness?.ageMs;
-  const idleResultCheckMs = getCurrentLivenessPolicy().idleProbeMs;
+  const livenessPolicy = getCurrentLivenessPolicy();
+  const idleResultCheckMs = livenessPolicy.idleProbeMs;
+  const idleFailureMs = livenessPolicy.idleFailureMs;
   if (ageMs != null && ageMs >= idleResultCheckMs) {
     const result = dispatch('result', ['--label', label]);
     if (hasStructuredCompletion(result)) {
       deliverResult(label, result?.lastReply || null, null, result?.completion || null);
     }
+    const terminalReply = getStrictTerminalReply(result, status);
+    if (terminalReply) {
+      deliverResult(label, terminalReply, 'completed (stop_reason=end_turn)', null);
+    }
-    const stallReason = getRunningSessionStallReason(status, idleResultCheckMs);
+    const stallReason = ageMs >= idleFailureMs
+      ? getRunningSessionStallReason(status, idleFailureMs)
+      : null;
     if (stallReason) {
       process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
       markLabelError(label, stallReason);
@@ -1493,7 +1567,7 @@ while (Date.now() < deadline) {
         deliverResult(label, result?.lastReply || terminalJsonlReply, 'completed (stop_reason=end_turn)', result?.completion || null);
         // deliverResult exits
       }
-      process.stderr.write(`[watcher] stop_reason=end_turn observed without completion signal -- continuing to monitor\n`);
+      deliverResult(label, terminalJsonlReply, 'completed (stop_reason=end_turn)', null);
     }
   }
@@ -1504,14 +1578,22 @@ while (Date.now() < deadline) {
   // while this watcher's lastPing heartbeat is fresh (written every 60s);
   // this path handles normal completion before the ping goes stale.
   const ageMs = status.liveness?.ageMs;
-  const idleResultCheckMs = getCurrentLivenessPolicy().idleProbeMs;
+  const livenessPolicy = getCurrentLivenessPolicy();
+  const idleResultCheckMs = livenessPolicy.idleProbeMs;
+  const idleFailureMs = livenessPolicy.idleFailureMs;
   if (ageMs != null && ageMs >= idleResultCheckMs) {
     const result = dispatch('result', ['--label', label]);
     if (hasStructuredCompletion(result)) {
       deliverResult(label, result?.lastReply || null, null, result?.completion || null);
     }
+    const terminalReply = getStrictTerminalReply(result, status);
+    if (terminalReply) {
+      deliverResult(label, terminalReply, 'completed (stop_reason=end_turn)', null);
+    }
-    const stallReason = getRunningSessionStallReason(status, idleResultCheckMs);
+    const stallReason = ageMs >= idleFailureMs
+      ? getRunningSessionStallReason(status, idleFailureMs)
+      : null;
     if (stallReason) {
       process.stderr.write(`[watcher] [${label}] ${stallReason}\n`);
       markLabelError(label, stallReason);
@@ -1530,6 +1612,14 @@ while (Date.now() < deadline) {
 // Timed out -- try one last result check
 const finalResult = dispatch('result', ['--label', label]);
 const finalStatus = dispatch('status', ['--label', label]);
+if (hasStructuredCompletion(finalResult)) {
+  deliverResult(
+    label,
+    finalResult?.lastReply || null,
+    finalStatus?.summary || null,
+    finalResult?.completion || finalStatus?.completion || null,
+  );
+}
 if (finalStatus?.status === 'done') {
   const rc = getRetryCount(label);
   if (rc > 0) setRetryCount(label, 0);

package/dispatcher-strategies.js CHANGED Viewed

@@ -1214,6 +1214,93 @@ export async function executeShell(job, ctx, deps) {
 // -- Strategy: Agent (isolated session) ----------------------
+function describeAgentSelection(selection) {
+  return {
+    model: selection?.model || null,
+    auth_profile: selection?.authProfile || null,
+  };
+}
+function sameAgentSelection(left, right) {
+  return (left?.model || undefined) === (right?.model || undefined)
+    && (left?.authProfile || undefined) === (right?.authProfile || undefined);
+}
+async function resolveConfiguredAuthProfile(authProfile, deps, jobId, fieldName = 'auth_profile') {
+  const { listSessions, log } = deps;
+  let resolvedAuthProfile = authProfile || undefined;
+  if (resolvedAuthProfile !== 'inherit') return resolvedAuthProfile;
+  try {
+    const sessions = await listSessions({ kinds: ['main'], activeMinutes: 120, limit: 10 });
+    const sessionList = sessions?.result?.details?.sessions || sessions?.result?.sessions || sessions?.sessions || sessions || [];
+    const mainSession = Array.isArray(sessionList)
+      ? sessionList.find(s => {
+          const key = s.key || s.sessionKey || '';
+          return key.includes(':main:') || key.endsWith(':main') || key === 'main';
+        })
+      : null;
+    const profileId = mainSession?.authProfileOverride || mainSession?.authProfile || mainSession?.profile;
+    if (profileId) {
+      resolvedAuthProfile = profileId;
+      log('debug', `Resolved ${fieldName} 'inherit' -> '${profileId}'`, { jobId });
+    } else {
+      log('debug', `${fieldName} 'inherit' -- no main session profile found, passing 'inherit' as-is`, { jobId });
+    }
+  } catch (err) {
+    log('warn', `Failed to resolve ${fieldName} 'inherit': ${err.message}`, { jobId });
+    // Fall through with 'inherit' -- gateway may handle it.
+  }
+  return resolvedAuthProfile;
+}
+async function runAgentTurnForSelection(job, deps, prompt, sessionKey, selection, dispatchAgentTurn) {
+  const { log } = deps;
+  const { syncAuthStoreToSession: syncAuth, applySessionOverridesToSessionStore: applySessionOverrides } = deps;
+  // Always sync the live auth store before each attempt so refreshed credentials
+  // are visible to any embedded/isolated runner startup.
+  if (typeof syncAuth === 'function') {
+    const syncResult = syncAuth(job.agent_id || 'main');
+    if (syncResult.ok) {
+      log('debug', `Synced live auth store to agent '${job.agent_id || 'main'}'`, { jobId: job.id });
+    } else {
+      log('warn', `Failed to sync auth store: ${syncResult.error}`, { jobId: job.id });
+    }
+  }
+  if (typeof applySessionOverrides === 'function') {
+    const applyResult = applySessionOverrides(
+      sessionKey,
+      {
+        authProfile: selection.authProfile,
+        modelRef: selection.model || null,
+      },
+      job.agent_id || 'main',
+    );
+    if (applyResult.ok) {
+      log('debug', `Applied session overrides for ${sessionKey}`, {
+        jobId: job.id,
+        authProfile: selection.authProfile || null,
+        modelRef: selection.model || null,
+      });
+    } else {
+      log('warn', `Failed to apply session overrides: ${applyResult.error}`, { jobId: job.id, sessionKey });
+    }
+  }
+  return dispatchAgentTurn({
+    message: prompt,
+    agentId: job.agent_id || 'main',
+    sessionKey,
+    authProfile: selection.authProfile,
+    idleTimeoutMs: (job.payload_timeout_seconds || 120) * 1000,
+    pollIntervalMs: 60000,
+    absoluteTimeoutMs: job.run_timeout_ms || 300000,
+  });
+}
 export async function executeAgent(job, ctx, deps) {
   const {
     waitForGateway, updateRunSession, setAgentStatus,
@@ -1224,7 +1311,6 @@ export async function executeAgent(job, ctx, deps) {
     runIsolatedAgentTurn,
     updateContextSummary, releaseDispatch, releaseIdempotencyKey,
     updateJob, matchesSentinel, detectTransientError,
-    listSessions,
     sqliteNow, log,
   } = deps;
   const dispatchAgentTurn = runIsolatedAgentTurn || runAgentTurnWithActivityTimeout;
@@ -1264,82 +1350,45 @@ export async function executeAgent(job, ctx, deps) {
   const { prompt, contextMeta } = buildJobPrompt(job, ctx.run);
   try { updateContextSummary(ctx.run.id, contextMeta); } catch (_e) { /* column may not exist yet */ }
-  // Resolve auth_profile: use effective profile from child credential policy
-  // if available (set by 'inherit' policy), otherwise fall back to the job's own.
-  let resolvedAuthProfile = ctx.v02Outcomes?.effective_auth_profile || job.auth_profile || undefined;
-  if (resolvedAuthProfile === 'inherit') {
-    try {
-      const sessions = await listSessions({ kinds: ['main'], activeMinutes: 120, limit: 10 });
-      const sessionList = sessions?.result?.details?.sessions || sessions?.result?.sessions || sessions?.sessions || sessions || [];
-      const mainSession = Array.isArray(sessionList)
-        ? sessionList.find(s => {
-            const key = s.key || s.sessionKey || '';
-            return key.includes(':main:') || key.endsWith(':main') || key === 'main';
-          })
-        : null;
-      const profileId = mainSession?.authProfileOverride || mainSession?.authProfile || mainSession?.profile;
-      if (profileId) {
-        resolvedAuthProfile = profileId;
-        log('debug', `Resolved auth_profile 'inherit' -> '${profileId}'`, { jobId: job.id });
-      } else {
-        log('debug', `auth_profile 'inherit' -- no main session profile found, passing 'inherit' as-is`, { jobId: job.id });
-      }
-    } catch (err) {
-      log('warn', `Failed to resolve 'inherit' auth_profile: ${err.message}`, { jobId: job.id });
-      // Fall through with 'inherit' -- gateway may handle it
-    }
-  }
+  const primarySelection = {
+    model: job.payload_model || undefined,
+    authProfile: await resolveConfiguredAuthProfile(
+      ctx.v02Outcomes?.effective_auth_profile || job.auth_profile || undefined,
+      deps,
+      job.id,
+      ctx.v02Outcomes?.effective_auth_profile ? 'effective_auth_profile' : 'auth_profile'
+    ),
+  };
+  const hasConfiguredFallback = job.payload_model_fallback != null || job.auth_profile_fallback != null;
+  const fallbackSelection = hasConfiguredFallback ? {
+    model: job.payload_model_fallback || primarySelection.model || undefined,
+    authProfile: job.auth_profile_fallback != null
+      ? await resolveConfiguredAuthProfile(job.auth_profile_fallback, deps, job.id, 'auth_profile_fallback')
+      : primarySelection.authProfile,
+  } : null;
+  let turnResult;
+  try {
+    turnResult = await runAgentTurnForSelection(job, deps, prompt, sessionKey, primarySelection, dispatchAgentTurn);
+  } catch (primaryError) {
+    const canTryConfiguredFallback = fallbackSelection && !sameAgentSelection(primarySelection, fallbackSelection);
+    if (!canTryConfiguredFallback) throw primaryError;
-  // Always sync the live auth store to the agent's auth-profiles.json BEFORE
-  // every agent turn. This ensures sessions that reuse a stable key (scheduler:<jobId>)
-  // always have fresh credentials -- token refreshes, order changes, and new
-  // profiles are picked up automatically without requiring an explicit auth_profile
-  // on every job.
-  const { syncAuthStoreToSession: syncAuth } = deps;
-  if (typeof syncAuth === 'function') {
-    const syncResult = syncAuth(job.agent_id || 'main');
-    if (syncResult.ok) {
-      log('debug', `Synced live auth store to agent '${job.agent_id || 'main'}'`, { jobId: job.id });
-    } else {
-      log('warn', `Failed to sync auth store: ${syncResult.error}`, { jobId: job.id });
-    }
-  }
+    log('warn', 'Primary agent selection failed; retrying with configured fallback', {
+      jobId: job.id,
+      primary: describeAgentSelection(primarySelection),
+      fallback: describeAgentSelection(fallbackSelection),
+      error: primaryError.message,
+    });
-  // Apply auth profile to session store BEFORE the agent turn.
-  // The x-openclaw-auth-profile HTTP header is not read by the gateway (dead header).
-  // Writing authProfileOverride directly to sessions.json is the effective mechanism
-  // for auth profile propagation to isolated/embedded sessions.
-  if (resolvedAuthProfile && resolvedAuthProfile !== 'inherit') {
-    const { applyAuthProfileToSessionStore: applyAuthProfile } = deps;
-    if (typeof applyAuthProfile === 'function') {
-      const applyResult = applyAuthProfile(sessionKey, resolvedAuthProfile, job.agent_id || 'main');
-      if (applyResult.ok) {
-        log('debug', `Applied auth profile '${resolvedAuthProfile}' to session store for ${sessionKey}`, { jobId: job.id });
-      } else {
-        log('warn', `Failed to apply auth profile to session store: ${applyResult.error}`, { jobId: job.id, sessionKey });
-      }
+    try {
+      turnResult = await runAgentTurnForSelection(job, deps, prompt, sessionKey, fallbackSelection, dispatchAgentTurn);
+      log('info', 'Configured agent fallback succeeded', { jobId: job.id, fallback: describeAgentSelection(fallbackSelection) });
+    } catch (fallbackError) {
+      throw new Error(`Primary agent selection failed: ${primaryError.message}; configured fallback also failed: ${fallbackError.message}`, { cause: fallbackError });
     }
   }
-  // Isolated dispatch primitive: HTTP-only chat completions call. The
-  // scheduler must never fork a sibling `openclaw` process to spawn an
-  // isolated session -- that variant has historically SIGTERM'd the
-  // launchd-tracked gateway parent and orphaned a node process on port
-  // 18789 (see ISOLATED_DISPATCH_PRIMITIVE in gateway.js).
-  const turnResult = await dispatchAgentTurn({
-    message: prompt,
-    agentId: job.agent_id || 'main',
-    sessionKey,
-    model: job.payload_model || undefined,
-    authProfile: resolvedAuthProfile,
-    // materializedEnv deferred: the x-openclaw-env-inject header is not sent
-    // until the OpenClaw gateway implements the receiver side. See
-    // openclaw/docs/env-inject-proposal.md for the gateway spec.
-    idleTimeoutMs: (job.payload_timeout_seconds || 120) * 1000,
-    pollIntervalMs: 60000,
-    absoluteTimeoutMs: job.run_timeout_ms || 300000,
-  });
   const content = turnResult.content || '';
   const trimmed = content.trim();

package/dispatcher.js CHANGED Viewed

@@ -54,7 +54,7 @@ import {
   runAgentTurnWithActivityTimeout, runIsolatedAgentTurn,
   sendSystemEvent, getAllSubAgentSessions, listSessions,
   deliverMessage, checkGatewayHealth, waitForGateway, resolveDeliveryAlias,
-  applyAuthProfileToSessionStore,
+  applySessionOverridesToSessionStore,
   syncAuthStoreToSession,
 } from './gateway.js';
 import { normalizeShellResult } from './shell-result.js';
@@ -314,7 +314,7 @@ function buildDispatchDeps() {
     updateContextSummary, releaseIdempotencyKey,
     matchesSentinel, detectTransientError,
     listSessions,
-    applyAuthProfileToSessionStore,
+    applySessionOverridesToSessionStore,
     syncAuthStoreToSession,
     // Finalize
     updateIdempotencyResultHash,
@@ -430,8 +430,10 @@ function buildJobPrompt(job, run) {
     execution_intent: job.execution_intent || 'execute',
     execution_read_only: Boolean(job.execution_read_only),
     payload_model: job.payload_model || null,
+    payload_model_fallback: job.payload_model_fallback || null,
     payload_thinking: job.payload_thinking || null,
     auth_profile: job.auth_profile || null,
+    auth_profile_fallback: job.auth_profile_fallback || null,
   };
   const triggerContext = buildTriggeredRunContext(run);

package/docs/gateway-contract.md CHANGED Viewed

@@ -90,6 +90,10 @@ single user message to an agent and receives the complete assistant response.
 The `model` field defaults to `openclaw:<agentId>` but can be overridden via
 `job.payload_model`.
+If `job.payload_model_fallback` and/or `job.auth_profile_fallback` are set, the
+scheduler retries once in the same run with the configured fallback selection
+after a primary selection error.
 **Response body** (expected):
 ```json
@@ -653,6 +657,23 @@ directly as the `x-openclaw-auth-profile` header value without resolution.
 ---
+## Fallback Model / Auth Selection
+Jobs can optionally persist `payload_model_fallback` and `auth_profile_fallback`
+alongside the primary `payload_model` / `auth_profile` fields.
+Runtime behavior:
+- The scheduler attempts the primary selection first.
+- If the primary chat-completions request errors before a usable assistant
+  reply is returned, `executeAgent()` retries once in the same run using the
+  configured fallback overrides.
+- Any fallback dimension left unset keeps the primary effective value.
+- Existing jobs remain backward-compatible because both fallback fields default
+  to `NULL` and no retry is attempted unless a fallback override is configured.
+---
 ## Env-Inject Forwarding
 When credential materialization for an agent task produces a non-empty plain