openclaw-scheduler 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/AGENTS.md +302 -0
  2. package/BEST-PRACTICES.md +506 -0
  3. package/CHANGELOG.md +82 -0
  4. package/CODE_OF_CONDUCT.md +22 -0
  5. package/CONTEXT.md +26 -0
  6. package/CONTRIBUTING.md +73 -0
  7. package/IMPLEMENTATION_SPEC.md +170 -0
  8. package/INSTALL-ADDITIONAL-HOST.md +333 -0
  9. package/INSTALL-LINUX.md +419 -0
  10. package/INSTALL-WINDOWS.md +305 -0
  11. package/INSTALL.md +364 -0
  12. package/JOB-QUICK-REF.md +222 -0
  13. package/LICENSE +21 -0
  14. package/QUICK-START.md +256 -0
  15. package/README.md +2170 -0
  16. package/SECURITY.md +34 -0
  17. package/UNINSTALL.md +129 -0
  18. package/UPGRADING.md +436 -0
  19. package/agents.js +67 -0
  20. package/approval.js +107 -0
  21. package/backup.js +390 -0
  22. package/bin/openclaw-scheduler.js +138 -0
  23. package/cli.js +1083 -0
  24. package/db.js +122 -0
  25. package/dispatch/529-recovery.mjs +204 -0
  26. package/dispatch/README.md +372 -0
  27. package/dispatch/config.example.json +24 -0
  28. package/dispatch/deliver-watcher.sh +57 -0
  29. package/dispatch/hooks.mjs +171 -0
  30. package/dispatch/index.mjs +1836 -0
  31. package/dispatch/watcher.mjs +1396 -0
  32. package/dispatch-queue.js +112 -0
  33. package/dispatcher-approvals.js +96 -0
  34. package/dispatcher-delivery.js +43 -0
  35. package/dispatcher-maintenance.js +242 -0
  36. package/dispatcher-shell.js +29 -0
  37. package/dispatcher-strategies.js +1280 -0
  38. package/dispatcher-utils.js +81 -0
  39. package/dispatcher.js +855 -0
  40. package/docs/adr-schedule-ownership.md +73 -0
  41. package/docs/gateway-contract.md +904 -0
  42. package/docs/plans/2026-03-09-fix-typescript-types.md +91 -0
  43. package/docs/plans/2026-03-09-test-coverage-gaps.md +83 -0
  44. package/docs/plans/2026-03-10-dispatcher-refactor.md +801 -0
  45. package/docs/trust-architecture.md +266 -0
  46. package/gateway.js +473 -0
  47. package/idempotency.js +119 -0
  48. package/index.d.ts +864 -0
  49. package/index.js +17 -0
  50. package/jobs.js +1224 -0
  51. package/messages.js +357 -0
  52. package/migrate-consolidate.js +694 -0
  53. package/migrate.js +125 -0
  54. package/package.json +130 -0
  55. package/paths.js +79 -0
  56. package/prompt-context.js +94 -0
  57. package/retrieval.js +176 -0
  58. package/runs.js +270 -0
  59. package/scheduler-schema.js +101 -0
  60. package/schema.sql +480 -0
  61. package/scripts/dispatch-cli-utils.mjs +65 -0
  62. package/scripts/inbox-consumer.mjs +288 -0
  63. package/scripts/stuck-detector.sh +18 -0
  64. package/scripts/stuck-run-detector.mjs +333 -0
  65. package/scripts/telegram-webhook-check.mjs +238 -0
  66. package/setup.mjs +724 -0
  67. package/shell-result.js +214 -0
  68. package/task-tracker.js +300 -0
  69. package/team-adapter.js +335 -0
  70. package/v02-runtime.js +599 -0
package/dispatcher.js ADDED
@@ -0,0 +1,855 @@
1
+ #!/usr/bin/env node
2
+ // OpenClaw Scheduler Dispatcher
3
+ //
4
+ // Full standalone scheduler + message router.
5
+ // Dispatches independently via chat completions API.
6
+ //
7
+ // Tick loop:
8
+ // 1. Check gateway health
9
+ // 2. Find due jobs -> dispatch via chat completions / system event
10
+ // 3. Check running runs for staleness (implicit heartbeat)
11
+ // 4. Deliver pending messages
12
+ // 5. Expire old messages
13
+ // 6. Prune old runs (hourly)
14
+
15
+ import { readFileSync } from 'fs';
16
+ import { dirname, join } from 'path';
17
+ import { fileURLToPath } from 'url';
18
+ import { initDb, closeDb, getDb, checkpointWal } from './db.js';
19
+ import {
20
+ generateIdempotencyKey as _genIdemKey,
21
+ generateChainIdempotencyKey as _genChainKey,
22
+ generateRunNowIdempotencyKey as _genRunNowKey,
23
+ claimIdempotencyKey as _claimIdemKey,
24
+ releaseIdempotencyKey as _releaseIdemKey,
25
+ updateIdempotencyResultHash as _updateIdemHash,
26
+ forcePruneIdempotency as _pruneIdemLedger,
27
+ } from './idempotency.js';
28
+
29
+ const __dirname = dirname(fileURLToPath(import.meta.url));
30
+ const { version: SCHEDULER_VERSION = '0.0.0' } = JSON.parse(
31
+ readFileSync(join(__dirname, 'package.json'), 'utf8')
32
+ );
33
+ import { getDueJobs, getDueAtJobs, hasRunningRun, hasRunningRunForPool, updateJob, nextRunFromCron, deleteJob, getJob, pruneExpiredJobs, fireTriggeredChildren, createJob, shouldRetry, scheduleRetry, enqueueJob, dequeueJob, getDispatchBacklogCount } from './jobs.js';
34
+ import {
35
+ createRun, finishRun, getRun, getStaleRuns, getTimedOutRuns, getRunningRuns,
36
+ updateRunSession, pruneRuns, updateContextSummary, persistV02Outcomes
37
+ } from './runs.js';
38
+ import {
39
+ resolveIdentity, evaluateTrust, verifyAuthorizationProof,
40
+ evaluateAuthorization, generateEvidence, summarizeCredentialHandoff,
41
+ compareTrustLevels,
42
+ } from './v02-runtime.js';
43
+ import {
44
+ getInbox, markDelivered,
45
+ expireMessages, pruneMessages
46
+ } from './messages.js';
47
+ import {
48
+ createApproval, getPendingApproval,
49
+ resolveApproval, getTimedOutApprovals, pruneApprovals, countPendingApprovalsForJob
50
+ } from './approval.js';
51
+ import { buildRetrievalContext } from './retrieval.js';
52
+ import { upsertAgent, setAgentStatus } from './agents.js';
53
+ import {
54
+ runAgentTurnWithActivityTimeout, sendSystemEvent, getAllSubAgentSessions, listSessions,
55
+ deliverMessage, checkGatewayHealth, waitForGateway, resolveDeliveryAlias,
56
+ } from './gateway.js';
57
+ import { normalizeShellResult } from './shell-result.js';
58
+ import {
59
+ getDispatch, getDueDispatches, claimDispatch, releaseDispatch, setDispatchStatus,
60
+ enqueueDispatch,
61
+ } from './dispatch-queue.js';
62
+ import {
63
+ listActiveTaskGroups, checkDeadAgents, checkGroupCompletion, getTaskGroupStatus,
64
+ touchAgentHeartbeat,
65
+ } from './task-tracker.js';
66
+ import { mapTeamMessages, checkTeamTaskGates } from './team-adapter.js';
67
+ import { buildTriggeredRunContext } from './prompt-context.js';
68
+ import { runShellCommand } from './dispatcher-shell.js';
69
+ import {
70
+ sqliteNow,
71
+ adaptiveDeferralMs,
72
+ buildExecutionIntentNote,
73
+ matchesSentinel,
74
+ detectTransientError,
75
+ getBackoffMs,
76
+ isDrainError,
77
+ } from './dispatcher-utils.js';
78
+ import { createDeliveryHelpers } from './dispatcher-delivery.js';
79
+ import { checkApprovals } from './dispatcher-approvals.js';
80
+ import {
81
+ checkRunHealth,
82
+ checkTaskTrackers,
83
+ expireStaleMessages,
84
+ ensureAgentInboxJobs,
85
+ } from './dispatcher-maintenance.js';
86
+ import {
87
+ prepareDispatch,
88
+ executeStrategy,
89
+ finalizeDispatch,
90
+ } from './dispatcher-strategies.js';
91
+ import {
92
+ loadProviders, getIdentityProvider, getAuthorizationProvider, getProofVerifier,
93
+ } from './provider-registry.js';
94
+
95
+ // -- Idempotency Key Wrappers --------------------------------
96
+ // The shared module (idempotency.js) uses jobId strings; dispatcher wraps with job objects.
97
+ function generateIdempotencyKey(job, scheduledTime) {
98
+ if (job.parent_id && !scheduledTime) return null;
99
+ return _genIdemKey(job.id, scheduledTime);
100
+ }
101
+ const generateChainIdempotencyKey = _genChainKey;
102
+ const generateRunNowIdempotencyKey = _genRunNowKey;
103
+ const claimIdempotencyKey = _claimIdemKey;
104
+ const releaseIdempotencyKey = _releaseIdemKey;
105
+ const updateIdempotencyResultHash = _updateIdemHash;
106
+ const pruneIdempotencyLedger = _pruneIdemLedger;
107
+
108
+ // -- Config --------------------------------------------------
109
+ const TICK_INTERVAL_MS = Math.max(1000, parseInt(process.env.SCHEDULER_TICK_MS || '10000', 10));
110
+ const STALE_THRESHOLD_S = Math.max(10, parseInt(process.env.SCHEDULER_STALE_THRESHOLD_S || '90', 10));
111
+ const HEARTBEAT_CHECK_MS = Math.max(5000, parseInt(process.env.SCHEDULER_HEARTBEAT_CHECK_MS || '30000', 10));
112
+ const MESSAGE_DELIVERY_MS = Math.max(5000, parseInt(process.env.SCHEDULER_MESSAGE_DELIVERY_MS || '15000', 10));
113
+ const PRUNE_INTERVAL_MS = Math.max(60000, parseInt(process.env.SCHEDULER_PRUNE_MS || '3600000', 10));
114
+ const BACKUP_INTERVAL_MS = Math.max(60000, parseInt(process.env.SCHEDULER_BACKUP_MS || '300000', 10)); // 5 min
115
+ let backupEnabled = process.env.SCHEDULER_BACKUP === '1' || process.env.SCHEDULER_BACKUP === 'true';
116
+ const LOG_PREFIX = '[scheduler]';
117
+
118
+ // -- State ---------------------------------------------------
119
+ let running = true;
120
+ let lastHeartbeatCheck = 0;
121
+ let lastMessageDelivery = 0;
122
+ let lastPrune = 0;
123
+ let lastBackup = 0;
124
+ let lastGatewayCheck = 0;
125
+ let gatewayHealthy = true;
126
+ let lastRollupBackup = 0;
127
+
128
+ // -- Logging -------------------------------------------------
129
+ function log(level, msg, meta) {
130
+ if (level === 'debug' && !process.env.SCHEDULER_DEBUG) return;
131
+ const ts = new Date().toISOString();
132
+ const metaStr = meta ? ` ${JSON.stringify(meta)}` : '';
133
+ const line = `${ts} ${LOG_PREFIX} [${level}] ${msg}${metaStr}\n`;
134
+ process.stderr.write(line);
135
+ }
136
+
137
+ const { handleDelivery } = createDeliveryHelpers({
138
+ log,
139
+ resolveDeliveryAlias,
140
+ });
141
+
142
+ // -- Replay orphaned runs on startup -------------------------
143
+ async function replayOrphanedRuns() {
144
+ const db = getDb();
145
+ const orphaned = db.prepare(`
146
+ SELECT r.id, r.job_id, r.dispatch_queue_id, r.idempotency_key, j.delivery_guarantee, j.name as job_name, j.schedule_cron, j.schedule_tz, j.run_timeout_ms, j.schedule_kind
147
+ FROM runs r
148
+ JOIN jobs j ON r.job_id = j.id
149
+ WHERE r.status = 'running'
150
+ `).all();
151
+
152
+ if (orphaned.length === 0) return;
153
+ log('info', `Found ${orphaned.length} orphaned run(s) to process`);
154
+
155
+ for (const run of orphaned) {
156
+ log('info', `Found orphaned run for ${run.job_name}`, { runId: run.id, jobId: run.job_id });
157
+
158
+ // Wrap all per-run operations in a transaction so crash between steps
159
+ // cannot leave the run marked crashed without the corresponding retry enqueued.
160
+ const processOrphan = db.transaction(() => {
161
+ const crashedAt = sqliteNow();
162
+
163
+ // Mark old run as crashed
164
+ db.prepare(`UPDATE runs SET status = 'crashed', finished_at = ? WHERE id = ?`).run(crashedAt, run.id);
165
+ if (run.dispatch_queue_id) {
166
+ setDispatchStatus(run.dispatch_queue_id, 'done');
167
+ }
168
+
169
+ // Release any idempotency key held by the crashed run so replays can reclaim
170
+ if (run.idempotency_key) {
171
+ releaseIdempotencyKey(run.idempotency_key);
172
+ log('info', `Released idempotency key for crashed run`, { runId: run.id, key: run.idempotency_key.slice(0, 8) });
173
+ }
174
+
175
+ if (run.delivery_guarantee === 'at-least-once') {
176
+ const replayPatch = {
177
+ last_run_at: crashedAt,
178
+ last_status: 'crashed',
179
+ };
180
+ if (run.schedule_kind !== 'at') {
181
+ replayPatch.next_run_at = nextRunFromCron(run.schedule_cron, run.schedule_tz);
182
+ }
183
+ updateJob(run.job_id, replayPatch);
184
+
185
+ // Enqueue a dispatch so the normal dispatch flow creates and executes the replay run
186
+ const replayDispatch = enqueueDispatch(run.job_id, {
187
+ kind: 'retry',
188
+ scheduled_for: sqliteNow(-1000),
189
+ source_run_id: run.id,
190
+ retry_of_run_id: run.id,
191
+ });
192
+ log('info', `Replaying run for ${run.job_name} (at-least-once)`, { oldRunId: run.id, dispatchId: replayDispatch.id });
193
+ } else {
194
+ if (run.schedule_kind === 'at') {
195
+ updateJob(run.job_id, { enabled: false });
196
+ log('info', `Disabled at-job after crash (at-most-once): ${run.job_name}`, { jobId: run.job_id });
197
+ } else {
198
+ const nextRun = nextRunFromCron(run.schedule_cron, run.schedule_tz);
199
+ if (nextRun) {
200
+ updateJob(run.job_id, { next_run_at: nextRun });
201
+ }
202
+ }
203
+ log('info', `Marked crashed: ${run.job_name} (at-most-once)`, { runId: run.id });
204
+ }
205
+ });
206
+ processOrphan();
207
+ }
208
+ }
209
+
210
+ function reconcileQueuedRetrySchedules() {
211
+ const db = getDb();
212
+ const queuedRetries = db.prepare(`
213
+ SELECT DISTINCT
214
+ j.id,
215
+ j.name,
216
+ j.parent_id,
217
+ j.schedule_kind,
218
+ j.schedule_cron,
219
+ j.schedule_tz,
220
+ j.next_run_at,
221
+ j.schedule_at,
222
+ j.last_run_at
223
+ FROM jobs j
224
+ JOIN job_dispatch_queue q ON q.job_id = j.id
225
+ WHERE q.dispatch_kind = 'retry'
226
+ AND q.status IN ('pending', 'claimed', 'awaiting_approval')
227
+ AND j.enabled = 1
228
+ AND j.parent_id IS NULL
229
+ `).all();
230
+
231
+ if (queuedRetries.length === 0) return;
232
+
233
+ const now = Date.now();
234
+ const parseMaybeDate = (value) => {
235
+ if (!value || typeof value !== 'string') return null;
236
+ const parsed = value.includes('T')
237
+ ? new Date(value)
238
+ : new Date(value.replace(' ', 'T') + 'Z');
239
+ return Number.isNaN(parsed.getTime()) ? null : parsed;
240
+ };
241
+
242
+ for (const job of queuedRetries) {
243
+ const patch = {};
244
+ if (job.schedule_kind === 'at') {
245
+ const scheduledAt = parseMaybeDate(job.schedule_at);
246
+ const lastRunAt = parseMaybeDate(job.last_run_at);
247
+ if (scheduledAt && (!lastRunAt || lastRunAt < scheduledAt)) {
248
+ patch.last_run_at = sqliteNow();
249
+ }
250
+ } else {
251
+ const nextRunAt = parseMaybeDate(job.next_run_at);
252
+ if (nextRunAt && nextRunAt.getTime() <= now) {
253
+ patch.next_run_at = nextRunFromCron(job.schedule_cron, job.schedule_tz);
254
+ }
255
+ }
256
+ if (Object.keys(patch).length === 0) continue;
257
+ updateJob(job.id, patch);
258
+ log('info', `Reconciled root schedule while retry is queued: ${job.name}`, {
259
+ jobId: job.id,
260
+ patch,
261
+ });
262
+ }
263
+ }
264
+
265
+ // -- Triggered Children Helper -------------------------------
266
+ /**
267
+ * Fire triggered children for a completed run and track chain idempotency keys.
268
+ * Extracts the duplicated fireTriggeredChildren + pendingChainKeys pattern.
269
+ */
270
+ function handleTriggeredChildren(jobId, status, content, runId, logSuffix = '') {
271
+ const triggered = fireTriggeredChildren(jobId, status, content, runId);
272
+ if (triggered.length > 0) {
273
+ log('info', `Triggered ${triggered.length} child job(s)${logSuffix}`, {
274
+ parentId: jobId,
275
+ children: triggered.map(c => c.name),
276
+ });
277
+ }
278
+ return triggered;
279
+ }
280
+
281
+
282
+ // -- Build dispatch dependencies bag -------------------------
283
+ function buildDispatchDeps() {
284
+ return {
285
+ // Guards + dispatch queue
286
+ claimDispatch, releaseDispatch, setDispatchStatus,
287
+ countPendingApprovalsForJob, getPendingApproval,
288
+ createApproval, createRun, getRun,
289
+ hasRunningRunForPool, hasRunningRun,
290
+ enqueueJob, getDispatchBacklogCount,
291
+ generateIdempotencyKey, generateChainIdempotencyKey,
292
+ generateRunNowIdempotencyKey, claimIdempotencyKey,
293
+ finishRun, getDb,
294
+ sqliteNow, adaptiveDeferralMs,
295
+ handleDelivery, advanceNextRun,
296
+ TICK_INTERVAL_MS,
297
+ log,
298
+ // Watchdog
299
+ runShellCommand, updateJob, deleteJob,
300
+ // Main session
301
+ sendSystemEvent, buildExecutionIntentNote,
302
+ // Shell
303
+ normalizeShellResult,
304
+ // Agent
305
+ waitForGateway, updateRunSession, setAgentStatus,
306
+ buildJobPrompt, runAgentTurnWithActivityTimeout,
307
+ updateContextSummary, releaseIdempotencyKey,
308
+ matchesSentinel, detectTransientError,
309
+ listSessions,
310
+ // Finalize
311
+ updateIdempotencyResultHash,
312
+ shouldRetry, scheduleRetry,
313
+ updateJobAfterRun, handleTriggeredChildren,
314
+ dequeueJob,
315
+ // Drain-error retry
316
+ isDrainError, enqueueDispatch, getJob,
317
+ // v0.2 runtime
318
+ resolveIdentity, evaluateTrust, verifyAuthorizationProof,
319
+ evaluateAuthorization, generateEvidence, summarizeCredentialHandoff,
320
+ compareTrustLevels,
321
+ persistV02Outcomes,
322
+ // Provider registry
323
+ getIdentityProvider,
324
+ getAuthorizationProvider,
325
+ getProofVerifier,
326
+ };
327
+ }
328
+
329
+ // -- Dispatch a single job -----------------------------------
330
+ async function dispatchJob(job, opts = {}) {
331
+ const deps = buildDispatchDeps();
332
+ const ctx = await prepareDispatch(job, opts, deps);
333
+ if (!ctx) return;
334
+ const result = await executeStrategy(job, ctx, deps);
335
+ await finalizeDispatch(job, ctx, result, deps);
336
+ }
337
+
338
+
339
+ // -- Build the prompt sent to the agent ----------------------
340
+ /**
341
+ * Build the prompt sent to the agent for a given job and run.
342
+ *
343
+ * Side effect: calls markDelivered() on each pending inbox message injected
344
+ * into the prompt, so those messages will not be delivered again.
345
+ */
346
+ function buildJobPrompt(job, run) {
347
+ const parts = [`[scheduler:${job.id} ${job.name}]`];
348
+ const executionNote = buildExecutionIntentNote(job);
349
+ if (executionNote) parts.push(`\n${executionNote}`);
350
+ if (job.payload_thinking) {
351
+ parts.push(
352
+ '\n[SYSTEM NOTE -- model policy]',
353
+ `Prefer reasoning depth: ${job.payload_thinking}.`,
354
+ '[END SYSTEM NOTE]',
355
+ );
356
+ }
357
+
358
+ // Flush preamble for pre_compaction_flush jobs
359
+ if (job.job_class === 'pre_compaction_flush') {
360
+ parts.push('\n[SYSTEM: Pre-compaction flush required]');
361
+ parts.push('Write a structured summary of: active decisions, constraints, task owners, open questions.');
362
+ parts.push('Format as labeled sections. If nothing needs flushing, respond with exactly: NO_FLUSH');
363
+ parts.push('[END SYSTEM]');
364
+ }
365
+
366
+ // Global sub-agent scope: instruct the agent to query across all sessions
367
+ if (job.payload_scope === 'global') {
368
+ parts.push(
369
+ '\n[SYSTEM NOTE -- scope=global]',
370
+ 'This job has cross-session sub-agent visibility enabled.',
371
+ 'When you need to list or inspect sub-agents, do NOT use `subagents list`',
372
+ '(which only shows sub-agents spawned by the current session).',
373
+ 'Instead, call `sessions_list` with no session filter to enumerate ALL active',
374
+ 'sessions across every requester, then filter by session key prefix or agent id.',
375
+ 'This lets you observe sub-agents spawned from the main Telegram session or any',
376
+ 'other session -- not just this isolated scheduler session.',
377
+ '[END SYSTEM NOTE]',
378
+ );
379
+ }
380
+
381
+ // Include any pending messages for this agent
382
+ const inbox = getInbox(job.agent_id || 'main', { limit: 5 });
383
+ if (inbox.length > 0) {
384
+ parts.push('\n--- Pending Messages ---');
385
+ for (const msg of inbox) {
386
+ const kindLabel = msg.kind && !['text', 'result', 'status', 'system', 'spawn'].includes(msg.kind)
387
+ ? `[${msg.kind}]${msg.owner ? ` (owner: ${msg.owner})` : ''} `
388
+ : '';
389
+ parts.push(`From: ${msg.from_agent} | ${msg.kind} | ${msg.subject || '(no subject)'}`);
390
+ const bodyExcerpt = msg.body.length > 500
391
+ ? msg.body.slice(0, 500) + '\n[... message truncated]'
392
+ : msg.body;
393
+ if (kindLabel) {
394
+ parts.push(`${kindLabel}${bodyExcerpt}`);
395
+ } else {
396
+ parts.push(bodyExcerpt);
397
+ }
398
+ parts.push('---');
399
+ markDelivered(msg.id);
400
+ }
401
+ }
402
+
403
+ // Collect context metadata
404
+ const contextMeta = {
405
+ messages_injected: inbox.length,
406
+ scope: job.payload_scope || 'own',
407
+ job_class: job.job_class || 'standard',
408
+ delivery_guarantee: job.delivery_guarantee || 'at-most-once',
409
+ context_retrieval: job.context_retrieval || 'none',
410
+ execution_intent: job.execution_intent || 'execute',
411
+ execution_read_only: Boolean(job.execution_read_only),
412
+ payload_model: job.payload_model || null,
413
+ payload_thinking: job.payload_thinking || null,
414
+ auth_profile: job.auth_profile || null,
415
+ };
416
+
417
+ const triggerContext = buildTriggeredRunContext(run);
418
+ if (triggerContext.text) {
419
+ parts.push(triggerContext.text);
420
+ Object.assign(contextMeta, triggerContext.meta);
421
+ }
422
+
423
+ // Add retrieval context if configured
424
+ if (job.context_retrieval && job.context_retrieval !== 'none') {
425
+ try {
426
+ const retrievalCtx = buildRetrievalContext(job);
427
+ if (retrievalCtx) {
428
+ parts.push(retrievalCtx);
429
+ contextMeta.retrieval_results = (retrievalCtx.match(/\n\[/g) || []).length;
430
+ }
431
+ } catch (err) {
432
+ log('warn', `Retrieval context error for ${job.name}: ${err.message}`);
433
+ }
434
+ }
435
+
436
+ // Inject idempotency key for at-least-once jobs
437
+ if (run.idempotency_key && job.delivery_guarantee === 'at-least-once') {
438
+ parts.push(`\n[IDEMPOTENCY KEY: ${run.idempotency_key}]`);
439
+ parts.push('This is an at-least-once job. Before performing side effects, verify this key');
440
+ parts.push('has not already been processed. If you\'ve already handled this exact execution,');
441
+ parts.push('respond with: IDEMPOTENT_SKIP');
442
+ }
443
+
444
+ parts.push('\n' + (job.payload_message ?? ''));
445
+ return { prompt: parts.join('\n'), contextMeta };
446
+ }
447
+
448
+ // -- Advance next_run_at -------------------------------------
449
+ function advanceNextRun(job) {
450
+ const nextRun = nextRunFromCron(job.schedule_cron, job.schedule_tz);
451
+ updateJob(job.id, { next_run_at: nextRun });
452
+ }
453
+
454
+ // -- Update job state after run ------------------------------
455
+ function updateJobAfterRun(job, status) {
456
+ // Re-read from DB to get current state (avoids stale consecutive_errors during retries)
457
+ const freshJob = getJob(job.id);
458
+ if (!freshJob) return; // Job was already deleted (e.g. delete_after_run race)
459
+ const currentErrors = freshJob?.consecutive_errors || 0;
460
+ const patch = { last_run_at: sqliteNow(), last_status: status };
461
+
462
+ if (status === 'error' || status === 'timeout') {
463
+ patch.consecutive_errors = currentErrors + 1;
464
+ } else if (status === 'ok') {
465
+ patch.consecutive_errors = 0;
466
+ }
467
+
468
+ // At-jobs (one-shot): don't advance cron schedule -- delete or disable
469
+ if (freshJob.schedule_kind === 'at') {
470
+ if (freshJob.delete_after_run) {
471
+ getDb().transaction(() => {
472
+ updateJob(job.id, patch);
473
+ deleteJob(job.id);
474
+ })();
475
+ log('info', `Deleting one-shot at-job: ${job.name}`, { jobId: job.id });
476
+ } else {
477
+ patch.enabled = 0; // Disable so it won't fire again via getDueAtJobs
478
+ updateJob(job.id, patch);
479
+ log('info', `Disabling completed at-job: ${job.name}`, { jobId: job.id });
480
+ }
481
+ return;
482
+ }
483
+
484
+ // Cron job: advance schedule
485
+ const nextRun = nextRunFromCron(freshJob.schedule_cron, freshJob.schedule_tz);
486
+ patch.next_run_at = nextRun;
487
+
488
+ // Backoff for errors
489
+ if (patch.consecutive_errors > 0 && nextRun) {
490
+ const backoffMs = getBackoffMs(patch.consecutive_errors);
491
+ const backoffDate = new Date(Date.now() + backoffMs);
492
+ const nextDate = new Date(nextRun);
493
+ if (backoffDate > nextDate) patch.next_run_at = backoffDate.toISOString().replace('T', ' ').replace(/\.\d{3}Z$/, '');
494
+ }
495
+
496
+ if (status === 'ok' && freshJob.delete_after_run) {
497
+ getDb().transaction(() => {
498
+ updateJob(job.id, patch);
499
+ deleteJob(freshJob.id);
500
+ })();
501
+ log('info', `Deleting one-shot: ${freshJob.name}`);
502
+ } else {
503
+ updateJob(job.id, patch);
504
+ }
505
+ }
506
+
507
+ // -- Main tick -----------------------------------------------
508
+ async function tick() {
509
+ const now = Date.now();
510
+
511
+ // Gateway health check
512
+ if (!gatewayHealthy || now - lastGatewayCheck >= 60000) {
513
+ lastGatewayCheck = now;
514
+ gatewayHealthy = await checkGatewayHealth();
515
+ if (!gatewayHealthy) {
516
+ log('warn', 'Gateway unreachable -- isolated jobs will be deferred; shell/main jobs continue');
517
+ }
518
+ }
519
+
520
+ // 1. Dispatch due jobs
521
+ try {
522
+ const dueJobs = getDueJobs();
523
+ for (const job of dueJobs) {
524
+ if (!gatewayHealthy && job.session_target === 'isolated') {
525
+ const deferredAt = new Date(Date.now() + 60000).toISOString().replace('T', ' ').replace(/\.\d{3}Z$/, '');
526
+ updateJob(job.id, { next_run_at: deferredAt });
527
+ log('info', `Deferred isolated job while gateway is down: ${job.name}`, { jobId: job.id, nextRunAt: deferredAt });
528
+ continue;
529
+ }
530
+ await dispatchJob(job);
531
+ }
532
+
533
+ // 1b. Dispatch due at-jobs (one-shot scheduling)
534
+ const dueAtJobs = getDueAtJobs();
535
+ for (const job of dueAtJobs) {
536
+ if (!gatewayHealthy && job.session_target === 'isolated') {
537
+ // Gateway down: skip this tick, at-job will be retried next tick
538
+ // (schedule_at condition still holds, enabled=1 unchanged)
539
+ log('info', `Deferred at-job while gateway is down: ${job.name}`, { jobId: job.id, scheduleAt: job.schedule_at });
540
+ continue;
541
+ }
542
+ await dispatchJob(job);
543
+ }
544
+
545
+ const dueDispatches = getDueDispatches();
546
+ for (const dispatchRecord of dueDispatches) {
547
+ const job = getJob(dispatchRecord.job_id);
548
+ if (!job) {
549
+ setDispatchStatus(dispatchRecord.id, 'cancelled');
550
+ continue;
551
+ }
552
+ if (!job.enabled && dispatchRecord.dispatch_kind !== 'manual') {
553
+ setDispatchStatus(dispatchRecord.id, 'cancelled');
554
+ continue;
555
+ }
556
+ if (!gatewayHealthy && job.session_target === 'isolated') {
557
+ releaseDispatch(dispatchRecord.id, sqliteNow(60000));
558
+ log('info', `Deferred queued dispatch while gateway is down: ${job.name}`, {
559
+ jobId: job.id,
560
+ dispatchId: dispatchRecord.id,
561
+ });
562
+ continue;
563
+ }
564
+ await dispatchJob(job, { dispatchRecord });
565
+ }
566
+ } catch (err) {
567
+ log('error', `Dispatch error: ${err.message}`);
568
+ }
569
+
570
+ // 2. Health check + approval gates (every HEARTBEAT_CHECK_MS)
571
+ if (now - lastHeartbeatCheck >= HEARTBEAT_CHECK_MS) {
572
+ lastHeartbeatCheck = now;
573
+ try {
574
+ await checkRunHealth({
575
+ log,
576
+ getDb,
577
+ getRunningRuns,
578
+ getStaleRuns,
579
+ getTimedOutRuns,
580
+ finishRun,
581
+ getJob,
582
+ updateJobAfterRun,
583
+ handleDelivery,
584
+ dequeueJob,
585
+ shouldRetry,
586
+ scheduleRetry,
587
+ staleThresholdSeconds: STALE_THRESHOLD_S,
588
+ });
589
+ } catch (err) {
590
+ log('error', `Health check error: ${err.message}`);
591
+ }
592
+ try {
593
+ await checkApprovals({
594
+ log,
595
+ getDb,
596
+ getTimedOutApprovals,
597
+ getJob,
598
+ resolveApproval,
599
+ dispatchJob,
600
+ getDispatch,
601
+ setDispatchStatus,
602
+ });
603
+ } catch (err) {
604
+ log('error', `Approval check error: ${err.message}`);
605
+ }
606
+ }
607
+
608
+ // 3. Message delivery + spawn handling (every MESSAGE_DELIVERY_MS)
609
+ if (now - lastMessageDelivery >= MESSAGE_DELIVERY_MS) {
610
+ lastMessageDelivery = now;
611
+ // Handle spawn messages -- running jobs can request child job creation
612
+ try {
613
+ const spawnMsgs = getDb().prepare(`
614
+ SELECT * FROM messages WHERE kind = 'spawn' AND delivered_at IS NULL
615
+ `).all();
616
+ for (const msg of spawnMsgs) {
617
+ try {
618
+ const spec = JSON.parse(msg.body);
619
+ if (!spec.payload_message || typeof spec.payload_message !== 'string' || !spec.payload_message.trim()) {
620
+ log('error', `Spawn message missing payload_message`, { msgId: msg.id, fromAgent: msg.from_agent });
621
+ markDelivered(msg.id);
622
+ continue;
623
+ }
624
+ const VALID_SPAWN_SESSION_TARGETS = ['isolated', 'shell'];
625
+ const VALID_SPAWN_DELIVERY_MODES = ['none', 'announce', 'announce-always'];
626
+
627
+ let sessionTarget = spec.session_target || 'isolated';
628
+ if (!VALID_SPAWN_SESSION_TARGETS.includes(sessionTarget)) {
629
+ log('warn', `Spawn: invalid session_target "${sessionTarget}", defaulting to "isolated"`, {
630
+ msgId: msg.id, fromAgent: msg.from_agent,
631
+ });
632
+ sessionTarget = 'isolated';
633
+ }
634
+
635
+ let deliveryMode = spec.delivery_mode || 'none';
636
+ if (!VALID_SPAWN_DELIVERY_MODES.includes(deliveryMode)) {
637
+ log('warn', `Spawn: invalid delivery_mode "${deliveryMode}", defaulting to "none"`, {
638
+ msgId: msg.id, fromAgent: msg.from_agent,
639
+ });
640
+ deliveryMode = 'none';
641
+ }
642
+
643
+ // Wrap job creation + message ack in a transaction so a crash
644
+ // between the two cannot leave an unacked spawn that replays.
645
+ const child = getDb().transaction(() => {
646
+ const c = createJob({
647
+ name: spec.name || `Spawned by ${msg.from_agent}`,
648
+ parent_id: msg.job_id || null,
649
+ schedule_cron: spec.schedule_cron,
650
+ payload_message: spec.payload_message,
651
+ session_target: sessionTarget,
652
+ agent_id: spec.agent_id || msg.to_agent || 'main',
653
+ delivery_mode: deliveryMode,
654
+ delivery_channel: spec.delivery_channel,
655
+ delivery_to: spec.delivery_to,
656
+ delivery_opt_out_reason: spec.delivery_opt_out_reason
657
+ || (deliveryMode === 'none' ? 'spawned-child' : null),
658
+ delete_after_run: spec.delete_after_run !== false ? 1 : 0,
659
+ enabled: true,
660
+ run_timeout_ms: spec.run_timeout_ms || 300_000,
661
+ origin: spec.origin || 'system',
662
+ });
663
+ // Fire immediately
664
+ getDb().prepare(`UPDATE jobs SET next_run_at = datetime('now', '-1 second') WHERE id = ?`).run(c.id);
665
+ markDelivered(msg.id);
666
+ return c;
667
+ })();
668
+ log('info', `Spawned child job: ${child.name}`, { childId: child.id, parentJobId: msg.job_id });
669
+ } catch (e) {
670
+ log('error', `Spawn message parse error: ${e.message}`, { msgId: msg.id, fromAgent: msg.from_agent });
671
+ markDelivered(msg.id); // Don't retry bad messages
672
+ }
673
+ }
674
+ } catch (err) {
675
+ log('error', `Spawn handler error: ${err.message}`);
676
+ }
677
+ try {
678
+ const mapped = mapTeamMessages(200);
679
+ if (mapped > 0) {
680
+ log('debug', `Team adapter mapped ${mapped} message(s)`);
681
+ }
682
+ } catch (err) {
683
+ log('error', `Team adapter map error: ${err.message}`);
684
+ }
685
+ try {
686
+ const gates = checkTeamTaskGates(100);
687
+ if (gates.passed > 0 || gates.failed > 0) {
688
+ log('info', `Team task gates updated`, gates);
689
+ } else if (gates.pending > 0) {
690
+ log('debug', `Team task gates pending`, gates);
691
+ }
692
+ } catch (err) {
693
+ log('error', `Team gate check error: ${err.message}`);
694
+ }
695
+ try {
696
+ expireStaleMessages({ expireMessages });
697
+ } catch (err) {
698
+ log('error', `Message delivery error: ${err.message}`);
699
+ }
700
+ try {
701
+ await checkTaskTrackers({
702
+ log,
703
+ getDb,
704
+ getAllSubAgentSessions,
705
+ touchAgentHeartbeat,
706
+ checkDeadAgents,
707
+ listActiveTaskGroups,
708
+ checkGroupCompletion,
709
+ getTaskGroupStatus,
710
+ resolveDeliveryAlias,
711
+ deliverMessage,
712
+ });
713
+ } catch (err) {
714
+ log('error', `Task tracker error: ${err.message}`);
715
+ }
716
+ }
717
+
718
+ // 4. Prune (hourly)
719
+ if (now - lastPrune >= PRUNE_INTERVAL_MS) {
720
+ lastPrune = now;
721
+ try {
722
+ pruneRuns(100);
723
+ pruneMessages(30);
724
+ pruneApprovals(30);
725
+ pruneIdempotencyLedger();
726
+ const expiredCount = pruneExpiredJobs();
727
+ if (expiredCount > 0) log('info', `Pruned ${expiredCount} expired disabled job(s)`);
728
+ // Ensure inbox consumer jobs exist for agents with delivery config
729
+ ensureAgentInboxJobs({ log, getDb, createJob });
730
+ // Checkpoint WAL to disk -- reduces data loss window on crash/SIGKILL
731
+ const cpResult = checkpointWal();
732
+ if (cpResult) {
733
+ log('debug', `WAL checkpoint: log=${cpResult.log}, checkpointed=${cpResult.checkpointed}, busy=${cpResult.busy}`);
734
+ }
735
+ log('info', 'Pruned old runs + messages');
736
+ } catch (err) {
737
+ log('error', `Prune error: ${err.message}`);
738
+ }
739
+ }
740
+
741
+ // 5. Backup to MinIO (every BACKUP_INTERVAL_MS, default 5 min; set SCHEDULER_BACKUP=1 to enable)
742
+ if (backupEnabled && now - lastBackup >= BACKUP_INTERVAL_MS) {
743
+ lastBackup = now;
744
+ const isRollup = now - lastRollupBackup >= 3600000;
745
+ if (isRollup) lastRollupBackup = now;
746
+ const mode = isRollup ? 'rollup' : 'snapshot';
747
+ // Run backup in a child process without blocking the event loop
748
+ const { execFile } = await import('child_process');
749
+ execFile(process.execPath, [join(__dirname, 'backup.js'), mode], {
750
+ timeout: 30000,
751
+ stdio: ['ignore', 'pipe', 'pipe'],
752
+ }, (err, _stdout, stderr) => {
753
+ if (err) {
754
+ const msg = stderr?.trim() || err.message;
755
+ if (msg.includes('not found') || msg.includes('ENOENT')) {
756
+ log('warn', `Backup disabled: mc binary not found. Install mc to use backups.`);
757
+ backupEnabled = false;
758
+ } else {
759
+ log('error', `Backup failed: ${msg}`);
760
+ }
761
+ } else {
762
+ log('debug', `Backup ${mode} completed`);
763
+ }
764
+ });
765
+ }
766
+ }
767
+
768
+ // -- Lifecycle -----------------------------------------------
769
+ function shutdown(signal) {
770
+ log('info', `Shutting down (${signal})`);
771
+ running = false;
772
+ try {
773
+ // Force WAL checkpoint before close to ensure all data is in main DB
774
+ const cpResult = checkpointWal();
775
+ if (cpResult) {
776
+ log('info', `Shutdown WAL checkpoint: log=${cpResult.log}, checkpointed=${cpResult.checkpointed}, busy=${cpResult.busy}`);
777
+ }
778
+ } catch (err) {
779
+ log('error', `Shutdown checkpoint failed: ${err.message}`);
780
+ }
781
+ closeDb();
782
+ log('info', 'Shutdown complete');
783
+ process.exit(0);
784
+ }
785
+
786
+ // -- Startup repair -----------------------------------------
787
+ /**
788
+ * Find enabled root cron jobs with NULL next_run_at and recompute their schedule.
789
+ * Guards against insertion bugs (e.g. via direct DB write or a CLI code-path that
790
+ * skips nextRunFromCron) that leave a job permanently dormant.
791
+ */
792
+ function repairNullNextRunAt() {
793
+ const db = getDb();
794
+ const broken = db.prepare(`
795
+ SELECT id, name, schedule_cron, schedule_tz
796
+ FROM jobs
797
+ WHERE enabled = 1
798
+ AND next_run_at IS NULL
799
+ AND parent_id IS NULL
800
+ AND schedule_cron IS NOT NULL
801
+ AND schedule_cron != '0 0 31 2 *'
802
+ `).all();
803
+
804
+ if (broken.length === 0) return;
805
+
806
+ const fix = db.prepare(`UPDATE jobs SET next_run_at = ? WHERE id = ?`);
807
+ for (const job of broken) {
808
+ const next = nextRunFromCron(job.schedule_cron, job.schedule_tz || 'UTC');
809
+ if (next) {
810
+ fix.run(next, job.id);
811
+ log('warn', `Repaired null next_run_at for job "${job.name}" -> ${next}`);
812
+ }
813
+ }
814
+ }
815
+
816
+ async function main() {
817
+ log('info', `Starting OpenClaw Scheduler v${SCHEDULER_VERSION}`, {
818
+ tickMs: TICK_INTERVAL_MS,
819
+ staleThresholdS: STALE_THRESHOLD_S,
820
+ heartbeatCheckMs: HEARTBEAT_CHECK_MS,
821
+ });
822
+
823
+ await initDb();
824
+
825
+ // Load provider plugins if configured
826
+ if (process.env.SCHEDULER_PROVIDER_PATH) {
827
+ await loadProviders(process.env.SCHEDULER_PROVIDER_PATH);
828
+ }
829
+
830
+ // Register default agent
831
+ upsertAgent('main', { name: 'Main Agent', status: 'idle', capabilities: ['*'] });
832
+
833
+ log('info', 'Database initialized');
834
+
835
+ // Replay orphaned runs from previous crash (delivery guarantee support)
836
+ await replayOrphanedRuns();
837
+ reconcileQueuedRetrySchedules();
838
+
839
+ // Repair any enabled cron jobs with NULL next_run_at (scheduling bug defence)
840
+ repairNullNextRunAt();
841
+
842
+ process.on('SIGINT', () => shutdown('SIGINT'));
843
+ process.on('SIGTERM', () => shutdown('SIGTERM'));
844
+
845
+ while (running) {
846
+ await tick();
847
+ await new Promise(r => setTimeout(r, TICK_INTERVAL_MS));
848
+ }
849
+ }
850
+
851
+ main().catch(err => {
852
+ log('error', `Fatal: ${err.message}`);
853
+ closeDb();
854
+ process.exit(1);
855
+ });