openclaw-scheduler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +302 -0
- package/BEST-PRACTICES.md +506 -0
- package/CHANGELOG.md +82 -0
- package/CODE_OF_CONDUCT.md +22 -0
- package/CONTEXT.md +26 -0
- package/CONTRIBUTING.md +73 -0
- package/IMPLEMENTATION_SPEC.md +170 -0
- package/INSTALL-ADDITIONAL-HOST.md +333 -0
- package/INSTALL-LINUX.md +419 -0
- package/INSTALL-WINDOWS.md +305 -0
- package/INSTALL.md +364 -0
- package/JOB-QUICK-REF.md +222 -0
- package/LICENSE +21 -0
- package/QUICK-START.md +256 -0
- package/README.md +2170 -0
- package/SECURITY.md +34 -0
- package/UNINSTALL.md +129 -0
- package/UPGRADING.md +436 -0
- package/agents.js +67 -0
- package/approval.js +107 -0
- package/backup.js +390 -0
- package/bin/openclaw-scheduler.js +138 -0
- package/cli.js +1083 -0
- package/db.js +122 -0
- package/dispatch/529-recovery.mjs +204 -0
- package/dispatch/README.md +372 -0
- package/dispatch/config.example.json +24 -0
- package/dispatch/deliver-watcher.sh +57 -0
- package/dispatch/hooks.mjs +171 -0
- package/dispatch/index.mjs +1836 -0
- package/dispatch/watcher.mjs +1396 -0
- package/dispatch-queue.js +112 -0
- package/dispatcher-approvals.js +96 -0
- package/dispatcher-delivery.js +43 -0
- package/dispatcher-maintenance.js +242 -0
- package/dispatcher-shell.js +29 -0
- package/dispatcher-strategies.js +1280 -0
- package/dispatcher-utils.js +81 -0
- package/dispatcher.js +855 -0
- package/docs/adr-schedule-ownership.md +73 -0
- package/docs/gateway-contract.md +904 -0
- package/docs/plans/2026-03-09-fix-typescript-types.md +91 -0
- package/docs/plans/2026-03-09-test-coverage-gaps.md +83 -0
- package/docs/plans/2026-03-10-dispatcher-refactor.md +801 -0
- package/docs/trust-architecture.md +266 -0
- package/gateway.js +473 -0
- package/idempotency.js +119 -0
- package/index.d.ts +864 -0
- package/index.js +17 -0
- package/jobs.js +1224 -0
- package/messages.js +357 -0
- package/migrate-consolidate.js +694 -0
- package/migrate.js +125 -0
- package/package.json +130 -0
- package/paths.js +79 -0
- package/prompt-context.js +94 -0
- package/retrieval.js +176 -0
- package/runs.js +270 -0
- package/scheduler-schema.js +101 -0
- package/schema.sql +480 -0
- package/scripts/dispatch-cli-utils.mjs +65 -0
- package/scripts/inbox-consumer.mjs +288 -0
- package/scripts/stuck-detector.sh +18 -0
- package/scripts/stuck-run-detector.mjs +333 -0
- package/scripts/telegram-webhook-check.mjs +238 -0
- package/setup.mjs +724 -0
- package/shell-result.js +214 -0
- package/task-tracker.js +300 -0
- package/team-adapter.js +335 -0
- package/v02-runtime.js +599 -0
package/dispatcher.js
ADDED
|
@@ -0,0 +1,855 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// OpenClaw Scheduler Dispatcher
|
|
3
|
+
//
|
|
4
|
+
// Full standalone scheduler + message router.
|
|
5
|
+
// Dispatches independently via chat completions API.
|
|
6
|
+
//
|
|
7
|
+
// Tick loop:
|
|
8
|
+
// 1. Check gateway health
|
|
9
|
+
// 2. Find due jobs -> dispatch via chat completions / system event
|
|
10
|
+
// 3. Check running runs for staleness (implicit heartbeat)
|
|
11
|
+
// 4. Deliver pending messages
|
|
12
|
+
// 5. Expire old messages
|
|
13
|
+
// 6. Prune old runs (hourly)
|
|
14
|
+
|
|
15
|
+
import { readFileSync } from 'fs';
|
|
16
|
+
import { dirname, join } from 'path';
|
|
17
|
+
import { fileURLToPath } from 'url';
|
|
18
|
+
import { initDb, closeDb, getDb, checkpointWal } from './db.js';
|
|
19
|
+
import {
|
|
20
|
+
generateIdempotencyKey as _genIdemKey,
|
|
21
|
+
generateChainIdempotencyKey as _genChainKey,
|
|
22
|
+
generateRunNowIdempotencyKey as _genRunNowKey,
|
|
23
|
+
claimIdempotencyKey as _claimIdemKey,
|
|
24
|
+
releaseIdempotencyKey as _releaseIdemKey,
|
|
25
|
+
updateIdempotencyResultHash as _updateIdemHash,
|
|
26
|
+
forcePruneIdempotency as _pruneIdemLedger,
|
|
27
|
+
} from './idempotency.js';
|
|
28
|
+
|
|
29
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
30
|
+
const { version: SCHEDULER_VERSION = '0.0.0' } = JSON.parse(
|
|
31
|
+
readFileSync(join(__dirname, 'package.json'), 'utf8')
|
|
32
|
+
);
|
|
33
|
+
import { getDueJobs, getDueAtJobs, hasRunningRun, hasRunningRunForPool, updateJob, nextRunFromCron, deleteJob, getJob, pruneExpiredJobs, fireTriggeredChildren, createJob, shouldRetry, scheduleRetry, enqueueJob, dequeueJob, getDispatchBacklogCount } from './jobs.js';
|
|
34
|
+
import {
|
|
35
|
+
createRun, finishRun, getRun, getStaleRuns, getTimedOutRuns, getRunningRuns,
|
|
36
|
+
updateRunSession, pruneRuns, updateContextSummary, persistV02Outcomes
|
|
37
|
+
} from './runs.js';
|
|
38
|
+
import {
|
|
39
|
+
resolveIdentity, evaluateTrust, verifyAuthorizationProof,
|
|
40
|
+
evaluateAuthorization, generateEvidence, summarizeCredentialHandoff,
|
|
41
|
+
compareTrustLevels,
|
|
42
|
+
} from './v02-runtime.js';
|
|
43
|
+
import {
|
|
44
|
+
getInbox, markDelivered,
|
|
45
|
+
expireMessages, pruneMessages
|
|
46
|
+
} from './messages.js';
|
|
47
|
+
import {
|
|
48
|
+
createApproval, getPendingApproval,
|
|
49
|
+
resolveApproval, getTimedOutApprovals, pruneApprovals, countPendingApprovalsForJob
|
|
50
|
+
} from './approval.js';
|
|
51
|
+
import { buildRetrievalContext } from './retrieval.js';
|
|
52
|
+
import { upsertAgent, setAgentStatus } from './agents.js';
|
|
53
|
+
import {
|
|
54
|
+
runAgentTurnWithActivityTimeout, sendSystemEvent, getAllSubAgentSessions, listSessions,
|
|
55
|
+
deliverMessage, checkGatewayHealth, waitForGateway, resolveDeliveryAlias,
|
|
56
|
+
} from './gateway.js';
|
|
57
|
+
import { normalizeShellResult } from './shell-result.js';
|
|
58
|
+
import {
|
|
59
|
+
getDispatch, getDueDispatches, claimDispatch, releaseDispatch, setDispatchStatus,
|
|
60
|
+
enqueueDispatch,
|
|
61
|
+
} from './dispatch-queue.js';
|
|
62
|
+
import {
|
|
63
|
+
listActiveTaskGroups, checkDeadAgents, checkGroupCompletion, getTaskGroupStatus,
|
|
64
|
+
touchAgentHeartbeat,
|
|
65
|
+
} from './task-tracker.js';
|
|
66
|
+
import { mapTeamMessages, checkTeamTaskGates } from './team-adapter.js';
|
|
67
|
+
import { buildTriggeredRunContext } from './prompt-context.js';
|
|
68
|
+
import { runShellCommand } from './dispatcher-shell.js';
|
|
69
|
+
import {
|
|
70
|
+
sqliteNow,
|
|
71
|
+
adaptiveDeferralMs,
|
|
72
|
+
buildExecutionIntentNote,
|
|
73
|
+
matchesSentinel,
|
|
74
|
+
detectTransientError,
|
|
75
|
+
getBackoffMs,
|
|
76
|
+
isDrainError,
|
|
77
|
+
} from './dispatcher-utils.js';
|
|
78
|
+
import { createDeliveryHelpers } from './dispatcher-delivery.js';
|
|
79
|
+
import { checkApprovals } from './dispatcher-approvals.js';
|
|
80
|
+
import {
|
|
81
|
+
checkRunHealth,
|
|
82
|
+
checkTaskTrackers,
|
|
83
|
+
expireStaleMessages,
|
|
84
|
+
ensureAgentInboxJobs,
|
|
85
|
+
} from './dispatcher-maintenance.js';
|
|
86
|
+
import {
|
|
87
|
+
prepareDispatch,
|
|
88
|
+
executeStrategy,
|
|
89
|
+
finalizeDispatch,
|
|
90
|
+
} from './dispatcher-strategies.js';
|
|
91
|
+
import {
|
|
92
|
+
loadProviders, getIdentityProvider, getAuthorizationProvider, getProofVerifier,
|
|
93
|
+
} from './provider-registry.js';
|
|
94
|
+
|
|
95
|
+
// -- Idempotency Key Wrappers --------------------------------
|
|
96
|
+
// The shared module (idempotency.js) uses jobId strings; dispatcher wraps with job objects.
|
|
97
|
+
function generateIdempotencyKey(job, scheduledTime) {
|
|
98
|
+
if (job.parent_id && !scheduledTime) return null;
|
|
99
|
+
return _genIdemKey(job.id, scheduledTime);
|
|
100
|
+
}
|
|
101
|
+
const generateChainIdempotencyKey = _genChainKey;
|
|
102
|
+
const generateRunNowIdempotencyKey = _genRunNowKey;
|
|
103
|
+
const claimIdempotencyKey = _claimIdemKey;
|
|
104
|
+
const releaseIdempotencyKey = _releaseIdemKey;
|
|
105
|
+
const updateIdempotencyResultHash = _updateIdemHash;
|
|
106
|
+
const pruneIdempotencyLedger = _pruneIdemLedger;
|
|
107
|
+
|
|
108
|
+
// -- Config --------------------------------------------------
|
|
109
|
+
const TICK_INTERVAL_MS = Math.max(1000, parseInt(process.env.SCHEDULER_TICK_MS || '10000', 10));
|
|
110
|
+
const STALE_THRESHOLD_S = Math.max(10, parseInt(process.env.SCHEDULER_STALE_THRESHOLD_S || '90', 10));
|
|
111
|
+
const HEARTBEAT_CHECK_MS = Math.max(5000, parseInt(process.env.SCHEDULER_HEARTBEAT_CHECK_MS || '30000', 10));
|
|
112
|
+
const MESSAGE_DELIVERY_MS = Math.max(5000, parseInt(process.env.SCHEDULER_MESSAGE_DELIVERY_MS || '15000', 10));
|
|
113
|
+
const PRUNE_INTERVAL_MS = Math.max(60000, parseInt(process.env.SCHEDULER_PRUNE_MS || '3600000', 10));
|
|
114
|
+
const BACKUP_INTERVAL_MS = Math.max(60000, parseInt(process.env.SCHEDULER_BACKUP_MS || '300000', 10)); // 5 min
|
|
115
|
+
let backupEnabled = process.env.SCHEDULER_BACKUP === '1' || process.env.SCHEDULER_BACKUP === 'true';
|
|
116
|
+
const LOG_PREFIX = '[scheduler]';
|
|
117
|
+
|
|
118
|
+
// -- State ---------------------------------------------------
|
|
119
|
+
let running = true;
|
|
120
|
+
let lastHeartbeatCheck = 0;
|
|
121
|
+
let lastMessageDelivery = 0;
|
|
122
|
+
let lastPrune = 0;
|
|
123
|
+
let lastBackup = 0;
|
|
124
|
+
let lastGatewayCheck = 0;
|
|
125
|
+
let gatewayHealthy = true;
|
|
126
|
+
let lastRollupBackup = 0;
|
|
127
|
+
|
|
128
|
+
// -- Logging -------------------------------------------------
|
|
129
|
+
function log(level, msg, meta) {
|
|
130
|
+
if (level === 'debug' && !process.env.SCHEDULER_DEBUG) return;
|
|
131
|
+
const ts = new Date().toISOString();
|
|
132
|
+
const metaStr = meta ? ` ${JSON.stringify(meta)}` : '';
|
|
133
|
+
const line = `${ts} ${LOG_PREFIX} [${level}] ${msg}${metaStr}\n`;
|
|
134
|
+
process.stderr.write(line);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const { handleDelivery } = createDeliveryHelpers({
|
|
138
|
+
log,
|
|
139
|
+
resolveDeliveryAlias,
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
// -- Replay orphaned runs on startup -------------------------
|
|
143
|
+
async function replayOrphanedRuns() {
|
|
144
|
+
const db = getDb();
|
|
145
|
+
const orphaned = db.prepare(`
|
|
146
|
+
SELECT r.id, r.job_id, r.dispatch_queue_id, r.idempotency_key, j.delivery_guarantee, j.name as job_name, j.schedule_cron, j.schedule_tz, j.run_timeout_ms, j.schedule_kind
|
|
147
|
+
FROM runs r
|
|
148
|
+
JOIN jobs j ON r.job_id = j.id
|
|
149
|
+
WHERE r.status = 'running'
|
|
150
|
+
`).all();
|
|
151
|
+
|
|
152
|
+
if (orphaned.length === 0) return;
|
|
153
|
+
log('info', `Found ${orphaned.length} orphaned run(s) to process`);
|
|
154
|
+
|
|
155
|
+
for (const run of orphaned) {
|
|
156
|
+
log('info', `Found orphaned run for ${run.job_name}`, { runId: run.id, jobId: run.job_id });
|
|
157
|
+
|
|
158
|
+
// Wrap all per-run operations in a transaction so crash between steps
|
|
159
|
+
// cannot leave the run marked crashed without the corresponding retry enqueued.
|
|
160
|
+
const processOrphan = db.transaction(() => {
|
|
161
|
+
const crashedAt = sqliteNow();
|
|
162
|
+
|
|
163
|
+
// Mark old run as crashed
|
|
164
|
+
db.prepare(`UPDATE runs SET status = 'crashed', finished_at = ? WHERE id = ?`).run(crashedAt, run.id);
|
|
165
|
+
if (run.dispatch_queue_id) {
|
|
166
|
+
setDispatchStatus(run.dispatch_queue_id, 'done');
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Release any idempotency key held by the crashed run so replays can reclaim
|
|
170
|
+
if (run.idempotency_key) {
|
|
171
|
+
releaseIdempotencyKey(run.idempotency_key);
|
|
172
|
+
log('info', `Released idempotency key for crashed run`, { runId: run.id, key: run.idempotency_key.slice(0, 8) });
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (run.delivery_guarantee === 'at-least-once') {
|
|
176
|
+
const replayPatch = {
|
|
177
|
+
last_run_at: crashedAt,
|
|
178
|
+
last_status: 'crashed',
|
|
179
|
+
};
|
|
180
|
+
if (run.schedule_kind !== 'at') {
|
|
181
|
+
replayPatch.next_run_at = nextRunFromCron(run.schedule_cron, run.schedule_tz);
|
|
182
|
+
}
|
|
183
|
+
updateJob(run.job_id, replayPatch);
|
|
184
|
+
|
|
185
|
+
// Enqueue a dispatch so the normal dispatch flow creates and executes the replay run
|
|
186
|
+
const replayDispatch = enqueueDispatch(run.job_id, {
|
|
187
|
+
kind: 'retry',
|
|
188
|
+
scheduled_for: sqliteNow(-1000),
|
|
189
|
+
source_run_id: run.id,
|
|
190
|
+
retry_of_run_id: run.id,
|
|
191
|
+
});
|
|
192
|
+
log('info', `Replaying run for ${run.job_name} (at-least-once)`, { oldRunId: run.id, dispatchId: replayDispatch.id });
|
|
193
|
+
} else {
|
|
194
|
+
if (run.schedule_kind === 'at') {
|
|
195
|
+
updateJob(run.job_id, { enabled: false });
|
|
196
|
+
log('info', `Disabled at-job after crash (at-most-once): ${run.job_name}`, { jobId: run.job_id });
|
|
197
|
+
} else {
|
|
198
|
+
const nextRun = nextRunFromCron(run.schedule_cron, run.schedule_tz);
|
|
199
|
+
if (nextRun) {
|
|
200
|
+
updateJob(run.job_id, { next_run_at: nextRun });
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
log('info', `Marked crashed: ${run.job_name} (at-most-once)`, { runId: run.id });
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
processOrphan();
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function reconcileQueuedRetrySchedules() {
|
|
211
|
+
const db = getDb();
|
|
212
|
+
const queuedRetries = db.prepare(`
|
|
213
|
+
SELECT DISTINCT
|
|
214
|
+
j.id,
|
|
215
|
+
j.name,
|
|
216
|
+
j.parent_id,
|
|
217
|
+
j.schedule_kind,
|
|
218
|
+
j.schedule_cron,
|
|
219
|
+
j.schedule_tz,
|
|
220
|
+
j.next_run_at,
|
|
221
|
+
j.schedule_at,
|
|
222
|
+
j.last_run_at
|
|
223
|
+
FROM jobs j
|
|
224
|
+
JOIN job_dispatch_queue q ON q.job_id = j.id
|
|
225
|
+
WHERE q.dispatch_kind = 'retry'
|
|
226
|
+
AND q.status IN ('pending', 'claimed', 'awaiting_approval')
|
|
227
|
+
AND j.enabled = 1
|
|
228
|
+
AND j.parent_id IS NULL
|
|
229
|
+
`).all();
|
|
230
|
+
|
|
231
|
+
if (queuedRetries.length === 0) return;
|
|
232
|
+
|
|
233
|
+
const now = Date.now();
|
|
234
|
+
const parseMaybeDate = (value) => {
|
|
235
|
+
if (!value || typeof value !== 'string') return null;
|
|
236
|
+
const parsed = value.includes('T')
|
|
237
|
+
? new Date(value)
|
|
238
|
+
: new Date(value.replace(' ', 'T') + 'Z');
|
|
239
|
+
return Number.isNaN(parsed.getTime()) ? null : parsed;
|
|
240
|
+
};
|
|
241
|
+
|
|
242
|
+
for (const job of queuedRetries) {
|
|
243
|
+
const patch = {};
|
|
244
|
+
if (job.schedule_kind === 'at') {
|
|
245
|
+
const scheduledAt = parseMaybeDate(job.schedule_at);
|
|
246
|
+
const lastRunAt = parseMaybeDate(job.last_run_at);
|
|
247
|
+
if (scheduledAt && (!lastRunAt || lastRunAt < scheduledAt)) {
|
|
248
|
+
patch.last_run_at = sqliteNow();
|
|
249
|
+
}
|
|
250
|
+
} else {
|
|
251
|
+
const nextRunAt = parseMaybeDate(job.next_run_at);
|
|
252
|
+
if (nextRunAt && nextRunAt.getTime() <= now) {
|
|
253
|
+
patch.next_run_at = nextRunFromCron(job.schedule_cron, job.schedule_tz);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
if (Object.keys(patch).length === 0) continue;
|
|
257
|
+
updateJob(job.id, patch);
|
|
258
|
+
log('info', `Reconciled root schedule while retry is queued: ${job.name}`, {
|
|
259
|
+
jobId: job.id,
|
|
260
|
+
patch,
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// -- Triggered Children Helper -------------------------------
|
|
266
|
+
/**
|
|
267
|
+
* Fire triggered children for a completed run and track chain idempotency keys.
|
|
268
|
+
* Extracts the duplicated fireTriggeredChildren + pendingChainKeys pattern.
|
|
269
|
+
*/
|
|
270
|
+
function handleTriggeredChildren(jobId, status, content, runId, logSuffix = '') {
|
|
271
|
+
const triggered = fireTriggeredChildren(jobId, status, content, runId);
|
|
272
|
+
if (triggered.length > 0) {
|
|
273
|
+
log('info', `Triggered ${triggered.length} child job(s)${logSuffix}`, {
|
|
274
|
+
parentId: jobId,
|
|
275
|
+
children: triggered.map(c => c.name),
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
return triggered;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
// -- Build dispatch dependencies bag -------------------------
|
|
283
|
+
function buildDispatchDeps() {
|
|
284
|
+
return {
|
|
285
|
+
// Guards + dispatch queue
|
|
286
|
+
claimDispatch, releaseDispatch, setDispatchStatus,
|
|
287
|
+
countPendingApprovalsForJob, getPendingApproval,
|
|
288
|
+
createApproval, createRun, getRun,
|
|
289
|
+
hasRunningRunForPool, hasRunningRun,
|
|
290
|
+
enqueueJob, getDispatchBacklogCount,
|
|
291
|
+
generateIdempotencyKey, generateChainIdempotencyKey,
|
|
292
|
+
generateRunNowIdempotencyKey, claimIdempotencyKey,
|
|
293
|
+
finishRun, getDb,
|
|
294
|
+
sqliteNow, adaptiveDeferralMs,
|
|
295
|
+
handleDelivery, advanceNextRun,
|
|
296
|
+
TICK_INTERVAL_MS,
|
|
297
|
+
log,
|
|
298
|
+
// Watchdog
|
|
299
|
+
runShellCommand, updateJob, deleteJob,
|
|
300
|
+
// Main session
|
|
301
|
+
sendSystemEvent, buildExecutionIntentNote,
|
|
302
|
+
// Shell
|
|
303
|
+
normalizeShellResult,
|
|
304
|
+
// Agent
|
|
305
|
+
waitForGateway, updateRunSession, setAgentStatus,
|
|
306
|
+
buildJobPrompt, runAgentTurnWithActivityTimeout,
|
|
307
|
+
updateContextSummary, releaseIdempotencyKey,
|
|
308
|
+
matchesSentinel, detectTransientError,
|
|
309
|
+
listSessions,
|
|
310
|
+
// Finalize
|
|
311
|
+
updateIdempotencyResultHash,
|
|
312
|
+
shouldRetry, scheduleRetry,
|
|
313
|
+
updateJobAfterRun, handleTriggeredChildren,
|
|
314
|
+
dequeueJob,
|
|
315
|
+
// Drain-error retry
|
|
316
|
+
isDrainError, enqueueDispatch, getJob,
|
|
317
|
+
// v0.2 runtime
|
|
318
|
+
resolveIdentity, evaluateTrust, verifyAuthorizationProof,
|
|
319
|
+
evaluateAuthorization, generateEvidence, summarizeCredentialHandoff,
|
|
320
|
+
compareTrustLevels,
|
|
321
|
+
persistV02Outcomes,
|
|
322
|
+
// Provider registry
|
|
323
|
+
getIdentityProvider,
|
|
324
|
+
getAuthorizationProvider,
|
|
325
|
+
getProofVerifier,
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// -- Dispatch a single job -----------------------------------
|
|
330
|
+
async function dispatchJob(job, opts = {}) {
|
|
331
|
+
const deps = buildDispatchDeps();
|
|
332
|
+
const ctx = await prepareDispatch(job, opts, deps);
|
|
333
|
+
if (!ctx) return;
|
|
334
|
+
const result = await executeStrategy(job, ctx, deps);
|
|
335
|
+
await finalizeDispatch(job, ctx, result, deps);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
// -- Build the prompt sent to the agent ----------------------
|
|
340
|
+
/**
|
|
341
|
+
* Build the prompt sent to the agent for a given job and run.
|
|
342
|
+
*
|
|
343
|
+
* Side effect: calls markDelivered() on each pending inbox message injected
|
|
344
|
+
* into the prompt, so those messages will not be delivered again.
|
|
345
|
+
*/
|
|
346
|
+
function buildJobPrompt(job, run) {
|
|
347
|
+
const parts = [`[scheduler:${job.id} ${job.name}]`];
|
|
348
|
+
const executionNote = buildExecutionIntentNote(job);
|
|
349
|
+
if (executionNote) parts.push(`\n${executionNote}`);
|
|
350
|
+
if (job.payload_thinking) {
|
|
351
|
+
parts.push(
|
|
352
|
+
'\n[SYSTEM NOTE -- model policy]',
|
|
353
|
+
`Prefer reasoning depth: ${job.payload_thinking}.`,
|
|
354
|
+
'[END SYSTEM NOTE]',
|
|
355
|
+
);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
// Flush preamble for pre_compaction_flush jobs
|
|
359
|
+
if (job.job_class === 'pre_compaction_flush') {
|
|
360
|
+
parts.push('\n[SYSTEM: Pre-compaction flush required]');
|
|
361
|
+
parts.push('Write a structured summary of: active decisions, constraints, task owners, open questions.');
|
|
362
|
+
parts.push('Format as labeled sections. If nothing needs flushing, respond with exactly: NO_FLUSH');
|
|
363
|
+
parts.push('[END SYSTEM]');
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// Global sub-agent scope: instruct the agent to query across all sessions
|
|
367
|
+
if (job.payload_scope === 'global') {
|
|
368
|
+
parts.push(
|
|
369
|
+
'\n[SYSTEM NOTE -- scope=global]',
|
|
370
|
+
'This job has cross-session sub-agent visibility enabled.',
|
|
371
|
+
'When you need to list or inspect sub-agents, do NOT use `subagents list`',
|
|
372
|
+
'(which only shows sub-agents spawned by the current session).',
|
|
373
|
+
'Instead, call `sessions_list` with no session filter to enumerate ALL active',
|
|
374
|
+
'sessions across every requester, then filter by session key prefix or agent id.',
|
|
375
|
+
'This lets you observe sub-agents spawned from the main Telegram session or any',
|
|
376
|
+
'other session -- not just this isolated scheduler session.',
|
|
377
|
+
'[END SYSTEM NOTE]',
|
|
378
|
+
);
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Include any pending messages for this agent
|
|
382
|
+
const inbox = getInbox(job.agent_id || 'main', { limit: 5 });
|
|
383
|
+
if (inbox.length > 0) {
|
|
384
|
+
parts.push('\n--- Pending Messages ---');
|
|
385
|
+
for (const msg of inbox) {
|
|
386
|
+
const kindLabel = msg.kind && !['text', 'result', 'status', 'system', 'spawn'].includes(msg.kind)
|
|
387
|
+
? `[${msg.kind}]${msg.owner ? ` (owner: ${msg.owner})` : ''} `
|
|
388
|
+
: '';
|
|
389
|
+
parts.push(`From: ${msg.from_agent} | ${msg.kind} | ${msg.subject || '(no subject)'}`);
|
|
390
|
+
const bodyExcerpt = msg.body.length > 500
|
|
391
|
+
? msg.body.slice(0, 500) + '\n[... message truncated]'
|
|
392
|
+
: msg.body;
|
|
393
|
+
if (kindLabel) {
|
|
394
|
+
parts.push(`${kindLabel}${bodyExcerpt}`);
|
|
395
|
+
} else {
|
|
396
|
+
parts.push(bodyExcerpt);
|
|
397
|
+
}
|
|
398
|
+
parts.push('---');
|
|
399
|
+
markDelivered(msg.id);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
// Collect context metadata
|
|
404
|
+
const contextMeta = {
|
|
405
|
+
messages_injected: inbox.length,
|
|
406
|
+
scope: job.payload_scope || 'own',
|
|
407
|
+
job_class: job.job_class || 'standard',
|
|
408
|
+
delivery_guarantee: job.delivery_guarantee || 'at-most-once',
|
|
409
|
+
context_retrieval: job.context_retrieval || 'none',
|
|
410
|
+
execution_intent: job.execution_intent || 'execute',
|
|
411
|
+
execution_read_only: Boolean(job.execution_read_only),
|
|
412
|
+
payload_model: job.payload_model || null,
|
|
413
|
+
payload_thinking: job.payload_thinking || null,
|
|
414
|
+
auth_profile: job.auth_profile || null,
|
|
415
|
+
};
|
|
416
|
+
|
|
417
|
+
const triggerContext = buildTriggeredRunContext(run);
|
|
418
|
+
if (triggerContext.text) {
|
|
419
|
+
parts.push(triggerContext.text);
|
|
420
|
+
Object.assign(contextMeta, triggerContext.meta);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// Add retrieval context if configured
|
|
424
|
+
if (job.context_retrieval && job.context_retrieval !== 'none') {
|
|
425
|
+
try {
|
|
426
|
+
const retrievalCtx = buildRetrievalContext(job);
|
|
427
|
+
if (retrievalCtx) {
|
|
428
|
+
parts.push(retrievalCtx);
|
|
429
|
+
contextMeta.retrieval_results = (retrievalCtx.match(/\n\[/g) || []).length;
|
|
430
|
+
}
|
|
431
|
+
} catch (err) {
|
|
432
|
+
log('warn', `Retrieval context error for ${job.name}: ${err.message}`);
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// Inject idempotency key for at-least-once jobs
|
|
437
|
+
if (run.idempotency_key && job.delivery_guarantee === 'at-least-once') {
|
|
438
|
+
parts.push(`\n[IDEMPOTENCY KEY: ${run.idempotency_key}]`);
|
|
439
|
+
parts.push('This is an at-least-once job. Before performing side effects, verify this key');
|
|
440
|
+
parts.push('has not already been processed. If you\'ve already handled this exact execution,');
|
|
441
|
+
parts.push('respond with: IDEMPOTENT_SKIP');
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
parts.push('\n' + (job.payload_message ?? ''));
|
|
445
|
+
return { prompt: parts.join('\n'), contextMeta };
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// -- Advance next_run_at -------------------------------------
|
|
449
|
+
function advanceNextRun(job) {
|
|
450
|
+
const nextRun = nextRunFromCron(job.schedule_cron, job.schedule_tz);
|
|
451
|
+
updateJob(job.id, { next_run_at: nextRun });
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// -- Update job state after run ------------------------------
|
|
455
|
+
function updateJobAfterRun(job, status) {
|
|
456
|
+
// Re-read from DB to get current state (avoids stale consecutive_errors during retries)
|
|
457
|
+
const freshJob = getJob(job.id);
|
|
458
|
+
if (!freshJob) return; // Job was already deleted (e.g. delete_after_run race)
|
|
459
|
+
const currentErrors = freshJob?.consecutive_errors || 0;
|
|
460
|
+
const patch = { last_run_at: sqliteNow(), last_status: status };
|
|
461
|
+
|
|
462
|
+
if (status === 'error' || status === 'timeout') {
|
|
463
|
+
patch.consecutive_errors = currentErrors + 1;
|
|
464
|
+
} else if (status === 'ok') {
|
|
465
|
+
patch.consecutive_errors = 0;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
// At-jobs (one-shot): don't advance cron schedule -- delete or disable
|
|
469
|
+
if (freshJob.schedule_kind === 'at') {
|
|
470
|
+
if (freshJob.delete_after_run) {
|
|
471
|
+
getDb().transaction(() => {
|
|
472
|
+
updateJob(job.id, patch);
|
|
473
|
+
deleteJob(job.id);
|
|
474
|
+
})();
|
|
475
|
+
log('info', `Deleting one-shot at-job: ${job.name}`, { jobId: job.id });
|
|
476
|
+
} else {
|
|
477
|
+
patch.enabled = 0; // Disable so it won't fire again via getDueAtJobs
|
|
478
|
+
updateJob(job.id, patch);
|
|
479
|
+
log('info', `Disabling completed at-job: ${job.name}`, { jobId: job.id });
|
|
480
|
+
}
|
|
481
|
+
return;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// Cron job: advance schedule
|
|
485
|
+
const nextRun = nextRunFromCron(freshJob.schedule_cron, freshJob.schedule_tz);
|
|
486
|
+
patch.next_run_at = nextRun;
|
|
487
|
+
|
|
488
|
+
// Backoff for errors
|
|
489
|
+
if (patch.consecutive_errors > 0 && nextRun) {
|
|
490
|
+
const backoffMs = getBackoffMs(patch.consecutive_errors);
|
|
491
|
+
const backoffDate = new Date(Date.now() + backoffMs);
|
|
492
|
+
const nextDate = new Date(nextRun);
|
|
493
|
+
if (backoffDate > nextDate) patch.next_run_at = backoffDate.toISOString().replace('T', ' ').replace(/\.\d{3}Z$/, '');
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
if (status === 'ok' && freshJob.delete_after_run) {
|
|
497
|
+
getDb().transaction(() => {
|
|
498
|
+
updateJob(job.id, patch);
|
|
499
|
+
deleteJob(freshJob.id);
|
|
500
|
+
})();
|
|
501
|
+
log('info', `Deleting one-shot: ${freshJob.name}`);
|
|
502
|
+
} else {
|
|
503
|
+
updateJob(job.id, patch);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// -- Main tick -----------------------------------------------
|
|
508
|
+
async function tick() {
|
|
509
|
+
const now = Date.now();
|
|
510
|
+
|
|
511
|
+
// Gateway health check
|
|
512
|
+
if (!gatewayHealthy || now - lastGatewayCheck >= 60000) {
|
|
513
|
+
lastGatewayCheck = now;
|
|
514
|
+
gatewayHealthy = await checkGatewayHealth();
|
|
515
|
+
if (!gatewayHealthy) {
|
|
516
|
+
log('warn', 'Gateway unreachable -- isolated jobs will be deferred; shell/main jobs continue');
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// 1. Dispatch due jobs
|
|
521
|
+
try {
|
|
522
|
+
const dueJobs = getDueJobs();
|
|
523
|
+
for (const job of dueJobs) {
|
|
524
|
+
if (!gatewayHealthy && job.session_target === 'isolated') {
|
|
525
|
+
const deferredAt = new Date(Date.now() + 60000).toISOString().replace('T', ' ').replace(/\.\d{3}Z$/, '');
|
|
526
|
+
updateJob(job.id, { next_run_at: deferredAt });
|
|
527
|
+
log('info', `Deferred isolated job while gateway is down: ${job.name}`, { jobId: job.id, nextRunAt: deferredAt });
|
|
528
|
+
continue;
|
|
529
|
+
}
|
|
530
|
+
await dispatchJob(job);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// 1b. Dispatch due at-jobs (one-shot scheduling)
|
|
534
|
+
const dueAtJobs = getDueAtJobs();
|
|
535
|
+
for (const job of dueAtJobs) {
|
|
536
|
+
if (!gatewayHealthy && job.session_target === 'isolated') {
|
|
537
|
+
// Gateway down: skip this tick, at-job will be retried next tick
|
|
538
|
+
// (schedule_at condition still holds, enabled=1 unchanged)
|
|
539
|
+
log('info', `Deferred at-job while gateway is down: ${job.name}`, { jobId: job.id, scheduleAt: job.schedule_at });
|
|
540
|
+
continue;
|
|
541
|
+
}
|
|
542
|
+
await dispatchJob(job);
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
const dueDispatches = getDueDispatches();
|
|
546
|
+
for (const dispatchRecord of dueDispatches) {
|
|
547
|
+
const job = getJob(dispatchRecord.job_id);
|
|
548
|
+
if (!job) {
|
|
549
|
+
setDispatchStatus(dispatchRecord.id, 'cancelled');
|
|
550
|
+
continue;
|
|
551
|
+
}
|
|
552
|
+
if (!job.enabled && dispatchRecord.dispatch_kind !== 'manual') {
|
|
553
|
+
setDispatchStatus(dispatchRecord.id, 'cancelled');
|
|
554
|
+
continue;
|
|
555
|
+
}
|
|
556
|
+
if (!gatewayHealthy && job.session_target === 'isolated') {
|
|
557
|
+
releaseDispatch(dispatchRecord.id, sqliteNow(60000));
|
|
558
|
+
log('info', `Deferred queued dispatch while gateway is down: ${job.name}`, {
|
|
559
|
+
jobId: job.id,
|
|
560
|
+
dispatchId: dispatchRecord.id,
|
|
561
|
+
});
|
|
562
|
+
continue;
|
|
563
|
+
}
|
|
564
|
+
await dispatchJob(job, { dispatchRecord });
|
|
565
|
+
}
|
|
566
|
+
} catch (err) {
|
|
567
|
+
log('error', `Dispatch error: ${err.message}`);
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
// 2. Health check + approval gates (every HEARTBEAT_CHECK_MS)
|
|
571
|
+
if (now - lastHeartbeatCheck >= HEARTBEAT_CHECK_MS) {
|
|
572
|
+
lastHeartbeatCheck = now;
|
|
573
|
+
try {
|
|
574
|
+
await checkRunHealth({
|
|
575
|
+
log,
|
|
576
|
+
getDb,
|
|
577
|
+
getRunningRuns,
|
|
578
|
+
getStaleRuns,
|
|
579
|
+
getTimedOutRuns,
|
|
580
|
+
finishRun,
|
|
581
|
+
getJob,
|
|
582
|
+
updateJobAfterRun,
|
|
583
|
+
handleDelivery,
|
|
584
|
+
dequeueJob,
|
|
585
|
+
shouldRetry,
|
|
586
|
+
scheduleRetry,
|
|
587
|
+
staleThresholdSeconds: STALE_THRESHOLD_S,
|
|
588
|
+
});
|
|
589
|
+
} catch (err) {
|
|
590
|
+
log('error', `Health check error: ${err.message}`);
|
|
591
|
+
}
|
|
592
|
+
try {
|
|
593
|
+
await checkApprovals({
|
|
594
|
+
log,
|
|
595
|
+
getDb,
|
|
596
|
+
getTimedOutApprovals,
|
|
597
|
+
getJob,
|
|
598
|
+
resolveApproval,
|
|
599
|
+
dispatchJob,
|
|
600
|
+
getDispatch,
|
|
601
|
+
setDispatchStatus,
|
|
602
|
+
});
|
|
603
|
+
} catch (err) {
|
|
604
|
+
log('error', `Approval check error: ${err.message}`);
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
// 3. Message delivery + spawn handling (every MESSAGE_DELIVERY_MS)
|
|
609
|
+
if (now - lastMessageDelivery >= MESSAGE_DELIVERY_MS) {
|
|
610
|
+
lastMessageDelivery = now;
|
|
611
|
+
// Handle spawn messages -- running jobs can request child job creation
|
|
612
|
+
try {
|
|
613
|
+
const spawnMsgs = getDb().prepare(`
|
|
614
|
+
SELECT * FROM messages WHERE kind = 'spawn' AND delivered_at IS NULL
|
|
615
|
+
`).all();
|
|
616
|
+
for (const msg of spawnMsgs) {
|
|
617
|
+
try {
|
|
618
|
+
const spec = JSON.parse(msg.body);
|
|
619
|
+
if (!spec.payload_message || typeof spec.payload_message !== 'string' || !spec.payload_message.trim()) {
|
|
620
|
+
log('error', `Spawn message missing payload_message`, { msgId: msg.id, fromAgent: msg.from_agent });
|
|
621
|
+
markDelivered(msg.id);
|
|
622
|
+
continue;
|
|
623
|
+
}
|
|
624
|
+
const VALID_SPAWN_SESSION_TARGETS = ['isolated', 'shell'];
|
|
625
|
+
const VALID_SPAWN_DELIVERY_MODES = ['none', 'announce', 'announce-always'];
|
|
626
|
+
|
|
627
|
+
let sessionTarget = spec.session_target || 'isolated';
|
|
628
|
+
if (!VALID_SPAWN_SESSION_TARGETS.includes(sessionTarget)) {
|
|
629
|
+
log('warn', `Spawn: invalid session_target "${sessionTarget}", defaulting to "isolated"`, {
|
|
630
|
+
msgId: msg.id, fromAgent: msg.from_agent,
|
|
631
|
+
});
|
|
632
|
+
sessionTarget = 'isolated';
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
let deliveryMode = spec.delivery_mode || 'none';
|
|
636
|
+
if (!VALID_SPAWN_DELIVERY_MODES.includes(deliveryMode)) {
|
|
637
|
+
log('warn', `Spawn: invalid delivery_mode "${deliveryMode}", defaulting to "none"`, {
|
|
638
|
+
msgId: msg.id, fromAgent: msg.from_agent,
|
|
639
|
+
});
|
|
640
|
+
deliveryMode = 'none';
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
// Wrap job creation + message ack in a transaction so a crash
|
|
644
|
+
// between the two cannot leave an unacked spawn that replays.
|
|
645
|
+
const child = getDb().transaction(() => {
|
|
646
|
+
const c = createJob({
|
|
647
|
+
name: spec.name || `Spawned by ${msg.from_agent}`,
|
|
648
|
+
parent_id: msg.job_id || null,
|
|
649
|
+
schedule_cron: spec.schedule_cron,
|
|
650
|
+
payload_message: spec.payload_message,
|
|
651
|
+
session_target: sessionTarget,
|
|
652
|
+
agent_id: spec.agent_id || msg.to_agent || 'main',
|
|
653
|
+
delivery_mode: deliveryMode,
|
|
654
|
+
delivery_channel: spec.delivery_channel,
|
|
655
|
+
delivery_to: spec.delivery_to,
|
|
656
|
+
delivery_opt_out_reason: spec.delivery_opt_out_reason
|
|
657
|
+
|| (deliveryMode === 'none' ? 'spawned-child' : null),
|
|
658
|
+
delete_after_run: spec.delete_after_run !== false ? 1 : 0,
|
|
659
|
+
enabled: true,
|
|
660
|
+
run_timeout_ms: spec.run_timeout_ms || 300_000,
|
|
661
|
+
origin: spec.origin || 'system',
|
|
662
|
+
});
|
|
663
|
+
// Fire immediately
|
|
664
|
+
getDb().prepare(`UPDATE jobs SET next_run_at = datetime('now', '-1 second') WHERE id = ?`).run(c.id);
|
|
665
|
+
markDelivered(msg.id);
|
|
666
|
+
return c;
|
|
667
|
+
})();
|
|
668
|
+
log('info', `Spawned child job: ${child.name}`, { childId: child.id, parentJobId: msg.job_id });
|
|
669
|
+
} catch (e) {
|
|
670
|
+
log('error', `Spawn message parse error: ${e.message}`, { msgId: msg.id, fromAgent: msg.from_agent });
|
|
671
|
+
markDelivered(msg.id); // Don't retry bad messages
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
} catch (err) {
|
|
675
|
+
log('error', `Spawn handler error: ${err.message}`);
|
|
676
|
+
}
|
|
677
|
+
try {
|
|
678
|
+
const mapped = mapTeamMessages(200);
|
|
679
|
+
if (mapped > 0) {
|
|
680
|
+
log('debug', `Team adapter mapped ${mapped} message(s)`);
|
|
681
|
+
}
|
|
682
|
+
} catch (err) {
|
|
683
|
+
log('error', `Team adapter map error: ${err.message}`);
|
|
684
|
+
}
|
|
685
|
+
try {
|
|
686
|
+
const gates = checkTeamTaskGates(100);
|
|
687
|
+
if (gates.passed > 0 || gates.failed > 0) {
|
|
688
|
+
log('info', `Team task gates updated`, gates);
|
|
689
|
+
} else if (gates.pending > 0) {
|
|
690
|
+
log('debug', `Team task gates pending`, gates);
|
|
691
|
+
}
|
|
692
|
+
} catch (err) {
|
|
693
|
+
log('error', `Team gate check error: ${err.message}`);
|
|
694
|
+
}
|
|
695
|
+
try {
|
|
696
|
+
expireStaleMessages({ expireMessages });
|
|
697
|
+
} catch (err) {
|
|
698
|
+
log('error', `Message delivery error: ${err.message}`);
|
|
699
|
+
}
|
|
700
|
+
try {
|
|
701
|
+
await checkTaskTrackers({
|
|
702
|
+
log,
|
|
703
|
+
getDb,
|
|
704
|
+
getAllSubAgentSessions,
|
|
705
|
+
touchAgentHeartbeat,
|
|
706
|
+
checkDeadAgents,
|
|
707
|
+
listActiveTaskGroups,
|
|
708
|
+
checkGroupCompletion,
|
|
709
|
+
getTaskGroupStatus,
|
|
710
|
+
resolveDeliveryAlias,
|
|
711
|
+
deliverMessage,
|
|
712
|
+
});
|
|
713
|
+
} catch (err) {
|
|
714
|
+
log('error', `Task tracker error: ${err.message}`);
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// 4. Prune (hourly)
|
|
719
|
+
if (now - lastPrune >= PRUNE_INTERVAL_MS) {
|
|
720
|
+
lastPrune = now;
|
|
721
|
+
try {
|
|
722
|
+
pruneRuns(100);
|
|
723
|
+
pruneMessages(30);
|
|
724
|
+
pruneApprovals(30);
|
|
725
|
+
pruneIdempotencyLedger();
|
|
726
|
+
const expiredCount = pruneExpiredJobs();
|
|
727
|
+
if (expiredCount > 0) log('info', `Pruned ${expiredCount} expired disabled job(s)`);
|
|
728
|
+
// Ensure inbox consumer jobs exist for agents with delivery config
|
|
729
|
+
ensureAgentInboxJobs({ log, getDb, createJob });
|
|
730
|
+
// Checkpoint WAL to disk -- reduces data loss window on crash/SIGKILL
|
|
731
|
+
const cpResult = checkpointWal();
|
|
732
|
+
if (cpResult) {
|
|
733
|
+
log('debug', `WAL checkpoint: log=${cpResult.log}, checkpointed=${cpResult.checkpointed}, busy=${cpResult.busy}`);
|
|
734
|
+
}
|
|
735
|
+
log('info', 'Pruned old runs + messages');
|
|
736
|
+
} catch (err) {
|
|
737
|
+
log('error', `Prune error: ${err.message}`);
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
// 5. Backup to MinIO (every BACKUP_INTERVAL_MS, default 5 min; set SCHEDULER_BACKUP=1 to enable)
|
|
742
|
+
if (backupEnabled && now - lastBackup >= BACKUP_INTERVAL_MS) {
|
|
743
|
+
lastBackup = now;
|
|
744
|
+
const isRollup = now - lastRollupBackup >= 3600000;
|
|
745
|
+
if (isRollup) lastRollupBackup = now;
|
|
746
|
+
const mode = isRollup ? 'rollup' : 'snapshot';
|
|
747
|
+
// Run backup in a child process without blocking the event loop
|
|
748
|
+
const { execFile } = await import('child_process');
|
|
749
|
+
execFile(process.execPath, [join(__dirname, 'backup.js'), mode], {
|
|
750
|
+
timeout: 30000,
|
|
751
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
752
|
+
}, (err, _stdout, stderr) => {
|
|
753
|
+
if (err) {
|
|
754
|
+
const msg = stderr?.trim() || err.message;
|
|
755
|
+
if (msg.includes('not found') || msg.includes('ENOENT')) {
|
|
756
|
+
log('warn', `Backup disabled: mc binary not found. Install mc to use backups.`);
|
|
757
|
+
backupEnabled = false;
|
|
758
|
+
} else {
|
|
759
|
+
log('error', `Backup failed: ${msg}`);
|
|
760
|
+
}
|
|
761
|
+
} else {
|
|
762
|
+
log('debug', `Backup ${mode} completed`);
|
|
763
|
+
}
|
|
764
|
+
});
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
// -- Lifecycle -----------------------------------------------
|
|
769
|
+
function shutdown(signal) {
|
|
770
|
+
log('info', `Shutting down (${signal})`);
|
|
771
|
+
running = false;
|
|
772
|
+
try {
|
|
773
|
+
// Force WAL checkpoint before close to ensure all data is in main DB
|
|
774
|
+
const cpResult = checkpointWal();
|
|
775
|
+
if (cpResult) {
|
|
776
|
+
log('info', `Shutdown WAL checkpoint: log=${cpResult.log}, checkpointed=${cpResult.checkpointed}, busy=${cpResult.busy}`);
|
|
777
|
+
}
|
|
778
|
+
} catch (err) {
|
|
779
|
+
log('error', `Shutdown checkpoint failed: ${err.message}`);
|
|
780
|
+
}
|
|
781
|
+
closeDb();
|
|
782
|
+
log('info', 'Shutdown complete');
|
|
783
|
+
process.exit(0);
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
// -- Startup repair -----------------------------------------
|
|
787
|
+
/**
|
|
788
|
+
* Find enabled root cron jobs with NULL next_run_at and recompute their schedule.
|
|
789
|
+
* Guards against insertion bugs (e.g. via direct DB write or a CLI code-path that
|
|
790
|
+
* skips nextRunFromCron) that leave a job permanently dormant.
|
|
791
|
+
*/
|
|
792
|
+
function repairNullNextRunAt() {
|
|
793
|
+
const db = getDb();
|
|
794
|
+
const broken = db.prepare(`
|
|
795
|
+
SELECT id, name, schedule_cron, schedule_tz
|
|
796
|
+
FROM jobs
|
|
797
|
+
WHERE enabled = 1
|
|
798
|
+
AND next_run_at IS NULL
|
|
799
|
+
AND parent_id IS NULL
|
|
800
|
+
AND schedule_cron IS NOT NULL
|
|
801
|
+
AND schedule_cron != '0 0 31 2 *'
|
|
802
|
+
`).all();
|
|
803
|
+
|
|
804
|
+
if (broken.length === 0) return;
|
|
805
|
+
|
|
806
|
+
const fix = db.prepare(`UPDATE jobs SET next_run_at = ? WHERE id = ?`);
|
|
807
|
+
for (const job of broken) {
|
|
808
|
+
const next = nextRunFromCron(job.schedule_cron, job.schedule_tz || 'UTC');
|
|
809
|
+
if (next) {
|
|
810
|
+
fix.run(next, job.id);
|
|
811
|
+
log('warn', `Repaired null next_run_at for job "${job.name}" -> ${next}`);
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
async function main() {
|
|
817
|
+
log('info', `Starting OpenClaw Scheduler v${SCHEDULER_VERSION}`, {
|
|
818
|
+
tickMs: TICK_INTERVAL_MS,
|
|
819
|
+
staleThresholdS: STALE_THRESHOLD_S,
|
|
820
|
+
heartbeatCheckMs: HEARTBEAT_CHECK_MS,
|
|
821
|
+
});
|
|
822
|
+
|
|
823
|
+
await initDb();
|
|
824
|
+
|
|
825
|
+
// Load provider plugins if configured
|
|
826
|
+
if (process.env.SCHEDULER_PROVIDER_PATH) {
|
|
827
|
+
await loadProviders(process.env.SCHEDULER_PROVIDER_PATH);
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
// Register default agent
|
|
831
|
+
upsertAgent('main', { name: 'Main Agent', status: 'idle', capabilities: ['*'] });
|
|
832
|
+
|
|
833
|
+
log('info', 'Database initialized');
|
|
834
|
+
|
|
835
|
+
// Replay orphaned runs from previous crash (delivery guarantee support)
|
|
836
|
+
await replayOrphanedRuns();
|
|
837
|
+
reconcileQueuedRetrySchedules();
|
|
838
|
+
|
|
839
|
+
// Repair any enabled cron jobs with NULL next_run_at (scheduling bug defence)
|
|
840
|
+
repairNullNextRunAt();
|
|
841
|
+
|
|
842
|
+
process.on('SIGINT', () => shutdown('SIGINT'));
|
|
843
|
+
process.on('SIGTERM', () => shutdown('SIGTERM'));
|
|
844
|
+
|
|
845
|
+
while (running) {
|
|
846
|
+
await tick();
|
|
847
|
+
await new Promise(r => setTimeout(r, TICK_INTERVAL_MS));
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
main().catch(err => {
|
|
852
|
+
log('error', `Fatal: ${err.message}`);
|
|
853
|
+
closeDb();
|
|
854
|
+
process.exit(1);
|
|
855
|
+
});
|