openclaw-scheduler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +302 -0
- package/BEST-PRACTICES.md +506 -0
- package/CHANGELOG.md +82 -0
- package/CODE_OF_CONDUCT.md +22 -0
- package/CONTEXT.md +26 -0
- package/CONTRIBUTING.md +73 -0
- package/IMPLEMENTATION_SPEC.md +170 -0
- package/INSTALL-ADDITIONAL-HOST.md +333 -0
- package/INSTALL-LINUX.md +419 -0
- package/INSTALL-WINDOWS.md +305 -0
- package/INSTALL.md +364 -0
- package/JOB-QUICK-REF.md +222 -0
- package/LICENSE +21 -0
- package/QUICK-START.md +256 -0
- package/README.md +2170 -0
- package/SECURITY.md +34 -0
- package/UNINSTALL.md +129 -0
- package/UPGRADING.md +436 -0
- package/agents.js +67 -0
- package/approval.js +107 -0
- package/backup.js +390 -0
- package/bin/openclaw-scheduler.js +138 -0
- package/cli.js +1083 -0
- package/db.js +122 -0
- package/dispatch/529-recovery.mjs +204 -0
- package/dispatch/README.md +372 -0
- package/dispatch/config.example.json +24 -0
- package/dispatch/deliver-watcher.sh +57 -0
- package/dispatch/hooks.mjs +171 -0
- package/dispatch/index.mjs +1836 -0
- package/dispatch/watcher.mjs +1396 -0
- package/dispatch-queue.js +112 -0
- package/dispatcher-approvals.js +96 -0
- package/dispatcher-delivery.js +43 -0
- package/dispatcher-maintenance.js +242 -0
- package/dispatcher-shell.js +29 -0
- package/dispatcher-strategies.js +1280 -0
- package/dispatcher-utils.js +81 -0
- package/dispatcher.js +855 -0
- package/docs/adr-schedule-ownership.md +73 -0
- package/docs/gateway-contract.md +904 -0
- package/docs/plans/2026-03-09-fix-typescript-types.md +91 -0
- package/docs/plans/2026-03-09-test-coverage-gaps.md +83 -0
- package/docs/plans/2026-03-10-dispatcher-refactor.md +801 -0
- package/docs/trust-architecture.md +266 -0
- package/gateway.js +473 -0
- package/idempotency.js +119 -0
- package/index.d.ts +864 -0
- package/index.js +17 -0
- package/jobs.js +1224 -0
- package/messages.js +357 -0
- package/migrate-consolidate.js +694 -0
- package/migrate.js +125 -0
- package/package.json +130 -0
- package/paths.js +79 -0
- package/prompt-context.js +94 -0
- package/retrieval.js +176 -0
- package/runs.js +270 -0
- package/scheduler-schema.js +101 -0
- package/schema.sql +480 -0
- package/scripts/dispatch-cli-utils.mjs +65 -0
- package/scripts/inbox-consumer.mjs +288 -0
- package/scripts/stuck-detector.sh +18 -0
- package/scripts/stuck-run-detector.mjs +333 -0
- package/scripts/telegram-webhook-check.mjs +238 -0
- package/setup.mjs +724 -0
- package/shell-result.js +214 -0
- package/task-tracker.js +300 -0
- package/team-adapter.js +335 -0
- package/v02-runtime.js +599 -0
|
@@ -0,0 +1,1280 @@
|
|
|
1
|
+
// dispatcher-strategies.js
|
|
2
|
+
// Strategy pattern for dispatchJob: each execution target returns a DispatchResult,
|
|
3
|
+
// and finalizeDispatch processes it uniformly.
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* DispatchResult shape (returned by every strategy):
|
|
7
|
+
* {
|
|
8
|
+
* status: 'ok' | 'error' | 'skipped',
|
|
9
|
+
* summary: string,
|
|
10
|
+
* content: string, // for delivery + trigger condition eval
|
|
11
|
+
* errorMessage: string | null,
|
|
12
|
+
* runFinishFields: object, // extra fields for finishRun (shell_exit_code, etc.)
|
|
13
|
+
* deliveryOverride: string | null, // override delivery content (null = use content)
|
|
14
|
+
* skipDelivery: boolean, // suppress delivery entirely
|
|
15
|
+
* skipJobUpdate: boolean, // strategy handled job state itself
|
|
16
|
+
* skipChildren: boolean, // don't fire triggered children
|
|
17
|
+
* skipDequeue: boolean, // don't drain overlap queue
|
|
18
|
+
* idemAction: 'keep' | 'release' | 'noop', // what to do with idempotency key
|
|
19
|
+
* retryFiresChildren: boolean, // whether retry path fires triggered children
|
|
20
|
+
* earlyReturn: boolean, // finalize should skip everything (strategy fully handled it)
|
|
21
|
+
* }
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
export function makeDefaultResult() {
|
|
25
|
+
return {
|
|
26
|
+
status: 'ok',
|
|
27
|
+
summary: '',
|
|
28
|
+
content: '',
|
|
29
|
+
errorMessage: null,
|
|
30
|
+
runFinishFields: {},
|
|
31
|
+
deliveryOverride: null,
|
|
32
|
+
skipDelivery: false,
|
|
33
|
+
skipJobUpdate: false,
|
|
34
|
+
skipChildren: false,
|
|
35
|
+
skipDequeue: false,
|
|
36
|
+
skipAgentCleanup: true,
|
|
37
|
+
idemAction: 'noop',
|
|
38
|
+
retryFiresChildren: false,
|
|
39
|
+
earlyReturn: false,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Safely parse a JSON string. Returns parsed value or null on failure. */
|
|
44
|
+
function safeParse(str) {
|
|
45
|
+
if (str == null || str === '') return null;
|
|
46
|
+
try {
|
|
47
|
+
return JSON.parse(str);
|
|
48
|
+
} catch (_e) {
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function getIdentityTrustLevel(identity) {
|
|
54
|
+
if (!identity || typeof identity !== 'object') return null;
|
|
55
|
+
return identity.trust_level
|
|
56
|
+
|| identity.trust?.effective_level
|
|
57
|
+
|| identity.trust?.level
|
|
58
|
+
|| identity.session?.trust?.effective_level
|
|
59
|
+
|| identity.session?.trust?.level
|
|
60
|
+
|| identity.raw?.trust_level
|
|
61
|
+
|| identity.raw?.trust?.effective_level
|
|
62
|
+
|| identity.raw?.trust?.level
|
|
63
|
+
|| null;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function getJobTrustLevel(job, parsedIdentity = null) {
|
|
67
|
+
const identityBlob = parsedIdentity || safeParse(job?.identity);
|
|
68
|
+
return getIdentityTrustLevel(identityBlob) || job?.identity_trust_level || null;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function hasIdentityDeclaration(job) {
|
|
72
|
+
if (!job) return false;
|
|
73
|
+
return job.identity != null
|
|
74
|
+
|| job.identity_ref != null
|
|
75
|
+
|| job.identity_principal != null
|
|
76
|
+
|| job.identity_run_as != null
|
|
77
|
+
|| job.identity_attestation != null
|
|
78
|
+
|| job.identity_subject_kind != null
|
|
79
|
+
|| job.identity_subject_principal != null
|
|
80
|
+
|| job.identity_trust_level != null
|
|
81
|
+
|| job.identity_delegation_mode != null;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Redact session credentials from v02Outcomes before DB persistence.
|
|
86
|
+
* Uses the provider's describeSession() for redaction when available,
|
|
87
|
+
* otherwise strips the credentials key directly.
|
|
88
|
+
*/
|
|
89
|
+
export function redactOutcomesForPersistence(outcomes, deps) {
|
|
90
|
+
if (!outcomes?.identity_resolved?.session?.credentials) return outcomes;
|
|
91
|
+
const redacted = { ...outcomes };
|
|
92
|
+
const ir = { ...redacted.identity_resolved };
|
|
93
|
+
const session = { ...ir.session };
|
|
94
|
+
|
|
95
|
+
const providerName = ir.provider;
|
|
96
|
+
const provider = providerName && deps?.getIdentityProvider?.(providerName);
|
|
97
|
+
if (provider && typeof provider.describeSession === 'function') {
|
|
98
|
+
try {
|
|
99
|
+
ir.session = provider.describeSession(session);
|
|
100
|
+
} catch (_err) {
|
|
101
|
+
delete session.credentials;
|
|
102
|
+
ir.session = session;
|
|
103
|
+
}
|
|
104
|
+
} else {
|
|
105
|
+
delete session.credentials;
|
|
106
|
+
ir.session = session;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
redacted.identity_resolved = ir;
|
|
110
|
+
return redacted;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function abortPreparedRun(job, run, summary, outcomes, state, deps, opts = {}) {
|
|
114
|
+
const {
|
|
115
|
+
finishRun, persistV02Outcomes, releaseIdempotencyKey, updateJobAfterRun,
|
|
116
|
+
setDispatchStatus, handleTriggeredChildren, dequeueJob, log,
|
|
117
|
+
} = deps;
|
|
118
|
+
|
|
119
|
+
finishRun(run.id, 'error', {
|
|
120
|
+
summary,
|
|
121
|
+
error_message: summary,
|
|
122
|
+
});
|
|
123
|
+
persistV02Outcomes(run.id, redactOutcomesForPersistence(outcomes, deps));
|
|
124
|
+
if (state.idemKey) releaseIdempotencyKey(state.idemKey);
|
|
125
|
+
updateJobAfterRun(job, 'error');
|
|
126
|
+
if (state.dispatchRecord) setDispatchStatus(state.dispatchRecord.id, 'done');
|
|
127
|
+
// Security-related aborts (identity/trust/auth/proof/credential failures)
|
|
128
|
+
// should not fire child jobs -- a parent that failed a security gate must
|
|
129
|
+
// not trigger downstream work that may have weaker security requirements.
|
|
130
|
+
if (!opts.skipChildren) {
|
|
131
|
+
handleTriggeredChildren(job.id, 'error', summary, run.id);
|
|
132
|
+
}
|
|
133
|
+
if (dequeueJob(job.id)) {
|
|
134
|
+
log('info', `Dequeued pending dispatch for ${job.name}`);
|
|
135
|
+
}
|
|
136
|
+
return null;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Uniform post-execution ceremony. Processes the DispatchResult from any strategy.
|
|
141
|
+
*
|
|
142
|
+
* @param {object} job - The job record
|
|
143
|
+
* @param {object} ctx - DispatchContext from prepareDispatch
|
|
144
|
+
* @param {object} result - DispatchResult from the strategy
|
|
145
|
+
* @param {object} deps - Injected dependencies
|
|
146
|
+
*/
|
|
147
|
+
export async function finalizeDispatch(job, ctx, result, deps) {
|
|
148
|
+
const {
|
|
149
|
+
finishRun, updateIdempotencyResultHash, releaseIdempotencyKey,
|
|
150
|
+
setAgentStatus, handleDelivery, shouldRetry, scheduleRetry,
|
|
151
|
+
getDb, updateJobAfterRun, setDispatchStatus, handleTriggeredChildren,
|
|
152
|
+
dequeueJob, log,
|
|
153
|
+
} = deps;
|
|
154
|
+
|
|
155
|
+
if (result.earlyReturn) return;
|
|
156
|
+
|
|
157
|
+
// 1. Finish the run
|
|
158
|
+
finishRun(ctx.run.id, result.status, {
|
|
159
|
+
summary: result.summary,
|
|
160
|
+
error_message: result.errorMessage,
|
|
161
|
+
...result.runFinishFields,
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
// 1b. v0.2 evidence and outcome persistence
|
|
165
|
+
if (ctx.v02Outcomes) {
|
|
166
|
+
const { generateEvidence, persistV02Outcomes } = deps;
|
|
167
|
+
if (job.evidence || job.evidence_ref) {
|
|
168
|
+
const runMetadata = { id: ctx.run.id, status: result.status };
|
|
169
|
+
const evidence = generateEvidence(job, runMetadata, ctx.v02Outcomes);
|
|
170
|
+
if (evidence) ctx.v02Outcomes.evidence_record = evidence;
|
|
171
|
+
}
|
|
172
|
+
persistV02Outcomes(ctx.run.id, redactOutcomesForPersistence(ctx.v02Outcomes, deps));
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// 1c. Provider cleanup
|
|
176
|
+
if (ctx.materializationCleanup) {
|
|
177
|
+
try {
|
|
178
|
+
const { provider, cleanupState } = ctx.materializationCleanup;
|
|
179
|
+
if (typeof provider.cleanup === 'function') {
|
|
180
|
+
await provider.cleanup(cleanupState, { env: process.env, cwd: process.cwd() });
|
|
181
|
+
}
|
|
182
|
+
} catch (err) {
|
|
183
|
+
log('warn', `Provider cleanup failed for ${job.name}: ${err.message}`, { jobId: job.id });
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// 2. Idempotency key management
|
|
188
|
+
if (ctx.idemKey) {
|
|
189
|
+
if (result.idemAction === 'keep') {
|
|
190
|
+
updateIdempotencyResultHash(ctx.idemKey, result.content);
|
|
191
|
+
} else if (result.idemAction === 'release') {
|
|
192
|
+
releaseIdempotencyKey(ctx.idemKey);
|
|
193
|
+
}
|
|
194
|
+
// 'noop' -- leave key claimed without writing result hash
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// 3. Agent status cleanup (only for strategies that set busy)
|
|
198
|
+
if (!result.skipAgentCleanup && job.agent_id) setAgentStatus(job.agent_id, 'idle', null);
|
|
199
|
+
|
|
200
|
+
// 4. Delivery
|
|
201
|
+
if (!result.skipDelivery) {
|
|
202
|
+
const deliveryContent = result.deliveryOverride ?? result.content;
|
|
203
|
+
const shouldAnnounce = ['announce', 'announce-always'].includes(job.delivery_mode)
|
|
204
|
+
&& deliveryContent?.trim();
|
|
205
|
+
|
|
206
|
+
if (shouldAnnounce) {
|
|
207
|
+
if (result.deliveryOverride) {
|
|
208
|
+
await handleDelivery(job, result.deliveryOverride);
|
|
209
|
+
} else if (result.status === 'error') {
|
|
210
|
+
const willRetry = (job.max_retries ?? 0) > 0 && (ctx.run.retry_count || 0) < job.max_retries;
|
|
211
|
+
const retryLabel = willRetry ? 'will retry' : 'no retries configured';
|
|
212
|
+
await handleDelivery(job, `\u26a0\ufe0f Job soft-failed (${retryLabel}): ${job.name}\n\n${deliveryContent}`);
|
|
213
|
+
} else {
|
|
214
|
+
await handleDelivery(job, deliveryContent);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// 5. Retry on error
|
|
220
|
+
if (result.status === 'error' && shouldRetry(job, ctx.run.id)) {
|
|
221
|
+
const retry = scheduleRetry(job, ctx.run.id);
|
|
222
|
+
if (retry.dispatch) {
|
|
223
|
+
log('info', `Scheduling retry ${retry.retryCount}/${job.max_retries} in ${retry.delaySec}s`, {
|
|
224
|
+
jobId: job.id, runId: ctx.run.id,
|
|
225
|
+
});
|
|
226
|
+
getDb().prepare('UPDATE runs SET retry_count = ? WHERE id = ?').run(retry.retryCount, ctx.run.id);
|
|
227
|
+
if (ctx.dispatchRecord) setDispatchStatus(ctx.dispatchRecord.id, 'done');
|
|
228
|
+
if (!result.skipDequeue && dequeueJob(job.id)) {
|
|
229
|
+
log('info', `Dequeued pending dispatch for ${job.name}`);
|
|
230
|
+
}
|
|
231
|
+
if (result.retryFiresChildren && !result.skipChildren) {
|
|
232
|
+
handleTriggeredChildren(job.id, 'error', result.content, ctx.run.id, ' on soft failure');
|
|
233
|
+
}
|
|
234
|
+
log('info', `Failed: ${job.name} (retry scheduled)`, { runId: ctx.run.id });
|
|
235
|
+
return; // retry path handles everything
|
|
236
|
+
}
|
|
237
|
+
log('warn', `Retry skipped for ${job.name} -- dispatch backlog limit reached`, {
|
|
238
|
+
jobId: job.id, runId: ctx.run.id,
|
|
239
|
+
maxQueuedDispatches: job.max_queued_dispatches || 25,
|
|
240
|
+
});
|
|
241
|
+
// Fall through to steps 6-9: updateJobAfterRun, dispatch status, children, dequeue
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// 6. Update job state
|
|
245
|
+
if (!result.skipJobUpdate) {
|
|
246
|
+
updateJobAfterRun(job, result.status);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// 7. Complete dispatch
|
|
250
|
+
if (ctx.dispatchRecord) {
|
|
251
|
+
setDispatchStatus(ctx.dispatchRecord.id, 'done');
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// 8. Triggered children
|
|
255
|
+
if (!result.skipChildren) {
|
|
256
|
+
handleTriggeredChildren(job.id, result.status, result.content, ctx.run.id);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// 9. Dequeue overlap
|
|
260
|
+
if (!result.skipDequeue && dequeueJob(job.id)) {
|
|
261
|
+
log('info', `Dequeued pending dispatch for ${job.name}`);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// -- Phase 1: Guards + run creation --------------------------
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* DispatchContext shape (returned by prepareDispatch):
|
|
269
|
+
* {
|
|
270
|
+
* dispatchRecord: object | null,
|
|
271
|
+
* idemKey: string | null,
|
|
272
|
+
* run: object, // the created run record
|
|
273
|
+
* retryCount: number,
|
|
274
|
+
* dispatchKind: string | null,
|
|
275
|
+
* isChainDispatch: boolean,
|
|
276
|
+
* }
|
|
277
|
+
*/
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Phase 1: Guards + run creation. Returns DispatchContext or null (guard rejected).
|
|
281
|
+
*
|
|
282
|
+
* @param {object} job
|
|
283
|
+
* @param {object} opts - { approvalBypass, dispatchRecord }
|
|
284
|
+
* @param {object} deps - Injected dependencies
|
|
285
|
+
* @returns {object|null}
|
|
286
|
+
*/
|
|
287
|
+
export async function prepareDispatch(job, opts, deps) {
|
|
288
|
+
const {
|
|
289
|
+
claimDispatch, releaseDispatch, setDispatchStatus,
|
|
290
|
+
countPendingApprovalsForJob, getPendingApproval,
|
|
291
|
+
createApproval, createRun, getRun,
|
|
292
|
+
hasRunningRunForPool, hasRunningRun,
|
|
293
|
+
enqueueJob, getDispatchBacklogCount,
|
|
294
|
+
generateIdempotencyKey, generateChainIdempotencyKey,
|
|
295
|
+
generateRunNowIdempotencyKey, claimIdempotencyKey,
|
|
296
|
+
finishRun, getDb,
|
|
297
|
+
sqliteNow, adaptiveDeferralMs,
|
|
298
|
+
handleDelivery, advanceNextRun,
|
|
299
|
+
TICK_INTERVAL_MS,
|
|
300
|
+
log,
|
|
301
|
+
} = deps;
|
|
302
|
+
|
|
303
|
+
const approvalBypass = opts.approvalBypass === true;
|
|
304
|
+
let dispatchRecord = opts.dispatchRecord || null;
|
|
305
|
+
|
|
306
|
+
// Claim pending dispatch
|
|
307
|
+
if (dispatchRecord && dispatchRecord.status === 'pending') {
|
|
308
|
+
dispatchRecord = claimDispatch(dispatchRecord.id);
|
|
309
|
+
if (!dispatchRecord) {
|
|
310
|
+
log('debug', `Skipping claimed dispatch for ${job.name}`, { dispatchId: opts.dispatchRecord.id });
|
|
311
|
+
return null;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
const completeCurrentDispatch = (status = 'done') => {
|
|
316
|
+
if (!dispatchRecord) return null;
|
|
317
|
+
return setDispatchStatus(dispatchRecord.id, status);
|
|
318
|
+
};
|
|
319
|
+
|
|
320
|
+
const dispatchKind = dispatchRecord?.dispatch_kind || null;
|
|
321
|
+
const isChainDispatch = dispatchKind === 'chain';
|
|
322
|
+
const dispatchBacklogDepth = getDispatchBacklogCount(job.id);
|
|
323
|
+
|
|
324
|
+
// HITL approval gate
|
|
325
|
+
if (job.approval_required && isChainDispatch && !approvalBypass) {
|
|
326
|
+
const pendingApprovalCount = countPendingApprovalsForJob(job.id);
|
|
327
|
+
if (pendingApprovalCount >= (job.max_pending_approvals || 10)) {
|
|
328
|
+
completeCurrentDispatch('cancelled');
|
|
329
|
+
log('warn', `Approval backlog limit reached for ${job.name}`, {
|
|
330
|
+
jobId: job.id,
|
|
331
|
+
pendingApprovals: pendingApprovalCount,
|
|
332
|
+
maxPendingApprovals: job.max_pending_approvals || 10,
|
|
333
|
+
});
|
|
334
|
+
return null;
|
|
335
|
+
}
|
|
336
|
+
const existing = getPendingApproval(job.id);
|
|
337
|
+
if (existing) {
|
|
338
|
+
releaseDispatch(dispatchRecord.id, sqliteNow(adaptiveDeferralMs(dispatchBacklogDepth)));
|
|
339
|
+
log('debug', `Skipping ${job.name} -- approval already pending`, {
|
|
340
|
+
approvalId: existing.id,
|
|
341
|
+
dispatchId: dispatchRecord?.id || null,
|
|
342
|
+
deferredMs: adaptiveDeferralMs(dispatchBacklogDepth),
|
|
343
|
+
});
|
|
344
|
+
return null;
|
|
345
|
+
}
|
|
346
|
+
const run = createRun(job.id, {
|
|
347
|
+
run_timeout_ms: job.run_timeout_ms,
|
|
348
|
+
status: 'awaiting_approval',
|
|
349
|
+
dispatch_queue_id: dispatchRecord?.id || null,
|
|
350
|
+
triggered_by_run: dispatchRecord?.source_run_id || null,
|
|
351
|
+
retry_of: dispatchRecord?.retry_of_run_id || null,
|
|
352
|
+
});
|
|
353
|
+
const approval = createApproval(job.id, run.id, dispatchRecord?.id || null);
|
|
354
|
+
if (dispatchRecord) setDispatchStatus(dispatchRecord.id, 'awaiting_approval');
|
|
355
|
+
log('info', `Approval required for ${job.name} -- awaiting operator`, { approvalId: approval.id, runId: run.id });
|
|
356
|
+
const msg = `\u26a0\ufe0f Job '${job.name}' requires approval.\nApprove: openclaw-scheduler jobs approve ${job.id}\nReject: openclaw-scheduler jobs reject ${job.id}`;
|
|
357
|
+
await handleDelivery({ ...job, delivery_mode: 'announce-always' }, msg);
|
|
358
|
+
return null;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Resource pool concurrency
|
|
362
|
+
if (job.resource_pool && hasRunningRunForPool(job.resource_pool)) {
|
|
363
|
+
log('info', `Skipping ${job.name} -- resource pool '${job.resource_pool}' busy`, { jobId: job.id, pool: job.resource_pool });
|
|
364
|
+
if (dispatchRecord) {
|
|
365
|
+
releaseDispatch(dispatchRecord.id, sqliteNow(TICK_INTERVAL_MS));
|
|
366
|
+
} else {
|
|
367
|
+
advanceNextRun(job);
|
|
368
|
+
}
|
|
369
|
+
return null;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Overlap control
|
|
373
|
+
if (hasRunningRun(job.id)) {
|
|
374
|
+
if (job.overlap_policy === 'skip') {
|
|
375
|
+
log('info', `Skipping ${job.name} -- previous run still active`, { jobId: job.id });
|
|
376
|
+
if (dispatchRecord) {
|
|
377
|
+
completeCurrentDispatch('cancelled');
|
|
378
|
+
} else {
|
|
379
|
+
advanceNextRun(job);
|
|
380
|
+
}
|
|
381
|
+
return null;
|
|
382
|
+
}
|
|
383
|
+
if (job.overlap_policy === 'queue') {
|
|
384
|
+
const queueResult = enqueueJob(job.id);
|
|
385
|
+
if (!queueResult.queued) {
|
|
386
|
+
log('warn', `Queue limit reached for ${job.name} -- dropping overlap dispatch`, {
|
|
387
|
+
jobId: job.id,
|
|
388
|
+
queuedCount: queueResult.queued_count,
|
|
389
|
+
maxQueuedDispatches: job.max_queued_dispatches || 25,
|
|
390
|
+
});
|
|
391
|
+
if (dispatchRecord) {
|
|
392
|
+
completeCurrentDispatch('cancelled');
|
|
393
|
+
} else {
|
|
394
|
+
advanceNextRun(job);
|
|
395
|
+
}
|
|
396
|
+
return null;
|
|
397
|
+
}
|
|
398
|
+
log('info', `Queueing ${job.name} -- previous run still active`, {
|
|
399
|
+
jobId: job.id,
|
|
400
|
+
queuedCount: queueResult.queued_count,
|
|
401
|
+
});
|
|
402
|
+
if (dispatchRecord) {
|
|
403
|
+
completeCurrentDispatch('done');
|
|
404
|
+
} else {
|
|
405
|
+
advanceNextRun(job);
|
|
406
|
+
}
|
|
407
|
+
return null;
|
|
408
|
+
}
|
|
409
|
+
// 'allow' falls through
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// Idempotency key generation
|
|
413
|
+
const scheduledTime = job.schedule_at || job.next_run_at;
|
|
414
|
+
let idemKey;
|
|
415
|
+
if (dispatchKind === 'chain') {
|
|
416
|
+
idemKey = generateChainIdempotencyKey(dispatchRecord.source_run_id || dispatchRecord.id, job.id);
|
|
417
|
+
} else if (dispatchKind === 'manual') {
|
|
418
|
+
idemKey = generateRunNowIdempotencyKey(job.id);
|
|
419
|
+
} else if (dispatchKind === 'retry') {
|
|
420
|
+
idemKey = generateChainIdempotencyKey(dispatchRecord.retry_of_run_id || dispatchRecord.id, job.id);
|
|
421
|
+
} else {
|
|
422
|
+
idemKey = generateIdempotencyKey(job, scheduledTime);
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Idempotency dedup
|
|
426
|
+
if (idemKey) {
|
|
427
|
+
const existing = getDb().prepare("SELECT * FROM idempotency_ledger WHERE key = ? AND status = 'claimed'").get(idemKey);
|
|
428
|
+
if (existing) {
|
|
429
|
+
log('info', `Idempotency skip: ${job.name} (key ${idemKey.slice(0,8)}... already claimed by run ${existing.run_id.slice(0,8)}...)`);
|
|
430
|
+
if (dispatchRecord) {
|
|
431
|
+
completeCurrentDispatch('done');
|
|
432
|
+
} else {
|
|
433
|
+
advanceNextRun(job);
|
|
434
|
+
}
|
|
435
|
+
return null;
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
log('info', `Dispatching: ${job.name}`, { jobId: job.id, target: job.session_target });
|
|
440
|
+
|
|
441
|
+
const retryCount = dispatchKind === 'retry' && dispatchRecord?.retry_of_run_id
|
|
442
|
+
? (getRun(dispatchRecord.retry_of_run_id)?.retry_count || 0)
|
|
443
|
+
: 0;
|
|
444
|
+
|
|
445
|
+
const run = createRun(job.id, {
|
|
446
|
+
run_timeout_ms: job.run_timeout_ms,
|
|
447
|
+
idempotency_key: idemKey,
|
|
448
|
+
retry_count: retryCount,
|
|
449
|
+
dispatch_queue_id: dispatchRecord?.id || null,
|
|
450
|
+
triggered_by_run: dispatchRecord?.source_run_id || null,
|
|
451
|
+
retry_of: dispatchRecord?.retry_of_run_id || null,
|
|
452
|
+
});
|
|
453
|
+
|
|
454
|
+
// Claim idempotency key
|
|
455
|
+
if (idemKey) {
|
|
456
|
+
const expiresAt = job.delete_after_run
|
|
457
|
+
? sqliteNow(24 * 60 * 60 * 1000)
|
|
458
|
+
: sqliteNow(7 * 24 * 60 * 60 * 1000);
|
|
459
|
+
const claimed = claimIdempotencyKey(idemKey, job.id, run.id, expiresAt);
|
|
460
|
+
if (!claimed) {
|
|
461
|
+
log('warn', `Idempotency race: ${job.name} key ${idemKey.slice(0,8)}... claimed by concurrent dispatch`);
|
|
462
|
+
finishRun(run.id, 'skipped', { summary: 'Idempotency key already claimed (race)' });
|
|
463
|
+
if (dispatchRecord) {
|
|
464
|
+
completeCurrentDispatch('done');
|
|
465
|
+
} else {
|
|
466
|
+
advanceNextRun(job);
|
|
467
|
+
}
|
|
468
|
+
return null;
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// v0.2 runtime evaluation
|
|
473
|
+
const {
|
|
474
|
+
resolveIdentity, evaluateTrust, verifyAuthorizationProof,
|
|
475
|
+
evaluateAuthorization, summarizeCredentialHandoff,
|
|
476
|
+
} = deps;
|
|
477
|
+
|
|
478
|
+
// Build provider context for v0.2 runtime calls
|
|
479
|
+
const providerCtx = {
|
|
480
|
+
getIdentityProvider: deps.getIdentityProvider,
|
|
481
|
+
getAuthorizationProvider: deps.getAuthorizationProvider,
|
|
482
|
+
getProofVerifier: deps.getProofVerifier,
|
|
483
|
+
env: process.env,
|
|
484
|
+
cwd: process.cwd(),
|
|
485
|
+
};
|
|
486
|
+
|
|
487
|
+
const v02Outcomes = {};
|
|
488
|
+
const hasV02Identity = hasIdentityDeclaration(job);
|
|
489
|
+
const hasV02Contract = job.contract_required_trust_level;
|
|
490
|
+
const needsAuthorization = job.authorization || job.authorization_ref;
|
|
491
|
+
const shouldResolveIdentity = hasV02Identity || hasV02Contract || needsAuthorization;
|
|
492
|
+
|
|
493
|
+
if (shouldResolveIdentity) {
|
|
494
|
+
v02Outcomes.identity_resolved = await resolveIdentity(job, providerCtx);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
if (hasV02Identity) {
|
|
498
|
+
const handoff = summarizeCredentialHandoff(job);
|
|
499
|
+
if (handoff) v02Outcomes.credential_handoff_summary = handoff;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
const hasDeclaredCredentialHandoff = v02Outcomes.credential_handoff_summary
|
|
503
|
+
&& (v02Outcomes.credential_handoff_summary.mode != null
|
|
504
|
+
|| v02Outcomes.credential_handoff_summary.bindings_count > 0);
|
|
505
|
+
if (hasDeclaredCredentialHandoff && job.session_target !== 'shell') {
|
|
506
|
+
return abortPreparedRun(
|
|
507
|
+
job,
|
|
508
|
+
run,
|
|
509
|
+
'Credential handoff presentation is only supported for shell jobs',
|
|
510
|
+
v02Outcomes,
|
|
511
|
+
{ dispatchRecord, idemKey },
|
|
512
|
+
deps,
|
|
513
|
+
{ skipChildren: true },
|
|
514
|
+
);
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Child credential policy enforcement.
|
|
518
|
+
// Apply this BEFORE trust/auth evaluation so later gates see the effective
|
|
519
|
+
// identity that will actually be materialized for the run. The policy can
|
|
520
|
+
// narrow (downscope) or remove (none) credentials, and it may also inherit
|
|
521
|
+
// the parent's auth_profile for downstream gateway calls.
|
|
522
|
+
if (job.parent_id) {
|
|
523
|
+
const { getDb: getDatabase } = deps;
|
|
524
|
+
const parentJob = getDatabase().prepare(
|
|
525
|
+
'SELECT id, child_credential_policy, identity, identity_trust_level, auth_profile FROM jobs WHERE id = ?'
|
|
526
|
+
).get(job.parent_id);
|
|
527
|
+
|
|
528
|
+
if (parentJob) {
|
|
529
|
+
const effectivePolicy = job.child_credential_policy
|
|
530
|
+
|| parentJob.child_credential_policy
|
|
531
|
+
|| 'none';
|
|
532
|
+
const parentIdentityBlob = safeParse(parentJob.identity);
|
|
533
|
+
const lastSuccessfulParentRun = (effectivePolicy === 'downscope' || effectivePolicy === 'independent')
|
|
534
|
+
? getDatabase().prepare(
|
|
535
|
+
'SELECT identity_resolved FROM runs WHERE job_id = ? AND status = ? ORDER BY started_at DESC LIMIT 1'
|
|
536
|
+
).get(parentJob.id, 'ok')
|
|
537
|
+
: null;
|
|
538
|
+
const parentResolvedIdentity = lastSuccessfulParentRun?.identity_resolved
|
|
539
|
+
? safeParse(lastSuccessfulParentRun.identity_resolved)
|
|
540
|
+
: null;
|
|
541
|
+
|
|
542
|
+
if (effectivePolicy === 'none') {
|
|
543
|
+
// No credentials from parent; suppress any identity the child resolved on its own
|
|
544
|
+
v02Outcomes.identity_resolved = null;
|
|
545
|
+
} else if (effectivePolicy === 'inherit' && parentJob.auth_profile) {
|
|
546
|
+
// Inherit parent's auth profile. Store in v02Outcomes rather than
|
|
547
|
+
// mutating the job DB record, which could leak to downstream writes.
|
|
548
|
+
v02Outcomes.effective_auth_profile = parentJob.auth_profile;
|
|
549
|
+
} else if (effectivePolicy === 'downscope') {
|
|
550
|
+
// Downscope: resolve narrower credentials via provider.
|
|
551
|
+
// Fail closed on every path -- if downscope is declared, we must
|
|
552
|
+
// either produce a downscoped session or abort dispatch.
|
|
553
|
+
const providerName = parentIdentityBlob?.provider || parentIdentityBlob?.auth?.provider;
|
|
554
|
+
const provider = deps.getIdentityProvider?.(providerName);
|
|
555
|
+
let downscopeApplied = false;
|
|
556
|
+
|
|
557
|
+
if (provider && typeof provider.prepareHandoff === 'function') {
|
|
558
|
+
// Get parent session from last run or re-resolve
|
|
559
|
+
let parentSession = parentResolvedIdentity?.session || null;
|
|
560
|
+
|
|
561
|
+
if (!parentSession && provider.resolveSession) {
|
|
562
|
+
// Fallback: re-resolve parent identity
|
|
563
|
+
try {
|
|
564
|
+
const parentScope = parentIdentityBlob?.scope || parentIdentityBlob?.auth?.scopes?.[0] || null;
|
|
565
|
+
const reResolved = await provider.resolveSession(
|
|
566
|
+
{ profile: parentIdentityBlob, instanceId: parentJob.id, scope: parentScope },
|
|
567
|
+
{ env: process.env, cwd: process.cwd() }
|
|
568
|
+
);
|
|
569
|
+
if (reResolved.ok) parentSession = reResolved.session;
|
|
570
|
+
} catch (resolveErr) {
|
|
571
|
+
log('warn', `Downscope parent re-resolve failed for ${job.name}: ${resolveErr.message}`, { jobId: job.id });
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
if (parentSession) {
|
|
576
|
+
const childIdentityBlob = safeParse(job.identity) || {};
|
|
577
|
+
const childScope = childIdentityBlob?.scope || childIdentityBlob?.auth?.scopes?.[0] || null;
|
|
578
|
+
|
|
579
|
+
try {
|
|
580
|
+
const handoffResult = await provider.prepareHandoff(
|
|
581
|
+
parentSession,
|
|
582
|
+
{ target_scope: childScope, parent_profile: parentIdentityBlob },
|
|
583
|
+
{ env: process.env, cwd: process.cwd() }
|
|
584
|
+
);
|
|
585
|
+
|
|
586
|
+
if (handoffResult.prepared) {
|
|
587
|
+
// Verify handoff actually downscoped: child trust must not
|
|
588
|
+
// exceed parent. A provider that returns an elevated session
|
|
589
|
+
// violates the downscope contract.
|
|
590
|
+
const parentTrustLevel = getIdentityTrustLevel(parentResolvedIdentity)
|
|
591
|
+
|| getIdentityTrustLevel({ session: parentSession })
|
|
592
|
+
|| getJobTrustLevel(parentJob, parentIdentityBlob);
|
|
593
|
+
const childTrustLevel = getIdentityTrustLevel({ session: handoffResult.session });
|
|
594
|
+
const { compareTrustLevels } = deps;
|
|
595
|
+
if (parentTrustLevel && childTrustLevel && compareTrustLevels(childTrustLevel, parentTrustLevel) > 0) {
|
|
596
|
+
log('warn', `Downscope handoff elevated trust from "${parentTrustLevel}" to "${childTrustLevel}" for ${job.name}`, { jobId: job.id });
|
|
597
|
+
// Do not set downscopeApplied -- will abort below
|
|
598
|
+
} else {
|
|
599
|
+
// Override the identity resolution with the handoff session
|
|
600
|
+
v02Outcomes.identity_resolved = {
|
|
601
|
+
provider: providerName,
|
|
602
|
+
session: handoffResult.session,
|
|
603
|
+
source: 'provider',
|
|
604
|
+
subject_kind: handoffResult.session?.subject?.kind || 'unknown',
|
|
605
|
+
principal: handoffResult.session?.subject?.principal || null,
|
|
606
|
+
trust_level: childTrustLevel,
|
|
607
|
+
delegation_mode: null,
|
|
608
|
+
raw: childIdentityBlob,
|
|
609
|
+
};
|
|
610
|
+
downscopeApplied = true;
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
} catch (err) {
|
|
614
|
+
log('warn', `Downscope handoff error for ${job.name}: ${err.message}`, { jobId: job.id });
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
if (!downscopeApplied) {
|
|
620
|
+
const reason = !provider
|
|
621
|
+
? `identity provider ${providerName || '(none)'} not loaded`
|
|
622
|
+
: typeof provider.prepareHandoff !== 'function'
|
|
623
|
+
? `provider ${providerName} does not support prepareHandoff`
|
|
624
|
+
: 'parent session unavailable or handoff did not produce a downscoped session';
|
|
625
|
+
return abortPreparedRun(
|
|
626
|
+
job,
|
|
627
|
+
run,
|
|
628
|
+
`Downscope credential policy failed: ${reason}`,
|
|
629
|
+
v02Outcomes,
|
|
630
|
+
{ dispatchRecord, idemKey },
|
|
631
|
+
deps,
|
|
632
|
+
{ skipChildren: true },
|
|
633
|
+
);
|
|
634
|
+
}
|
|
635
|
+
} else if (effectivePolicy === 'independent') {
|
|
636
|
+
// Child uses its own resolved identity, but cannot exceed the parent's
|
|
637
|
+
// trust level. Without this cap, a child could declare a higher trust
|
|
638
|
+
// level than the parent and bypass the parent's authorization scope.
|
|
639
|
+
const parentTrustLevel = getIdentityTrustLevel(parentResolvedIdentity)
|
|
640
|
+
|| getJobTrustLevel(parentJob, parentIdentityBlob);
|
|
641
|
+
const childTrustLevel = v02Outcomes.identity_resolved?.trust_level || null;
|
|
642
|
+
if (parentTrustLevel && childTrustLevel) {
|
|
643
|
+
const { compareTrustLevels } = deps;
|
|
644
|
+
if (compareTrustLevels(childTrustLevel, parentTrustLevel) > 0) {
|
|
645
|
+
return abortPreparedRun(
|
|
646
|
+
job,
|
|
647
|
+
run,
|
|
648
|
+
`Independent child trust level "${childTrustLevel}" exceeds parent trust level "${parentTrustLevel}"`,
|
|
649
|
+
v02Outcomes,
|
|
650
|
+
{ dispatchRecord, idemKey },
|
|
651
|
+
deps,
|
|
652
|
+
{ skipChildren: true },
|
|
653
|
+
);
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
if (v02Outcomes.identity_resolved?.source === 'provider-error') {
|
|
661
|
+
return abortPreparedRun(
|
|
662
|
+
job,
|
|
663
|
+
run,
|
|
664
|
+
'Identity resolution failed: ' + (v02Outcomes.identity_resolved.error || 'provider error'),
|
|
665
|
+
v02Outcomes,
|
|
666
|
+
{ dispatchRecord, idemKey },
|
|
667
|
+
deps,
|
|
668
|
+
{ skipChildren: true },
|
|
669
|
+
);
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
if (hasV02Identity || hasV02Contract || v02Outcomes.identity_resolved != null) {
|
|
673
|
+
v02Outcomes.trust_evaluation = evaluateTrust(job, v02Outcomes.identity_resolved);
|
|
674
|
+
if (v02Outcomes.trust_evaluation?.decision === 'warn') {
|
|
675
|
+
log('warn', `Trust evaluation warning for ${job.name}: ${v02Outcomes.trust_evaluation.reason}`, {
|
|
676
|
+
jobId: job.id,
|
|
677
|
+
runId: run.id,
|
|
678
|
+
});
|
|
679
|
+
}
|
|
680
|
+
if (v02Outcomes.trust_evaluation?.decision === 'deny') {
|
|
681
|
+
return abortPreparedRun(
|
|
682
|
+
job,
|
|
683
|
+
run,
|
|
684
|
+
'Trust enforcement blocked dispatch: ' + v02Outcomes.trust_evaluation.reason,
|
|
685
|
+
v02Outcomes,
|
|
686
|
+
{ dispatchRecord, idemKey },
|
|
687
|
+
deps,
|
|
688
|
+
{ skipChildren: true },
|
|
689
|
+
);
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
if (job.authorization_proof || job.authorization_proof_ref) {
|
|
694
|
+
v02Outcomes.authorization_proof_verification = await verifyAuthorizationProof(job, providerCtx);
|
|
695
|
+
if (v02Outcomes.authorization_proof_verification?.verified === false) {
|
|
696
|
+
const proofError = v02Outcomes.authorization_proof_verification.error || 'verification returned false';
|
|
697
|
+
// Proof verification failure is blocking: the job declared a proof
|
|
698
|
+
// requirement, so proceeding without a valid proof violates policy.
|
|
699
|
+
return abortPreparedRun(
|
|
700
|
+
job,
|
|
701
|
+
run,
|
|
702
|
+
'Authorization proof verification failed: ' + proofError,
|
|
703
|
+
v02Outcomes,
|
|
704
|
+
{ dispatchRecord, idemKey },
|
|
705
|
+
deps,
|
|
706
|
+
{ skipChildren: true },
|
|
707
|
+
);
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
if (needsAuthorization) {
|
|
712
|
+
v02Outcomes.authorization_decision = await evaluateAuthorization(
|
|
713
|
+
job, v02Outcomes.identity_resolved, v02Outcomes.trust_evaluation, providerCtx
|
|
714
|
+
);
|
|
715
|
+
|
|
716
|
+
if (v02Outcomes.authorization_decision?.decision === 'deny') {
|
|
717
|
+
return abortPreparedRun(
|
|
718
|
+
job,
|
|
719
|
+
run,
|
|
720
|
+
'Authorization denied: ' + v02Outcomes.authorization_decision.reason,
|
|
721
|
+
v02Outcomes,
|
|
722
|
+
{ dispatchRecord, idemKey },
|
|
723
|
+
deps,
|
|
724
|
+
{ skipChildren: true },
|
|
725
|
+
);
|
|
726
|
+
}
|
|
727
|
+
if (v02Outcomes.authorization_decision?.decision === 'escalate') {
|
|
728
|
+
// Escalation means the authorization provider wants a human decision.
|
|
729
|
+
// Abort the dispatch so the approval system (or operator) can intervene.
|
|
730
|
+
return abortPreparedRun(
|
|
731
|
+
job,
|
|
732
|
+
run,
|
|
733
|
+
'Authorization requires escalation: ' + (v02Outcomes.authorization_decision.reason || 'provider requested escalation'),
|
|
734
|
+
v02Outcomes,
|
|
735
|
+
{ dispatchRecord, idemKey },
|
|
736
|
+
deps,
|
|
737
|
+
{ skipChildren: true },
|
|
738
|
+
);
|
|
739
|
+
}
|
|
740
|
+
if (v02Outcomes.authorization_decision?.advisory) {
|
|
741
|
+
log('warn', `Authorization advisory for ${job.name}: ${v02Outcomes.authorization_decision.reason}`, { jobId: job.id });
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
// Materialization phase
|
|
746
|
+
let materializedEnv = null;
|
|
747
|
+
let materializationCleanup = null;
|
|
748
|
+
|
|
749
|
+
if (v02Outcomes.identity_resolved?.source === 'provider' && v02Outcomes.identity_resolved.session) {
|
|
750
|
+
const providerName = v02Outcomes.identity_resolved.provider;
|
|
751
|
+
const provider = deps.getIdentityProvider?.(providerName);
|
|
752
|
+
const identityBlob = safeParse(job.identity) || {};
|
|
753
|
+
const presentation = identityBlob.presentation || {};
|
|
754
|
+
const hasPresentation = presentation && Object.keys(presentation).length > 0;
|
|
755
|
+
|
|
756
|
+
if (provider && typeof provider.materialize === 'function') {
|
|
757
|
+
try {
|
|
758
|
+
const matResult = await provider.materialize(
|
|
759
|
+
v02Outcomes.identity_resolved.session,
|
|
760
|
+
presentation,
|
|
761
|
+
{ env: process.env, cwd: process.cwd() }
|
|
762
|
+
);
|
|
763
|
+
if (matResult?.materialized) {
|
|
764
|
+
materializedEnv = matResult.env_vars || null;
|
|
765
|
+
if (matResult.cleanup_required) {
|
|
766
|
+
materializationCleanup = {
|
|
767
|
+
provider,
|
|
768
|
+
cleanupState: {
|
|
769
|
+
session: v02Outcomes.identity_resolved.session,
|
|
770
|
+
...matResult,
|
|
771
|
+
},
|
|
772
|
+
};
|
|
773
|
+
}
|
|
774
|
+
} else if (hasPresentation) {
|
|
775
|
+
// Materialization returned false but credentials were declared required
|
|
776
|
+
return abortPreparedRun(
|
|
777
|
+
job,
|
|
778
|
+
run,
|
|
779
|
+
`Credential materialization failed for provider ${providerName}: provider returned materialized=false`,
|
|
780
|
+
v02Outcomes,
|
|
781
|
+
{ dispatchRecord, idemKey },
|
|
782
|
+
deps,
|
|
783
|
+
{ skipChildren: true },
|
|
784
|
+
);
|
|
785
|
+
}
|
|
786
|
+
} catch (err) {
|
|
787
|
+
if (hasPresentation) {
|
|
788
|
+
return abortPreparedRun(
|
|
789
|
+
job,
|
|
790
|
+
run,
|
|
791
|
+
`Credential materialization error for provider ${providerName}: ${err.message}`,
|
|
792
|
+
v02Outcomes,
|
|
793
|
+
{ dispatchRecord, idemKey },
|
|
794
|
+
deps,
|
|
795
|
+
{ skipChildren: true },
|
|
796
|
+
);
|
|
797
|
+
}
|
|
798
|
+
// No presentation declared: provider materializes opportunistically.
|
|
799
|
+
// Warn and continue -- the shell job can still run without injected
|
|
800
|
+
// credentials when the identity blob has no presentation block.
|
|
801
|
+
log('warn', `Materialization failed for ${job.name}: ${err.message}`, { jobId: job.id });
|
|
802
|
+
}
|
|
803
|
+
} else if (hasPresentation) {
|
|
804
|
+
// Job declared credential presentation but provider has no materialize method
|
|
805
|
+
return abortPreparedRun(
|
|
806
|
+
job,
|
|
807
|
+
run,
|
|
808
|
+
`Job declares credential presentation but provider ${providerName || '(none)'} does not support materialization`,
|
|
809
|
+
v02Outcomes,
|
|
810
|
+
{ dispatchRecord, idemKey },
|
|
811
|
+
deps,
|
|
812
|
+
{ skipChildren: true },
|
|
813
|
+
);
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
return { dispatchRecord, idemKey, run, retryCount, dispatchKind, isChainDispatch, v02Outcomes, materializedEnv, materializationCleanup };
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
// -- Strategy: Watchdog --------------------------------------
|
|
821
|
+
|
|
822
|
+
export async function executeWatchdog(job, ctx, deps) {
|
|
823
|
+
const { runShellCommand, handleDelivery, updateJob, deleteJob, log } = deps;
|
|
824
|
+
const result = makeDefaultResult();
|
|
825
|
+
result.skipChildren = true;
|
|
826
|
+
result.skipDequeue = true;
|
|
827
|
+
|
|
828
|
+
const checkCmd = job.watchdog_check_cmd;
|
|
829
|
+
if (!checkCmd) {
|
|
830
|
+
result.status = 'error';
|
|
831
|
+
result.errorMessage = 'Watchdog job missing watchdog_check_cmd';
|
|
832
|
+
result.skipJobUpdate = false;
|
|
833
|
+
return result;
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
const shellExec = await runShellCommand(checkCmd, Math.min(job.run_timeout_ms || 300000, 60000));
|
|
837
|
+
const exitCode = shellExec.exitCode;
|
|
838
|
+
const stdout = (shellExec.stdout || '').trim();
|
|
839
|
+
const stderr = (shellExec.stderr || '').trim();
|
|
840
|
+
|
|
841
|
+
let timedOut = false;
|
|
842
|
+
let elapsedMin = 0;
|
|
843
|
+
if (job.watchdog_started_at && job.watchdog_timeout_min) {
|
|
844
|
+
const startedAt = new Date(job.watchdog_started_at).getTime();
|
|
845
|
+
elapsedMin = Math.round((Date.now() - startedAt) / 60000);
|
|
846
|
+
if (elapsedMin >= job.watchdog_timeout_min) timedOut = true;
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
if (exitCode === 2) {
|
|
850
|
+
result.summary = `Watchdog check failed (transient): ${stderr || stdout}`;
|
|
851
|
+
result.skipDelivery = true;
|
|
852
|
+
log('debug', `Watchdog check transient failure: ${job.name}`, { exitCode, stderr: stderr.slice(0, 200) });
|
|
853
|
+
|
|
854
|
+
} else if (exitCode === 0 && stdout) {
|
|
855
|
+
const completionMsg = `\u2705 [watchdog] Task "${job.watchdog_target_label}" completed -- watchdog disarmed`;
|
|
856
|
+
result.summary = completionMsg;
|
|
857
|
+
result.content = completionMsg;
|
|
858
|
+
log('info', `Watchdog: target completed: ${job.watchdog_target_label}`, { jobId: job.id });
|
|
859
|
+
|
|
860
|
+
if (job.watchdog_alert_channel && job.watchdog_alert_target) {
|
|
861
|
+
await handleDelivery({
|
|
862
|
+
...job,
|
|
863
|
+
delivery_mode: 'announce-always',
|
|
864
|
+
delivery_channel: job.watchdog_alert_channel,
|
|
865
|
+
delivery_to: job.watchdog_alert_target,
|
|
866
|
+
}, completionMsg);
|
|
867
|
+
}
|
|
868
|
+
result.skipDelivery = true;
|
|
869
|
+
|
|
870
|
+
if (job.watchdog_self_destruct) {
|
|
871
|
+
result.skipJobUpdate = true;
|
|
872
|
+
updateJob(job.id, { enabled: 0 });
|
|
873
|
+
deleteJob(job.id);
|
|
874
|
+
log('info', `Watchdog self-destructed: ${job.name}`, { jobId: job.id });
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
} else if (exitCode === 1 || timedOut) {
|
|
878
|
+
const reason = timedOut
|
|
879
|
+
? `running for ${elapsedMin}min (threshold: ${job.watchdog_timeout_min}min)`
|
|
880
|
+
: `check command reported stuck`;
|
|
881
|
+
const alertMsg = [
|
|
882
|
+
`\ud83d\udea8 [watchdog] Task "${job.watchdog_target_label}" appears stuck`,
|
|
883
|
+
`- Dispatched: ${job.watchdog_started_at || 'unknown'}`,
|
|
884
|
+
`- Running for: ${elapsedMin} minutes (threshold: ${job.watchdog_timeout_min || '?'} min)`,
|
|
885
|
+
`- Reason: ${reason}`,
|
|
886
|
+
`- Check: ${checkCmd.split(/\s/)[0]}${checkCmd.length > 80 ? ' [...]' : ''}`,
|
|
887
|
+
stderr ? `- Error: ${stderr.slice(0, 500)}` : null,
|
|
888
|
+
stdout ? `- Output: ${stdout.slice(0, 500)}` : null,
|
|
889
|
+
].filter(Boolean).join('\n');
|
|
890
|
+
result.summary = `Watchdog alert fired: ${reason}`;
|
|
891
|
+
result.content = alertMsg;
|
|
892
|
+
|
|
893
|
+
log('warn', `Watchdog alert: ${job.watchdog_target_label} stuck`, {
|
|
894
|
+
jobId: job.id, elapsedMin, timedOut, exitCode,
|
|
895
|
+
});
|
|
896
|
+
|
|
897
|
+
if (job.watchdog_alert_channel && job.watchdog_alert_target) {
|
|
898
|
+
await handleDelivery({
|
|
899
|
+
...job,
|
|
900
|
+
delivery_mode: 'announce-always',
|
|
901
|
+
delivery_channel: job.watchdog_alert_channel,
|
|
902
|
+
delivery_to: job.watchdog_alert_target,
|
|
903
|
+
}, alertMsg);
|
|
904
|
+
}
|
|
905
|
+
result.skipDelivery = true;
|
|
906
|
+
|
|
907
|
+
} else if (exitCode === 0) {
|
|
908
|
+
result.summary = `Watchdog check: target still running (${elapsedMin}min elapsed)`;
|
|
909
|
+
result.skipDelivery = true;
|
|
910
|
+
log('debug', `Watchdog: target still running: ${job.watchdog_target_label}`, {
|
|
911
|
+
jobId: job.id, elapsedMin,
|
|
912
|
+
});
|
|
913
|
+
} else {
|
|
914
|
+
result.summary = `Watchdog check command returned unexpected exit code ${exitCode}`;
|
|
915
|
+
result.status = 'error';
|
|
916
|
+
log('warn', `Watchdog: unexpected exit code for ${job.watchdog_target_label}`, {
|
|
917
|
+
jobId: job.id, exitCode, stderr: stderr.slice(0, 200),
|
|
918
|
+
});
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
return result;
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
// -- Strategy: Main session ----------------------------------
|
|
925
|
+
|
|
926
|
+
export async function executeMain(job, ctx, deps) {
|
|
927
|
+
// Main session dispatch mode:
|
|
928
|
+
// - execution_intent 'background' or missing: use executeAgent (sync, waits
|
|
929
|
+
// for response, captures content for delivery). Best for quick tasks where
|
|
930
|
+
// a few seconds of session latency is acceptable.
|
|
931
|
+
// - execution_intent 'fire-and-forget': inject a system event and return
|
|
932
|
+
// immediately. The agent processes asynchronously and the session stays
|
|
933
|
+
// unblocked for interactive DMs. No response capture -- if delivery is
|
|
934
|
+
// configured, the prompt includes a reply-to instruction so the agent
|
|
935
|
+
// can send results via the message tool when done.
|
|
936
|
+
//
|
|
937
|
+
// Choose based on expected duration:
|
|
938
|
+
// Quick tasks (< 10s): sync is simpler and captures output
|
|
939
|
+
// Long tasks (> 30s): fire-and-forget avoids blocking interactive chat
|
|
940
|
+
|
|
941
|
+
const isFireAndForget = job.execution_intent === 'fire-and-forget';
|
|
942
|
+
|
|
943
|
+
if (!isFireAndForget) {
|
|
944
|
+
// Sync path: reuse executeAgent with the main session key.
|
|
945
|
+
// The job's preferred_session_key defaults to 'main' for main-session jobs.
|
|
946
|
+
const originalSessionKey = job.preferred_session_key;
|
|
947
|
+
job.preferred_session_key = job.preferred_session_key || 'main';
|
|
948
|
+
const agentResult = await executeAgent(job, ctx, deps);
|
|
949
|
+
job.preferred_session_key = originalSessionKey;
|
|
950
|
+
return agentResult;
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
// Fire-and-forget path: inject system event, return immediately.
|
|
954
|
+
const { sendSystemEvent, buildExecutionIntentNote, log } = deps;
|
|
955
|
+
const result = makeDefaultResult();
|
|
956
|
+
|
|
957
|
+
const executionNote = buildExecutionIntentNote(job);
|
|
958
|
+
const modelNote = job.payload_thinking
|
|
959
|
+
? `[SYSTEM NOTE -- model policy]\nPrefer reasoning depth: ${job.payload_thinking}.\n[END SYSTEM NOTE]\n\n`
|
|
960
|
+
: '';
|
|
961
|
+
|
|
962
|
+
// Build the delivery reply-to instruction so the agent can send results
|
|
963
|
+
// back through the scheduler post office when it finishes processing.
|
|
964
|
+
let deliveryInstruction = '';
|
|
965
|
+
if (job.delivery_mode && job.delivery_mode !== 'none' && job.delivery_channel && job.delivery_to) {
|
|
966
|
+
deliveryInstruction = [
|
|
967
|
+
'\n[SYSTEM NOTE -- delivery]',
|
|
968
|
+
`When you have completed this task, send your results using the message tool.`,
|
|
969
|
+
`Channel: ${job.delivery_channel}`,
|
|
970
|
+
`Target: ${job.delivery_to}`,
|
|
971
|
+
`Keep the message concise and actionable.`,
|
|
972
|
+
`If there is nothing noteworthy to report, do not send a message.`,
|
|
973
|
+
'[END SYSTEM NOTE]\n',
|
|
974
|
+
].join('\n');
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
const prompt = `${executionNote ? `${executionNote}\n\n` : ''}${modelNote}${deliveryInstruction}${job.payload_message}`;
|
|
978
|
+
await sendSystemEvent(prompt, 'now');
|
|
979
|
+
|
|
980
|
+
result.summary = 'System event dispatched (fire-and-forget)';
|
|
981
|
+
result.content = job.payload_message;
|
|
982
|
+
result.skipDelivery = true; // Agent handles delivery via message tool
|
|
983
|
+
result.skipChildren = true;
|
|
984
|
+
result.skipDequeue = true;
|
|
985
|
+
|
|
986
|
+
log('info', `Dispatched (main/fire-and-forget): ${job.name}`, { runId: ctx.run.id });
|
|
987
|
+
|
|
988
|
+
return result;
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
// -- Strategy: Shell -----------------------------------------
|
|
992
|
+
|
|
993
|
+
export async function executeShell(job, ctx, deps) {
|
|
994
|
+
const { runShellCommand, normalizeShellResult, log } = deps;
|
|
995
|
+
const result = makeDefaultResult();
|
|
996
|
+
|
|
997
|
+
const shellExec = await runShellCommand(job.payload_message, job.run_timeout_ms, ctx.materializedEnv || null);
|
|
998
|
+
const shellResult = normalizeShellResult(shellExec, {
|
|
999
|
+
runId: ctx.run.id,
|
|
1000
|
+
timeoutMs: job.run_timeout_ms,
|
|
1001
|
+
storeLimit: job.output_store_limit_bytes || undefined,
|
|
1002
|
+
excerptLimit: job.output_excerpt_limit_bytes || undefined,
|
|
1003
|
+
summaryLimit: job.output_summary_limit_bytes || undefined,
|
|
1004
|
+
offloadThreshold: job.output_offload_threshold_bytes || undefined,
|
|
1005
|
+
});
|
|
1006
|
+
|
|
1007
|
+
result.status = shellResult.status;
|
|
1008
|
+
result.summary = shellResult.summary;
|
|
1009
|
+
result.errorMessage = shellResult.errorMessage;
|
|
1010
|
+
result.content = shellResult.deliveryText;
|
|
1011
|
+
result.runFinishFields = {
|
|
1012
|
+
context_summary: shellResult.contextSummary,
|
|
1013
|
+
shell_exit_code: shellResult.exitCode,
|
|
1014
|
+
shell_signal: shellResult.signal,
|
|
1015
|
+
shell_timed_out: shellResult.timedOut,
|
|
1016
|
+
shell_stdout: shellResult.stdout,
|
|
1017
|
+
shell_stderr: shellResult.stderr,
|
|
1018
|
+
shell_stdout_path: shellResult.stdoutPath,
|
|
1019
|
+
shell_stderr_path: shellResult.stderrPath,
|
|
1020
|
+
shell_stdout_bytes: shellResult.stdoutBytes,
|
|
1021
|
+
shell_stderr_bytes: shellResult.stderrBytes,
|
|
1022
|
+
};
|
|
1023
|
+
|
|
1024
|
+
// Shell delivery logic: announce-always sends on all results, announce sends on error only
|
|
1025
|
+
const announcePayload = shellResult.deliveryText.trim() ? shellResult.deliveryText : shellResult.errorMessage;
|
|
1026
|
+
if (job.delivery_mode === 'announce-always' && announcePayload) {
|
|
1027
|
+
const prefix = shellResult.status === 'ok' ? '' : `\u26a0\ufe0f Shell job failed: ${job.name}\n\n`;
|
|
1028
|
+
result.deliveryOverride = `${prefix}${announcePayload}`;
|
|
1029
|
+
} else if (job.delivery_mode === 'announce' && shellResult.status !== 'ok' && announcePayload) {
|
|
1030
|
+
result.deliveryOverride = announcePayload;
|
|
1031
|
+
} else {
|
|
1032
|
+
result.skipDelivery = true;
|
|
1033
|
+
}
|
|
1034
|
+
|
|
1035
|
+
log('info', `Shell ${shellResult.status}: ${job.name}`, {
|
|
1036
|
+
runId: ctx.run.id,
|
|
1037
|
+
exitCode: shellResult.exitCode,
|
|
1038
|
+
signal: shellResult.signal,
|
|
1039
|
+
timedOut: shellResult.timedOut,
|
|
1040
|
+
});
|
|
1041
|
+
|
|
1042
|
+
return result;
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// -- Strategy: Agent (isolated session) ----------------------
|
|
1046
|
+
|
|
1047
|
+
export async function executeAgent(job, ctx, deps) {
|
|
1048
|
+
const {
|
|
1049
|
+
waitForGateway, updateRunSession, setAgentStatus,
|
|
1050
|
+
buildJobPrompt, runAgentTurnWithActivityTimeout,
|
|
1051
|
+
updateContextSummary, releaseDispatch, releaseIdempotencyKey,
|
|
1052
|
+
updateJob, matchesSentinel, detectTransientError,
|
|
1053
|
+
listSessions,
|
|
1054
|
+
sqliteNow, log,
|
|
1055
|
+
} = deps;
|
|
1056
|
+
const result = makeDefaultResult();
|
|
1057
|
+
|
|
1058
|
+
// Gateway health check
|
|
1059
|
+
const gatewayReady = await waitForGateway(30000, 2000);
|
|
1060
|
+
if (!gatewayReady) {
|
|
1061
|
+
log('warn', `Gateway unavailable after 30s -- deferring: ${job.name}`, { jobId: job.id });
|
|
1062
|
+
// Strategy handles everything for the gateway-down case
|
|
1063
|
+
deps.finishRun(ctx.run.id, 'error', { error_message: 'Gateway unavailable -- deferred' });
|
|
1064
|
+
if (ctx.idemKey) releaseIdempotencyKey(ctx.idemKey);
|
|
1065
|
+
const deferredAt = sqliteNow(60000);
|
|
1066
|
+
if (ctx.dispatchRecord) {
|
|
1067
|
+
releaseDispatch(ctx.dispatchRecord.id, deferredAt);
|
|
1068
|
+
} else {
|
|
1069
|
+
updateJob(job.id, { next_run_at: deferredAt });
|
|
1070
|
+
}
|
|
1071
|
+
result.earlyReturn = true;
|
|
1072
|
+
return result;
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
// Use a stable session key per job (not per run) so subsequent runs reuse
|
|
1076
|
+
// the warm session. This avoids full agent bootstrap on every dispatch --
|
|
1077
|
+
// memory search, plugin init, and context loading only happen on the first
|
|
1078
|
+
// run. Later runs get a pre-warmed session with context already loaded.
|
|
1079
|
+
const sessionKey = job.preferred_session_key || `scheduler:${job.id}`;
|
|
1080
|
+
updateRunSession(ctx.run.id, sessionKey, null);
|
|
1081
|
+
|
|
1082
|
+
// Mark agent as busy
|
|
1083
|
+
if (job.agent_id) setAgentStatus(job.agent_id, 'busy', sessionKey);
|
|
1084
|
+
|
|
1085
|
+
// Build prompt and collect context metadata
|
|
1086
|
+
const { prompt, contextMeta } = buildJobPrompt(job, ctx.run);
|
|
1087
|
+
try { updateContextSummary(ctx.run.id, contextMeta); } catch (_e) { /* column may not exist yet */ }
|
|
1088
|
+
|
|
1089
|
+
// Resolve auth_profile: use effective profile from child credential policy
|
|
1090
|
+
// if available (set by 'inherit' policy), otherwise fall back to the job's own.
|
|
1091
|
+
let resolvedAuthProfile = ctx.v02Outcomes?.effective_auth_profile || job.auth_profile || undefined;
|
|
1092
|
+
if (resolvedAuthProfile === 'inherit') {
|
|
1093
|
+
try {
|
|
1094
|
+
const sessions = await listSessions({ kinds: ['main'], activeMinutes: 120, limit: 10 });
|
|
1095
|
+
const sessionList = sessions?.result?.details?.sessions || sessions?.result?.sessions || sessions?.sessions || sessions || [];
|
|
1096
|
+
const mainSession = Array.isArray(sessionList)
|
|
1097
|
+
? sessionList.find(s => {
|
|
1098
|
+
const key = s.key || s.sessionKey || '';
|
|
1099
|
+
return key.includes(':main:') || key.endsWith(':main') || key === 'main';
|
|
1100
|
+
})
|
|
1101
|
+
: null;
|
|
1102
|
+
const profileId = mainSession?.authProfileOverride || mainSession?.authProfile || mainSession?.profile;
|
|
1103
|
+
if (profileId) {
|
|
1104
|
+
resolvedAuthProfile = profileId;
|
|
1105
|
+
log('debug', `Resolved auth_profile 'inherit' -> '${profileId}'`, { jobId: job.id });
|
|
1106
|
+
} else {
|
|
1107
|
+
log('debug', `auth_profile 'inherit' -- no main session profile found, passing 'inherit' as-is`, { jobId: job.id });
|
|
1108
|
+
}
|
|
1109
|
+
} catch (err) {
|
|
1110
|
+
log('warn', `Failed to resolve 'inherit' auth_profile: ${err.message}`, { jobId: job.id });
|
|
1111
|
+
// Fall through with 'inherit' -- gateway may handle it
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
const turnResult = await runAgentTurnWithActivityTimeout({
|
|
1116
|
+
message: prompt,
|
|
1117
|
+
agentId: job.agent_id || 'main',
|
|
1118
|
+
sessionKey,
|
|
1119
|
+
model: job.payload_model || undefined,
|
|
1120
|
+
authProfile: resolvedAuthProfile,
|
|
1121
|
+
// materializedEnv deferred: the x-openclaw-env-inject header is not sent
|
|
1122
|
+
// until the OpenClaw gateway implements the receiver side. See
|
|
1123
|
+
// openclaw/docs/env-inject-proposal.md for the gateway spec.
|
|
1124
|
+
idleTimeoutMs: (job.payload_timeout_seconds || 120) * 1000,
|
|
1125
|
+
pollIntervalMs: 60000,
|
|
1126
|
+
absoluteTimeoutMs: job.run_timeout_ms || 300000,
|
|
1127
|
+
});
|
|
1128
|
+
|
|
1129
|
+
const content = turnResult.content || '';
|
|
1130
|
+
const trimmed = content.trim();
|
|
1131
|
+
|
|
1132
|
+
const isHeartbeatOk = matchesSentinel(trimmed, 'HEARTBEAT_OK');
|
|
1133
|
+
const isNoFlush = matchesSentinel(trimmed, 'NO_FLUSH');
|
|
1134
|
+
const isIdempotentSkip = matchesSentinel(trimmed, 'IDEMPOTENT_SKIP');
|
|
1135
|
+
const isTaskFailed = matchesSentinel(trimmed, 'TASK_FAILED');
|
|
1136
|
+
const isTransientError = detectTransientError(content);
|
|
1137
|
+
|
|
1138
|
+
if (isNoFlush) log('info', `Flush: nothing to flush for ${job.name}`);
|
|
1139
|
+
if (isIdempotentSkip) log('info', `Idempotent skip (agent): ${job.name}`);
|
|
1140
|
+
if (isTaskFailed) log('warn', `Agent signalled TASK_FAILED: ${job.name}`, { runId: ctx.run.id });
|
|
1141
|
+
if (isTransientError) log('warn', `Transient error detected in agent reply: ${job.name}`, { runId: ctx.run.id, snippet: content.slice(0, 200) });
|
|
1142
|
+
|
|
1143
|
+
const effectiveStatus = (isTaskFailed || isTransientError) ? 'error' : 'ok';
|
|
1144
|
+
|
|
1145
|
+
result.status = effectiveStatus;
|
|
1146
|
+
result.summary = content.slice(0, 5000);
|
|
1147
|
+
result.content = content;
|
|
1148
|
+
result.errorMessage = effectiveStatus === 'error'
|
|
1149
|
+
? (isTaskFailed ? 'Agent signalled TASK_FAILED' : 'Transient error in agent reply')
|
|
1150
|
+
: null;
|
|
1151
|
+
result.idemAction = effectiveStatus === 'ok' ? 'keep' : 'release';
|
|
1152
|
+
result.skipAgentCleanup = false;
|
|
1153
|
+
result.retryFiresChildren = true;
|
|
1154
|
+
|
|
1155
|
+
// Suppress delivery for sentinel responses
|
|
1156
|
+
if (isHeartbeatOk || isNoFlush || isIdempotentSkip) {
|
|
1157
|
+
result.skipDelivery = true;
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
// Announce mode: only deliver on error (consistent with shell job behavior)
|
|
1161
|
+
if (job.delivery_mode === 'announce' && effectiveStatus === 'ok') {
|
|
1162
|
+
result.skipDelivery = true;
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
log('info', `Completed: ${job.name} (${turnResult.usage?.total_tokens || '?'} tokens)`, {
|
|
1166
|
+
runId: ctx.run.id,
|
|
1167
|
+
durationMs: ctx.run.started_at
|
|
1168
|
+
? Date.now() - new Date(ctx.run.started_at.replace(' ', 'T') + (ctx.run.started_at.endsWith('Z') ? '' : 'Z')).getTime()
|
|
1169
|
+
: null,
|
|
1170
|
+
});
|
|
1171
|
+
|
|
1172
|
+
return result;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
// -- Strategy dispatcher with error-catch wrapper ------------
|
|
1176
|
+
|
|
1177
|
+
export async function executeStrategy(job, ctx, deps) {
|
|
1178
|
+
const { handleDelivery, log } = deps;
|
|
1179
|
+
try {
|
|
1180
|
+
if (job.job_type === 'watchdog') return await executeWatchdog(job, ctx, deps);
|
|
1181
|
+
if (job.session_target === 'main') return await executeMain(job, ctx, deps);
|
|
1182
|
+
if (job.session_target === 'shell') return await executeShell(job, ctx, deps);
|
|
1183
|
+
return await executeAgent(job, ctx, deps);
|
|
1184
|
+
} catch (err) {
|
|
1185
|
+
const {
|
|
1186
|
+
finishRun, releaseIdempotencyKey, setAgentStatus,
|
|
1187
|
+
isDrainError, enqueueDispatch, getJob, getDispatchBacklogCount,
|
|
1188
|
+
shouldRetry, scheduleRetry, getDb, updateJobAfterRun,
|
|
1189
|
+
setDispatchStatus, handleTriggeredChildren, dequeueJob,
|
|
1190
|
+
sqliteNow,
|
|
1191
|
+
} = deps;
|
|
1192
|
+
|
|
1193
|
+
log('error', `Failed: ${job.name}: ${err.message}`, { jobId: job.id });
|
|
1194
|
+
|
|
1195
|
+
// -- Drain-error retry for isolated agentTurn jobs ----------
|
|
1196
|
+
// Gateway drain errors are transient infra noise -- the job never ran.
|
|
1197
|
+
// Don't increment consecutive_errors, and schedule a single retry after 90s.
|
|
1198
|
+
const isIsolatedAgent = job.session_target !== 'main' && job.session_target !== 'shell' && job.job_type !== 'watchdog';
|
|
1199
|
+
if (isIsolatedAgent && isDrainError(err.message)) {
|
|
1200
|
+
finishRun(ctx.run.id, 'error', { error_message: err.message });
|
|
1201
|
+
if (ctx.idemKey) releaseIdempotencyKey(ctx.idemKey);
|
|
1202
|
+
if (job.agent_id) setAgentStatus(job.agent_id, 'idle', null);
|
|
1203
|
+
|
|
1204
|
+
// Check: max 1 drain retry per run, job must still be enabled, and respect overlap_policy:skip
|
|
1205
|
+
const freshJob = getJob(job.id);
|
|
1206
|
+
const canDrainRetry = freshJob && freshJob.enabled
|
|
1207
|
+
&& (ctx.run.retry_count || 0) < 1
|
|
1208
|
+
&& !(freshJob.overlap_policy === 'skip' && getDispatchBacklogCount(job.id) > 0);
|
|
1209
|
+
|
|
1210
|
+
if (canDrainRetry) {
|
|
1211
|
+
const drainDispatch = enqueueDispatch(job.id, {
|
|
1212
|
+
kind: 'retry',
|
|
1213
|
+
scheduled_for: sqliteNow(90000),
|
|
1214
|
+
source_run_id: ctx.run.id,
|
|
1215
|
+
retry_of_run_id: ctx.run.id,
|
|
1216
|
+
});
|
|
1217
|
+
getDb().prepare('UPDATE runs SET retry_count = 1 WHERE id = ?').run(ctx.run.id);
|
|
1218
|
+
log('info', `[drain-retry] scheduling retry for ${job.name} in 90s (run ${ctx.run.id})`, {
|
|
1219
|
+
jobId: job.id, dispatchId: drainDispatch.id,
|
|
1220
|
+
});
|
|
1221
|
+
} else {
|
|
1222
|
+
log('info', `[drain-retry] skipping retry for ${job.name} (enabled=${freshJob?.enabled}, retry_count=${ctx.run.retry_count || 0}, overlap_backlog=${getDispatchBacklogCount(job.id)})`, {
|
|
1223
|
+
jobId: job.id, runId: ctx.run.id,
|
|
1224
|
+
});
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
// Do NOT call updateJobAfterRun -- avoid incrementing consecutive_errors for drain noise
|
|
1228
|
+
if (ctx.dispatchRecord) setDispatchStatus(ctx.dispatchRecord.id, 'done');
|
|
1229
|
+
return { ...makeDefaultResult(), status: 'error', earlyReturn: true };
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
finishRun(ctx.run.id, 'error', { error_message: err.message });
|
|
1233
|
+
if (ctx.idemKey) releaseIdempotencyKey(ctx.idemKey);
|
|
1234
|
+
if (job.agent_id) setAgentStatus(job.agent_id, 'idle', null);
|
|
1235
|
+
|
|
1236
|
+
if (shouldRetry(job, ctx.run.id)) {
|
|
1237
|
+
const retry = scheduleRetry(job, ctx.run.id);
|
|
1238
|
+
if (retry.dispatch) {
|
|
1239
|
+
log('info', `Scheduling retry ${retry.retryCount}/${job.max_retries} in ${retry.delaySec}s`, {
|
|
1240
|
+
jobId: job.id, runId: ctx.run.id,
|
|
1241
|
+
});
|
|
1242
|
+
if (job.delivery_mode === 'announce' || job.delivery_mode === 'announce-always') {
|
|
1243
|
+
const retryMsg = `Job "${job.name}" failed with exception, retry ${retry.retryCount}/${job.max_retries} scheduled`;
|
|
1244
|
+
await handleDelivery(job, retryMsg);
|
|
1245
|
+
}
|
|
1246
|
+
getDb().prepare('UPDATE runs SET retry_count = ? WHERE id = ?').run(retry.retryCount, ctx.run.id);
|
|
1247
|
+
if (ctx.dispatchRecord) setDispatchStatus(ctx.dispatchRecord.id, 'done');
|
|
1248
|
+
if (dequeueJob(job.id)) {
|
|
1249
|
+
log('info', `Dequeued pending dispatch for ${job.name} (after exception-retry)`);
|
|
1250
|
+
}
|
|
1251
|
+
} else {
|
|
1252
|
+
log('warn', `Retry skipped for ${job.name} -- dispatch backlog limit reached`, {
|
|
1253
|
+
jobId: job.id, runId: ctx.run.id,
|
|
1254
|
+
maxQueuedDispatches: job.max_queued_dispatches || 25,
|
|
1255
|
+
});
|
|
1256
|
+
if (['announce', 'announce-always'].includes(job.delivery_mode)) {
|
|
1257
|
+
await handleDelivery(job, `\u26a0\ufe0f Job failed: ${job.name}\n\n${err.message}`);
|
|
1258
|
+
}
|
|
1259
|
+
handleTriggeredChildren(job.id, 'error', err.message, ctx.run.id, ' on exception-retry-skipped');
|
|
1260
|
+
if (dequeueJob(job.id)) {
|
|
1261
|
+
log('info', `Dequeued pending dispatch for ${job.name} (after exception-retry-skipped)`);
|
|
1262
|
+
}
|
|
1263
|
+
updateJobAfterRun(job, 'error');
|
|
1264
|
+
if (ctx.dispatchRecord) setDispatchStatus(ctx.dispatchRecord.id, 'done');
|
|
1265
|
+
}
|
|
1266
|
+
} else {
|
|
1267
|
+
if (['announce', 'announce-always'].includes(job.delivery_mode)) {
|
|
1268
|
+
await handleDelivery(job, `\u26a0\ufe0f Job failed: ${job.name}\n\n${err.message}`);
|
|
1269
|
+
}
|
|
1270
|
+
handleTriggeredChildren(job.id, 'error', err.message, ctx.run.id, ' on failure');
|
|
1271
|
+
if (dequeueJob(job.id)) {
|
|
1272
|
+
log('info', `Dequeued pending dispatch for ${job.name} (after failure)`);
|
|
1273
|
+
}
|
|
1274
|
+
updateJobAfterRun(job, 'error');
|
|
1275
|
+
if (ctx.dispatchRecord) setDispatchStatus(ctx.dispatchRecord.id, 'done');
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
return { ...makeDefaultResult(), status: 'error', earlyReturn: true };
|
|
1279
|
+
}
|
|
1280
|
+
}
|