@donkeylabs/server 0.4.7 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/external-jobs.md +420 -0
- package/docs/workflows.md +509 -0
- package/package.json +1 -1
- package/src/core/external-job-socket.ts +356 -0
- package/src/core/external-jobs.ts +237 -0
- package/src/core/index.ts +49 -0
- package/src/core/jobs.ts +652 -9
- package/src/core/workflows.ts +1173 -0
- package/src/core.ts +2 -0
- package/src/harness.ts +3 -0
- package/src/server.ts +15 -2
package/src/core/jobs.ts
CHANGED
|
@@ -1,7 +1,30 @@
|
|
|
1
1
|
// Core Jobs Service
|
|
2
2
|
// Background job queue with scheduling
|
|
3
|
+
// Supports both in-process handlers and external processes (Python, Go, Shell, etc.)
|
|
3
4
|
|
|
4
5
|
import type { Events } from "./events";
|
|
6
|
+
import type {
|
|
7
|
+
ExternalJobConfig,
|
|
8
|
+
ExternalJob,
|
|
9
|
+
ExternalJobProcessState,
|
|
10
|
+
ExternalJobsConfig,
|
|
11
|
+
AnyExternalJobMessage,
|
|
12
|
+
} from "./external-jobs";
|
|
13
|
+
import {
|
|
14
|
+
isProcessAlive,
|
|
15
|
+
isExternalJob,
|
|
16
|
+
isProgressMessage,
|
|
17
|
+
isHeartbeatMessage,
|
|
18
|
+
isLogMessage,
|
|
19
|
+
isCompletedMessage,
|
|
20
|
+
isFailedMessage,
|
|
21
|
+
isStartedMessage,
|
|
22
|
+
createInitialPayload,
|
|
23
|
+
} from "./external-jobs";
|
|
24
|
+
import {
|
|
25
|
+
createExternalJobSocketServer,
|
|
26
|
+
type ExternalJobSocketServer,
|
|
27
|
+
} from "./external-job-socket";
|
|
5
28
|
|
|
6
29
|
export type JobStatus = "pending" | "running" | "completed" | "failed" | "scheduled";
|
|
7
30
|
|
|
@@ -18,6 +41,19 @@ export interface Job {
|
|
|
18
41
|
error?: string;
|
|
19
42
|
attempts: number;
|
|
20
43
|
maxAttempts: number;
|
|
44
|
+
// External job fields (null/undefined for in-process jobs)
|
|
45
|
+
/** Flag indicating this is an external job */
|
|
46
|
+
external?: boolean;
|
|
47
|
+
/** Process ID of the external process */
|
|
48
|
+
pid?: number;
|
|
49
|
+
/** Unix socket path for communication */
|
|
50
|
+
socketPath?: string;
|
|
51
|
+
/** TCP port for Windows fallback */
|
|
52
|
+
tcpPort?: number;
|
|
53
|
+
/** Timestamp of last heartbeat */
|
|
54
|
+
lastHeartbeat?: Date;
|
|
55
|
+
/** Current process state */
|
|
56
|
+
processState?: ExternalJobProcessState;
|
|
21
57
|
}
|
|
22
58
|
|
|
23
59
|
export interface JobHandler<T = any, R = any> {
|
|
@@ -32,6 +68,10 @@ export interface JobAdapter {
|
|
|
32
68
|
getPending(limit?: number): Promise<Job[]>;
|
|
33
69
|
getScheduledReady(now: Date): Promise<Job[]>;
|
|
34
70
|
getByName(name: string, status?: JobStatus): Promise<Job[]>;
|
|
71
|
+
/** Get all running external jobs */
|
|
72
|
+
getRunningExternal(): Promise<Job[]>;
|
|
73
|
+
/** Get external jobs that need reconnection after server restart */
|
|
74
|
+
getOrphanedExternal(): Promise<Job[]>;
|
|
35
75
|
}
|
|
36
76
|
|
|
37
77
|
export interface JobsConfig {
|
|
@@ -40,16 +80,30 @@ export interface JobsConfig {
|
|
|
40
80
|
concurrency?: number; // Max concurrent jobs, default 5
|
|
41
81
|
pollInterval?: number; // ms, default 1000
|
|
42
82
|
maxAttempts?: number; // Default retry attempts, default 3
|
|
83
|
+
/** External jobs configuration */
|
|
84
|
+
external?: ExternalJobsConfig;
|
|
43
85
|
}
|
|
44
86
|
|
|
45
87
|
export interface Jobs {
|
|
88
|
+
/** Register an in-process job handler */
|
|
46
89
|
register<T = any, R = any>(name: string, handler: JobHandler<T, R>): void;
|
|
90
|
+
/** Register an external job (Python, Go, Shell, etc.) */
|
|
91
|
+
registerExternal(name: string, config: ExternalJobConfig): void;
|
|
92
|
+
/** Enqueue a job (works for both in-process and external jobs) */
|
|
47
93
|
enqueue<T = any>(name: string, data: T, options?: { maxAttempts?: number }): Promise<string>;
|
|
94
|
+
/** Schedule a job to run at a specific time */
|
|
48
95
|
schedule<T = any>(name: string, data: T, runAt: Date, options?: { maxAttempts?: number }): Promise<string>;
|
|
96
|
+
/** Get a job by ID */
|
|
49
97
|
get(jobId: string): Promise<Job | null>;
|
|
98
|
+
/** Cancel a pending job */
|
|
50
99
|
cancel(jobId: string): Promise<boolean>;
|
|
100
|
+
/** Get jobs by name and optionally filter by status */
|
|
51
101
|
getByName(name: string, status?: JobStatus): Promise<Job[]>;
|
|
102
|
+
/** Get all running external jobs */
|
|
103
|
+
getRunningExternal(): Promise<Job[]>;
|
|
104
|
+
/** Start the job processing loop */
|
|
52
105
|
start(): void;
|
|
106
|
+
/** Stop the job processing and cleanup */
|
|
53
107
|
stop(): Promise<void>;
|
|
54
108
|
}
|
|
55
109
|
|
|
@@ -110,6 +164,26 @@ export class MemoryJobAdapter implements JobAdapter {
|
|
|
110
164
|
}
|
|
111
165
|
return results;
|
|
112
166
|
}
|
|
167
|
+
|
|
168
|
+
async getRunningExternal(): Promise<Job[]> {
|
|
169
|
+
const results: Job[] = [];
|
|
170
|
+
for (const job of this.jobs.values()) {
|
|
171
|
+
if (job.external && job.status === "running") {
|
|
172
|
+
results.push(job);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
return results;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
async getOrphanedExternal(): Promise<Job[]> {
|
|
179
|
+
const results: Job[] = [];
|
|
180
|
+
for (const job of this.jobs.values()) {
|
|
181
|
+
if (job.external && (job.processState === "orphaned" || job.processState === "running")) {
|
|
182
|
+
results.push(job);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return results;
|
|
186
|
+
}
|
|
113
187
|
}
|
|
114
188
|
|
|
115
189
|
class JobsImpl implements Jobs {
|
|
@@ -118,28 +192,49 @@ class JobsImpl implements Jobs {
|
|
|
118
192
|
private handlers = new Map<string, JobHandler>();
|
|
119
193
|
private running = false;
|
|
120
194
|
private timer: ReturnType<typeof setInterval> | null = null;
|
|
195
|
+
private heartbeatTimer: ReturnType<typeof setInterval> | null = null;
|
|
121
196
|
private activeJobs = 0;
|
|
122
197
|
private concurrency: number;
|
|
123
198
|
private pollInterval: number;
|
|
124
199
|
private defaultMaxAttempts: number;
|
|
125
200
|
|
|
201
|
+
// External jobs support
|
|
202
|
+
private externalConfigs = new Map<string, ExternalJobConfig>();
|
|
203
|
+
private externalConfig: ExternalJobsConfig;
|
|
204
|
+
private socketServer: ExternalJobSocketServer | null = null;
|
|
205
|
+
private externalProcesses = new Map<string, { pid: number; timeout?: ReturnType<typeof setTimeout> }>();
|
|
206
|
+
|
|
126
207
|
constructor(config: JobsConfig = {}) {
|
|
127
208
|
this.adapter = config.adapter ?? new MemoryJobAdapter();
|
|
128
209
|
this.events = config.events;
|
|
129
210
|
this.concurrency = config.concurrency ?? 5;
|
|
130
211
|
this.pollInterval = config.pollInterval ?? 1000;
|
|
131
212
|
this.defaultMaxAttempts = config.maxAttempts ?? 3;
|
|
213
|
+
this.externalConfig = config.external ?? {};
|
|
132
214
|
}
|
|
133
215
|
|
|
134
216
|
register<T = any, R = any>(name: string, handler: JobHandler<T, R>): void {
|
|
135
|
-
if (this.handlers.has(name)) {
|
|
217
|
+
if (this.handlers.has(name) || this.externalConfigs.has(name)) {
|
|
136
218
|
throw new Error(`Job handler "${name}" is already registered`);
|
|
137
219
|
}
|
|
138
220
|
this.handlers.set(name, handler);
|
|
139
221
|
}
|
|
140
222
|
|
|
223
|
+
registerExternal(name: string, config: ExternalJobConfig): void {
|
|
224
|
+
if (this.handlers.has(name) || this.externalConfigs.has(name)) {
|
|
225
|
+
throw new Error(`Job handler "${name}" is already registered`);
|
|
226
|
+
}
|
|
227
|
+
this.externalConfigs.set(name, config);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
private isExternalJob(name: string): boolean {
|
|
231
|
+
return this.externalConfigs.has(name);
|
|
232
|
+
}
|
|
233
|
+
|
|
141
234
|
async enqueue<T = any>(name: string, data: T, options: { maxAttempts?: number } = {}): Promise<string> {
|
|
142
|
-
|
|
235
|
+
const isExternal = this.isExternalJob(name);
|
|
236
|
+
|
|
237
|
+
if (!isExternal && !this.handlers.has(name)) {
|
|
143
238
|
throw new Error(`No handler registered for job "${name}"`);
|
|
144
239
|
}
|
|
145
240
|
|
|
@@ -150,6 +245,8 @@ class JobsImpl implements Jobs {
|
|
|
150
245
|
createdAt: new Date(),
|
|
151
246
|
attempts: 0,
|
|
152
247
|
maxAttempts: options.maxAttempts ?? this.defaultMaxAttempts,
|
|
248
|
+
external: isExternal || undefined,
|
|
249
|
+
processState: isExternal ? "spawning" : undefined,
|
|
153
250
|
});
|
|
154
251
|
|
|
155
252
|
return job.id;
|
|
@@ -161,7 +258,9 @@ class JobsImpl implements Jobs {
|
|
|
161
258
|
runAt: Date,
|
|
162
259
|
options: { maxAttempts?: number } = {}
|
|
163
260
|
): Promise<string> {
|
|
164
|
-
|
|
261
|
+
const isExternal = this.isExternalJob(name);
|
|
262
|
+
|
|
263
|
+
if (!isExternal && !this.handlers.has(name)) {
|
|
165
264
|
throw new Error(`No handler registered for job "${name}"`);
|
|
166
265
|
}
|
|
167
266
|
|
|
@@ -173,6 +272,8 @@ class JobsImpl implements Jobs {
|
|
|
173
272
|
runAt,
|
|
174
273
|
attempts: 0,
|
|
175
274
|
maxAttempts: options.maxAttempts ?? this.defaultMaxAttempts,
|
|
275
|
+
external: isExternal || undefined,
|
|
276
|
+
processState: isExternal ? "spawning" : undefined,
|
|
176
277
|
});
|
|
177
278
|
|
|
178
279
|
return job.id;
|
|
@@ -187,8 +288,18 @@ class JobsImpl implements Jobs {
|
|
|
187
288
|
if (!job) return false;
|
|
188
289
|
|
|
189
290
|
if (job.status === "running") {
|
|
190
|
-
//
|
|
191
|
-
|
|
291
|
+
// For external jobs, we can try to kill the process
|
|
292
|
+
if (job.external && job.pid) {
|
|
293
|
+
try {
|
|
294
|
+
process.kill(job.pid, "SIGTERM");
|
|
295
|
+
} catch {
|
|
296
|
+
// Process may already be dead
|
|
297
|
+
}
|
|
298
|
+
await this.cleanupExternalJob(jobId);
|
|
299
|
+
} else {
|
|
300
|
+
// Can't cancel running in-process job
|
|
301
|
+
return false;
|
|
302
|
+
}
|
|
192
303
|
}
|
|
193
304
|
|
|
194
305
|
return this.adapter.delete(jobId);
|
|
@@ -198,10 +309,22 @@ class JobsImpl implements Jobs {
|
|
|
198
309
|
return this.adapter.getByName(name, status);
|
|
199
310
|
}
|
|
200
311
|
|
|
312
|
+
async getRunningExternal(): Promise<Job[]> {
|
|
313
|
+
return this.adapter.getRunningExternal();
|
|
314
|
+
}
|
|
315
|
+
|
|
201
316
|
start(): void {
|
|
202
317
|
if (this.running) return;
|
|
203
318
|
this.running = true;
|
|
204
319
|
|
|
320
|
+
// Initialize socket server for external jobs
|
|
321
|
+
if (this.externalConfigs.size > 0) {
|
|
322
|
+
this.initializeSocketServer();
|
|
323
|
+
this.startHeartbeatMonitor();
|
|
324
|
+
// Attempt to reconnect to orphaned jobs from previous run
|
|
325
|
+
this.reconnectOrphanedJobs();
|
|
326
|
+
}
|
|
327
|
+
|
|
205
328
|
this.timer = setInterval(() => this.tick(), this.pollInterval);
|
|
206
329
|
// Run immediately too
|
|
207
330
|
this.tick();
|
|
@@ -209,19 +332,322 @@ class JobsImpl implements Jobs {
|
|
|
209
332
|
|
|
210
333
|
async stop(): Promise<void> {
|
|
211
334
|
this.running = false;
|
|
335
|
+
|
|
212
336
|
if (this.timer) {
|
|
213
337
|
clearInterval(this.timer);
|
|
214
338
|
this.timer = null;
|
|
215
339
|
}
|
|
216
340
|
|
|
217
|
-
|
|
341
|
+
if (this.heartbeatTimer) {
|
|
342
|
+
clearInterval(this.heartbeatTimer);
|
|
343
|
+
this.heartbeatTimer = null;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Cleanup external job processes
|
|
347
|
+
for (const [jobId, procInfo] of this.externalProcesses) {
|
|
348
|
+
if (procInfo.timeout) {
|
|
349
|
+
clearTimeout(procInfo.timeout);
|
|
350
|
+
}
|
|
351
|
+
try {
|
|
352
|
+
process.kill(procInfo.pid, "SIGTERM");
|
|
353
|
+
} catch {
|
|
354
|
+
// Process may already be dead
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
this.externalProcesses.clear();
|
|
358
|
+
|
|
359
|
+
// Shutdown socket server
|
|
360
|
+
if (this.socketServer) {
|
|
361
|
+
await this.socketServer.shutdown();
|
|
362
|
+
this.socketServer = null;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Wait for active in-process jobs to complete (with timeout)
|
|
218
366
|
const maxWait = 30000; // 30 seconds
|
|
219
|
-
const
|
|
220
|
-
while (this.activeJobs > 0 && Date.now() -
|
|
367
|
+
const startTime = Date.now();
|
|
368
|
+
while (this.activeJobs > 0 && Date.now() - startTime < maxWait) {
|
|
221
369
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
222
370
|
}
|
|
223
371
|
}
|
|
224
372
|
|
|
373
|
+
private initializeSocketServer(): void {
|
|
374
|
+
this.socketServer = createExternalJobSocketServer(this.externalConfig, {
|
|
375
|
+
onMessage: (msg) => this.handleExternalMessage(msg),
|
|
376
|
+
onConnect: (jobId) => {
|
|
377
|
+
console.log(`[Jobs] External job ${jobId} connected`);
|
|
378
|
+
},
|
|
379
|
+
onDisconnect: (jobId) => {
|
|
380
|
+
console.log(`[Jobs] External job ${jobId} disconnected`);
|
|
381
|
+
// Check if the job is still running and mark as orphaned if so
|
|
382
|
+
this.handleExternalDisconnect(jobId);
|
|
383
|
+
},
|
|
384
|
+
onError: (err, jobId) => {
|
|
385
|
+
console.error(`[Jobs] External job socket error:`, err, jobId);
|
|
386
|
+
},
|
|
387
|
+
});
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
private startHeartbeatMonitor(): void {
|
|
391
|
+
const checkInterval = this.externalConfig.heartbeatCheckInterval ?? 10000;
|
|
392
|
+
|
|
393
|
+
this.heartbeatTimer = setInterval(async () => {
|
|
394
|
+
await this.checkHeartbeats();
|
|
395
|
+
}, checkInterval);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
private async checkHeartbeats(): Promise<void> {
|
|
399
|
+
try {
|
|
400
|
+
const runningExternal = await this.adapter.getRunningExternal();
|
|
401
|
+
const now = Date.now();
|
|
402
|
+
|
|
403
|
+
for (const job of runningExternal) {
|
|
404
|
+
if (!job.external || !job.lastHeartbeat) continue;
|
|
405
|
+
|
|
406
|
+
const config = this.externalConfigs.get(job.name);
|
|
407
|
+
const heartbeatTimeout = config?.heartbeatTimeout ?? this.externalConfig.defaultHeartbeatTimeout ?? 30000;
|
|
408
|
+
const timeSinceHeartbeat = now - job.lastHeartbeat.getTime();
|
|
409
|
+
|
|
410
|
+
if (timeSinceHeartbeat > heartbeatTimeout) {
|
|
411
|
+
// Job is stale
|
|
412
|
+
console.warn(`[Jobs] External job ${job.id} is stale (no heartbeat for ${timeSinceHeartbeat}ms)`);
|
|
413
|
+
|
|
414
|
+
if (this.events) {
|
|
415
|
+
await this.events.emit("job.stale", {
|
|
416
|
+
jobId: job.id,
|
|
417
|
+
name: job.name,
|
|
418
|
+
timeSinceHeartbeat,
|
|
419
|
+
});
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// If stale for 2x timeout, kill the process
|
|
423
|
+
if (timeSinceHeartbeat > heartbeatTimeout * 2) {
|
|
424
|
+
console.error(`[Jobs] Killing stale external job ${job.id}`);
|
|
425
|
+
|
|
426
|
+
if (job.pid) {
|
|
427
|
+
try {
|
|
428
|
+
process.kill(job.pid, "SIGKILL");
|
|
429
|
+
} catch {
|
|
430
|
+
// Process may already be dead
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
await this.adapter.update(job.id, {
|
|
435
|
+
status: "failed",
|
|
436
|
+
error: "Heartbeat timeout - job process unresponsive",
|
|
437
|
+
completedAt: new Date(),
|
|
438
|
+
processState: "orphaned",
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
await this.cleanupExternalJob(job.id);
|
|
442
|
+
|
|
443
|
+
if (this.events) {
|
|
444
|
+
await this.events.emit("job.failed", {
|
|
445
|
+
jobId: job.id,
|
|
446
|
+
name: job.name,
|
|
447
|
+
error: "Heartbeat timeout",
|
|
448
|
+
});
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
} catch (err) {
|
|
454
|
+
console.error("[Jobs] Heartbeat check error:", err);
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
private async reconnectOrphanedJobs(): Promise<void> {
|
|
459
|
+
try {
|
|
460
|
+
const orphaned = await this.adapter.getOrphanedExternal();
|
|
461
|
+
const activeJobIds = new Set<string>();
|
|
462
|
+
|
|
463
|
+
for (const job of orphaned) {
|
|
464
|
+
if (!job.pid) {
|
|
465
|
+
// No PID, mark as failed
|
|
466
|
+
await this.adapter.update(job.id, {
|
|
467
|
+
status: "failed",
|
|
468
|
+
error: "Lost job state - no PID available",
|
|
469
|
+
completedAt: new Date(),
|
|
470
|
+
});
|
|
471
|
+
continue;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// Check if process is still alive
|
|
475
|
+
if (isProcessAlive(job.pid)) {
|
|
476
|
+
console.log(`[Jobs] Found orphaned job ${job.id} with PID ${job.pid}, attempting reconnect`);
|
|
477
|
+
activeJobIds.add(job.id);
|
|
478
|
+
|
|
479
|
+
// Try to reconnect to the socket
|
|
480
|
+
const reconnected = await this.socketServer?.reconnect(
|
|
481
|
+
job.id,
|
|
482
|
+
job.socketPath,
|
|
483
|
+
job.tcpPort
|
|
484
|
+
);
|
|
485
|
+
|
|
486
|
+
if (reconnected) {
|
|
487
|
+
await this.adapter.update(job.id, {
|
|
488
|
+
processState: "running",
|
|
489
|
+
lastHeartbeat: new Date(),
|
|
490
|
+
});
|
|
491
|
+
|
|
492
|
+
if (this.events) {
|
|
493
|
+
await this.events.emit("job.reconnected", {
|
|
494
|
+
jobId: job.id,
|
|
495
|
+
name: job.name,
|
|
496
|
+
});
|
|
497
|
+
}
|
|
498
|
+
} else {
|
|
499
|
+
// Mark as orphaned, but keep tracking
|
|
500
|
+
await this.adapter.update(job.id, { processState: "orphaned" });
|
|
501
|
+
|
|
502
|
+
if (this.events) {
|
|
503
|
+
await this.events.emit("job.lost", {
|
|
504
|
+
jobId: job.id,
|
|
505
|
+
name: job.name,
|
|
506
|
+
});
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
} else {
|
|
510
|
+
// Process is dead, mark job as failed
|
|
511
|
+
console.log(`[Jobs] Orphaned job ${job.id} process (PID ${job.pid}) is dead`);
|
|
512
|
+
await this.adapter.update(job.id, {
|
|
513
|
+
status: "failed",
|
|
514
|
+
error: "Process died unexpectedly",
|
|
515
|
+
completedAt: new Date(),
|
|
516
|
+
});
|
|
517
|
+
|
|
518
|
+
if (this.events) {
|
|
519
|
+
await this.events.emit("job.failed", {
|
|
520
|
+
jobId: job.id,
|
|
521
|
+
name: job.name,
|
|
522
|
+
error: "Process died unexpectedly",
|
|
523
|
+
});
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// Clean orphaned socket files
|
|
529
|
+
await this.socketServer?.cleanOrphanedSockets(activeJobIds);
|
|
530
|
+
} catch (err) {
|
|
531
|
+
console.error("[Jobs] Orphan reconnection error:", err);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
private async handleExternalMessage(message: AnyExternalJobMessage): Promise<void> {
|
|
536
|
+
const job = await this.adapter.get(message.jobId);
|
|
537
|
+
if (!job) {
|
|
538
|
+
console.warn(`[Jobs] Received message for unknown job: ${message.jobId}`);
|
|
539
|
+
return;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
if (isStartedMessage(message)) {
|
|
543
|
+
await this.adapter.update(message.jobId, {
|
|
544
|
+
processState: "running",
|
|
545
|
+
lastHeartbeat: new Date(message.timestamp),
|
|
546
|
+
});
|
|
547
|
+
|
|
548
|
+
if (this.events) {
|
|
549
|
+
await this.events.emit("job.external.spawned", {
|
|
550
|
+
jobId: message.jobId,
|
|
551
|
+
name: job.name,
|
|
552
|
+
});
|
|
553
|
+
}
|
|
554
|
+
} else if (isHeartbeatMessage(message)) {
|
|
555
|
+
await this.adapter.update(message.jobId, {
|
|
556
|
+
lastHeartbeat: new Date(message.timestamp),
|
|
557
|
+
});
|
|
558
|
+
} else if (isProgressMessage(message)) {
|
|
559
|
+
await this.adapter.update(message.jobId, {
|
|
560
|
+
lastHeartbeat: new Date(message.timestamp),
|
|
561
|
+
});
|
|
562
|
+
|
|
563
|
+
if (this.events) {
|
|
564
|
+
await this.events.emit("job.external.progress", {
|
|
565
|
+
jobId: message.jobId,
|
|
566
|
+
name: job.name,
|
|
567
|
+
percent: message.percent,
|
|
568
|
+
message: message.message,
|
|
569
|
+
data: message.data,
|
|
570
|
+
});
|
|
571
|
+
}
|
|
572
|
+
} else if (isLogMessage(message)) {
|
|
573
|
+
await this.adapter.update(message.jobId, {
|
|
574
|
+
lastHeartbeat: new Date(message.timestamp),
|
|
575
|
+
});
|
|
576
|
+
|
|
577
|
+
if (this.events) {
|
|
578
|
+
await this.events.emit("job.external.log", {
|
|
579
|
+
jobId: message.jobId,
|
|
580
|
+
name: job.name,
|
|
581
|
+
level: message.level,
|
|
582
|
+
message: message.message,
|
|
583
|
+
data: message.data,
|
|
584
|
+
});
|
|
585
|
+
}
|
|
586
|
+
} else if (isCompletedMessage(message)) {
|
|
587
|
+
await this.adapter.update(message.jobId, {
|
|
588
|
+
status: "completed",
|
|
589
|
+
result: message.result,
|
|
590
|
+
completedAt: new Date(message.timestamp),
|
|
591
|
+
});
|
|
592
|
+
|
|
593
|
+
await this.cleanupExternalJob(message.jobId);
|
|
594
|
+
|
|
595
|
+
if (this.events) {
|
|
596
|
+
await this.events.emit("job.completed", {
|
|
597
|
+
jobId: message.jobId,
|
|
598
|
+
name: job.name,
|
|
599
|
+
result: message.result,
|
|
600
|
+
});
|
|
601
|
+
await this.events.emit(`job.${job.name}.completed`, {
|
|
602
|
+
jobId: message.jobId,
|
|
603
|
+
result: message.result,
|
|
604
|
+
});
|
|
605
|
+
}
|
|
606
|
+
} else if (isFailedMessage(message)) {
|
|
607
|
+
await this.adapter.update(message.jobId, {
|
|
608
|
+
status: "failed",
|
|
609
|
+
error: message.error,
|
|
610
|
+
completedAt: new Date(message.timestamp),
|
|
611
|
+
});
|
|
612
|
+
|
|
613
|
+
await this.cleanupExternalJob(message.jobId);
|
|
614
|
+
|
|
615
|
+
if (this.events) {
|
|
616
|
+
await this.events.emit("job.failed", {
|
|
617
|
+
jobId: message.jobId,
|
|
618
|
+
name: job.name,
|
|
619
|
+
error: message.error,
|
|
620
|
+
stack: message.stack,
|
|
621
|
+
});
|
|
622
|
+
await this.events.emit(`job.${job.name}.failed`, {
|
|
623
|
+
jobId: message.jobId,
|
|
624
|
+
error: message.error,
|
|
625
|
+
stack: message.stack,
|
|
626
|
+
});
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
private async handleExternalDisconnect(jobId: string): Promise<void> {
|
|
632
|
+
const job = await this.adapter.get(jobId);
|
|
633
|
+
if (!job || job.status !== "running") return;
|
|
634
|
+
|
|
635
|
+
// Mark as orphaned - the job might still be running
|
|
636
|
+
await this.adapter.update(jobId, { processState: "orphaned" });
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
private async cleanupExternalJob(jobId: string): Promise<void> {
|
|
640
|
+
// Clear any timeout
|
|
641
|
+
const procInfo = this.externalProcesses.get(jobId);
|
|
642
|
+
if (procInfo?.timeout) {
|
|
643
|
+
clearTimeout(procInfo.timeout);
|
|
644
|
+
}
|
|
645
|
+
this.externalProcesses.delete(jobId);
|
|
646
|
+
|
|
647
|
+
// Close the socket
|
|
648
|
+
await this.socketServer?.closeSocket(jobId);
|
|
649
|
+
}
|
|
650
|
+
|
|
225
651
|
private async tick(): Promise<void> {
|
|
226
652
|
if (!this.running) return;
|
|
227
653
|
|
|
@@ -240,13 +666,230 @@ class JobsImpl implements Jobs {
|
|
|
240
666
|
const pending = await this.adapter.getPending(availableSlots);
|
|
241
667
|
for (const job of pending) {
|
|
242
668
|
if (this.activeJobs >= this.concurrency) break;
|
|
243
|
-
|
|
669
|
+
|
|
670
|
+
if (job.external) {
|
|
671
|
+
this.processExternalJob(job);
|
|
672
|
+
} else {
|
|
673
|
+
this.processJob(job);
|
|
674
|
+
}
|
|
244
675
|
}
|
|
245
676
|
} catch (err) {
|
|
246
677
|
console.error("[Jobs] Tick error:", err);
|
|
247
678
|
}
|
|
248
679
|
}
|
|
249
680
|
|
|
681
|
+
private async processExternalJob(job: Job): Promise<void> {
|
|
682
|
+
const config = this.externalConfigs.get(job.name);
|
|
683
|
+
if (!config) {
|
|
684
|
+
await this.adapter.update(job.id, {
|
|
685
|
+
status: "failed",
|
|
686
|
+
error: `No external config registered for job "${job.name}"`,
|
|
687
|
+
completedAt: new Date(),
|
|
688
|
+
});
|
|
689
|
+
return;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
this.activeJobs++;
|
|
693
|
+
const startedAt = new Date();
|
|
694
|
+
|
|
695
|
+
try {
|
|
696
|
+
// Create socket for this job
|
|
697
|
+
if (!this.socketServer) {
|
|
698
|
+
this.initializeSocketServer();
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
const { socketPath, tcpPort } = await this.socketServer!.createSocket(job.id);
|
|
702
|
+
|
|
703
|
+
// Update job with socket info
|
|
704
|
+
await this.adapter.update(job.id, {
|
|
705
|
+
status: "running",
|
|
706
|
+
startedAt,
|
|
707
|
+
attempts: job.attempts + 1,
|
|
708
|
+
socketPath,
|
|
709
|
+
tcpPort,
|
|
710
|
+
processState: "spawning",
|
|
711
|
+
lastHeartbeat: startedAt,
|
|
712
|
+
});
|
|
713
|
+
|
|
714
|
+
// Create initial payload
|
|
715
|
+
const payload = createInitialPayload(
|
|
716
|
+
job.id,
|
|
717
|
+
job.name,
|
|
718
|
+
job.data,
|
|
719
|
+
socketPath ?? `tcp://127.0.0.1:${tcpPort}`
|
|
720
|
+
);
|
|
721
|
+
|
|
722
|
+
// Spawn the external process
|
|
723
|
+
const env = {
|
|
724
|
+
...process.env,
|
|
725
|
+
...config.env,
|
|
726
|
+
DONKEYLABS_JOB_ID: job.id,
|
|
727
|
+
DONKEYLABS_JOB_NAME: job.name,
|
|
728
|
+
DONKEYLABS_SOCKET_PATH: socketPath ?? "",
|
|
729
|
+
DONKEYLABS_TCP_PORT: String(tcpPort ?? ""),
|
|
730
|
+
};
|
|
731
|
+
|
|
732
|
+
const proc = Bun.spawn([config.command, ...(config.args ?? [])], {
|
|
733
|
+
cwd: config.cwd,
|
|
734
|
+
env,
|
|
735
|
+
stdin: "pipe",
|
|
736
|
+
stdout: "pipe",
|
|
737
|
+
stderr: "pipe",
|
|
738
|
+
});
|
|
739
|
+
|
|
740
|
+
// Store process info
|
|
741
|
+
this.externalProcesses.set(job.id, { pid: proc.pid });
|
|
742
|
+
|
|
743
|
+
// Update job with PID
|
|
744
|
+
await this.adapter.update(job.id, { pid: proc.pid });
|
|
745
|
+
|
|
746
|
+
// Send payload via stdin
|
|
747
|
+
proc.stdin.write(payload + "\n");
|
|
748
|
+
proc.stdin.end();
|
|
749
|
+
|
|
750
|
+
// Set up process timeout if configured
|
|
751
|
+
if (config.timeout) {
|
|
752
|
+
const timeout = setTimeout(async () => {
|
|
753
|
+
console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
|
|
754
|
+
try {
|
|
755
|
+
process.kill(proc.pid, "SIGTERM");
|
|
756
|
+
} catch {
|
|
757
|
+
// Process may already be dead
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
await this.adapter.update(job.id, {
|
|
761
|
+
status: "failed",
|
|
762
|
+
error: `Job timed out after ${config.timeout}ms`,
|
|
763
|
+
completedAt: new Date(),
|
|
764
|
+
});
|
|
765
|
+
|
|
766
|
+
await this.cleanupExternalJob(job.id);
|
|
767
|
+
|
|
768
|
+
if (this.events) {
|
|
769
|
+
await this.events.emit("job.failed", {
|
|
770
|
+
jobId: job.id,
|
|
771
|
+
name: job.name,
|
|
772
|
+
error: "Timeout",
|
|
773
|
+
});
|
|
774
|
+
}
|
|
775
|
+
}, config.timeout);
|
|
776
|
+
|
|
777
|
+
const procInfo = this.externalProcesses.get(job.id);
|
|
778
|
+
if (procInfo) {
|
|
779
|
+
procInfo.timeout = timeout;
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
// Handle process exit
|
|
784
|
+
proc.exited.then(async (code) => {
|
|
785
|
+
// Only handle exit if job is still running (not already completed/failed via message)
|
|
786
|
+
const currentJob = await this.adapter.get(job.id);
|
|
787
|
+
if (currentJob?.status === "running") {
|
|
788
|
+
if (code === 0) {
|
|
789
|
+
// Process exited cleanly but didn't send completion message
|
|
790
|
+
// This might be ok, or might indicate an issue
|
|
791
|
+
console.warn(`[Jobs] External job ${job.id} exited with code 0 but no completion message`);
|
|
792
|
+
} else {
|
|
793
|
+
// Process failed
|
|
794
|
+
await this.adapter.update(job.id, {
|
|
795
|
+
status: "failed",
|
|
796
|
+
error: `Process exited with code ${code}`,
|
|
797
|
+
completedAt: new Date(),
|
|
798
|
+
});
|
|
799
|
+
|
|
800
|
+
if (this.events) {
|
|
801
|
+
await this.events.emit("job.failed", {
|
|
802
|
+
jobId: job.id,
|
|
803
|
+
name: job.name,
|
|
804
|
+
error: `Process exited with code ${code}`,
|
|
805
|
+
});
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
await this.cleanupExternalJob(job.id);
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
this.activeJobs--;
|
|
813
|
+
});
|
|
814
|
+
|
|
815
|
+
// Stream stdout/stderr to logs
|
|
816
|
+
this.streamProcessOutput(job.id, job.name, proc);
|
|
817
|
+
} catch (err) {
|
|
818
|
+
const error = err instanceof Error ? err.message : String(err);
|
|
819
|
+
|
|
820
|
+
await this.adapter.update(job.id, {
|
|
821
|
+
status: "failed",
|
|
822
|
+
error,
|
|
823
|
+
completedAt: new Date(),
|
|
824
|
+
});
|
|
825
|
+
|
|
826
|
+
await this.cleanupExternalJob(job.id);
|
|
827
|
+
|
|
828
|
+
if (this.events) {
|
|
829
|
+
await this.events.emit("job.failed", {
|
|
830
|
+
jobId: job.id,
|
|
831
|
+
name: job.name,
|
|
832
|
+
error,
|
|
833
|
+
});
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
this.activeJobs--;
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
private streamProcessOutput(
|
|
841
|
+
jobId: string,
|
|
842
|
+
jobName: string,
|
|
843
|
+
proc: ReturnType<typeof Bun.spawn>
|
|
844
|
+
): void {
|
|
845
|
+
const decoder = new TextDecoder();
|
|
846
|
+
const events = this.events;
|
|
847
|
+
|
|
848
|
+
// Helper to stream a ReadableStream
|
|
849
|
+
const streamOutput = async (
|
|
850
|
+
stream: ReadableStream<Uint8Array> | undefined,
|
|
851
|
+
level: "info" | "error"
|
|
852
|
+
) => {
|
|
853
|
+
if (!stream) return;
|
|
854
|
+
|
|
855
|
+
try {
|
|
856
|
+
const reader = stream.getReader();
|
|
857
|
+
while (true) {
|
|
858
|
+
const { done, value } = await reader.read();
|
|
859
|
+
if (done) break;
|
|
860
|
+
|
|
861
|
+
const text = decoder.decode(value);
|
|
862
|
+
if (level === "error") {
|
|
863
|
+
console.error(`[Jobs:${jobId}] stderr:`, text.trim());
|
|
864
|
+
} else {
|
|
865
|
+
console.log(`[Jobs:${jobId}] stdout:`, text.trim());
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
if (events) {
|
|
869
|
+
await events.emit("job.external.log", {
|
|
870
|
+
jobId,
|
|
871
|
+
name: jobName,
|
|
872
|
+
level,
|
|
873
|
+
message: text.trim(),
|
|
874
|
+
});
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
} catch {
|
|
878
|
+
// Stream may be closed
|
|
879
|
+
}
|
|
880
|
+
};
|
|
881
|
+
|
|
882
|
+
// Stream stdout
|
|
883
|
+
if (proc.stdout && typeof proc.stdout !== "number") {
|
|
884
|
+
streamOutput(proc.stdout as ReadableStream<Uint8Array>, "info");
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
// Stream stderr
|
|
888
|
+
if (proc.stderr && typeof proc.stderr !== "number") {
|
|
889
|
+
streamOutput(proc.stderr as ReadableStream<Uint8Array>, "error");
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
|
|
250
893
|
private async processJob(job: Job): Promise<void> {
|
|
251
894
|
const handler = this.handlers.get(job.name);
|
|
252
895
|
if (!handler) {
|