@donkeylabs/server 0.4.7 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/external-jobs.md +420 -0
- package/docs/workflows.md +509 -0
- package/package.json +1 -1
- package/src/core/external-job-socket.ts +356 -0
- package/src/core/external-jobs.ts +237 -0
- package/src/core/index.ts +49 -0
- package/src/core/jobs.ts +652 -9
- package/src/core/workflows.ts +1173 -0
- package/src/core.ts +2 -0
- package/src/harness.ts +3 -0
- package/src/server.ts +15 -2
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
# External Jobs
|
|
2
|
+
|
|
3
|
+
External jobs extend the built-in Jobs service to support processes written in any language (Python, Go, Rust, shell scripts, etc.) with bidirectional communication, server restart resilience, and SSE progress updates.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
External jobs allow you to:
|
|
8
|
+
- Run long-running tasks in any language
|
|
9
|
+
- Report progress back to the server in real-time
|
|
10
|
+
- Survive server restarts (jobs continue running)
|
|
11
|
+
- Broadcast progress updates via SSE to clients
|
|
12
|
+
- Monitor job health via heartbeats
|
|
13
|
+
|
|
14
|
+
## Architecture
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
18
|
+
│ @donkeylabs/server │
|
|
19
|
+
│ │
|
|
20
|
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
21
|
+
│ │ Jobs │────▶│ Events │────▶│ SSE │───────┼──▶ Client
|
|
22
|
+
│ │ Service │ │ Service │ │ Service │ │
|
|
23
|
+
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
|
24
|
+
│ │ │
|
|
25
|
+
│ │ spawn + Unix socket │
|
|
26
|
+
│ ▼ │
|
|
27
|
+
│ ┌─────────────────────────────────────────────────────┐ │
|
|
28
|
+
│ │ External Job Manager │ │
|
|
29
|
+
│ │ - Spawn processes │ │
|
|
30
|
+
│ │ - Unix socket communication │ │
|
|
31
|
+
│ │ - Heartbeat monitoring │ │
|
|
32
|
+
│ │ - State persistence for restart resilience │ │
|
|
33
|
+
│ └─────────────────────────────────────────────────────┘ │
|
|
34
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
35
|
+
│
|
|
36
|
+
│ bidirectional (Unix socket)
|
|
37
|
+
▼
|
|
38
|
+
┌───────────────┐
|
|
39
|
+
│ External Job │ (Python, Go, Rust, Shell, etc.)
|
|
40
|
+
│ - Wrapper lib │
|
|
41
|
+
│ - Heartbeat │
|
|
42
|
+
│ - Progress │
|
|
43
|
+
└───────────────┘
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Quick Start
|
|
47
|
+
|
|
48
|
+
### 1. Register an External Job
|
|
49
|
+
|
|
50
|
+
```typescript
|
|
51
|
+
import { AppServer } from "@donkeylabs/server";
|
|
52
|
+
|
|
53
|
+
const server = new AppServer({
|
|
54
|
+
db: createDatabase(),
|
|
55
|
+
port: 3000,
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
// Register an external job that runs a Python script
|
|
59
|
+
server.getCore().jobs.registerExternal("process-video", {
|
|
60
|
+
command: "python",
|
|
61
|
+
args: ["-m", "video_processor"],
|
|
62
|
+
cwd: "./workers",
|
|
63
|
+
heartbeatTimeout: 60000, // 60 seconds
|
|
64
|
+
timeout: 3600000, // 1 hour max
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
await server.start();
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### 2. Enqueue the Job
|
|
71
|
+
|
|
72
|
+
```typescript
|
|
73
|
+
// Same API as regular jobs
|
|
74
|
+
const jobId = await ctx.core.jobs.enqueue("process-video", {
|
|
75
|
+
videoId: "abc123",
|
|
76
|
+
operations: ["transcode", "thumbnail"],
|
|
77
|
+
});
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### 3. Listen for Progress (Optional)
|
|
81
|
+
|
|
82
|
+
```typescript
|
|
83
|
+
// In your server setup
|
|
84
|
+
ctx.core.events.on("job.external.progress", (data) => {
|
|
85
|
+
// Broadcast to SSE clients
|
|
86
|
+
ctx.core.sse.broadcast(`job:${data.jobId}`, "progress", data);
|
|
87
|
+
});
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### 4. Write the Worker (Python)
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
# workers/video_processor.py
|
|
94
|
+
from donkeylabs_job import DonkeylabsJob, run_job
|
|
95
|
+
|
|
96
|
+
def process_video(job: DonkeylabsJob):
|
|
97
|
+
video_id = job.data["videoId"]
|
|
98
|
+
operations = job.data["operations"]
|
|
99
|
+
|
|
100
|
+
for i, op in enumerate(operations):
|
|
101
|
+
progress = (i / len(operations)) * 100
|
|
102
|
+
job.progress(progress, f"Running {op}")
|
|
103
|
+
|
|
104
|
+
# Do the actual work...
|
|
105
|
+
if op == "transcode":
|
|
106
|
+
transcode_video(video_id)
|
|
107
|
+
elif op == "thumbnail":
|
|
108
|
+
generate_thumbnail(video_id)
|
|
109
|
+
|
|
110
|
+
return {"videoId": video_id, "processed": True}
|
|
111
|
+
|
|
112
|
+
if __name__ == "__main__":
|
|
113
|
+
run_job(process_video)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## External Job Configuration
|
|
117
|
+
|
|
118
|
+
```typescript
|
|
119
|
+
interface ExternalJobConfig {
|
|
120
|
+
/** Command to execute (e.g., "python", "node", "./script.sh") */
|
|
121
|
+
command: string;
|
|
122
|
+
|
|
123
|
+
/** Arguments to pass to the command */
|
|
124
|
+
args?: string[];
|
|
125
|
+
|
|
126
|
+
/** Working directory for the process */
|
|
127
|
+
cwd?: string;
|
|
128
|
+
|
|
129
|
+
/** Environment variables to set */
|
|
130
|
+
env?: Record<string, string>;
|
|
131
|
+
|
|
132
|
+
/** Heartbeat timeout in milliseconds (default: 30000) */
|
|
133
|
+
heartbeatTimeout?: number;
|
|
134
|
+
|
|
135
|
+
/** Job timeout in milliseconds (optional) */
|
|
136
|
+
timeout?: number;
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Global External Jobs Configuration
|
|
141
|
+
|
|
142
|
+
Configure external jobs behavior in `ServerConfig`:
|
|
143
|
+
|
|
144
|
+
```typescript
|
|
145
|
+
const server = new AppServer({
|
|
146
|
+
db: createDatabase(),
|
|
147
|
+
jobs: {
|
|
148
|
+
concurrency: 5,
|
|
149
|
+
external: {
|
|
150
|
+
/** Directory for Unix sockets (default: /tmp/donkeylabs-jobs) */
|
|
151
|
+
socketDir: "/tmp/donkeylabs-jobs",
|
|
152
|
+
|
|
153
|
+
/** TCP port range for Windows fallback (default: [49152, 65535]) */
|
|
154
|
+
tcpPortRange: [49152, 65535],
|
|
155
|
+
|
|
156
|
+
/** Default heartbeat timeout in ms (default: 30000) */
|
|
157
|
+
defaultHeartbeatTimeout: 30000,
|
|
158
|
+
|
|
159
|
+
/** Heartbeat check interval in ms (default: 10000) */
|
|
160
|
+
heartbeatCheckInterval: 10000,
|
|
161
|
+
},
|
|
162
|
+
},
|
|
163
|
+
});
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Communication Protocol
|
|
167
|
+
|
|
168
|
+
External jobs communicate with the server via Unix sockets (or TCP on Windows) using newline-delimited JSON messages.
|
|
169
|
+
|
|
170
|
+
### Message Types
|
|
171
|
+
|
|
172
|
+
| Type | Direction | Description |
|
|
173
|
+
|------|-----------|-------------|
|
|
174
|
+
| `started` | Job → Server | Job has initialized and is ready |
|
|
175
|
+
| `progress` | Job → Server | Progress update |
|
|
176
|
+
| `heartbeat` | Job → Server | Health check (auto-sent by wrappers) |
|
|
177
|
+
| `log` | Job → Server | Log message |
|
|
178
|
+
| `completed` | Job → Server | Job finished successfully |
|
|
179
|
+
| `failed` | Job → Server | Job encountered an error |
|
|
180
|
+
|
|
181
|
+
### Message Format
|
|
182
|
+
|
|
183
|
+
```json
|
|
184
|
+
{
|
|
185
|
+
"type": "progress",
|
|
186
|
+
"jobId": "job_123_1234567890",
|
|
187
|
+
"timestamp": 1234567890123,
|
|
188
|
+
"percent": 50,
|
|
189
|
+
"message": "Processing step 5 of 10",
|
|
190
|
+
"data": { "currentStep": "resize" }
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Events
|
|
195
|
+
|
|
196
|
+
External jobs emit the following events:
|
|
197
|
+
|
|
198
|
+
| Event | Data | Description |
|
|
199
|
+
|-------|------|-------------|
|
|
200
|
+
| `job.external.spawned` | `{ jobId, name }` | Process started |
|
|
201
|
+
| `job.external.progress` | `{ jobId, name, percent, message, data }` | Progress update |
|
|
202
|
+
| `job.external.log` | `{ jobId, name, level, message, data }` | Log message |
|
|
203
|
+
| `job.completed` | `{ jobId, name, result }` | Job completed |
|
|
204
|
+
| `job.failed` | `{ jobId, name, error, stack }` | Job failed |
|
|
205
|
+
| `job.stale` | `{ jobId, name, timeSinceHeartbeat }` | No heartbeat |
|
|
206
|
+
| `job.reconnected` | `{ jobId, name }` | Reconnected after restart |
|
|
207
|
+
| `job.lost` | `{ jobId, name }` | Lost job after restart |
|
|
208
|
+
|
|
209
|
+
### Listening for Events
|
|
210
|
+
|
|
211
|
+
```typescript
|
|
212
|
+
// Subscribe to all job progress
|
|
213
|
+
ctx.core.events.on("job.external.progress", (data) => {
|
|
214
|
+
console.log(`Job ${data.jobId}: ${data.percent}% - ${data.message}`);
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
// Subscribe to specific job completion
|
|
218
|
+
ctx.core.events.on("job.process-video.completed", (data) => {
|
|
219
|
+
console.log(`Video processing completed: ${data.result}`);
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
// Handle stale jobs
|
|
223
|
+
ctx.core.events.on("job.stale", (data) => {
|
|
224
|
+
console.warn(`Job ${data.jobId} hasn't sent heartbeat in ${data.timeSinceHeartbeat}ms`);
|
|
225
|
+
});
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## SSE Integration
|
|
229
|
+
|
|
230
|
+
Broadcast job progress to clients via Server-Sent Events:
|
|
231
|
+
|
|
232
|
+
```typescript
|
|
233
|
+
// Server setup
|
|
234
|
+
ctx.core.events.on("job.external.progress", (data) => {
|
|
235
|
+
// Broadcast to channel "job:<jobId>"
|
|
236
|
+
ctx.core.sse.broadcast(`job:${data.jobId}`, "progress", {
|
|
237
|
+
percent: data.percent,
|
|
238
|
+
message: data.message,
|
|
239
|
+
});
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
// In route handler - subscribe client to job updates
|
|
243
|
+
router.route("subscribe-job").raw({
|
|
244
|
+
handle: async (req, ctx) => {
|
|
245
|
+
const url = new URL(req.url);
|
|
246
|
+
const jobId = url.searchParams.get("jobId");
|
|
247
|
+
|
|
248
|
+
// Get SSE response
|
|
249
|
+
const { client, response } = ctx.core.sse.addClient();
|
|
250
|
+
|
|
251
|
+
// Subscribe to job channel
|
|
252
|
+
ctx.core.sse.subscribe(client.id, `job:${jobId}`);
|
|
253
|
+
|
|
254
|
+
return response;
|
|
255
|
+
},
|
|
256
|
+
});
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## Wrapper Libraries
|
|
260
|
+
|
|
261
|
+
### Python Wrapper
|
|
262
|
+
|
|
263
|
+
Located at `examples/external-jobs/python/donkeylabs_job.py`:
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
from donkeylabs_job import DonkeylabsJob, run_job
|
|
267
|
+
|
|
268
|
+
def my_job(job: DonkeylabsJob):
|
|
269
|
+
# Access job data
|
|
270
|
+
data = job.data
|
|
271
|
+
|
|
272
|
+
# Report progress
|
|
273
|
+
job.progress(50, "Halfway done", extra_key="value")
|
|
274
|
+
|
|
275
|
+
# Log messages
|
|
276
|
+
job.info("Processing...")
|
|
277
|
+
job.debug("Debug info")
|
|
278
|
+
job.warn("Warning!")
|
|
279
|
+
job.error("Error occurred")
|
|
280
|
+
|
|
281
|
+
# Return result (auto-completes)
|
|
282
|
+
return {"result": "success"}
|
|
283
|
+
|
|
284
|
+
# Or manually complete/fail:
|
|
285
|
+
def manual_job(job: DonkeylabsJob):
|
|
286
|
+
try:
|
|
287
|
+
result = do_work()
|
|
288
|
+
job.complete(result)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
job.fail(str(e))
|
|
291
|
+
|
|
292
|
+
if __name__ == "__main__":
|
|
293
|
+
run_job(my_job)
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
### Shell Wrapper
|
|
297
|
+
|
|
298
|
+
Located at `examples/external-jobs/shell/donkeylabs-job.sh`:
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
#!/bin/bash
|
|
302
|
+
source /path/to/donkeylabs-job.sh
|
|
303
|
+
|
|
304
|
+
# Initialize (reads stdin, starts heartbeat)
|
|
305
|
+
job_init 5 # 5 second heartbeat interval
|
|
306
|
+
|
|
307
|
+
# Report progress
|
|
308
|
+
job_progress 0 "Starting..."
|
|
309
|
+
|
|
310
|
+
# Log messages
|
|
311
|
+
job_info "Processing data..."
|
|
312
|
+
job_debug "Debug info"
|
|
313
|
+
job_warn "Warning!"
|
|
314
|
+
job_error "Error!"
|
|
315
|
+
|
|
316
|
+
# Access job data (requires jq)
|
|
317
|
+
STEPS=$(job_data_get '.steps // 5')
|
|
318
|
+
|
|
319
|
+
# Do work...
|
|
320
|
+
for i in $(seq 1 $STEPS); do
|
|
321
|
+
job_progress $((i * 100 / STEPS)) "Step $i"
|
|
322
|
+
sleep 1
|
|
323
|
+
done
|
|
324
|
+
|
|
325
|
+
# Complete
|
|
326
|
+
job_complete '{"result": "success"}'
|
|
327
|
+
|
|
328
|
+
# Or fail
|
|
329
|
+
# job_fail "Something went wrong"
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
## Server Restart Resilience
|
|
333
|
+
|
|
334
|
+
External jobs survive server restarts:
|
|
335
|
+
|
|
336
|
+
1. **On Shutdown**: Job state (PID, socket path) is persisted in the database
|
|
337
|
+
2. **On Startup**: Server checks for orphaned jobs:
|
|
338
|
+
- If process is still alive, attempts reconnection
|
|
339
|
+
- If process died, marks job as failed
|
|
340
|
+
3. **Reconnection**: External process continues sending heartbeats; server picks them up
|
|
341
|
+
|
|
342
|
+
### Best Practices
|
|
343
|
+
|
|
344
|
+
- External workers should handle reconnection gracefully
|
|
345
|
+
- Use heartbeats to detect server restarts
|
|
346
|
+
- Consider idempotent operations for potential re-execution
|
|
347
|
+
|
|
348
|
+
## Error Handling
|
|
349
|
+
|
|
350
|
+
### Heartbeat Timeout
|
|
351
|
+
|
|
352
|
+
If a job stops sending heartbeats:
|
|
353
|
+
|
|
354
|
+
1. After `heartbeatTimeout`: Emits `job.stale` event
|
|
355
|
+
2. After `2 * heartbeatTimeout`: Kills process, marks job as failed
|
|
356
|
+
|
|
357
|
+
### Process Exit
|
|
358
|
+
|
|
359
|
+
If the external process exits:
|
|
360
|
+
|
|
361
|
+
- Exit code 0 without completion message: Warning logged
|
|
362
|
+
- Non-zero exit code: Job marked as failed
|
|
363
|
+
|
|
364
|
+
### Job Timeout
|
|
365
|
+
|
|
366
|
+
If configured, jobs are killed after `timeout` milliseconds.
|
|
367
|
+
|
|
368
|
+
## API Reference
|
|
369
|
+
|
|
370
|
+
### Jobs Service
|
|
371
|
+
|
|
372
|
+
```typescript
|
|
373
|
+
interface Jobs {
|
|
374
|
+
// Register external job configuration
|
|
375
|
+
registerExternal(name: string, config: ExternalJobConfig): void;
|
|
376
|
+
|
|
377
|
+
// Enqueue (works for both internal and external)
|
|
378
|
+
enqueue<T>(name: string, data: T, options?: { maxAttempts?: number }): Promise<string>;
|
|
379
|
+
|
|
380
|
+
// Schedule for later
|
|
381
|
+
schedule<T>(name: string, data: T, runAt: Date, options?: { maxAttempts?: number }): Promise<string>;
|
|
382
|
+
|
|
383
|
+
// Get job by ID
|
|
384
|
+
get(jobId: string): Promise<Job | null>;
|
|
385
|
+
|
|
386
|
+
// Cancel a job (kills external process if running)
|
|
387
|
+
cancel(jobId: string): Promise<boolean>;
|
|
388
|
+
|
|
389
|
+
// Get all running external jobs
|
|
390
|
+
getRunningExternal(): Promise<Job[]>;
|
|
391
|
+
}
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
### Extended Job Interface
|
|
395
|
+
|
|
396
|
+
```typescript
|
|
397
|
+
interface Job {
|
|
398
|
+
id: string;
|
|
399
|
+
name: string;
|
|
400
|
+
data: any;
|
|
401
|
+
status: JobStatus;
|
|
402
|
+
// ... standard fields ...
|
|
403
|
+
|
|
404
|
+
// External job fields
|
|
405
|
+
external?: boolean;
|
|
406
|
+
pid?: number;
|
|
407
|
+
socketPath?: string;
|
|
408
|
+
tcpPort?: number;
|
|
409
|
+
lastHeartbeat?: Date;
|
|
410
|
+
processState?: "spawning" | "running" | "orphaned" | "reconnecting";
|
|
411
|
+
}
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
## Examples
|
|
415
|
+
|
|
416
|
+
See the `examples/external-jobs/` directory for complete examples:
|
|
417
|
+
|
|
418
|
+
- `python/donkeylabs_job.py` - Python wrapper library
|
|
419
|
+
- `shell/donkeylabs-job.sh` - Shell wrapper library
|
|
420
|
+
- `shell/example-job.sh` - Example shell script job
|