@donkeylabs/server 0.4.7 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,420 @@
1
+ # External Jobs
2
+
3
+ External jobs extend the built-in Jobs service to support processes written in any language (Python, Go, Rust, shell scripts, etc.) with bidirectional communication, server restart resilience, and SSE progress updates.
4
+
5
+ ## Overview
6
+
7
+ External jobs allow you to:
8
+ - Run long-running tasks in any language
9
+ - Report progress back to the server in real-time
10
+ - Survive server restarts (jobs continue running)
11
+ - Broadcast progress updates via SSE to clients
12
+ - Monitor job health via heartbeats
13
+
14
+ ## Architecture
15
+
16
+ ```
17
+ ┌─────────────────────────────────────────────────────────────────┐
18
+ │ @donkeylabs/server │
19
+ │ │
20
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
21
+ │ │ Jobs │────▶│ Events │────▶│ SSE │───────┼──▶ Client
22
+ │ │ Service │ │ Service │ │ Service │ │
23
+ │ └─────────────┘ └─────────────┘ └─────────────┘ │
24
+ │ │ │
25
+ │ │ spawn + Unix socket │
26
+ │ ▼ │
27
+ │ ┌─────────────────────────────────────────────────────┐ │
28
+ │ │ External Job Manager │ │
29
+ │ │ - Spawn processes │ │
30
+ │ │ - Unix socket communication │ │
31
+ │ │ - Heartbeat monitoring │ │
32
+ │ │ - State persistence for restart resilience │ │
33
+ │ └─────────────────────────────────────────────────────┘ │
34
+ └─────────────────────────────────────────────────────────────────┘
35
+
36
+ │ bidirectional (Unix socket)
37
+
38
+ ┌───────────────┐
39
+ │ External Job │ (Python, Go, Rust, Shell, etc.)
40
+ │ - Wrapper lib │
41
+ │ - Heartbeat │
42
+ │ - Progress │
43
+ └───────────────┘
44
+ ```
45
+
46
+ ## Quick Start
47
+
48
+ ### 1. Register an External Job
49
+
50
+ ```typescript
51
+ import { AppServer } from "@donkeylabs/server";
52
+
53
+ const server = new AppServer({
54
+ db: createDatabase(),
55
+ port: 3000,
56
+ });
57
+
58
+ // Register an external job that runs a Python script
59
+ server.getCore().jobs.registerExternal("process-video", {
60
+ command: "python",
61
+ args: ["-m", "video_processor"],
62
+ cwd: "./workers",
63
+ heartbeatTimeout: 60000, // 60 seconds
64
+ timeout: 3600000, // 1 hour max
65
+ });
66
+
67
+ await server.start();
68
+ ```
69
+
70
+ ### 2. Enqueue the Job
71
+
72
+ ```typescript
73
+ // Same API as regular jobs
74
+ const jobId = await ctx.core.jobs.enqueue("process-video", {
75
+ videoId: "abc123",
76
+ operations: ["transcode", "thumbnail"],
77
+ });
78
+ ```
79
+
80
+ ### 3. Listen for Progress (Optional)
81
+
82
+ ```typescript
83
+ // In your server setup
84
+ ctx.core.events.on("job.external.progress", (data) => {
85
+ // Broadcast to SSE clients
86
+ ctx.core.sse.broadcast(`job:${data.jobId}`, "progress", data);
87
+ });
88
+ ```
89
+
90
+ ### 4. Write the Worker (Python)
91
+
92
+ ```python
93
+ # workers/video_processor.py
94
+ from donkeylabs_job import DonkeylabsJob, run_job
95
+
96
+ def process_video(job: DonkeylabsJob):
97
+ video_id = job.data["videoId"]
98
+ operations = job.data["operations"]
99
+
100
+ for i, op in enumerate(operations):
101
+ progress = (i / len(operations)) * 100
102
+ job.progress(progress, f"Running {op}")
103
+
104
+ # Do the actual work...
105
+ if op == "transcode":
106
+ transcode_video(video_id)
107
+ elif op == "thumbnail":
108
+ generate_thumbnail(video_id)
109
+
110
+ return {"videoId": video_id, "processed": True}
111
+
112
+ if __name__ == "__main__":
113
+ run_job(process_video)
114
+ ```
115
+
116
+ ## External Job Configuration
117
+
118
+ ```typescript
119
+ interface ExternalJobConfig {
120
+ /** Command to execute (e.g., "python", "node", "./script.sh") */
121
+ command: string;
122
+
123
+ /** Arguments to pass to the command */
124
+ args?: string[];
125
+
126
+ /** Working directory for the process */
127
+ cwd?: string;
128
+
129
+ /** Environment variables to set */
130
+ env?: Record<string, string>;
131
+
132
+ /** Heartbeat timeout in milliseconds (default: 30000) */
133
+ heartbeatTimeout?: number;
134
+
135
+ /** Job timeout in milliseconds (optional) */
136
+ timeout?: number;
137
+ }
138
+ ```
139
+
140
+ ## Global External Jobs Configuration
141
+
142
+ Configure external jobs behavior in `ServerConfig`:
143
+
144
+ ```typescript
145
+ const server = new AppServer({
146
+ db: createDatabase(),
147
+ jobs: {
148
+ concurrency: 5,
149
+ external: {
150
+ /** Directory for Unix sockets (default: /tmp/donkeylabs-jobs) */
151
+ socketDir: "/tmp/donkeylabs-jobs",
152
+
153
+ /** TCP port range for Windows fallback (default: [49152, 65535]) */
154
+ tcpPortRange: [49152, 65535],
155
+
156
+ /** Default heartbeat timeout in ms (default: 30000) */
157
+ defaultHeartbeatTimeout: 30000,
158
+
159
+ /** Heartbeat check interval in ms (default: 10000) */
160
+ heartbeatCheckInterval: 10000,
161
+ },
162
+ },
163
+ });
164
+ ```
165
+
166
+ ## Communication Protocol
167
+
168
+ External jobs communicate with the server via Unix sockets (or TCP on Windows) using newline-delimited JSON messages.
169
+
170
+ ### Message Types
171
+
172
+ | Type | Direction | Description |
173
+ |------|-----------|-------------|
174
+ | `started` | Job → Server | Job has initialized and is ready |
175
+ | `progress` | Job → Server | Progress update |
176
+ | `heartbeat` | Job → Server | Health check (auto-sent by wrappers) |
177
+ | `log` | Job → Server | Log message |
178
+ | `completed` | Job → Server | Job finished successfully |
179
+ | `failed` | Job → Server | Job encountered an error |
180
+
181
+ ### Message Format
182
+
183
+ ```json
184
+ {
185
+ "type": "progress",
186
+ "jobId": "job_123_1234567890",
187
+ "timestamp": 1234567890123,
188
+ "percent": 50,
189
+ "message": "Processing step 5 of 10",
190
+ "data": { "currentStep": "resize" }
191
+ }
192
+ ```
193
+
194
+ ## Events
195
+
196
+ External jobs emit the following events:
197
+
198
+ | Event | Data | Description |
199
+ |-------|------|-------------|
200
+ | `job.external.spawned` | `{ jobId, name }` | Process started |
201
+ | `job.external.progress` | `{ jobId, name, percent, message, data }` | Progress update |
202
+ | `job.external.log` | `{ jobId, name, level, message, data }` | Log message |
203
+ | `job.completed` | `{ jobId, name, result }` | Job completed |
204
+ | `job.failed` | `{ jobId, name, error, stack }` | Job failed |
205
+ | `job.stale` | `{ jobId, name, timeSinceHeartbeat }` | No heartbeat |
206
+ | `job.reconnected` | `{ jobId, name }` | Reconnected after restart |
207
+ | `job.lost` | `{ jobId, name }` | Lost job after restart |
208
+
209
+ ### Listening for Events
210
+
211
+ ```typescript
212
+ // Subscribe to all job progress
213
+ ctx.core.events.on("job.external.progress", (data) => {
214
+ console.log(`Job ${data.jobId}: ${data.percent}% - ${data.message}`);
215
+ });
216
+
217
+ // Subscribe to specific job completion
218
+ ctx.core.events.on("job.process-video.completed", (data) => {
219
+ console.log(`Video processing completed: ${data.result}`);
220
+ });
221
+
222
+ // Handle stale jobs
223
+ ctx.core.events.on("job.stale", (data) => {
224
+ console.warn(`Job ${data.jobId} hasn't sent heartbeat in ${data.timeSinceHeartbeat}ms`);
225
+ });
226
+ ```
227
+
228
+ ## SSE Integration
229
+
230
+ Broadcast job progress to clients via Server-Sent Events:
231
+
232
+ ```typescript
233
+ // Server setup
234
+ ctx.core.events.on("job.external.progress", (data) => {
235
+ // Broadcast to channel "job:<jobId>"
236
+ ctx.core.sse.broadcast(`job:${data.jobId}`, "progress", {
237
+ percent: data.percent,
238
+ message: data.message,
239
+ });
240
+ });
241
+
242
+ // In route handler - subscribe client to job updates
243
+ router.route("subscribe-job").raw({
244
+ handle: async (req, ctx) => {
245
+ const url = new URL(req.url);
246
+ const jobId = url.searchParams.get("jobId");
247
+
248
+ // Get SSE response
249
+ const { client, response } = ctx.core.sse.addClient();
250
+
251
+ // Subscribe to job channel
252
+ ctx.core.sse.subscribe(client.id, `job:${jobId}`);
253
+
254
+ return response;
255
+ },
256
+ });
257
+ ```
258
+
259
+ ## Wrapper Libraries
260
+
261
+ ### Python Wrapper
262
+
263
+ Located at `examples/external-jobs/python/donkeylabs_job.py`:
264
+
265
+ ```python
266
+ from donkeylabs_job import DonkeylabsJob, run_job
267
+
268
+ def my_job(job: DonkeylabsJob):
269
+ # Access job data
270
+ data = job.data
271
+
272
+ # Report progress
273
+ job.progress(50, "Halfway done", extra_key="value")
274
+
275
+ # Log messages
276
+ job.info("Processing...")
277
+ job.debug("Debug info")
278
+ job.warn("Warning!")
279
+ job.error("Error occurred")
280
+
281
+ # Return result (auto-completes)
282
+ return {"result": "success"}
283
+
284
+ # Or manually complete/fail:
285
+ def manual_job(job: DonkeylabsJob):
286
+ try:
287
+ result = do_work()
288
+ job.complete(result)
289
+ except Exception as e:
290
+ job.fail(str(e))
291
+
292
+ if __name__ == "__main__":
293
+ run_job(my_job)
294
+ ```
295
+
296
+ ### Shell Wrapper
297
+
298
+ Located at `examples/external-jobs/shell/donkeylabs-job.sh`:
299
+
300
+ ```bash
301
+ #!/bin/bash
302
+ source /path/to/donkeylabs-job.sh
303
+
304
+ # Initialize (reads stdin, starts heartbeat)
305
+ job_init 5 # 5 second heartbeat interval
306
+
307
+ # Report progress
308
+ job_progress 0 "Starting..."
309
+
310
+ # Log messages
311
+ job_info "Processing data..."
312
+ job_debug "Debug info"
313
+ job_warn "Warning!"
314
+ job_error "Error!"
315
+
316
+ # Access job data (requires jq)
317
+ STEPS=$(job_data_get '.steps // 5')
318
+
319
+ # Do work...
320
+ for i in $(seq 1 $STEPS); do
321
+ job_progress $((i * 100 / STEPS)) "Step $i"
322
+ sleep 1
323
+ done
324
+
325
+ # Complete
326
+ job_complete '{"result": "success"}'
327
+
328
+ # Or fail
329
+ # job_fail "Something went wrong"
330
+ ```
331
+
332
+ ## Server Restart Resilience
333
+
334
+ External jobs survive server restarts:
335
+
336
+ 1. **On Shutdown**: Job state (PID, socket path) is persisted in the database
337
+ 2. **On Startup**: Server checks for orphaned jobs:
338
+ - If process is still alive, attempts reconnection
339
+ - If process died, marks job as failed
340
+ 3. **Reconnection**: External process continues sending heartbeats; server picks them up
341
+
342
+ ### Best Practices
343
+
344
+ - External workers should handle reconnection gracefully
345
+ - Use heartbeats to detect server restarts
346
+ - Consider idempotent operations for potential re-execution
347
+
348
+ ## Error Handling
349
+
350
+ ### Heartbeat Timeout
351
+
352
+ If a job stops sending heartbeats:
353
+
354
+ 1. After `heartbeatTimeout`: Emits `job.stale` event
355
+ 2. After `2 * heartbeatTimeout`: Kills process, marks job as failed
356
+
357
+ ### Process Exit
358
+
359
+ If the external process exits:
360
+
361
+ - Exit code 0 without completion message: Warning logged
362
+ - Non-zero exit code: Job marked as failed
363
+
364
+ ### Job Timeout
365
+
366
+ If configured, jobs are killed after `timeout` milliseconds.
367
+
368
+ ## API Reference
369
+
370
+ ### Jobs Service
371
+
372
+ ```typescript
373
+ interface Jobs {
374
+ // Register external job configuration
375
+ registerExternal(name: string, config: ExternalJobConfig): void;
376
+
377
+ // Enqueue (works for both internal and external)
378
+ enqueue<T>(name: string, data: T, options?: { maxAttempts?: number }): Promise<string>;
379
+
380
+ // Schedule for later
381
+ schedule<T>(name: string, data: T, runAt: Date, options?: { maxAttempts?: number }): Promise<string>;
382
+
383
+ // Get job by ID
384
+ get(jobId: string): Promise<Job | null>;
385
+
386
+ // Cancel a job (kills external process if running)
387
+ cancel(jobId: string): Promise<boolean>;
388
+
389
+ // Get all running external jobs
390
+ getRunningExternal(): Promise<Job[]>;
391
+ }
392
+ ```
393
+
394
+ ### Extended Job Interface
395
+
396
+ ```typescript
397
+ interface Job {
398
+ id: string;
399
+ name: string;
400
+ data: any;
401
+ status: JobStatus;
402
+ // ... standard fields ...
403
+
404
+ // External job fields
405
+ external?: boolean;
406
+ pid?: number;
407
+ socketPath?: string;
408
+ tcpPort?: number;
409
+ lastHeartbeat?: Date;
410
+ processState?: "spawning" | "running" | "orphaned" | "reconnecting";
411
+ }
412
+ ```
413
+
414
+ ## Examples
415
+
416
+ See the `examples/external-jobs/` directory for complete examples:
417
+
418
+ - `python/donkeylabs_job.py` - Python wrapper library
419
+ - `shell/donkeylabs-job.sh` - Shell wrapper library
420
+ - `shell/example-job.sh` - Example shell script job