@donkeylabs/server 0.5.0 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -258,9 +258,17 @@ router.route("subscribe-job").raw({
258
258
 
259
259
  ## Wrapper Libraries
260
260
 
261
- ### Python Wrapper
261
+ After installing `@donkeylabs/server`, copy the wrapper to your project:
262
+
263
+ ```bash
264
+ # Python
265
+ cp node_modules/@donkeylabs/server/examples/external-jobs/python/donkeylabs_job.py ./workers/
266
+
267
+ # Shell
268
+ cp node_modules/@donkeylabs/server/examples/external-jobs/shell/donkeylabs-job.sh ./workers/
269
+ ```
262
270
 
263
- Located at `examples/external-jobs/python/donkeylabs_job.py`:
271
+ ### Python Wrapper
264
272
 
265
273
  ```python
266
274
  from donkeylabs_job import DonkeylabsJob, run_job
@@ -331,19 +339,131 @@ job_complete '{"result": "success"}'
331
339
 
332
340
  ## Server Restart Resilience
333
341
 
334
- External jobs survive server restarts:
342
+ External jobs automatically survive server restarts through built-in SQLite persistence.
343
+
344
+ ### Default Behavior (SQLite Persistence)
345
+
346
+ Jobs are automatically persisted to `.donkeylabs/jobs.db` by default:
347
+
348
+ ```typescript
349
+ import { AppServer } from "@donkeylabs/server";
350
+
351
+ const server = new AppServer({
352
+ db: createDatabase(),
353
+ // Jobs automatically use SQLite persistence - no config needed!
354
+ });
355
+
356
+ server.getCore().jobs.registerExternal("process-video", {
357
+ command: "python",
358
+ args: ["-m", "video_processor"],
359
+ });
360
+ ```
361
+
362
+ ### Configuration Options
363
+
364
+ ```typescript
365
+ const server = new AppServer({
366
+ db: createDatabase(),
367
+ jobs: {
368
+ // SQLite is used by default (persist: true)
369
+ persist: true, // Set to false for in-memory only
370
+ dbPath: ".donkeylabs/jobs.db", // Custom database path
371
+ external: {
372
+ socketDir: "/tmp/donkeylabs-jobs",
373
+ },
374
+ },
375
+ });
376
+ ```
377
+
378
+ ### Custom Adapter
379
+
380
+ For Postgres, MySQL, or other databases, provide your own adapter:
381
+
382
+ ```typescript
383
+ import { AppServer, SqliteJobAdapter } from "@donkeylabs/server";
384
+ import { MyPostgresJobAdapter } from "./adapters/postgres";
385
+
386
+ const server = new AppServer({
387
+ db: createDatabase(),
388
+ jobs: {
389
+ adapter: new MyPostgresJobAdapter(db), // Custom adapter
390
+ },
391
+ });
392
+ ```
393
+
394
+ ### What Gets Persisted
395
+
396
+ The adapter must persist these fields for external jobs:
397
+
398
+ | Field | Description |
399
+ |-------|-------------|
400
+ | `id` | Unique job ID |
401
+ | `name` | Job name |
402
+ | `data` | Job payload (JSON) |
403
+ | `status` | pending, running, completed, failed |
404
+ | `pid` | External process ID |
405
+ | `socketPath` | Unix socket path |
406
+ | `tcpPort` | TCP port (Windows) |
407
+ | `lastHeartbeat` | Last heartbeat timestamp |
408
+ | `processState` | spawning, running, orphaned |
409
+
410
+ ### How Reconnection Works
411
+
412
+ 1. **On Server Shutdown**: Job state is already persisted in the database
413
+ 2. **On Server Restart**:
414
+ - Server queries for jobs where `status = 'running'` and `external = true`
415
+ - Checks if the process is still alive (via PID)
416
+ - Checks if heartbeat hasn't expired
417
+ - **Reserves** the socket path/port to prevent new jobs from using it
418
+ - Recreates the socket server on the **same path/port**
419
+ - External process detects disconnection and retries connecting
420
+ 3. **Reconnection**: Once reconnected, the job resumes normal operation
421
+ 4. **Cleanup**: When the job completes, fails, or is killed, the reservation is released
422
+
423
+ ### Socket/Port Reservation
424
+
425
+ The server prevents new jobs from accidentally using socket paths or TCP ports that are reserved for orphaned jobs awaiting reconnection:
426
+
427
+ - When an orphaned job is detected on startup, its socket path/port is **reserved**
428
+ - New jobs cannot use reserved paths/ports (an error is thrown if attempted)
429
+ - Reservations are automatically released when:
430
+ - The job completes successfully
431
+ - The job fails
432
+ - The job is killed due to stale heartbeat
433
+ - The process is confirmed dead
434
+
435
+ This ensures that running external processes can always reconnect to their original socket path/port even if the server restarts multiple times.
436
+
437
+ ### Python Wrapper Reconnection
438
+
439
+ The Python wrapper automatically handles reconnection:
440
+
441
+ ```python
442
+ # Default reconnection settings
443
+ job = DonkeylabsJob(
444
+ job_id=job_id,
445
+ name=name,
446
+ data=data,
447
+ socket_path=socket_path,
448
+ heartbeat_interval=5.0, # Heartbeat every 5 seconds
449
+ reconnect_interval=2.0, # Retry every 2 seconds
450
+ max_reconnect_attempts=30, # Try for up to 60 seconds
451
+ )
452
+ ```
335
453
 
336
- 1. **On Shutdown**: Job state (PID, socket path) is persisted in the database
337
- 2. **On Startup**: Server checks for orphaned jobs:
338
- - If process is still alive, attempts reconnection
339
- - If process died, marks job as failed
340
- 3. **Reconnection**: External process continues sending heartbeats; server picks them up
454
+ When the connection is lost:
455
+ 1. Heartbeat/progress messages fail to send
456
+ 2. Background reconnection thread starts
457
+ 3. Retries connecting to the same socket path
458
+ 4. Once reconnected, sends "started" message to server
459
+ 5. Normal operation resumes
341
460
 
342
461
  ### Best Practices
343
462
 
344
- - External workers should handle reconnection gracefully
345
- - Use heartbeats to detect server restarts
346
- - Consider idempotent operations for potential re-execution
463
+ - **Always use a persistent adapter in production**
464
+ - External workers should be idempotent when possible
465
+ - Set `heartbeatTimeout` appropriately (longer = more time to reconnect)
466
+ - Consider longer `max_reconnect_attempts` for critical jobs
347
467
 
348
468
  ## Error Handling
349
469
 
package/docs/router.md CHANGED
@@ -304,6 +304,99 @@ router.route("getUser").typed({
304
304
 
305
305
  ---
306
306
 
307
+ ## Type Generation
308
+
309
+ When using `donkeylabs generate` to create a typed API client, **you must provide explicit `output` schemas if your route returns data**.
310
+
311
+ ### Output Schema Rules
312
+
313
+ - **No `output` schema** → Generated type is `void` (handler should return nothing)
314
+ - **With `output` schema** → Generated type matches the schema
315
+
316
+ This enforces explicitness: if you want to return data, you must declare what you're returning.
317
+
318
+ **Without `output` schema (returns void):**
319
+ ```ts
320
+ // Handler should NOT return anything
321
+ router.route("delete").typed({
322
+ input: z.object({ id: z.string() }),
323
+ handle: async (input, ctx) => {
324
+ await ctx.plugins.recordings.delete(input.id);
325
+ // No return - this is correct for void output
326
+ },
327
+ });
328
+ ```
329
+
330
+ **With `output` schema (returns data):**
331
+ ```ts
332
+ // ✅ Generated type will be: Output = Expand<{ recordings: Recording[]; total: number; }>
333
+ router.route("list").typed({
334
+ input: z.object({ page: z.number() }),
335
+ output: z.object({
336
+ recordings: z.array(RecordingSchema),
337
+ total: z.number(),
338
+ }),
339
+ handle: async (input, ctx) => {
340
+ return ctx.plugins.recordings.list(input);
341
+ },
342
+ });
343
+ ```
344
+
345
+ ### Best Practice: Always Define Output Schemas
346
+
347
+ For proper type safety in generated clients:
348
+
349
+ 1. **Define Zod schemas for outputs**:
350
+ ```ts
351
+ // schemas.ts
352
+ export const RecordingSchema = z.object({
353
+ id: z.string(),
354
+ name: z.string(),
355
+ duration: z.number(),
356
+ createdAt: z.string(),
357
+ });
358
+
359
+ export const RecordingListOutput = z.object({
360
+ recordings: z.array(RecordingSchema),
361
+ total: z.number(),
362
+ page: z.number(),
363
+ });
364
+ ```
365
+
366
+ 2. **Use them in routes**:
367
+ ```ts
368
+ import { RecordingListOutput } from "./schemas";
369
+
370
+ router.route("list").typed({
371
+ input: z.object({ page: z.number().default(1) }),
372
+ output: RecordingListOutput,
373
+ handle: async (input, ctx) => {
374
+ return ctx.plugins.recordings.list(input);
375
+ },
376
+ });
377
+ ```
378
+
379
+ 3. **Run type generation**:
380
+ ```bash
381
+ donkeylabs generate
382
+ ```
383
+
384
+ The generated client will have properly typed methods:
385
+ ```ts
386
+ // Generated API client
387
+ api.recordings.list({ page: 1 }) // Returns Promise<{ recordings: Recording[]; total: number; page: number; }>
388
+ ```
389
+
390
+ ### Debugging Missing Types
391
+
392
+ If your generated client shows `Output = Expand<void>` but your handler returns data:
393
+
394
+ 1. Add an explicit `output` Zod schema that matches your return type
395
+ 2. Run `donkeylabs generate` to regenerate the client
396
+ 3. Check the warning logs - routes without output schemas are listed
397
+
398
+ ---
399
+
307
400
  ## Real-World Examples
308
401
 
309
402
  ### CRUD Operations
@@ -0,0 +1,366 @@
1
+ """
2
+ Donkeylabs External Job Python Wrapper
3
+
4
+ This module provides a simple interface for Python scripts to communicate
5
+ with the Donkeylabs job system via Unix sockets or TCP.
6
+
7
+ Usage:
8
+ from donkeylabs_job import DonkeylabsJob, run_job
9
+
10
+ def my_job(job: DonkeylabsJob):
11
+ job.progress(0, "Starting...")
12
+ # Do work...
13
+ job.progress(50, "Halfway done")
14
+ # More work...
15
+ return {"result": "success"}
16
+
17
+ if __name__ == "__main__":
18
+ run_job(my_job)
19
+ """
20
+
21
+ import json
22
+ import os
23
+ import socket
24
+ import sys
25
+ import threading
26
+ import time
27
+ from typing import Any, Callable, Dict, Optional
28
+
29
+
30
+ class DonkeylabsJob:
31
+ """Interface for communicating with the Donkeylabs job system."""
32
+
33
+ def __init__(
34
+ self,
35
+ job_id: str,
36
+ name: str,
37
+ data: Any,
38
+ socket_path: str,
39
+ heartbeat_interval: float = 5.0,
40
+ reconnect_interval: float = 2.0,
41
+ max_reconnect_attempts: int = 30,
42
+ ):
43
+ self.job_id = job_id
44
+ self.name = name
45
+ self.data = data
46
+ self._socket_path = socket_path
47
+ self._heartbeat_interval = heartbeat_interval
48
+ self._reconnect_interval = reconnect_interval
49
+ self._max_reconnect_attempts = max_reconnect_attempts
50
+ self._socket: Optional[socket.socket] = None
51
+ self._heartbeat_thread: Optional[threading.Thread] = None
52
+ self._reconnect_thread: Optional[threading.Thread] = None
53
+ self._running = False
54
+ self._connected = False
55
+ self._lock = threading.Lock()
56
+ self._reconnect_lock = threading.Lock()
57
+
58
+ def connect(self) -> None:
59
+ """Connect to the job server socket."""
60
+ self._do_connect()
61
+ self._running = True
62
+ self._connected = True
63
+ self._start_heartbeat()
64
+ self._send_started()
65
+
66
+ def _do_connect(self) -> None:
67
+ """Internal connection logic."""
68
+ if self._socket_path.startswith("tcp://"):
69
+ # TCP connection (Windows fallback)
70
+ addr = self._socket_path[6:] # Remove "tcp://"
71
+ host, port = addr.rsplit(":", 1)
72
+ self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
73
+ self._socket.connect((host, int(port)))
74
+ else:
75
+ # Unix socket
76
+ self._socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
77
+ self._socket.connect(self._socket_path)
78
+
79
+ def _try_reconnect(self) -> bool:
80
+ """Attempt to reconnect to the server (for server restart resilience)."""
81
+ with self._reconnect_lock:
82
+ if self._connected:
83
+ return True
84
+
85
+ print(f"[DonkeylabsJob] Attempting to reconnect...", file=sys.stderr)
86
+
87
+ for attempt in range(self._max_reconnect_attempts):
88
+ try:
89
+ # Close old socket
90
+ if self._socket:
91
+ try:
92
+ self._socket.close()
93
+ except Exception:
94
+ pass
95
+
96
+ # Try to reconnect
97
+ self._do_connect()
98
+ self._connected = True
99
+ print(f"[DonkeylabsJob] Reconnected after {attempt + 1} attempts", file=sys.stderr)
100
+
101
+ # Send started message to let server know we're back
102
+ self._send_started()
103
+ return True
104
+ except Exception as e:
105
+ print(f"[DonkeylabsJob] Reconnect attempt {attempt + 1}/{self._max_reconnect_attempts} failed: {e}", file=sys.stderr)
106
+ time.sleep(self._reconnect_interval)
107
+
108
+ print(f"[DonkeylabsJob] Failed to reconnect after {self._max_reconnect_attempts} attempts", file=sys.stderr)
109
+ return False
110
+
111
+ def disconnect(self) -> None:
112
+ """Disconnect from the job server."""
113
+ self._running = False
114
+ if self._heartbeat_thread:
115
+ self._heartbeat_thread.join(timeout=2.0)
116
+ if self._socket:
117
+ try:
118
+ self._socket.close()
119
+ except Exception:
120
+ pass
121
+
122
+ def _send_message(self, message: Dict[str, Any]) -> bool:
123
+ """Send a JSON message to the server. Returns True if sent successfully."""
124
+ if not self._socket:
125
+ return False
126
+
127
+ message["jobId"] = self.job_id
128
+ message["timestamp"] = int(time.time() * 1000)
129
+
130
+ with self._lock:
131
+ try:
132
+ data = json.dumps(message) + "\n"
133
+ self._socket.sendall(data.encode("utf-8"))
134
+ return True
135
+ except (BrokenPipeError, ConnectionResetError, OSError) as e:
136
+ print(f"[DonkeylabsJob] Connection lost: {e}", file=sys.stderr)
137
+ self._connected = False
138
+
139
+ # Try to reconnect in background (don't block the caller)
140
+ if self._running and not self._reconnect_thread:
141
+ self._reconnect_thread = threading.Thread(
142
+ target=self._reconnect_loop,
143
+ daemon=True
144
+ )
145
+ self._reconnect_thread.start()
146
+ return False
147
+ except Exception as e:
148
+ print(f"[DonkeylabsJob] Failed to send message: {e}", file=sys.stderr)
149
+ return False
150
+
151
+ def _reconnect_loop(self) -> None:
152
+ """Background thread that attempts to reconnect."""
153
+ if self._try_reconnect():
154
+ print(f"[DonkeylabsJob] Reconnection successful, resuming operation", file=sys.stderr)
155
+ else:
156
+ print(f"[DonkeylabsJob] Reconnection failed, job may be lost", file=sys.stderr)
157
+ self._reconnect_thread = None
158
+
159
+ def _send_started(self) -> None:
160
+ """Send a started message to the server."""
161
+ self._send_message({"type": "started"})
162
+
163
+ def _start_heartbeat(self) -> None:
164
+ """Start the background heartbeat thread."""
165
+
166
+ def heartbeat_loop():
167
+ while self._running:
168
+ self._send_message({"type": "heartbeat"})
169
+ time.sleep(self._heartbeat_interval)
170
+
171
+ self._heartbeat_thread = threading.Thread(target=heartbeat_loop, daemon=True)
172
+ self._heartbeat_thread.start()
173
+
174
+ def progress(
175
+ self,
176
+ percent: float,
177
+ message: Optional[str] = None,
178
+ **data: Any,
179
+ ) -> None:
180
+ """
181
+ Report progress to the job server.
182
+
183
+ Args:
184
+ percent: Progress percentage (0-100)
185
+ message: Optional status message
186
+ **data: Additional data to include
187
+ """
188
+ msg: Dict[str, Any] = {
189
+ "type": "progress",
190
+ "percent": percent,
191
+ }
192
+ if message:
193
+ msg["message"] = message
194
+ if data:
195
+ msg["data"] = data
196
+
197
+ self._send_message(msg)
198
+
199
+ def log(
200
+ self,
201
+ level: str,
202
+ message: str,
203
+ **data: Any,
204
+ ) -> None:
205
+ """
206
+ Send a log message to the job server.
207
+
208
+ Args:
209
+ level: Log level (debug, info, warn, error)
210
+ message: Log message
211
+ **data: Additional data to include
212
+ """
213
+ msg: Dict[str, Any] = {
214
+ "type": "log",
215
+ "level": level,
216
+ "message": message,
217
+ }
218
+ if data:
219
+ msg["data"] = data
220
+
221
+ self._send_message(msg)
222
+
223
+ def debug(self, message: str, **data: Any) -> None:
224
+ """Send a debug log message."""
225
+ self.log("debug", message, **data)
226
+
227
+ def info(self, message: str, **data: Any) -> None:
228
+ """Send an info log message."""
229
+ self.log("info", message, **data)
230
+
231
+ def warn(self, message: str, **data: Any) -> None:
232
+ """Send a warning log message."""
233
+ self.log("warn", message, **data)
234
+
235
+ def error(self, message: str, **data: Any) -> None:
236
+ """Send an error log message."""
237
+ self.log("error", message, **data)
238
+
239
+ def complete(self, result: Any = None) -> None:
240
+ """
241
+ Mark the job as completed.
242
+
243
+ Args:
244
+ result: Optional result data to return
245
+ """
246
+ msg: Dict[str, Any] = {"type": "completed"}
247
+ if result is not None:
248
+ msg["result"] = result
249
+
250
+ self._send_message(msg)
251
+
252
+ def fail(self, error: str, stack: Optional[str] = None) -> None:
253
+ """
254
+ Mark the job as failed.
255
+
256
+ Args:
257
+ error: Error message
258
+ stack: Optional stack trace
259
+ """
260
+ msg: Dict[str, Any] = {
261
+ "type": "failed",
262
+ "error": error,
263
+ }
264
+ if stack:
265
+ msg["stack"] = stack
266
+
267
+ self._send_message(msg)
268
+
269
+
270
+ def run_job(
271
+ handler: Callable[[DonkeylabsJob], Any],
272
+ heartbeat_interval: float = 5.0,
273
+ ) -> None:
274
+ """
275
+ Run a job handler function.
276
+
277
+ This function reads the job payload from stdin, connects to the job server,
278
+ runs the handler, and reports the result.
279
+
280
+ Args:
281
+ handler: A function that takes a DonkeylabsJob and returns the result
282
+ heartbeat_interval: How often to send heartbeats (seconds)
283
+
284
+ Example:
285
+ def my_job(job: DonkeylabsJob):
286
+ job.progress(0, "Starting...")
287
+ result = do_work(job.data)
288
+ return result
289
+
290
+ if __name__ == "__main__":
291
+ run_job(my_job)
292
+ """
293
+ # Read payload from stdin
294
+ payload_line = sys.stdin.readline()
295
+ if not payload_line:
296
+ print("No payload received on stdin", file=sys.stderr)
297
+ sys.exit(1)
298
+
299
+ try:
300
+ payload = json.loads(payload_line)
301
+ except json.JSONDecodeError as e:
302
+ print(f"Failed to parse payload: {e}", file=sys.stderr)
303
+ sys.exit(1)
304
+
305
+ job_id = payload.get("jobId")
306
+ name = payload.get("name")
307
+ data = payload.get("data")
308
+ socket_path = payload.get("socketPath")
309
+
310
+ # Fall back to environment variables if not in payload
311
+ if not job_id:
312
+ job_id = os.environ.get("DONKEYLABS_JOB_ID")
313
+ if not socket_path:
314
+ socket_path = os.environ.get("DONKEYLABS_SOCKET_PATH")
315
+ tcp_port = os.environ.get("DONKEYLABS_TCP_PORT")
316
+ if tcp_port and not socket_path:
317
+ socket_path = f"tcp://127.0.0.1:{tcp_port}"
318
+
319
+ if not job_id or not socket_path:
320
+ print("Missing jobId or socketPath", file=sys.stderr)
321
+ sys.exit(1)
322
+
323
+ job = DonkeylabsJob(
324
+ job_id=job_id,
325
+ name=name or "unknown",
326
+ data=data,
327
+ socket_path=socket_path,
328
+ heartbeat_interval=heartbeat_interval,
329
+ )
330
+
331
+ try:
332
+ job.connect()
333
+
334
+ # Run the handler
335
+ result = handler(job)
336
+
337
+ # Send completion
338
+ job.complete(result)
339
+ except Exception as e:
340
+ import traceback
341
+
342
+ job.fail(str(e), traceback.format_exc())
343
+ sys.exit(1)
344
+ finally:
345
+ job.disconnect()
346
+
347
+
348
+ # Example job handler
349
+ def example_handler(job: DonkeylabsJob) -> Dict[str, Any]:
350
+ """Example job handler that processes data in steps."""
351
+ job.info(f"Starting job with data: {job.data}")
352
+
353
+ total_steps = job.data.get("steps", 5)
354
+
355
+ for i in range(total_steps):
356
+ progress = (i / total_steps) * 100
357
+ job.progress(progress, f"Processing step {i + 1} of {total_steps}")
358
+ time.sleep(0.5) # Simulate work
359
+
360
+ job.progress(100, "Complete!")
361
+ return {"processed": True, "steps": total_steps}
362
+
363
+
364
+ if __name__ == "__main__":
365
+ # If run directly, use the example handler
366
+ run_job(example_handler)