@donkeylabs/server 2.0.28 → 2.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/jobs.md CHANGED
@@ -192,6 +192,26 @@ const cancelled = await ctx.core.jobs.cancel(jobId);
192
192
 
193
193
  ---
194
194
 
195
+ ## External Jobs (Subprocess)
196
+
197
+ External jobs run in a separate process and are monitored by a watchdog.
198
+
199
+ ```ts
200
+ ctx.core.jobs.registerExternal("batchWorker", {
201
+ command: "bun",
202
+ args: ["./workers/batch-worker.ts"],
203
+ heartbeatTimeout: 30000,
204
+ timeout: 10 * 60 * 1000,
205
+ killGraceMs: 5000,
206
+ });
207
+ ```
208
+
209
+ Watchdog events:
210
+ - `job.watchdog.stale`
211
+ - `job.watchdog.killed`
212
+
213
+ ---
214
+
195
215
  ## Event Integration
196
216
 
197
217
  Jobs automatically emit events on completion and failure:
package/docs/processes.md CHANGED
@@ -8,6 +8,7 @@ Processes provide:
8
8
  - Long-running daemon management (start, stop, restart)
9
9
  - Typed event communication from process to server
10
10
  - Automatic heartbeat monitoring
11
+ - Watchdog termination for unresponsive processes
11
12
  - Connection resilience with auto-reconnection
12
13
  - Metadata passing to spawned processes
13
14
  - Cross-platform support (Unix sockets / TCP on Windows)
@@ -89,6 +90,13 @@ server.getCore().processes.define("video-encoder", {
89
90
 
90
91
  // Heartbeat configuration
91
92
  heartbeatTimeout: 30000, // 30 seconds
93
+
94
+ // Optional hard limits (requires stats for memory/CPU)
95
+ limits: {
96
+ maxRuntimeMs: 60_000,
97
+ maxMemoryMb: 512,
98
+ maxCpuPercent: 90,
99
+ },
92
100
  });
93
101
  ```
94
102
 
@@ -221,6 +229,25 @@ const client = await ProcessClient.connect({
221
229
  });
222
230
  ```
223
231
 
232
+ ---
233
+
234
+ ## Hard Limits
235
+
236
+ Processes can be terminated automatically when limits are exceeded:
237
+
238
+ - `maxRuntimeMs` always enforced by the server watchdog
239
+ - `maxMemoryMb` and `maxCpuPercent` require `ProcessClient` stats enabled
240
+
241
+ ```ts
242
+ const client = await ProcessClient.connect({
243
+ stats: { enabled: true, interval: 5000 },
244
+ });
245
+ ```
246
+
247
+ Watchdog events:
248
+ - `process.watchdog.stale`
249
+ - `process.watchdog.killed`
250
+
224
251
  ### Properties
225
252
 
226
253
  ```typescript
package/docs/workflows.md CHANGED
@@ -309,6 +309,30 @@ workflow("batch.status")
309
309
 
310
310
  Each poll cycle emits `workflow.step.poll` events and persists progress to the instance.
311
311
 
312
+ ---
313
+
314
+ ## Watchdog and Subprocess Settings
315
+
316
+ You can tune subprocess termination and SQLite pragmas used by isolated workflows:
317
+
318
+ ```ts
319
+ const server = new AppServer({
320
+ db,
321
+ workflows: {
322
+ killGraceMs: 5000,
323
+ sqlitePragmas: {
324
+ busyTimeout: 5000,
325
+ journalMode: "WAL",
326
+ synchronous: "NORMAL",
327
+ },
328
+ },
329
+ });
330
+ ```
331
+
332
+ Watchdog events:
333
+ - `workflow.watchdog.stale` (heartbeat missed)
334
+ - `workflow.watchdog.killed` (process terminated)
335
+
312
336
  ### Loop
313
337
 
314
338
  Use a loop step to jump back to a previous step until a condition is false.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@donkeylabs/server",
3
- "version": "2.0.28",
3
+ "version": "2.0.29",
4
4
  "type": "module",
5
5
  "description": "Type-safe plugin system for building RPC-style APIs with Bun",
6
6
  "main": "./src/index.ts",
@@ -201,6 +201,23 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
201
201
 
202
202
  let buffer = "";
203
203
 
204
+ const queue: AnyExternalJobMessage[] = [];
205
+ let processing = false;
206
+
207
+ const processQueue = async () => {
208
+ if (processing) return;
209
+ processing = true;
210
+ while (queue.length > 0) {
211
+ const message = queue.shift()!;
212
+ try {
213
+ await this.onMessage(message);
214
+ } catch (err) {
215
+ this.onError?.(err instanceof Error ? err : new Error(String(err)), jobId);
216
+ }
217
+ }
218
+ processing = false;
219
+ };
220
+
204
221
  socket.on("data", (data) => {
205
222
  buffer += data.toString();
206
223
 
@@ -213,11 +230,13 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
213
230
 
214
231
  const message = parseJobMessage(line);
215
232
  if (message) {
216
- this.onMessage(message);
233
+ queue.push(message);
217
234
  } else {
218
235
  this.onError?.(new Error(`Invalid message: ${line}`), jobId);
219
236
  }
220
237
  }
238
+
239
+ processQueue().catch(() => undefined);
221
240
  });
222
241
 
223
242
  socket.on("error", (err) => {
@@ -80,6 +80,8 @@ export interface ExternalJobConfig {
80
80
  heartbeatTimeout?: number;
81
81
  /** Job timeout in milliseconds (optional) */
82
82
  timeout?: number;
83
+ /** Grace period before SIGKILL when terminating (ms, default: 5000) */
84
+ killGraceMs?: number;
83
85
  }
84
86
 
85
87
  // ============================================
@@ -120,6 +122,8 @@ export interface ExternalJobsConfig {
120
122
  defaultHeartbeatTimeout?: number;
121
123
  /** Heartbeat check interval in ms (default: 10000) */
122
124
  heartbeatCheckInterval?: number;
125
+ /** Default grace period before SIGKILL when terminating (ms, default: 5000) */
126
+ killGraceMs?: number;
123
127
  }
124
128
 
125
129
  // ============================================
package/src/core/index.ts CHANGED
@@ -134,6 +134,7 @@ export {
134
134
  export {
135
135
  type Workflows,
136
136
  type WorkflowsConfig,
137
+ type SqlitePragmaConfig,
137
138
  type WorkflowRegisterOptions,
138
139
  type WorkflowDefinition,
139
140
  type WorkflowInstance,
package/src/core/jobs.ts CHANGED
@@ -273,7 +273,10 @@ class JobsImpl implements Jobs {
273
273
  private externalConfigs = new Map<string, ExternalJobConfig>();
274
274
  private externalConfig: ExternalJobsConfig;
275
275
  private socketServer: ExternalJobSocketServer | null = null;
276
- private externalProcesses = new Map<string, { pid: number; timeout?: ReturnType<typeof setTimeout> }>();
276
+ private externalProcesses = new Map<
277
+ string,
278
+ { pid: number; timeout?: ReturnType<typeof setTimeout>; killTimer?: ReturnType<typeof setTimeout> }
279
+ >();
277
280
 
278
281
  constructor(config: JobsConfig = {}) {
279
282
  this.events = config.events;
@@ -521,6 +524,7 @@ class JobsImpl implements Jobs {
521
524
 
522
525
  const config = this.externalConfigs.get(job.name);
523
526
  const heartbeatTimeout = config?.heartbeatTimeout ?? this.externalConfig.defaultHeartbeatTimeout ?? 30000;
527
+ const killGraceMs = config?.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
524
528
  const timeSinceHeartbeat = now - job.lastHeartbeat.getTime();
525
529
 
526
530
  if (timeSinceHeartbeat > heartbeatTimeout) {
@@ -533,36 +537,22 @@ class JobsImpl implements Jobs {
533
537
  name: job.name,
534
538
  timeSinceHeartbeat,
535
539
  });
536
- }
537
-
538
- // If stale for 2x timeout, kill the process
539
- if (timeSinceHeartbeat > heartbeatTimeout * 2) {
540
- console.error(`[Jobs] Killing stale external job ${job.id}`);
541
-
542
- if (job.pid) {
543
- try {
544
- process.kill(job.pid, "SIGKILL");
545
- } catch {
546
- // Process may already be dead
547
- }
548
- }
549
-
550
- await this.adapter.update(job.id, {
551
- status: "failed",
552
- error: "Heartbeat timeout - job process unresponsive",
553
- completedAt: new Date(),
554
- processState: "orphaned",
540
+ await this.events.emit("job.watchdog.stale", {
541
+ jobId: job.id,
542
+ name: job.name,
543
+ timeSinceHeartbeat,
555
544
  });
545
+ }
556
546
 
557
- await this.cleanupExternalJob(job.id);
558
-
559
- if (this.events) {
560
- await this.events.emit("job.failed", {
561
- jobId: job.id,
562
- name: job.name,
563
- error: "Heartbeat timeout",
564
- });
565
- }
547
+ const procInfo = this.externalProcesses.get(job.id);
548
+ if (job.pid && !procInfo?.killTimer) {
549
+ console.error(`[Jobs] Terminating stale external job ${job.id}`);
550
+ await this.terminateExternalProcess(
551
+ job.id,
552
+ job.pid,
553
+ killGraceMs,
554
+ "Heartbeat timeout - job process unresponsive"
555
+ );
566
556
  }
567
557
  }
568
558
  }
@@ -764,6 +754,9 @@ class JobsImpl implements Jobs {
764
754
  if (procInfo?.timeout) {
765
755
  clearTimeout(procInfo.timeout);
766
756
  }
757
+ if (procInfo?.killTimer) {
758
+ clearTimeout(procInfo.killTimer);
759
+ }
767
760
  this.externalProcesses.delete(jobId);
768
761
 
769
762
  // Close the socket
@@ -901,27 +894,13 @@ class JobsImpl implements Jobs {
901
894
  if (config.timeout) {
902
895
  const timeout = setTimeout(async () => {
903
896
  console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
904
- try {
905
- process.kill(proc.pid, "SIGTERM");
906
- } catch {
907
- // Process may already be dead
908
- }
909
-
910
- await this.adapter.update(job.id, {
911
- status: "failed",
912
- error: `Job timed out after ${config.timeout}ms`,
913
- completedAt: new Date(),
914
- });
915
-
916
- await this.cleanupExternalJob(job.id);
917
-
918
- if (this.events) {
919
- await this.events.emit("job.failed", {
920
- jobId: job.id,
921
- name: job.name,
922
- error: "Timeout",
923
- });
924
- }
897
+ const killGraceMs = config.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
898
+ await this.terminateExternalProcess(
899
+ job.id,
900
+ proc.pid,
901
+ killGraceMs,
902
+ `Job timed out after ${config.timeout}ms`
903
+ );
925
904
  }, config.timeout);
926
905
 
927
906
  const procInfo = this.externalProcesses.get(job.id);
@@ -998,6 +977,73 @@ class JobsImpl implements Jobs {
998
977
  }
999
978
  }
1000
979
 
980
+ private async terminateExternalProcess(
981
+ jobId: string,
982
+ pid: number,
983
+ killGraceMs: number,
984
+ error: string
985
+ ): Promise<void> {
986
+ try {
987
+ process.kill(pid, "SIGTERM");
988
+ } catch {
989
+ return;
990
+ }
991
+
992
+ if (killGraceMs <= 0) {
993
+ try {
994
+ process.kill(pid, "SIGKILL");
995
+ } catch {
996
+ // ignore
997
+ }
998
+ await this.handleExternalFailure(jobId, error);
999
+ return;
1000
+ }
1001
+
1002
+ const timer = setTimeout(async () => {
1003
+ try {
1004
+ process.kill(pid, 0);
1005
+ process.kill(pid, "SIGKILL");
1006
+ } catch {
1007
+ // ignore
1008
+ }
1009
+
1010
+ await this.handleExternalFailure(jobId, error);
1011
+ }, killGraceMs);
1012
+
1013
+ const procInfo = this.externalProcesses.get(jobId);
1014
+ if (procInfo) {
1015
+ procInfo.killTimer = timer;
1016
+ }
1017
+ }
1018
+
1019
+ private async handleExternalFailure(jobId: string, error: string): Promise<void> {
1020
+ await this.adapter.update(jobId, {
1021
+ status: "failed",
1022
+ error,
1023
+ completedAt: new Date(),
1024
+ processState: "orphaned",
1025
+ });
1026
+
1027
+ const job = await this.adapter.get(jobId);
1028
+ if (this.events && job) {
1029
+ await this.events.emit("job.watchdog.killed", {
1030
+ jobId,
1031
+ name: job.name,
1032
+ reason: error,
1033
+ });
1034
+ }
1035
+
1036
+ await this.cleanupExternalJob(jobId);
1037
+
1038
+ if (this.events && job) {
1039
+ await this.events.emit("job.failed", {
1040
+ jobId,
1041
+ name: job.name,
1042
+ error,
1043
+ });
1044
+ }
1045
+ }
1046
+
1001
1047
  private streamProcessOutput(
1002
1048
  jobId: string,
1003
1049
  jobName: string,
@@ -207,6 +207,23 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
207
207
 
208
208
  let buffer = "";
209
209
 
210
+ const queue: ProcessMessage[] = [];
211
+ let processing = false;
212
+
213
+ const processQueue = async () => {
214
+ if (processing) return;
215
+ processing = true;
216
+ while (queue.length > 0) {
217
+ const message = queue.shift()!;
218
+ try {
219
+ await this.onMessage(message);
220
+ } catch (err) {
221
+ this.onError?.(err instanceof Error ? err : new Error(String(err)), processId);
222
+ }
223
+ }
224
+ processing = false;
225
+ };
226
+
210
227
  socket.on("data", (data) => {
211
228
  buffer += data.toString();
212
229
 
@@ -219,11 +236,13 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
219
236
 
220
237
  const message = this.parseMessage(line);
221
238
  if (message) {
222
- this.onMessage(message);
239
+ queue.push(message);
223
240
  } else {
224
241
  this.onError?.(new Error(`Invalid message: ${line}`), processId);
225
242
  }
226
243
  }
244
+
245
+ processQueue().catch(() => undefined);
227
246
  });
228
247
 
229
248
  socket.on("error", (err) => {
@@ -61,6 +61,15 @@ export interface ProcessConfig {
61
61
  /** Timeout before considering unhealthy in ms (default: 60000) */
62
62
  timeoutMs?: number;
63
63
  };
64
+ /** Hard limits for the process (optional) */
65
+ limits?: {
66
+ /** Max runtime in ms before termination */
67
+ maxRuntimeMs?: number;
68
+ /** Max memory (RSS) in MB before termination (requires stats enabled) */
69
+ maxMemoryMb?: number;
70
+ /** Max CPU percent before termination (requires stats enabled) */
71
+ maxCpuPercent?: number;
72
+ };
64
73
  }
65
74
 
66
75
  export interface ManagedProcess {
@@ -171,6 +180,8 @@ export interface ProcessesConfig {
171
180
  heartbeatCheckInterval?: number;
172
181
  /** Enable auto-reconnect to orphaned processes on startup (default: true) */
173
182
  autoRecoverOrphans?: boolean;
183
+ /** Grace period before SIGKILL when stopping/killing (ms, default: 5000) */
184
+ killGraceMs?: number;
174
185
  }
175
186
 
176
187
  // ============================================
@@ -251,6 +262,8 @@ export class ProcessesImpl implements Processes {
251
262
  private events?: Events;
252
263
  private heartbeatCheckInterval: number;
253
264
  private autoRecoverOrphans: boolean;
265
+ private killGraceMs: number;
266
+ private runtimeLimitTimers = new Map<string, ReturnType<typeof setTimeout>>();
254
267
 
255
268
  // Track running Bun subprocesses
256
269
  private subprocesses = new Map<string, Subprocess>();
@@ -266,6 +279,7 @@ export class ProcessesImpl implements Processes {
266
279
  this.events = config.events;
267
280
  this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
268
281
  this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
282
+ this.killGraceMs = config.killGraceMs ?? 5000;
269
283
 
270
284
  // Create socket server with callbacks
271
285
  this.socketServer = createProcessSocketServer(config.socket ?? {}, {
@@ -361,6 +375,21 @@ export class ProcessesImpl implements Processes {
361
375
  // Set up exit handler for crash detection
362
376
  proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
363
377
 
378
+ const maxRuntimeMs = config.limits?.maxRuntimeMs;
379
+ if (maxRuntimeMs && maxRuntimeMs > 0) {
380
+ const timer = setTimeout(async () => {
381
+ console.warn(`[Processes] Max runtime exceeded for ${name} (${process.id})`);
382
+ await this.emitEvent("process.limits_exceeded", {
383
+ processId: process.id,
384
+ name,
385
+ reason: "maxRuntimeMs",
386
+ limit: maxRuntimeMs,
387
+ });
388
+ await this.stop(process.id);
389
+ }, maxRuntimeMs);
390
+ this.runtimeLimitTimers.set(process.id, timer);
391
+ }
392
+
364
393
  console.log(`[Processes] Spawned ${name} (${process.id}) with PID ${proc.pid}`);
365
394
  return process.id;
366
395
  } catch (error) {
@@ -395,7 +424,7 @@ export class ProcessesImpl implements Processes {
395
424
  // Wait for process to exit (with timeout)
396
425
  const exitPromise = subprocess.exited;
397
426
  const timeoutPromise = new Promise<null>((resolve) =>
398
- setTimeout(() => resolve(null), 5000)
427
+ setTimeout(() => resolve(null), this.killGraceMs)
399
428
  );
400
429
 
401
430
  const result = await Promise.race([exitPromise, timeoutPromise]);
@@ -412,6 +441,11 @@ export class ProcessesImpl implements Processes {
412
441
  // Cleanup
413
442
  await this.socketServer.closeSocket(processId);
414
443
  this.subprocesses.delete(processId);
444
+ const runtimeTimer = this.runtimeLimitTimers.get(processId);
445
+ if (runtimeTimer) {
446
+ clearTimeout(runtimeTimer);
447
+ this.runtimeLimitTimers.delete(processId);
448
+ }
415
449
 
416
450
  await this.adapter.update(processId, {
417
451
  status: "stopped",
@@ -443,6 +477,11 @@ export class ProcessesImpl implements Processes {
443
477
  // Cleanup
444
478
  await this.socketServer.closeSocket(processId);
445
479
  this.subprocesses.delete(processId);
480
+ const runtimeTimer = this.runtimeLimitTimers.get(processId);
481
+ if (runtimeTimer) {
482
+ clearTimeout(runtimeTimer);
483
+ this.runtimeLimitTimers.delete(processId);
484
+ }
446
485
 
447
486
  await this.adapter.update(processId, {
448
487
  status: "stopped",
@@ -590,6 +629,47 @@ export class ProcessesImpl implements Processes {
590
629
  await definition.onStats(proc, stats);
591
630
  }
592
631
 
632
+ const limits = proc.config.limits;
633
+ if (limits) {
634
+ if (limits.maxMemoryMb && stats.memory.rss / 1e6 > limits.maxMemoryMb) {
635
+ console.warn(`[Processes] Memory limit exceeded for ${proc.name} (${proc.id})`);
636
+ await this.emitEvent("process.limits_exceeded", {
637
+ processId,
638
+ name: proc.name,
639
+ reason: "maxMemoryMb",
640
+ limit: limits.maxMemoryMb,
641
+ value: stats.memory.rss / 1e6,
642
+ });
643
+ await this.emitEvent("process.watchdog.killed", {
644
+ processId,
645
+ name: proc.name,
646
+ reason: "maxMemoryMb",
647
+ value: stats.memory.rss / 1e6,
648
+ });
649
+ await this.stop(proc.id);
650
+ return;
651
+ }
652
+
653
+ if (limits.maxCpuPercent && stats.cpu.percent > limits.maxCpuPercent) {
654
+ console.warn(`[Processes] CPU limit exceeded for ${proc.name} (${proc.id})`);
655
+ await this.emitEvent("process.limits_exceeded", {
656
+ processId,
657
+ name: proc.name,
658
+ reason: "maxCpuPercent",
659
+ limit: limits.maxCpuPercent,
660
+ value: stats.cpu.percent,
661
+ });
662
+ await this.emitEvent("process.watchdog.killed", {
663
+ processId,
664
+ name: proc.name,
665
+ reason: "maxCpuPercent",
666
+ value: stats.cpu.percent,
667
+ });
668
+ await this.stop(proc.id);
669
+ return;
670
+ }
671
+ }
672
+
593
673
  return;
594
674
  }
595
675
 
@@ -835,16 +915,27 @@ export class ProcessesImpl implements Processes {
835
915
  processId: proc.id,
836
916
  name: proc.name,
837
917
  });
918
+ await this.emitEvent("process.watchdog.stale", {
919
+ processId: proc.id,
920
+ name: proc.name,
921
+ reason: "heartbeat",
922
+ timeoutMs,
923
+ });
838
924
 
839
925
  const definition = this.definitions.get(proc.name);
840
926
  if (definition?.onUnhealthy) {
841
927
  await definition.onUnhealthy(proc);
842
928
  }
843
929
 
844
- // If heartbeat is way overdue (2x timeout), kill and restart
930
+ // If heartbeat is way overdue (2x timeout), stop and restart
845
931
  if (now - lastHeartbeat > timeoutMs * 2) {
846
- console.warn(`[Processes] Killing unresponsive process ${proc.name} (${proc.id})`);
847
- await this.kill(proc.id);
932
+ console.warn(`[Processes] Stopping unresponsive process ${proc.name} (${proc.id})`);
933
+ await this.stop(proc.id);
934
+ await this.emitEvent("process.watchdog.killed", {
935
+ processId: proc.id,
936
+ name: proc.name,
937
+ reason: "heartbeat",
938
+ });
848
939
  // handleExit will trigger auto-restart if configured
849
940
  }
850
941
  }
@@ -32,6 +32,11 @@ export interface SubprocessPluginMetadata {
32
32
  export interface SubprocessBootstrapOptions {
33
33
  dbPath: string;
34
34
  coreConfig?: Record<string, any>;
35
+ sqlitePragmas?: {
36
+ busyTimeout?: number;
37
+ synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
38
+ journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
39
+ };
35
40
  pluginMetadata: SubprocessPluginMetadata;
36
41
  startServices?: {
37
42
  cron?: boolean;
@@ -53,7 +58,15 @@ export async function bootstrapSubprocess(
53
58
  options: SubprocessBootstrapOptions
54
59
  ): Promise<SubprocessBootstrapResult> {
55
60
  const sqlite = new Database(options.dbPath);
56
- sqlite.run("PRAGMA busy_timeout = 5000");
61
+ const pragmas = options.sqlitePragmas;
62
+ const busyTimeout = pragmas?.busyTimeout ?? 5000;
63
+ sqlite.run(`PRAGMA busy_timeout = ${busyTimeout}`);
64
+ if (pragmas?.journalMode) {
65
+ sqlite.run(`PRAGMA journal_mode = ${pragmas.journalMode}`);
66
+ }
67
+ if (pragmas?.synchronous) {
68
+ sqlite.run(`PRAGMA synchronous = ${pragmas.synchronous}`);
69
+ }
57
70
 
58
71
  const db = new Kysely<any>({
59
72
  dialect: new BunSqliteDialect({ database: sqlite }),
@@ -25,6 +25,11 @@ interface ExecutorConfig {
25
25
  pluginModulePaths: Record<string, string>;
26
26
  pluginConfigs: Record<string, any>;
27
27
  coreConfig?: Record<string, any>;
28
+ sqlitePragmas?: {
29
+ busyTimeout?: number;
30
+ synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
31
+ journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
32
+ };
28
33
  }
29
34
 
30
35
  // ============================================
@@ -47,6 +52,7 @@ async function main(): Promise<void> {
47
52
  pluginModulePaths,
48
53
  pluginConfigs,
49
54
  coreConfig,
55
+ sqlitePragmas,
50
56
  } = config;
51
57
 
52
58
  const socket = await connectToSocket(socketPath, tcpPort);
@@ -71,6 +77,7 @@ async function main(): Promise<void> {
71
77
  const bootstrap = await bootstrapSubprocess({
72
78
  dbPath,
73
79
  coreConfig,
80
+ sqlitePragmas,
74
81
  pluginMetadata: {
75
82
  names: pluginNames,
76
83
  modulePaths: pluginModulePaths,
@@ -248,7 +248,24 @@ export class WorkflowSocketServerImpl implements WorkflowSocketServer {
248
248
 
249
249
  let buffer = "";
250
250
 
251
- socket.on("data", async (data) => {
251
+ const queue: WorkflowMessage[] = [];
252
+ let processing = false;
253
+
254
+ const processQueue = async () => {
255
+ if (processing) return;
256
+ processing = true;
257
+ while (queue.length > 0) {
258
+ const message = queue.shift()!;
259
+ try {
260
+ await this.handleMessage(instanceId, message);
261
+ } catch (err) {
262
+ this.onError?.(err instanceof Error ? err : new Error(String(err)), instanceId);
263
+ }
264
+ }
265
+ processing = false;
266
+ };
267
+
268
+ socket.on("data", (data) => {
252
269
  buffer += data.toString();
253
270
 
254
271
  // Process complete messages (newline-delimited JSON)
@@ -260,11 +277,13 @@ export class WorkflowSocketServerImpl implements WorkflowSocketServer {
260
277
 
261
278
  try {
262
279
  const message = JSON.parse(line) as WorkflowMessage;
263
- await this.handleMessage(instanceId, message);
280
+ queue.push(message);
264
281
  } catch (err) {
265
282
  this.onError?.(new Error(`Invalid message: ${line}`), instanceId);
266
283
  }
267
284
  }
285
+
286
+ processQueue().catch(() => undefined);
268
287
  });
269
288
 
270
289
  socket.on("error", (err) => {
@@ -760,12 +760,22 @@ export interface WorkflowsConfig {
760
760
  heartbeatTimeout?: number;
761
761
  /** Timeout waiting for isolated subprocess readiness (ms, default: 10000) */
762
762
  readyTimeout?: number;
763
+ /** Grace period before SIGKILL when terminating isolated subprocesses (ms, default: 5000) */
764
+ killGraceMs?: number;
765
+ /** SQLite pragmas for isolated subprocess connections */
766
+ sqlitePragmas?: SqlitePragmaConfig;
763
767
  /** Resume strategy for orphaned workflows (default: "blocking") */
764
768
  resumeStrategy?: WorkflowResumeStrategy;
765
769
  }
766
770
 
767
771
  export type WorkflowResumeStrategy = "blocking" | "background" | "skip";
768
772
 
773
+ export interface SqlitePragmaConfig {
774
+ busyTimeout?: number;
775
+ synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
776
+ journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
777
+ }
778
+
769
779
  /** Options for registering a workflow */
770
780
  export interface WorkflowRegisterOptions {
771
781
  /**
@@ -854,6 +864,8 @@ class WorkflowsImpl implements Workflows {
854
864
  private dbPath?: string;
855
865
  private heartbeatTimeoutMs: number;
856
866
  private readyTimeoutMs: number;
867
+ private killGraceMs: number;
868
+ private sqlitePragmas?: SqlitePragmaConfig;
857
869
  private resumeStrategy!: WorkflowResumeStrategy;
858
870
  private workflowModulePaths = new Map<string, string>();
859
871
  private isolatedProcesses = new Map<string, IsolatedProcessInfo>();
@@ -888,6 +900,8 @@ class WorkflowsImpl implements Workflows {
888
900
  this.dbPath = config.dbPath;
889
901
  this.heartbeatTimeoutMs = config.heartbeatTimeout ?? 60000;
890
902
  this.readyTimeoutMs = config.readyTimeout ?? 10000;
903
+ this.killGraceMs = config.killGraceMs ?? 5000;
904
+ this.sqlitePragmas = config.sqlitePragmas;
891
905
  this.resumeStrategy = config.resumeStrategy ?? "blocking";
892
906
  }
893
907
 
@@ -1049,11 +1063,7 @@ class WorkflowsImpl implements Workflows {
1049
1063
  // Kill isolated process if running
1050
1064
  const isolatedInfo = this.isolatedProcesses.get(instanceId);
1051
1065
  if (isolatedInfo) {
1052
- try {
1053
- process.kill(isolatedInfo.pid, "SIGTERM");
1054
- } catch {
1055
- // Process might already be dead
1056
- }
1066
+ await killProcessWithGrace(isolatedInfo.pid, this.killGraceMs);
1057
1067
  if (isolatedInfo.timeout) clearTimeout(isolatedInfo.timeout);
1058
1068
  if (isolatedInfo.heartbeatTimeout) clearTimeout(isolatedInfo.heartbeatTimeout);
1059
1069
  this.isolatedProcesses.delete(instanceId);
@@ -1470,6 +1480,7 @@ class WorkflowsImpl implements Workflows {
1470
1480
  pluginModulePaths: this.pluginModulePaths,
1471
1481
  pluginConfigs,
1472
1482
  coreConfig,
1483
+ sqlitePragmas: this.sqlitePragmas,
1473
1484
  };
1474
1485
 
1475
1486
  // Spawn the subprocess
@@ -1995,6 +2006,11 @@ class WorkflowsImpl implements Workflows {
1995
2006
  }
1996
2007
 
1997
2008
  console.error(`[Workflows] No heartbeat from isolated workflow ${instanceId} for ${this.heartbeatTimeoutMs}ms`);
2009
+ await this.emitEvent("workflow.watchdog.stale", {
2010
+ instanceId,
2011
+ reason: "heartbeat",
2012
+ timeoutMs: this.heartbeatTimeoutMs,
2013
+ });
1998
2014
  await this.handleIsolatedTimeout(instanceId, pid);
1999
2015
  }, this.heartbeatTimeoutMs);
2000
2016
  }
@@ -2006,12 +2022,12 @@ class WorkflowsImpl implements Workflows {
2006
2022
  const info = this.isolatedProcesses.get(instanceId);
2007
2023
  if (!info) return;
2008
2024
 
2009
- // Kill the process
2010
- try {
2011
- process.kill(pid, "SIGKILL");
2012
- } catch {
2013
- // Process might already be dead
2014
- }
2025
+ await killProcessWithGrace(pid, this.killGraceMs);
2026
+ await this.emitEvent("workflow.watchdog.killed", {
2027
+ instanceId,
2028
+ reason: "timeout",
2029
+ timeoutMs: this.heartbeatTimeoutMs,
2030
+ });
2015
2031
 
2016
2032
  // Clean up
2017
2033
  if (info.timeout) clearTimeout(info.timeout);
@@ -2148,3 +2164,29 @@ function isPlainObject(value: Record<string, any>): boolean {
2148
2164
  export function createWorkflows(config?: WorkflowsConfig): Workflows {
2149
2165
  return new WorkflowsImpl(config);
2150
2166
  }
2167
+
2168
+ async function killProcessWithGrace(pid: number, graceMs: number): Promise<void> {
2169
+ try {
2170
+ process.kill(pid, "SIGTERM");
2171
+ } catch {
2172
+ return;
2173
+ }
2174
+
2175
+ if (graceMs <= 0) {
2176
+ try {
2177
+ process.kill(pid, "SIGKILL");
2178
+ } catch {
2179
+ return;
2180
+ }
2181
+ return;
2182
+ }
2183
+
2184
+ await new Promise((resolve) => setTimeout(resolve, graceMs));
2185
+
2186
+ try {
2187
+ process.kill(pid, 0);
2188
+ process.kill(pid, "SIGKILL");
2189
+ } catch {
2190
+ // Process already exited
2191
+ }
2192
+ }