@donkeylabs/server 2.0.28 → 2.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/jobs.md CHANGED
@@ -192,6 +192,33 @@ const cancelled = await ctx.core.jobs.cancel(jobId);
192
192
 
193
193
  ---
194
194
 
195
+ ## External Jobs (Subprocess)
196
+
197
+ External jobs run in a separate process and are monitored by a watchdog.
198
+
199
+ ```ts
200
+ ctx.core.jobs.registerExternal("batchWorker", {
201
+ command: "bun",
202
+ args: ["./workers/batch-worker.ts"],
203
+ heartbeatTimeout: 30000,
204
+ timeout: 10 * 60 * 1000,
205
+ killGraceMs: 5000,
206
+ });
207
+
208
+ // Disable in-process timers when using the watchdog subprocess
209
+ const server = new AppServer({
210
+ db,
211
+ watchdog: { enabled: true },
212
+ jobs: { external: { useWatchdog: true } },
213
+ });
214
+ ```
215
+
216
+ Watchdog events:
217
+ - `job.watchdog.stale`
218
+ - `job.watchdog.killed`
219
+
220
+ ---
221
+
195
222
  ## Event Integration
196
223
 
197
224
  Jobs automatically emit events on completion and failure:
package/docs/processes.md CHANGED
@@ -8,6 +8,7 @@ Processes provide:
8
8
  - Long-running daemon management (start, stop, restart)
9
9
  - Typed event communication from process to server
10
10
  - Automatic heartbeat monitoring
11
+ - Watchdog termination for unresponsive processes
11
12
  - Connection resilience with auto-reconnection
12
13
  - Metadata passing to spawned processes
13
14
  - Cross-platform support (Unix sockets / TCP on Windows)
@@ -89,6 +90,13 @@ server.getCore().processes.define("video-encoder", {
89
90
 
90
91
  // Heartbeat configuration
91
92
  heartbeatTimeout: 30000, // 30 seconds
93
+
94
+ // Optional hard limits (requires stats for memory/CPU)
95
+ limits: {
96
+ maxRuntimeMs: 60_000,
97
+ maxMemoryMb: 512,
98
+ maxCpuPercent: 90,
99
+ },
92
100
  });
93
101
  ```
94
102
 
@@ -221,6 +229,27 @@ const client = await ProcessClient.connect({
221
229
  });
222
230
  ```
223
231
 
232
+ ---
233
+
234
+ ## Hard Limits
235
+
236
+ Processes can be terminated automatically when limits are exceeded:
237
+
238
+ - `maxRuntimeMs` always enforced by the server watchdog
239
+ - `maxMemoryMb` and `maxCpuPercent` require `ProcessClient` stats enabled
240
+
241
+ ```ts
242
+ const client = await ProcessClient.connect({
243
+ stats: { enabled: true, interval: 5000 },
244
+ });
245
+ ```
246
+
247
+ Watchdog events:
248
+ - `process.watchdog.stale`
249
+ - `process.watchdog.killed`
250
+
251
+ When `watchdog.enabled` is true, heartbeat monitoring runs in the watchdog subprocess.
252
+
224
253
  ### Properties
225
254
 
226
255
  ```typescript
package/docs/workflows.md CHANGED
@@ -309,6 +309,38 @@ workflow("batch.status")
309
309
 
310
310
  Each poll cycle emits `workflow.step.poll` events and persists progress to the instance.
311
311
 
312
+ ---
313
+
314
+ ## Watchdog and Subprocess Settings
315
+
316
+ You can tune subprocess termination and SQLite pragmas used by isolated workflows:
317
+
318
+ ```ts
319
+ const server = new AppServer({
320
+ db,
321
+ watchdog: {
322
+ enabled: true,
323
+ intervalMs: 5000,
324
+ services: ["workflows", "jobs", "processes"],
325
+ killGraceMs: 5000,
326
+ },
327
+ workflows: {
328
+ killGraceMs: 5000,
329
+ sqlitePragmas: {
330
+ busyTimeout: 5000,
331
+ journalMode: "WAL",
332
+ synchronous: "NORMAL",
333
+ },
334
+ },
335
+ });
336
+ ```
337
+
338
+ When `watchdog.enabled` is true, workflow heartbeat timers run in the watchdog subprocess instead of the main server.
339
+
340
+ Watchdog events:
341
+ - `workflow.watchdog.stale` (heartbeat missed)
342
+ - `workflow.watchdog.killed` (process terminated)
343
+
312
344
  ### Loop
313
345
 
314
346
  Use a loop step to jump back to a previous step until a condition is false.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@donkeylabs/server",
3
- "version": "2.0.28",
3
+ "version": "2.0.30",
4
4
  "type": "module",
5
5
  "description": "Type-safe plugin system for building RPC-style APIs with Bun",
6
6
  "main": "./src/index.ts",
@@ -201,6 +201,23 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
201
201
 
202
202
  let buffer = "";
203
203
 
204
+ const queue: AnyExternalJobMessage[] = [];
205
+ let processing = false;
206
+
207
+ const processQueue = async () => {
208
+ if (processing) return;
209
+ processing = true;
210
+ while (queue.length > 0) {
211
+ const message = queue.shift()!;
212
+ try {
213
+ await this.onMessage(message);
214
+ } catch (err) {
215
+ this.onError?.(err instanceof Error ? err : new Error(String(err)), jobId);
216
+ }
217
+ }
218
+ processing = false;
219
+ };
220
+
204
221
  socket.on("data", (data) => {
205
222
  buffer += data.toString();
206
223
 
@@ -213,11 +230,13 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
213
230
 
214
231
  const message = parseJobMessage(line);
215
232
  if (message) {
216
- this.onMessage(message);
233
+ queue.push(message);
217
234
  } else {
218
235
  this.onError?.(new Error(`Invalid message: ${line}`), jobId);
219
236
  }
220
237
  }
238
+
239
+ processQueue().catch(() => undefined);
221
240
  });
222
241
 
223
242
  socket.on("error", (err) => {
@@ -80,6 +80,8 @@ export interface ExternalJobConfig {
80
80
  heartbeatTimeout?: number;
81
81
  /** Job timeout in milliseconds (optional) */
82
82
  timeout?: number;
83
+ /** Grace period before SIGKILL when terminating (ms, default: 5000) */
84
+ killGraceMs?: number;
83
85
  }
84
86
 
85
87
  // ============================================
@@ -120,6 +122,10 @@ export interface ExternalJobsConfig {
120
122
  defaultHeartbeatTimeout?: number;
121
123
  /** Heartbeat check interval in ms (default: 10000) */
122
124
  heartbeatCheckInterval?: number;
125
+ /** Default grace period before SIGKILL when terminating (ms, default: 5000) */
126
+ killGraceMs?: number;
127
+ /** Disable in-process watchdog timers (use external watchdog instead) */
128
+ useWatchdog?: boolean;
123
129
  }
124
130
 
125
131
  // ============================================
package/src/core/index.ts CHANGED
@@ -134,6 +134,7 @@ export {
134
134
  export {
135
135
  type Workflows,
136
136
  type WorkflowsConfig,
137
+ type SqlitePragmaConfig,
137
138
  type WorkflowRegisterOptions,
138
139
  type WorkflowDefinition,
139
140
  type WorkflowInstance,
package/src/core/jobs.ts CHANGED
@@ -141,6 +141,8 @@ export interface Jobs {
141
141
  getRunningExternal(): Promise<Job[]>;
142
142
  /** Get all jobs with optional filtering (for admin dashboard) */
143
143
  getAll(options?: GetAllJobsOptions): Promise<Job[]>;
144
+ /** Get external job config snapshot for watchdog */
145
+ getExternalJobConfigs(): Record<string, ExternalJobConfig>;
144
146
  /** Start the job processing loop */
145
147
  start(): void;
146
148
  /** Stop the job processing and cleanup */
@@ -273,7 +275,10 @@ class JobsImpl implements Jobs {
273
275
  private externalConfigs = new Map<string, ExternalJobConfig>();
274
276
  private externalConfig: ExternalJobsConfig;
275
277
  private socketServer: ExternalJobSocketServer | null = null;
276
- private externalProcesses = new Map<string, { pid: number; timeout?: ReturnType<typeof setTimeout> }>();
278
+ private externalProcesses = new Map<
279
+ string,
280
+ { pid: number; timeout?: ReturnType<typeof setTimeout>; killTimer?: ReturnType<typeof setTimeout> }
281
+ >();
277
282
 
278
283
  constructor(config: JobsConfig = {}) {
279
284
  this.events = config.events;
@@ -322,6 +327,14 @@ class JobsImpl implements Jobs {
322
327
  this.externalConfigs.set(name, config);
323
328
  }
324
329
 
330
+ getExternalJobConfigs(): Record<string, ExternalJobConfig> {
331
+ const snapshot: Record<string, ExternalJobConfig> = {};
332
+ for (const [name, config] of this.externalConfigs.entries()) {
333
+ snapshot[name] = { ...config };
334
+ }
335
+ return snapshot;
336
+ }
337
+
325
338
  private isExternalJob(name: string): boolean {
326
339
  return this.externalConfigs.has(name);
327
340
  }
@@ -419,7 +432,9 @@ class JobsImpl implements Jobs {
419
432
  // Initialize socket server for external jobs
420
433
  if (this.externalConfigs.size > 0) {
421
434
  this.initializeSocketServer();
422
- this.startHeartbeatMonitor();
435
+ if (!this.externalConfig.useWatchdog) {
436
+ this.startHeartbeatMonitor();
437
+ }
423
438
  // Attempt to reconnect to orphaned jobs from previous run
424
439
  this.reconnectOrphanedJobs();
425
440
  }
@@ -521,6 +536,7 @@ class JobsImpl implements Jobs {
521
536
 
522
537
  const config = this.externalConfigs.get(job.name);
523
538
  const heartbeatTimeout = config?.heartbeatTimeout ?? this.externalConfig.defaultHeartbeatTimeout ?? 30000;
539
+ const killGraceMs = config?.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
524
540
  const timeSinceHeartbeat = now - job.lastHeartbeat.getTime();
525
541
 
526
542
  if (timeSinceHeartbeat > heartbeatTimeout) {
@@ -533,36 +549,22 @@ class JobsImpl implements Jobs {
533
549
  name: job.name,
534
550
  timeSinceHeartbeat,
535
551
  });
536
- }
537
-
538
- // If stale for 2x timeout, kill the process
539
- if (timeSinceHeartbeat > heartbeatTimeout * 2) {
540
- console.error(`[Jobs] Killing stale external job ${job.id}`);
541
-
542
- if (job.pid) {
543
- try {
544
- process.kill(job.pid, "SIGKILL");
545
- } catch {
546
- // Process may already be dead
547
- }
548
- }
549
-
550
- await this.adapter.update(job.id, {
551
- status: "failed",
552
- error: "Heartbeat timeout - job process unresponsive",
553
- completedAt: new Date(),
554
- processState: "orphaned",
552
+ await this.events.emit("job.watchdog.stale", {
553
+ jobId: job.id,
554
+ name: job.name,
555
+ timeSinceHeartbeat,
555
556
  });
557
+ }
556
558
 
557
- await this.cleanupExternalJob(job.id);
558
-
559
- if (this.events) {
560
- await this.events.emit("job.failed", {
561
- jobId: job.id,
562
- name: job.name,
563
- error: "Heartbeat timeout",
564
- });
565
- }
559
+ const procInfo = this.externalProcesses.get(job.id);
560
+ if (job.pid && !procInfo?.killTimer) {
561
+ console.error(`[Jobs] Terminating stale external job ${job.id}`);
562
+ await this.terminateExternalProcess(
563
+ job.id,
564
+ job.pid,
565
+ killGraceMs,
566
+ "Heartbeat timeout - job process unresponsive"
567
+ );
566
568
  }
567
569
  }
568
570
  }
@@ -764,6 +766,9 @@ class JobsImpl implements Jobs {
764
766
  if (procInfo?.timeout) {
765
767
  clearTimeout(procInfo.timeout);
766
768
  }
769
+ if (procInfo?.killTimer) {
770
+ clearTimeout(procInfo.killTimer);
771
+ }
767
772
  this.externalProcesses.delete(jobId);
768
773
 
769
774
  // Close the socket
@@ -898,30 +903,16 @@ class JobsImpl implements Jobs {
898
903
  proc.stdin.end();
899
904
 
900
905
  // Set up process timeout if configured
901
- if (config.timeout) {
906
+ if (config.timeout && !this.externalConfig.useWatchdog) {
902
907
  const timeout = setTimeout(async () => {
903
908
  console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
904
- try {
905
- process.kill(proc.pid, "SIGTERM");
906
- } catch {
907
- // Process may already be dead
908
- }
909
-
910
- await this.adapter.update(job.id, {
911
- status: "failed",
912
- error: `Job timed out after ${config.timeout}ms`,
913
- completedAt: new Date(),
914
- });
915
-
916
- await this.cleanupExternalJob(job.id);
917
-
918
- if (this.events) {
919
- await this.events.emit("job.failed", {
920
- jobId: job.id,
921
- name: job.name,
922
- error: "Timeout",
923
- });
924
- }
909
+ const killGraceMs = config.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
910
+ await this.terminateExternalProcess(
911
+ job.id,
912
+ proc.pid,
913
+ killGraceMs,
914
+ `Job timed out after ${config.timeout}ms`
915
+ );
925
916
  }, config.timeout);
926
917
 
927
918
  const procInfo = this.externalProcesses.get(job.id);
@@ -998,6 +989,73 @@ class JobsImpl implements Jobs {
998
989
  }
999
990
  }
1000
991
 
992
+ private async terminateExternalProcess(
993
+ jobId: string,
994
+ pid: number,
995
+ killGraceMs: number,
996
+ error: string
997
+ ): Promise<void> {
998
+ try {
999
+ process.kill(pid, "SIGTERM");
1000
+ } catch {
1001
+ return;
1002
+ }
1003
+
1004
+ if (killGraceMs <= 0) {
1005
+ try {
1006
+ process.kill(pid, "SIGKILL");
1007
+ } catch {
1008
+ // ignore
1009
+ }
1010
+ await this.handleExternalFailure(jobId, error);
1011
+ return;
1012
+ }
1013
+
1014
+ const timer = setTimeout(async () => {
1015
+ try {
1016
+ process.kill(pid, 0);
1017
+ process.kill(pid, "SIGKILL");
1018
+ } catch {
1019
+ // ignore
1020
+ }
1021
+
1022
+ await this.handleExternalFailure(jobId, error);
1023
+ }, killGraceMs);
1024
+
1025
+ const procInfo = this.externalProcesses.get(jobId);
1026
+ if (procInfo) {
1027
+ procInfo.killTimer = timer;
1028
+ }
1029
+ }
1030
+
1031
+ private async handleExternalFailure(jobId: string, error: string): Promise<void> {
1032
+ await this.adapter.update(jobId, {
1033
+ status: "failed",
1034
+ error,
1035
+ completedAt: new Date(),
1036
+ processState: "orphaned",
1037
+ });
1038
+
1039
+ const job = await this.adapter.get(jobId);
1040
+ if (this.events && job) {
1041
+ await this.events.emit("job.watchdog.killed", {
1042
+ jobId,
1043
+ name: job.name,
1044
+ reason: error,
1045
+ });
1046
+ }
1047
+
1048
+ await this.cleanupExternalJob(jobId);
1049
+
1050
+ if (this.events && job) {
1051
+ await this.events.emit("job.failed", {
1052
+ jobId,
1053
+ name: job.name,
1054
+ error,
1055
+ });
1056
+ }
1057
+ }
1058
+
1001
1059
  private streamProcessOutput(
1002
1060
  jobId: string,
1003
1061
  jobName: string,
@@ -207,6 +207,23 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
207
207
 
208
208
  let buffer = "";
209
209
 
210
+ const queue: ProcessMessage[] = [];
211
+ let processing = false;
212
+
213
+ const processQueue = async () => {
214
+ if (processing) return;
215
+ processing = true;
216
+ while (queue.length > 0) {
217
+ const message = queue.shift()!;
218
+ try {
219
+ await this.onMessage(message);
220
+ } catch (err) {
221
+ this.onError?.(err instanceof Error ? err : new Error(String(err)), processId);
222
+ }
223
+ }
224
+ processing = false;
225
+ };
226
+
210
227
  socket.on("data", (data) => {
211
228
  buffer += data.toString();
212
229
 
@@ -219,11 +236,13 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
219
236
 
220
237
  const message = this.parseMessage(line);
221
238
  if (message) {
222
- this.onMessage(message);
239
+ queue.push(message);
223
240
  } else {
224
241
  this.onError?.(new Error(`Invalid message: ${line}`), processId);
225
242
  }
226
243
  }
244
+
245
+ processQueue().catch(() => undefined);
227
246
  });
228
247
 
229
248
  socket.on("error", (err) => {
@@ -61,6 +61,15 @@ export interface ProcessConfig {
61
61
  /** Timeout before considering unhealthy in ms (default: 60000) */
62
62
  timeoutMs?: number;
63
63
  };
64
+ /** Hard limits for the process (optional) */
65
+ limits?: {
66
+ /** Max runtime in ms before termination */
67
+ maxRuntimeMs?: number;
68
+ /** Max memory (RSS) in MB before termination (requires stats enabled) */
69
+ maxMemoryMb?: number;
70
+ /** Max CPU percent before termination (requires stats enabled) */
71
+ maxCpuPercent?: number;
72
+ };
64
73
  }
65
74
 
66
75
  export interface ManagedProcess {
@@ -171,6 +180,10 @@ export interface ProcessesConfig {
171
180
  heartbeatCheckInterval?: number;
172
181
  /** Enable auto-reconnect to orphaned processes on startup (default: true) */
173
182
  autoRecoverOrphans?: boolean;
183
+ /** Grace period before SIGKILL when stopping/killing (ms, default: 5000) */
184
+ killGraceMs?: number;
185
+ /** Disable in-process watchdog timers (use external watchdog instead) */
186
+ useWatchdog?: boolean;
174
187
  }
175
188
 
176
189
  // ============================================
@@ -251,6 +264,9 @@ export class ProcessesImpl implements Processes {
251
264
  private events?: Events;
252
265
  private heartbeatCheckInterval: number;
253
266
  private autoRecoverOrphans: boolean;
267
+ private killGraceMs: number;
268
+ private runtimeLimitTimers = new Map<string, ReturnType<typeof setTimeout>>();
269
+ private useWatchdog: boolean;
254
270
 
255
271
  // Track running Bun subprocesses
256
272
  private subprocesses = new Map<string, Subprocess>();
@@ -266,6 +282,8 @@ export class ProcessesImpl implements Processes {
266
282
  this.events = config.events;
267
283
  this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
268
284
  this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
285
+ this.killGraceMs = config.killGraceMs ?? 5000;
286
+ this.useWatchdog = config.useWatchdog ?? false;
269
287
 
270
288
  // Create socket server with callbacks
271
289
  this.socketServer = createProcessSocketServer(config.socket ?? {}, {
@@ -361,6 +379,21 @@ export class ProcessesImpl implements Processes {
361
379
  // Set up exit handler for crash detection
362
380
  proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
363
381
 
382
+ const maxRuntimeMs = config.limits?.maxRuntimeMs;
383
+ if (!this.useWatchdog && maxRuntimeMs && maxRuntimeMs > 0) {
384
+ const timer = setTimeout(async () => {
385
+ console.warn(`[Processes] Max runtime exceeded for ${name} (${process.id})`);
386
+ await this.emitEvent("process.limits_exceeded", {
387
+ processId: process.id,
388
+ name,
389
+ reason: "maxRuntimeMs",
390
+ limit: maxRuntimeMs,
391
+ });
392
+ await this.stop(process.id);
393
+ }, maxRuntimeMs);
394
+ this.runtimeLimitTimers.set(process.id, timer);
395
+ }
396
+
364
397
  console.log(`[Processes] Spawned ${name} (${process.id}) with PID ${proc.pid}`);
365
398
  return process.id;
366
399
  } catch (error) {
@@ -395,7 +428,7 @@ export class ProcessesImpl implements Processes {
395
428
  // Wait for process to exit (with timeout)
396
429
  const exitPromise = subprocess.exited;
397
430
  const timeoutPromise = new Promise<null>((resolve) =>
398
- setTimeout(() => resolve(null), 5000)
431
+ setTimeout(() => resolve(null), this.killGraceMs)
399
432
  );
400
433
 
401
434
  const result = await Promise.race([exitPromise, timeoutPromise]);
@@ -412,6 +445,11 @@ export class ProcessesImpl implements Processes {
412
445
  // Cleanup
413
446
  await this.socketServer.closeSocket(processId);
414
447
  this.subprocesses.delete(processId);
448
+ const runtimeTimer = this.runtimeLimitTimers.get(processId);
449
+ if (runtimeTimer) {
450
+ clearTimeout(runtimeTimer);
451
+ this.runtimeLimitTimers.delete(processId);
452
+ }
415
453
 
416
454
  await this.adapter.update(processId, {
417
455
  status: "stopped",
@@ -443,6 +481,11 @@ export class ProcessesImpl implements Processes {
443
481
  // Cleanup
444
482
  await this.socketServer.closeSocket(processId);
445
483
  this.subprocesses.delete(processId);
484
+ const runtimeTimer = this.runtimeLimitTimers.get(processId);
485
+ if (runtimeTimer) {
486
+ clearTimeout(runtimeTimer);
487
+ this.runtimeLimitTimers.delete(processId);
488
+ }
446
489
 
447
490
  await this.adapter.update(processId, {
448
491
  status: "stopped",
@@ -590,6 +633,47 @@ export class ProcessesImpl implements Processes {
590
633
  await definition.onStats(proc, stats);
591
634
  }
592
635
 
636
+ const limits = proc.config.limits;
637
+ if (limits) {
638
+ if (limits.maxMemoryMb && stats.memory.rss / 1e6 > limits.maxMemoryMb) {
639
+ console.warn(`[Processes] Memory limit exceeded for ${proc.name} (${proc.id})`);
640
+ await this.emitEvent("process.limits_exceeded", {
641
+ processId,
642
+ name: proc.name,
643
+ reason: "maxMemoryMb",
644
+ limit: limits.maxMemoryMb,
645
+ value: stats.memory.rss / 1e6,
646
+ });
647
+ await this.emitEvent("process.watchdog.killed", {
648
+ processId,
649
+ name: proc.name,
650
+ reason: "maxMemoryMb",
651
+ value: stats.memory.rss / 1e6,
652
+ });
653
+ await this.stop(proc.id);
654
+ return;
655
+ }
656
+
657
+ if (limits.maxCpuPercent && stats.cpu.percent > limits.maxCpuPercent) {
658
+ console.warn(`[Processes] CPU limit exceeded for ${proc.name} (${proc.id})`);
659
+ await this.emitEvent("process.limits_exceeded", {
660
+ processId,
661
+ name: proc.name,
662
+ reason: "maxCpuPercent",
663
+ limit: limits.maxCpuPercent,
664
+ value: stats.cpu.percent,
665
+ });
666
+ await this.emitEvent("process.watchdog.killed", {
667
+ processId,
668
+ name: proc.name,
669
+ reason: "maxCpuPercent",
670
+ value: stats.cpu.percent,
671
+ });
672
+ await this.stop(proc.id);
673
+ return;
674
+ }
675
+ }
676
+
593
677
  return;
594
678
  }
595
679
 
@@ -815,6 +899,7 @@ export class ProcessesImpl implements Processes {
815
899
  }
816
900
 
817
901
  private startHeartbeatMonitor(): void {
902
+ if (this.useWatchdog) return;
818
903
  this.heartbeatMonitor = setInterval(async () => {
819
904
  if (this.isShuttingDown) return;
820
905
 
@@ -835,16 +920,27 @@ export class ProcessesImpl implements Processes {
835
920
  processId: proc.id,
836
921
  name: proc.name,
837
922
  });
923
+ await this.emitEvent("process.watchdog.stale", {
924
+ processId: proc.id,
925
+ name: proc.name,
926
+ reason: "heartbeat",
927
+ timeoutMs,
928
+ });
838
929
 
839
930
  const definition = this.definitions.get(proc.name);
840
931
  if (definition?.onUnhealthy) {
841
932
  await definition.onUnhealthy(proc);
842
933
  }
843
934
 
844
- // If heartbeat is way overdue (2x timeout), kill and restart
935
+ // If heartbeat is way overdue (2x timeout), stop and restart
845
936
  if (now - lastHeartbeat > timeoutMs * 2) {
846
- console.warn(`[Processes] Killing unresponsive process ${proc.name} (${proc.id})`);
847
- await this.kill(proc.id);
937
+ console.warn(`[Processes] Stopping unresponsive process ${proc.name} (${proc.id})`);
938
+ await this.stop(proc.id);
939
+ await this.emitEvent("process.watchdog.killed", {
940
+ processId: proc.id,
941
+ name: proc.name,
942
+ reason: "heartbeat",
943
+ });
848
944
  // handleExit will trigger auto-restart if configured
849
945
  }
850
946
  }
@@ -32,6 +32,11 @@ export interface SubprocessPluginMetadata {
32
32
  export interface SubprocessBootstrapOptions {
33
33
  dbPath: string;
34
34
  coreConfig?: Record<string, any>;
35
+ sqlitePragmas?: {
36
+ busyTimeout?: number;
37
+ synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
38
+ journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
39
+ };
35
40
  pluginMetadata: SubprocessPluginMetadata;
36
41
  startServices?: {
37
42
  cron?: boolean;
@@ -53,7 +58,15 @@ export async function bootstrapSubprocess(
53
58
  options: SubprocessBootstrapOptions
54
59
  ): Promise<SubprocessBootstrapResult> {
55
60
  const sqlite = new Database(options.dbPath);
56
- sqlite.run("PRAGMA busy_timeout = 5000");
61
+ const pragmas = options.sqlitePragmas;
62
+ const busyTimeout = pragmas?.busyTimeout ?? 5000;
63
+ sqlite.run(`PRAGMA busy_timeout = ${busyTimeout}`);
64
+ if (pragmas?.journalMode) {
65
+ sqlite.run(`PRAGMA journal_mode = ${pragmas.journalMode}`);
66
+ }
67
+ if (pragmas?.synchronous) {
68
+ sqlite.run(`PRAGMA synchronous = ${pragmas.synchronous}`);
69
+ }
57
70
 
58
71
  const db = new Kysely<any>({
59
72
  dialect: new BunSqliteDialect({ database: sqlite }),