@donkeylabs/server 2.0.27 → 2.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/jobs.md CHANGED
@@ -192,6 +192,26 @@ const cancelled = await ctx.core.jobs.cancel(jobId);
192
192
 
193
193
  ---
194
194
 
195
+ ## External Jobs (Subprocess)
196
+
197
+ External jobs run in a separate process and are monitored by a watchdog.
198
+
199
+ ```ts
200
+ ctx.core.jobs.registerExternal("batchWorker", {
201
+ command: "bun",
202
+ args: ["./workers/batch-worker.ts"],
203
+ heartbeatTimeout: 30000,
204
+ timeout: 10 * 60 * 1000,
205
+ killGraceMs: 5000,
206
+ });
207
+ ```
208
+
209
+ Watchdog events:
210
+ - `job.watchdog.stale`
211
+ - `job.watchdog.killed`
212
+
213
+ ---
214
+
195
215
  ## Event Integration
196
216
 
197
217
  Jobs automatically emit events on completion and failure:
package/docs/processes.md CHANGED
@@ -8,6 +8,7 @@ Processes provide:
8
8
  - Long-running daemon management (start, stop, restart)
9
9
  - Typed event communication from process to server
10
10
  - Automatic heartbeat monitoring
11
+ - Watchdog termination for unresponsive processes
11
12
  - Connection resilience with auto-reconnection
12
13
  - Metadata passing to spawned processes
13
14
  - Cross-platform support (Unix sockets / TCP on Windows)
@@ -89,6 +90,13 @@ server.getCore().processes.define("video-encoder", {
89
90
 
90
91
  // Heartbeat configuration
91
92
  heartbeatTimeout: 30000, // 30 seconds
93
+
94
+ // Optional hard limits (requires stats for memory/CPU)
95
+ limits: {
96
+ maxRuntimeMs: 60_000,
97
+ maxMemoryMb: 512,
98
+ maxCpuPercent: 90,
99
+ },
92
100
  });
93
101
  ```
94
102
 
@@ -221,6 +229,25 @@ const client = await ProcessClient.connect({
221
229
  });
222
230
  ```
223
231
 
232
+ ---
233
+
234
+ ## Hard Limits
235
+
236
+ Processes can be terminated automatically when limits are exceeded:
237
+
238
+ - `maxRuntimeMs` always enforced by the server watchdog
239
+ - `maxMemoryMb` and `maxCpuPercent` require `ProcessClient` stats enabled
240
+
241
+ ```ts
242
+ const client = await ProcessClient.connect({
243
+ stats: { enabled: true, interval: 5000 },
244
+ });
245
+ ```
246
+
247
+ Watchdog events:
248
+ - `process.watchdog.stale`
249
+ - `process.watchdog.killed`
250
+
224
251
  ### Properties
225
252
 
226
253
  ```typescript
package/docs/workflows.md CHANGED
@@ -285,6 +285,75 @@ workflow("example")
285
285
  .end("done")
286
286
  ```
287
287
 
288
+ ### Poll
289
+
290
+ Use a poll step for wait → check loops that persist across restarts.
291
+
292
+ ```typescript
293
+ workflow("batch.status")
294
+ .poll("wait-for-result", {
295
+ interval: 5000,
296
+ timeout: 600000,
297
+ maxAttempts: 120,
298
+ check: async (input, ctx) => {
299
+ const status = await fetchStatus(input.operationId);
300
+ if (status.state === "FAILED") throw new Error(status.error);
301
+ if (status.state === "SUCCEEDED") {
302
+ return { done: true, result: status.data };
303
+ }
304
+ return { done: false };
305
+ },
306
+ })
307
+ .build();
308
+ ```
309
+
310
+ Each poll cycle emits `workflow.step.poll` events and persists progress to the instance.
311
+
312
+ ---
313
+
314
+ ## Watchdog and Subprocess Settings
315
+
316
+ You can tune subprocess termination and SQLite pragmas used by isolated workflows:
317
+
318
+ ```ts
319
+ const server = new AppServer({
320
+ db,
321
+ workflows: {
322
+ killGraceMs: 5000,
323
+ sqlitePragmas: {
324
+ busyTimeout: 5000,
325
+ journalMode: "WAL",
326
+ synchronous: "NORMAL",
327
+ },
328
+ },
329
+ });
330
+ ```
331
+
332
+ Watchdog events:
333
+ - `workflow.watchdog.stale` (heartbeat missed)
334
+ - `workflow.watchdog.killed` (process terminated)
335
+
336
+ ### Loop
337
+
338
+ Use a loop step to jump back to a previous step until a condition is false.
339
+
340
+ ```typescript
341
+ workflow("loop-example")
342
+ .task("increment", {
343
+ handler: async (input) => ({ count: (input.count ?? 0) + 1 }),
344
+ })
345
+ .loop("repeat", {
346
+ condition: (ctx) => ctx.steps.increment.count < 3,
347
+ target: "increment",
348
+ interval: 1000,
349
+ maxIterations: 10,
350
+ timeout: 30000,
351
+ })
352
+ .build();
353
+ ```
354
+
355
+ Each loop iteration emits `workflow.step.loop` and persists loop counters to the instance.
356
+
288
357
  ## Workflow Context
289
358
 
290
359
  Every step receives a `WorkflowContext` with:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@donkeylabs/server",
3
- "version": "2.0.27",
3
+ "version": "2.0.29",
4
4
  "type": "module",
5
5
  "description": "Type-safe plugin system for building RPC-style APIs with Bun",
6
6
  "main": "./src/index.ts",
@@ -505,6 +505,17 @@ export function createAdminRouter(config: AdminRouteContext) {
505
505
  stepName: z.string(),
506
506
  error: z.string(),
507
507
  }),
508
+ "step.poll": z.object({
509
+ stepName: z.string(),
510
+ pollCount: z.number(),
511
+ done: z.boolean(),
512
+ result: z.any().optional(),
513
+ }),
514
+ "step.loop": z.object({
515
+ stepName: z.string(),
516
+ loopCount: z.number(),
517
+ target: z.string(),
518
+ }),
508
519
  completed: z.object({
509
520
  output: z.any().optional(),
510
521
  }),
@@ -548,6 +559,19 @@ export function createAdminRouter(config: AdminRouteContext) {
548
559
  workflowName: z.string(),
549
560
  error: z.string(),
550
561
  }),
562
+ "workflow.step.poll": z.object({
563
+ instanceId: z.string(),
564
+ stepName: z.string(),
565
+ pollCount: z.number(),
566
+ done: z.boolean(),
567
+ result: z.any().optional(),
568
+ }),
569
+ "workflow.step.loop": z.object({
570
+ instanceId: z.string(),
571
+ stepName: z.string(),
572
+ loopCount: z.number(),
573
+ target: z.string(),
574
+ }),
551
575
  },
552
576
  handle: (input, ctx) => {
553
577
  if (!checkAuth(ctx)) {
@@ -201,6 +201,23 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
201
201
 
202
202
  let buffer = "";
203
203
 
204
+ const queue: AnyExternalJobMessage[] = [];
205
+ let processing = false;
206
+
207
+ const processQueue = async () => {
208
+ if (processing) return;
209
+ processing = true;
210
+ while (queue.length > 0) {
211
+ const message = queue.shift()!;
212
+ try {
213
+ await this.onMessage(message);
214
+ } catch (err) {
215
+ this.onError?.(err instanceof Error ? err : new Error(String(err)), jobId);
216
+ }
217
+ }
218
+ processing = false;
219
+ };
220
+
204
221
  socket.on("data", (data) => {
205
222
  buffer += data.toString();
206
223
 
@@ -213,11 +230,13 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
213
230
 
214
231
  const message = parseJobMessage(line);
215
232
  if (message) {
216
- this.onMessage(message);
233
+ queue.push(message);
217
234
  } else {
218
235
  this.onError?.(new Error(`Invalid message: ${line}`), jobId);
219
236
  }
220
237
  }
238
+
239
+ processQueue().catch(() => undefined);
221
240
  });
222
241
 
223
242
  socket.on("error", (err) => {
@@ -80,6 +80,8 @@ export interface ExternalJobConfig {
80
80
  heartbeatTimeout?: number;
81
81
  /** Job timeout in milliseconds (optional) */
82
82
  timeout?: number;
83
+ /** Grace period before SIGKILL when terminating (ms, default: 5000) */
84
+ killGraceMs?: number;
83
85
  }
84
86
 
85
87
  // ============================================
@@ -120,6 +122,8 @@ export interface ExternalJobsConfig {
120
122
  defaultHeartbeatTimeout?: number;
121
123
  /** Heartbeat check interval in ms (default: 10000) */
122
124
  heartbeatCheckInterval?: number;
125
+ /** Default grace period before SIGKILL when terminating (ms, default: 5000) */
126
+ killGraceMs?: number;
123
127
  }
124
128
 
125
129
  // ============================================
package/src/core/index.ts CHANGED
@@ -134,6 +134,7 @@ export {
134
134
  export {
135
135
  type Workflows,
136
136
  type WorkflowsConfig,
137
+ type SqlitePragmaConfig,
137
138
  type WorkflowRegisterOptions,
138
139
  type WorkflowDefinition,
139
140
  type WorkflowInstance,
@@ -149,6 +150,9 @@ export {
149
150
  type ChoiceStepDefinition,
150
151
  type ChoiceCondition,
151
152
  type PassStepDefinition,
153
+ type PollStepDefinition,
154
+ type PollStepResult,
155
+ type LoopStepDefinition,
152
156
  type RetryConfig,
153
157
  type GetAllWorkflowsOptions,
154
158
  type PluginMetadata,
package/src/core/jobs.ts CHANGED
@@ -273,7 +273,10 @@ class JobsImpl implements Jobs {
273
273
  private externalConfigs = new Map<string, ExternalJobConfig>();
274
274
  private externalConfig: ExternalJobsConfig;
275
275
  private socketServer: ExternalJobSocketServer | null = null;
276
- private externalProcesses = new Map<string, { pid: number; timeout?: ReturnType<typeof setTimeout> }>();
276
+ private externalProcesses = new Map<
277
+ string,
278
+ { pid: number; timeout?: ReturnType<typeof setTimeout>; killTimer?: ReturnType<typeof setTimeout> }
279
+ >();
277
280
 
278
281
  constructor(config: JobsConfig = {}) {
279
282
  this.events = config.events;
@@ -521,6 +524,7 @@ class JobsImpl implements Jobs {
521
524
 
522
525
  const config = this.externalConfigs.get(job.name);
523
526
  const heartbeatTimeout = config?.heartbeatTimeout ?? this.externalConfig.defaultHeartbeatTimeout ?? 30000;
527
+ const killGraceMs = config?.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
524
528
  const timeSinceHeartbeat = now - job.lastHeartbeat.getTime();
525
529
 
526
530
  if (timeSinceHeartbeat > heartbeatTimeout) {
@@ -533,36 +537,22 @@ class JobsImpl implements Jobs {
533
537
  name: job.name,
534
538
  timeSinceHeartbeat,
535
539
  });
536
- }
537
-
538
- // If stale for 2x timeout, kill the process
539
- if (timeSinceHeartbeat > heartbeatTimeout * 2) {
540
- console.error(`[Jobs] Killing stale external job ${job.id}`);
541
-
542
- if (job.pid) {
543
- try {
544
- process.kill(job.pid, "SIGKILL");
545
- } catch {
546
- // Process may already be dead
547
- }
548
- }
549
-
550
- await this.adapter.update(job.id, {
551
- status: "failed",
552
- error: "Heartbeat timeout - job process unresponsive",
553
- completedAt: new Date(),
554
- processState: "orphaned",
540
+ await this.events.emit("job.watchdog.stale", {
541
+ jobId: job.id,
542
+ name: job.name,
543
+ timeSinceHeartbeat,
555
544
  });
545
+ }
556
546
 
557
- await this.cleanupExternalJob(job.id);
558
-
559
- if (this.events) {
560
- await this.events.emit("job.failed", {
561
- jobId: job.id,
562
- name: job.name,
563
- error: "Heartbeat timeout",
564
- });
565
- }
547
+ const procInfo = this.externalProcesses.get(job.id);
548
+ if (job.pid && !procInfo?.killTimer) {
549
+ console.error(`[Jobs] Terminating stale external job ${job.id}`);
550
+ await this.terminateExternalProcess(
551
+ job.id,
552
+ job.pid,
553
+ killGraceMs,
554
+ "Heartbeat timeout - job process unresponsive"
555
+ );
566
556
  }
567
557
  }
568
558
  }
@@ -764,6 +754,9 @@ class JobsImpl implements Jobs {
764
754
  if (procInfo?.timeout) {
765
755
  clearTimeout(procInfo.timeout);
766
756
  }
757
+ if (procInfo?.killTimer) {
758
+ clearTimeout(procInfo.killTimer);
759
+ }
767
760
  this.externalProcesses.delete(jobId);
768
761
 
769
762
  // Close the socket
@@ -901,27 +894,13 @@ class JobsImpl implements Jobs {
901
894
  if (config.timeout) {
902
895
  const timeout = setTimeout(async () => {
903
896
  console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
904
- try {
905
- process.kill(proc.pid, "SIGTERM");
906
- } catch {
907
- // Process may already be dead
908
- }
909
-
910
- await this.adapter.update(job.id, {
911
- status: "failed",
912
- error: `Job timed out after ${config.timeout}ms`,
913
- completedAt: new Date(),
914
- });
915
-
916
- await this.cleanupExternalJob(job.id);
917
-
918
- if (this.events) {
919
- await this.events.emit("job.failed", {
920
- jobId: job.id,
921
- name: job.name,
922
- error: "Timeout",
923
- });
924
- }
897
+ const killGraceMs = config.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
898
+ await this.terminateExternalProcess(
899
+ job.id,
900
+ proc.pid,
901
+ killGraceMs,
902
+ `Job timed out after ${config.timeout}ms`
903
+ );
925
904
  }, config.timeout);
926
905
 
927
906
  const procInfo = this.externalProcesses.get(job.id);
@@ -998,6 +977,73 @@ class JobsImpl implements Jobs {
998
977
  }
999
978
  }
1000
979
 
980
+ private async terminateExternalProcess(
981
+ jobId: string,
982
+ pid: number,
983
+ killGraceMs: number,
984
+ error: string
985
+ ): Promise<void> {
986
+ try {
987
+ process.kill(pid, "SIGTERM");
988
+ } catch {
989
+ return;
990
+ }
991
+
992
+ if (killGraceMs <= 0) {
993
+ try {
994
+ process.kill(pid, "SIGKILL");
995
+ } catch {
996
+ // ignore
997
+ }
998
+ await this.handleExternalFailure(jobId, error);
999
+ return;
1000
+ }
1001
+
1002
+ const timer = setTimeout(async () => {
1003
+ try {
1004
+ process.kill(pid, 0);
1005
+ process.kill(pid, "SIGKILL");
1006
+ } catch {
1007
+ // ignore
1008
+ }
1009
+
1010
+ await this.handleExternalFailure(jobId, error);
1011
+ }, killGraceMs);
1012
+
1013
+ const procInfo = this.externalProcesses.get(jobId);
1014
+ if (procInfo) {
1015
+ procInfo.killTimer = timer;
1016
+ }
1017
+ }
1018
+
1019
+ private async handleExternalFailure(jobId: string, error: string): Promise<void> {
1020
+ await this.adapter.update(jobId, {
1021
+ status: "failed",
1022
+ error,
1023
+ completedAt: new Date(),
1024
+ processState: "orphaned",
1025
+ });
1026
+
1027
+ const job = await this.adapter.get(jobId);
1028
+ if (this.events && job) {
1029
+ await this.events.emit("job.watchdog.killed", {
1030
+ jobId,
1031
+ name: job.name,
1032
+ reason: error,
1033
+ });
1034
+ }
1035
+
1036
+ await this.cleanupExternalJob(jobId);
1037
+
1038
+ if (this.events && job) {
1039
+ await this.events.emit("job.failed", {
1040
+ jobId,
1041
+ name: job.name,
1042
+ error,
1043
+ });
1044
+ }
1045
+ }
1046
+
1001
1047
  private streamProcessOutput(
1002
1048
  jobId: string,
1003
1049
  jobName: string,
@@ -207,6 +207,23 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
207
207
 
208
208
  let buffer = "";
209
209
 
210
+ const queue: ProcessMessage[] = [];
211
+ let processing = false;
212
+
213
+ const processQueue = async () => {
214
+ if (processing) return;
215
+ processing = true;
216
+ while (queue.length > 0) {
217
+ const message = queue.shift()!;
218
+ try {
219
+ await this.onMessage(message);
220
+ } catch (err) {
221
+ this.onError?.(err instanceof Error ? err : new Error(String(err)), processId);
222
+ }
223
+ }
224
+ processing = false;
225
+ };
226
+
210
227
  socket.on("data", (data) => {
211
228
  buffer += data.toString();
212
229
 
@@ -219,11 +236,13 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
219
236
 
220
237
  const message = this.parseMessage(line);
221
238
  if (message) {
222
- this.onMessage(message);
239
+ queue.push(message);
223
240
  } else {
224
241
  this.onError?.(new Error(`Invalid message: ${line}`), processId);
225
242
  }
226
243
  }
244
+
245
+ processQueue().catch(() => undefined);
227
246
  });
228
247
 
229
248
  socket.on("error", (err) => {
@@ -61,6 +61,15 @@ export interface ProcessConfig {
61
61
  /** Timeout before considering unhealthy in ms (default: 60000) */
62
62
  timeoutMs?: number;
63
63
  };
64
+ /** Hard limits for the process (optional) */
65
+ limits?: {
66
+ /** Max runtime in ms before termination */
67
+ maxRuntimeMs?: number;
68
+ /** Max memory (RSS) in MB before termination (requires stats enabled) */
69
+ maxMemoryMb?: number;
70
+ /** Max CPU percent before termination (requires stats enabled) */
71
+ maxCpuPercent?: number;
72
+ };
64
73
  }
65
74
 
66
75
  export interface ManagedProcess {
@@ -171,6 +180,8 @@ export interface ProcessesConfig {
171
180
  heartbeatCheckInterval?: number;
172
181
  /** Enable auto-reconnect to orphaned processes on startup (default: true) */
173
182
  autoRecoverOrphans?: boolean;
183
+ /** Grace period before SIGKILL when stopping/killing (ms, default: 5000) */
184
+ killGraceMs?: number;
174
185
  }
175
186
 
176
187
  // ============================================
@@ -251,6 +262,8 @@ export class ProcessesImpl implements Processes {
251
262
  private events?: Events;
252
263
  private heartbeatCheckInterval: number;
253
264
  private autoRecoverOrphans: boolean;
265
+ private killGraceMs: number;
266
+ private runtimeLimitTimers = new Map<string, ReturnType<typeof setTimeout>>();
254
267
 
255
268
  // Track running Bun subprocesses
256
269
  private subprocesses = new Map<string, Subprocess>();
@@ -266,6 +279,7 @@ export class ProcessesImpl implements Processes {
266
279
  this.events = config.events;
267
280
  this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
268
281
  this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
282
+ this.killGraceMs = config.killGraceMs ?? 5000;
269
283
 
270
284
  // Create socket server with callbacks
271
285
  this.socketServer = createProcessSocketServer(config.socket ?? {}, {
@@ -361,6 +375,21 @@ export class ProcessesImpl implements Processes {
361
375
  // Set up exit handler for crash detection
362
376
  proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
363
377
 
378
+ const maxRuntimeMs = config.limits?.maxRuntimeMs;
379
+ if (maxRuntimeMs && maxRuntimeMs > 0) {
380
+ const timer = setTimeout(async () => {
381
+ console.warn(`[Processes] Max runtime exceeded for ${name} (${process.id})`);
382
+ await this.emitEvent("process.limits_exceeded", {
383
+ processId: process.id,
384
+ name,
385
+ reason: "maxRuntimeMs",
386
+ limit: maxRuntimeMs,
387
+ });
388
+ await this.stop(process.id);
389
+ }, maxRuntimeMs);
390
+ this.runtimeLimitTimers.set(process.id, timer);
391
+ }
392
+
364
393
  console.log(`[Processes] Spawned ${name} (${process.id}) with PID ${proc.pid}`);
365
394
  return process.id;
366
395
  } catch (error) {
@@ -395,7 +424,7 @@ export class ProcessesImpl implements Processes {
395
424
  // Wait for process to exit (with timeout)
396
425
  const exitPromise = subprocess.exited;
397
426
  const timeoutPromise = new Promise<null>((resolve) =>
398
- setTimeout(() => resolve(null), 5000)
427
+ setTimeout(() => resolve(null), this.killGraceMs)
399
428
  );
400
429
 
401
430
  const result = await Promise.race([exitPromise, timeoutPromise]);
@@ -412,6 +441,11 @@ export class ProcessesImpl implements Processes {
412
441
  // Cleanup
413
442
  await this.socketServer.closeSocket(processId);
414
443
  this.subprocesses.delete(processId);
444
+ const runtimeTimer = this.runtimeLimitTimers.get(processId);
445
+ if (runtimeTimer) {
446
+ clearTimeout(runtimeTimer);
447
+ this.runtimeLimitTimers.delete(processId);
448
+ }
415
449
 
416
450
  await this.adapter.update(processId, {
417
451
  status: "stopped",
@@ -443,6 +477,11 @@ export class ProcessesImpl implements Processes {
443
477
  // Cleanup
444
478
  await this.socketServer.closeSocket(processId);
445
479
  this.subprocesses.delete(processId);
480
+ const runtimeTimer = this.runtimeLimitTimers.get(processId);
481
+ if (runtimeTimer) {
482
+ clearTimeout(runtimeTimer);
483
+ this.runtimeLimitTimers.delete(processId);
484
+ }
446
485
 
447
486
  await this.adapter.update(processId, {
448
487
  status: "stopped",
@@ -590,6 +629,47 @@ export class ProcessesImpl implements Processes {
590
629
  await definition.onStats(proc, stats);
591
630
  }
592
631
 
632
+ const limits = proc.config.limits;
633
+ if (limits) {
634
+ if (limits.maxMemoryMb && stats.memory.rss / 1e6 > limits.maxMemoryMb) {
635
+ console.warn(`[Processes] Memory limit exceeded for ${proc.name} (${proc.id})`);
636
+ await this.emitEvent("process.limits_exceeded", {
637
+ processId,
638
+ name: proc.name,
639
+ reason: "maxMemoryMb",
640
+ limit: limits.maxMemoryMb,
641
+ value: stats.memory.rss / 1e6,
642
+ });
643
+ await this.emitEvent("process.watchdog.killed", {
644
+ processId,
645
+ name: proc.name,
646
+ reason: "maxMemoryMb",
647
+ value: stats.memory.rss / 1e6,
648
+ });
649
+ await this.stop(proc.id);
650
+ return;
651
+ }
652
+
653
+ if (limits.maxCpuPercent && stats.cpu.percent > limits.maxCpuPercent) {
654
+ console.warn(`[Processes] CPU limit exceeded for ${proc.name} (${proc.id})`);
655
+ await this.emitEvent("process.limits_exceeded", {
656
+ processId,
657
+ name: proc.name,
658
+ reason: "maxCpuPercent",
659
+ limit: limits.maxCpuPercent,
660
+ value: stats.cpu.percent,
661
+ });
662
+ await this.emitEvent("process.watchdog.killed", {
663
+ processId,
664
+ name: proc.name,
665
+ reason: "maxCpuPercent",
666
+ value: stats.cpu.percent,
667
+ });
668
+ await this.stop(proc.id);
669
+ return;
670
+ }
671
+ }
672
+
593
673
  return;
594
674
  }
595
675
 
@@ -835,16 +915,27 @@ export class ProcessesImpl implements Processes {
835
915
  processId: proc.id,
836
916
  name: proc.name,
837
917
  });
918
+ await this.emitEvent("process.watchdog.stale", {
919
+ processId: proc.id,
920
+ name: proc.name,
921
+ reason: "heartbeat",
922
+ timeoutMs,
923
+ });
838
924
 
839
925
  const definition = this.definitions.get(proc.name);
840
926
  if (definition?.onUnhealthy) {
841
927
  await definition.onUnhealthy(proc);
842
928
  }
843
929
 
844
- // If heartbeat is way overdue (2x timeout), kill and restart
930
+ // If heartbeat is way overdue (2x timeout), stop and restart
845
931
  if (now - lastHeartbeat > timeoutMs * 2) {
846
- console.warn(`[Processes] Killing unresponsive process ${proc.name} (${proc.id})`);
847
- await this.kill(proc.id);
932
+ console.warn(`[Processes] Stopping unresponsive process ${proc.name} (${proc.id})`);
933
+ await this.stop(proc.id);
934
+ await this.emitEvent("process.watchdog.killed", {
935
+ processId: proc.id,
936
+ name: proc.name,
937
+ reason: "heartbeat",
938
+ });
848
939
  // handleExit will trigger auto-restart if configured
849
940
  }
850
941
  }