@donkeylabs/server 0.5.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,758 @@
1
+ /**
2
+ * Processes Core Service
3
+ *
4
+ * Manages persistent, long-running processes (daemons) with supervision capabilities.
5
+ * Use cases: FFmpeg subprocesses, Firecracker VMs, any long-running daemon requiring supervision.
6
+ */
7
+
8
+ import type { Subprocess } from "bun";
9
+ import type { Events } from "./events";
10
+ import {
11
+ createProcessSocketServer,
12
+ type ProcessSocketServer,
13
+ type ProcessMessage,
14
+ type ProcessSocketConfig,
15
+ } from "./process-socket";
16
+ import {
17
+ SqliteProcessAdapter,
18
+ type ProcessAdapter,
19
+ type SqliteProcessAdapterConfig,
20
+ } from "./process-adapter-sqlite";
21
+
22
+ // ============================================
23
+ // Types
24
+ // ============================================
25
+
26
+ export type ProcessStatus =
27
+ | "spawning"
28
+ | "running"
29
+ | "stopping"
30
+ | "stopped"
31
+ | "crashed"
32
+ | "orphaned"
33
+ | "dead";
34
+
35
+ export interface ProcessConfig {
36
+ /** Command to execute (e.g., "ffmpeg", "python", "./script.sh") */
37
+ command: string;
38
+ /** Arguments to pass to the command */
39
+ args?: string[];
40
+ /** Working directory for the process */
41
+ cwd?: string;
42
+ /** Environment variables to set */
43
+ env?: Record<string, string>;
44
+ /** Auto-restart on crash (default: false) */
45
+ autoRestart?: boolean;
46
+ /** Maximum number of restarts before giving up (default: 10, -1 for unlimited) */
47
+ maxRestarts?: number;
48
+ /** Backoff configuration for restarts */
49
+ backoff?: {
50
+ /** Initial delay in ms (default: 1000) */
51
+ initialDelayMs?: number;
52
+ /** Maximum delay in ms (default: 30000) */
53
+ maxDelayMs?: number;
54
+ /** Multiplier for exponential backoff (default: 2) */
55
+ multiplier?: number;
56
+ };
57
+ /** Heartbeat configuration */
58
+ heartbeat?: {
59
+ /** Expected interval between heartbeats in ms (default: 30000) */
60
+ intervalMs?: number;
61
+ /** Timeout before considering unhealthy in ms (default: 60000) */
62
+ timeoutMs?: number;
63
+ };
64
+ }
65
+
66
+ export interface ManagedProcess {
67
+ id: string;
68
+ name: string;
69
+ pid?: number;
70
+ socketPath?: string;
71
+ tcpPort?: number;
72
+ status: ProcessStatus;
73
+ config: ProcessConfig;
74
+ metadata?: Record<string, any>;
75
+ createdAt: Date;
76
+ startedAt?: Date;
77
+ stoppedAt?: Date;
78
+ lastHeartbeat?: Date;
79
+ restartCount: number;
80
+ consecutiveFailures: number;
81
+ error?: string;
82
+ }
83
+
84
+ export interface ProcessDefinition {
85
+ name: string;
86
+ config: Omit<ProcessConfig, "args"> & { args?: string[] };
87
+ /** Called when a message is received from the process */
88
+ onMessage?: (process: ManagedProcess, message: any) => void | Promise<void>;
89
+ /** Called when the process crashes unexpectedly */
90
+ onCrash?: (process: ManagedProcess, exitCode: number | null) => void | Promise<void>;
91
+ /** Called when heartbeat is missed */
92
+ onUnhealthy?: (process: ManagedProcess) => void | Promise<void>;
93
+ /** Called when the process is restarted */
94
+ onRestart?: (oldProcess: ManagedProcess, newProcess: ManagedProcess, attempt: number) => void | Promise<void>;
95
+ }
96
+
97
+ export interface SpawnOptions {
98
+ /** Override config fields for this spawn */
99
+ configOverrides?: Partial<ProcessConfig>;
100
+ /** Metadata to store with the process */
101
+ metadata?: Record<string, any>;
102
+ }
103
+
104
+ // ============================================
105
+ // Configuration
106
+ // ============================================
107
+
108
+ export interface ProcessesConfig {
109
+ /** SQLite adapter configuration */
110
+ adapter?: SqliteProcessAdapterConfig;
111
+ /** Socket server configuration */
112
+ socket?: ProcessSocketConfig;
113
+ /** Events service for emitting process events */
114
+ events?: Events;
115
+ /** Heartbeat check interval in ms (default: 10000) */
116
+ heartbeatCheckInterval?: number;
117
+ /** Enable auto-reconnect to orphaned processes on startup (default: true) */
118
+ autoRecoverOrphans?: boolean;
119
+ }
120
+
121
+ // ============================================
122
+ // Service Interface
123
+ // ============================================
124
+
125
+ export interface Processes {
126
+ /** Register a process definition */
127
+ register(definition: ProcessDefinition): void;
128
+ /** Spawn a new process instance */
129
+ spawn(name: string, options?: SpawnOptions): Promise<string>;
130
+ /** Gracefully stop a process (SIGTERM) */
131
+ stop(processId: string): Promise<boolean>;
132
+ /** Force kill a process (SIGKILL) */
133
+ kill(processId: string): Promise<boolean>;
134
+ /** Restart a process */
135
+ restart(processId: string): Promise<string>;
136
+ /** Get a process by ID */
137
+ get(processId: string): Promise<ManagedProcess | null>;
138
+ /** Get all processes by name */
139
+ getByName(name: string): Promise<ManagedProcess[]>;
140
+ /** Get all running processes */
141
+ getRunning(): Promise<ManagedProcess[]>;
142
+ /** Send a message to a process via socket */
143
+ send(processId: string, message: any): Promise<boolean>;
144
+ /** Start the service (recovery, monitoring) */
145
+ start(): void;
146
+ /** Shutdown the service and all managed processes */
147
+ shutdown(): Promise<void>;
148
+ }
149
+
150
+ // ============================================
151
+ // Helper Functions
152
+ // ============================================
153
+
154
+ /**
155
+ * Check if a process with given PID is still alive
156
+ */
157
+ export function isProcessAlive(pid: number): boolean {
158
+ try {
159
+ // Sending signal 0 doesn't actually send a signal,
160
+ // it just checks if the process exists and we have permission to signal it
161
+ process.kill(pid, 0);
162
+ return true;
163
+ } catch {
164
+ return false;
165
+ }
166
+ }
167
+
168
+ /**
169
+ * Calculate backoff delay with jitter
170
+ */
171
+ function calculateBackoff(
172
+ consecutiveFailures: number,
173
+ config: ProcessConfig["backoff"]
174
+ ): number {
175
+ const initialDelay = config?.initialDelayMs ?? 1000;
176
+ const maxDelay = config?.maxDelayMs ?? 30000;
177
+ const multiplier = config?.multiplier ?? 2;
178
+
179
+ const delay = Math.min(
180
+ initialDelay * Math.pow(multiplier, consecutiveFailures),
181
+ maxDelay
182
+ );
183
+
184
+ // Add jitter (0.5 to 1.5x the delay)
185
+ return delay * (0.5 + Math.random());
186
+ }
187
+
188
+ // ============================================
189
+ // Implementation
190
+ // ============================================
191
+
192
+ export class ProcessesImpl implements Processes {
193
+ private definitions = new Map<string, ProcessDefinition>();
194
+ private adapter: ProcessAdapter;
195
+ private socketServer: ProcessSocketServer;
196
+ private events?: Events;
197
+ private heartbeatCheckInterval: number;
198
+ private autoRecoverOrphans: boolean;
199
+
200
+ // Track running Bun subprocesses
201
+ private subprocesses = new Map<string, Subprocess>();
202
+ // Track pending restarts (processId -> timeout)
203
+ private pendingRestarts = new Map<string, ReturnType<typeof setTimeout>>();
204
+ // Heartbeat monitor interval
205
+ private heartbeatMonitor?: ReturnType<typeof setInterval>;
206
+ // Shutdown flag
207
+ private isShuttingDown = false;
208
+
209
+ constructor(config: ProcessesConfig = {}) {
210
+ this.adapter = new SqliteProcessAdapter(config.adapter);
211
+ this.events = config.events;
212
+ this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
213
+ this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
214
+
215
+ // Create socket server with callbacks
216
+ this.socketServer = createProcessSocketServer(config.socket ?? {}, {
217
+ onMessage: (message) => this.handleMessage(message),
218
+ onConnect: (processId) => this.handleConnect(processId),
219
+ onDisconnect: (processId) => this.handleDisconnect(processId),
220
+ onError: (error, processId) => this.handleError(error, processId),
221
+ });
222
+ }
223
+
224
+ register(definition: ProcessDefinition): void {
225
+ if (this.definitions.has(definition.name)) {
226
+ console.warn(`[Processes] Overwriting existing definition for '${definition.name}'`);
227
+ }
228
+ this.definitions.set(definition.name, definition);
229
+ console.log(`[Processes] Registered process definition: ${definition.name}`);
230
+ }
231
+
232
+ async spawn(name: string, options?: SpawnOptions): Promise<string> {
233
+ const definition = this.definitions.get(name);
234
+ if (!definition) {
235
+ throw new Error(`Process definition '${name}' not found. Did you call register()?`);
236
+ }
237
+
238
+ // Merge config with overrides
239
+ const config: ProcessConfig = {
240
+ ...definition.config,
241
+ ...options?.configOverrides,
242
+ args: options?.configOverrides?.args ?? definition.config.args,
243
+ env: {
244
+ ...definition.config.env,
245
+ ...options?.configOverrides?.env,
246
+ },
247
+ };
248
+
249
+ // Create DB record with status "spawning" (before spawn for crash recovery)
250
+ const process = await this.adapter.create({
251
+ name,
252
+ status: "spawning",
253
+ config,
254
+ metadata: options?.metadata,
255
+ createdAt: new Date(),
256
+ restartCount: 0,
257
+ consecutiveFailures: 0,
258
+ });
259
+
260
+ try {
261
+ // Create Unix socket
262
+ const { socketPath, tcpPort } = await this.socketServer.createSocket(process.id);
263
+
264
+ // Build environment with socket info
265
+ const env: Record<string, string> = {
266
+ ...config.env,
267
+ DONKEYLABS_PROCESS_ID: process.id,
268
+ };
269
+ if (socketPath) {
270
+ env.DONKEYLABS_SOCKET_PATH = socketPath;
271
+ }
272
+ if (tcpPort) {
273
+ env.DONKEYLABS_TCP_PORT = tcpPort.toString();
274
+ }
275
+
276
+ // Spawn the process
277
+ const proc = Bun.spawn([config.command, ...(config.args || [])], {
278
+ cwd: config.cwd,
279
+ env,
280
+ stdout: "inherit",
281
+ stderr: "inherit",
282
+ });
283
+
284
+ // Store subprocess reference
285
+ this.subprocesses.set(process.id, proc);
286
+
287
+ // Update DB with PID and status
288
+ await this.adapter.update(process.id, {
289
+ pid: proc.pid,
290
+ socketPath,
291
+ tcpPort,
292
+ status: "running",
293
+ startedAt: new Date(),
294
+ });
295
+
296
+ // Emit event
297
+ await this.emitEvent("process.spawned", {
298
+ processId: process.id,
299
+ name,
300
+ pid: proc.pid,
301
+ });
302
+
303
+ // Set up exit handler for crash detection
304
+ proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
305
+
306
+ console.log(`[Processes] Spawned ${name} (${process.id}) with PID ${proc.pid}`);
307
+ return process.id;
308
+ } catch (error) {
309
+ // Cleanup on spawn failure
310
+ await this.adapter.update(process.id, {
311
+ status: "crashed",
312
+ error: error instanceof Error ? error.message : String(error),
313
+ stoppedAt: new Date(),
314
+ });
315
+ await this.socketServer.closeSocket(process.id);
316
+ throw error;
317
+ }
318
+ }
319
+
320
+ async stop(processId: string): Promise<boolean> {
321
+ const proc = await this.adapter.get(processId);
322
+ if (!proc) return false;
323
+
324
+ if (proc.status !== "running" && proc.status !== "spawning") {
325
+ return false;
326
+ }
327
+
328
+ // Mark as stopping to prevent auto-restart
329
+ await this.adapter.update(processId, { status: "stopping" });
330
+
331
+ const subprocess = this.subprocesses.get(processId);
332
+ if (subprocess && proc.pid) {
333
+ try {
334
+ // Send SIGTERM for graceful shutdown
335
+ subprocess.kill("SIGTERM");
336
+
337
+ // Wait for process to exit (with timeout)
338
+ const exitPromise = subprocess.exited;
339
+ const timeoutPromise = new Promise<null>((resolve) =>
340
+ setTimeout(() => resolve(null), 5000)
341
+ );
342
+
343
+ const result = await Promise.race([exitPromise, timeoutPromise]);
344
+ if (result === null) {
345
+ // Timed out, force kill
346
+ subprocess.kill("SIGKILL");
347
+ await subprocess.exited;
348
+ }
349
+ } catch {
350
+ // Process may have already exited
351
+ }
352
+ }
353
+
354
+ // Cleanup
355
+ await this.socketServer.closeSocket(processId);
356
+ this.subprocesses.delete(processId);
357
+
358
+ await this.adapter.update(processId, {
359
+ status: "stopped",
360
+ stoppedAt: new Date(),
361
+ });
362
+
363
+ await this.emitEvent("process.stopped", { processId, name: proc.name });
364
+ console.log(`[Processes] Stopped ${proc.name} (${processId})`);
365
+ return true;
366
+ }
367
+
368
+ async kill(processId: string): Promise<boolean> {
369
+ const proc = await this.adapter.get(processId);
370
+ if (!proc) return false;
371
+
372
+ // Mark as stopping to prevent auto-restart
373
+ await this.adapter.update(processId, { status: "stopping" });
374
+
375
+ const subprocess = this.subprocesses.get(processId);
376
+ if (subprocess && proc.pid) {
377
+ try {
378
+ subprocess.kill("SIGKILL");
379
+ await subprocess.exited;
380
+ } catch {
381
+ // Process may have already exited
382
+ }
383
+ }
384
+
385
+ // Cleanup
386
+ await this.socketServer.closeSocket(processId);
387
+ this.subprocesses.delete(processId);
388
+
389
+ await this.adapter.update(processId, {
390
+ status: "stopped",
391
+ stoppedAt: new Date(),
392
+ });
393
+
394
+ await this.emitEvent("process.stopped", { processId, name: proc.name });
395
+ console.log(`[Processes] Killed ${proc.name} (${processId})`);
396
+ return true;
397
+ }
398
+
399
+ async restart(processId: string): Promise<string> {
400
+ const oldProcess = await this.adapter.get(processId);
401
+ if (!oldProcess) {
402
+ throw new Error(`Process ${processId} not found`);
403
+ }
404
+
405
+ // Stop the old process
406
+ await this.stop(processId);
407
+
408
+ // Spawn new instance with same config
409
+ const newProcessId = await this.spawn(oldProcess.name, {
410
+ configOverrides: oldProcess.config,
411
+ metadata: oldProcess.metadata,
412
+ });
413
+
414
+ const newProcess = await this.adapter.get(newProcessId);
415
+ if (newProcess) {
416
+ // Update restart count
417
+ await this.adapter.update(newProcessId, {
418
+ restartCount: oldProcess.restartCount + 1,
419
+ });
420
+
421
+ const definition = this.definitions.get(oldProcess.name);
422
+ if (definition?.onRestart) {
423
+ await definition.onRestart(oldProcess, newProcess, oldProcess.restartCount + 1);
424
+ }
425
+
426
+ await this.emitEvent("process.restarted", {
427
+ oldProcessId: processId,
428
+ newProcessId,
429
+ name: oldProcess.name,
430
+ attempt: oldProcess.restartCount + 1,
431
+ });
432
+ }
433
+
434
+ return newProcessId;
435
+ }
436
+
437
+ async get(processId: string): Promise<ManagedProcess | null> {
438
+ return this.adapter.get(processId);
439
+ }
440
+
441
+ async getByName(name: string): Promise<ManagedProcess[]> {
442
+ return this.adapter.getByName(name);
443
+ }
444
+
445
+ async getRunning(): Promise<ManagedProcess[]> {
446
+ return this.adapter.getRunning();
447
+ }
448
+
449
+ async send(processId: string, message: any): Promise<boolean> {
450
+ return this.socketServer.send(processId, message);
451
+ }
452
+
453
+ start(): void {
454
+ // Recover orphaned processes
455
+ if (this.autoRecoverOrphans) {
456
+ this.reconcileOrphans().catch((err) => {
457
+ console.error("[Processes] Error recovering orphans:", err);
458
+ });
459
+ }
460
+
461
+ // Start heartbeat monitoring
462
+ this.startHeartbeatMonitor();
463
+
464
+ console.log("[Processes] Service started");
465
+ }
466
+
467
+ async shutdown(): Promise<void> {
468
+ this.isShuttingDown = true;
469
+ console.log("[Processes] Shutting down...");
470
+
471
+ // Stop heartbeat monitor
472
+ if (this.heartbeatMonitor) {
473
+ clearInterval(this.heartbeatMonitor);
474
+ this.heartbeatMonitor = undefined;
475
+ }
476
+
477
+ // Cancel pending restarts
478
+ for (const timeout of this.pendingRestarts.values()) {
479
+ clearTimeout(timeout);
480
+ }
481
+ this.pendingRestarts.clear();
482
+
483
+ // Stop all running processes
484
+ const running = await this.adapter.getRunning();
485
+ await Promise.all(running.map((p) => this.stop(p.id)));
486
+
487
+ // Shutdown socket server
488
+ await this.socketServer.shutdown();
489
+
490
+ // Stop adapter
491
+ this.adapter.stop();
492
+
493
+ console.log("[Processes] Shutdown complete");
494
+ }
495
+
496
+ // ============================================
497
+ // Private Methods
498
+ // ============================================
499
+
500
+ private async handleMessage(message: ProcessMessage): Promise<void> {
501
+ const { processId, type } = message;
502
+ const proc = await this.adapter.get(processId);
503
+ if (!proc) return;
504
+
505
+ // Handle heartbeat messages
506
+ if (type === "heartbeat") {
507
+ await this.adapter.update(processId, { lastHeartbeat: new Date() });
508
+ return;
509
+ }
510
+
511
+ // Emit generic message event
512
+ await this.emitEvent("process.message", {
513
+ processId,
514
+ name: proc.name,
515
+ message,
516
+ });
517
+
518
+ // Call definition callback
519
+ const definition = this.definitions.get(proc.name);
520
+ if (definition?.onMessage) {
521
+ await definition.onMessage(proc, message);
522
+ }
523
+ }
524
+
525
+ private handleConnect(processId: string): void {
526
+ console.log(`[Processes] Socket connected: ${processId}`);
527
+ }
528
+
529
+ private async handleDisconnect(processId: string): Promise<void> {
530
+ console.log(`[Processes] Socket disconnected: ${processId}`);
531
+ // Socket disconnect doesn't mean process crashed - wait for exit handler
532
+ }
533
+
534
+ private handleError(error: Error, processId?: string): void {
535
+ console.error(`[Processes] Socket error${processId ? ` for ${processId}` : ""}:`, error.message);
536
+ }
537
+
538
+ private async handleExit(processId: string, exitCode: number | null): Promise<void> {
539
+ if (this.isShuttingDown) return;
540
+
541
+ const proc = await this.adapter.get(processId);
542
+ if (!proc) return;
543
+
544
+ // If we're intentionally stopping, don't treat as crash
545
+ if (proc.status === "stopping" || proc.status === "stopped") {
546
+ return;
547
+ }
548
+
549
+ console.log(`[Processes] Process ${proc.name} (${processId}) exited with code ${exitCode}`);
550
+
551
+ // Increment consecutive failures
552
+ const newConsecutiveFailures = proc.consecutiveFailures + 1;
553
+
554
+ // Unexpected crash
555
+ await this.adapter.update(processId, {
556
+ status: "crashed",
557
+ stoppedAt: new Date(),
558
+ consecutiveFailures: newConsecutiveFailures,
559
+ error: `Exited with code ${exitCode}`,
560
+ });
561
+
562
+ // Cleanup
563
+ await this.socketServer.closeSocket(processId);
564
+ this.subprocesses.delete(processId);
565
+
566
+ // Emit event
567
+ await this.emitEvent("process.crashed", {
568
+ processId,
569
+ name: proc.name,
570
+ exitCode,
571
+ });
572
+
573
+ // Call definition callback
574
+ const definition = this.definitions.get(proc.name);
575
+ if (definition?.onCrash) {
576
+ await definition.onCrash(proc, exitCode);
577
+ }
578
+
579
+ // Handle auto-restart with updated consecutive failures
580
+ const updatedProc = { ...proc, consecutiveFailures: newConsecutiveFailures };
581
+ await this.handleAutoRestart(processId, updatedProc);
582
+ }
583
+
584
+ private async handleAutoRestart(processId: string, proc: ManagedProcess): Promise<void> {
585
+ const config = proc.config;
586
+ if (!config.autoRestart) return;
587
+
588
+ const maxRestarts = config.maxRestarts ?? 10;
589
+ if (maxRestarts !== -1 && proc.consecutiveFailures >= maxRestarts) {
590
+ console.log(`[Processes] ${proc.name} (${processId}) reached max restarts (${maxRestarts})`);
591
+ await this.adapter.update(processId, { status: "dead" });
592
+ return;
593
+ }
594
+
595
+ // Calculate backoff delay
596
+ const delay = calculateBackoff(proc.consecutiveFailures, config.backoff);
597
+ console.log(`[Processes] Scheduling restart of ${proc.name} in ${Math.round(delay)}ms`);
598
+
599
+ // Schedule restart
600
+ const timeout = setTimeout(async () => {
601
+ this.pendingRestarts.delete(processId);
602
+
603
+ try {
604
+ const newProcessId = await this.spawn(proc.name, {
605
+ configOverrides: proc.config,
606
+ metadata: proc.metadata,
607
+ });
608
+
609
+ // Preserve restart count
610
+ await this.adapter.update(newProcessId, {
611
+ restartCount: proc.restartCount + 1,
612
+ consecutiveFailures: proc.consecutiveFailures,
613
+ });
614
+
615
+ const newProcess = await this.adapter.get(newProcessId);
616
+ const definition = this.definitions.get(proc.name);
617
+ if (definition?.onRestart && newProcess) {
618
+ await definition.onRestart(proc, newProcess, proc.restartCount + 1);
619
+ }
620
+
621
+ await this.emitEvent("process.restarted", {
622
+ oldProcessId: processId,
623
+ newProcessId,
624
+ name: proc.name,
625
+ attempt: proc.restartCount + 1,
626
+ });
627
+ } catch (err) {
628
+ console.error(`[Processes] Failed to restart ${proc.name}:`, err);
629
+ }
630
+ }, delay);
631
+
632
+ this.pendingRestarts.set(processId, timeout);
633
+ }
634
+
635
+ private async reconcileOrphans(): Promise<void> {
636
+ console.log("[Processes] Checking for orphaned processes...");
637
+ const orphaned = await this.adapter.getOrphaned();
638
+
639
+ if (orphaned.length === 0) {
640
+ console.log("[Processes] No orphaned processes found");
641
+ return;
642
+ }
643
+
644
+ // Reserve socket paths for potential reconnection
645
+ const activeIds = new Set<string>();
646
+ for (const proc of orphaned) {
647
+ if (proc.socketPath || proc.tcpPort) {
648
+ this.socketServer.reserve(proc.id, proc.socketPath, proc.tcpPort);
649
+ activeIds.add(proc.id);
650
+ }
651
+ }
652
+
653
+ // Clean orphaned socket files not belonging to our processes
654
+ await this.socketServer.cleanOrphanedSockets(activeIds);
655
+
656
+ for (const proc of orphaned) {
657
+ if (proc.pid && isProcessAlive(proc.pid)) {
658
+ // Process is still running! Try to reconnect
659
+ console.log(`[Processes] Found orphaned process ${proc.name} (${proc.id}) with PID ${proc.pid}`);
660
+
661
+ const reconnected = await this.socketServer.reconnect(
662
+ proc.id,
663
+ proc.socketPath,
664
+ proc.tcpPort
665
+ );
666
+
667
+ if (reconnected) {
668
+ await this.adapter.update(proc.id, { status: "running" });
669
+ await this.emitEvent("process.reconnected", {
670
+ processId: proc.id,
671
+ name: proc.name,
672
+ pid: proc.pid,
673
+ });
674
+ console.log(`[Processes] Reconnected to ${proc.name} (${proc.id})`);
675
+ } else {
676
+ // Couldn't reconnect, mark as orphaned
677
+ await this.adapter.update(proc.id, { status: "orphaned" });
678
+ console.log(`[Processes] Could not reconnect to ${proc.name} (${proc.id}), marked as orphaned`);
679
+
680
+ // Try auto-restart if configured
681
+ const definition = this.definitions.get(proc.name);
682
+ if (definition?.config.autoRestart) {
683
+ console.log(`[Processes] Killing orphaned process and restarting ${proc.name}`);
684
+ try {
685
+ process.kill(proc.pid, "SIGKILL");
686
+ } catch {
687
+ // Process may have already exited
688
+ }
689
+ await this.handleAutoRestart(proc.id, proc);
690
+ }
691
+ }
692
+ } else {
693
+ // Process is dead
694
+ console.log(`[Processes] Orphaned process ${proc.name} (${proc.id}) is dead`);
695
+ await this.adapter.update(proc.id, { status: "dead", stoppedAt: new Date() });
696
+ this.socketServer.release(proc.id);
697
+
698
+ // Try auto-restart if configured
699
+ const definition = this.definitions.get(proc.name);
700
+ if (definition?.config.autoRestart) {
701
+ await this.handleAutoRestart(proc.id, proc);
702
+ }
703
+ }
704
+ }
705
+ }
706
+
707
+ private startHeartbeatMonitor(): void {
708
+ this.heartbeatMonitor = setInterval(async () => {
709
+ if (this.isShuttingDown) return;
710
+
711
+ const running = await this.adapter.getRunning();
712
+ const now = Date.now();
713
+
714
+ for (const proc of running) {
715
+ const heartbeatConfig = proc.config.heartbeat;
716
+ if (!heartbeatConfig) continue;
717
+
718
+ const timeoutMs = heartbeatConfig.timeoutMs ?? 60000;
719
+ const lastHeartbeat = proc.lastHeartbeat?.getTime() ?? proc.startedAt?.getTime() ?? 0;
720
+
721
+ if (now - lastHeartbeat > timeoutMs) {
722
+ console.warn(`[Processes] Heartbeat missed for ${proc.name} (${proc.id})`);
723
+
724
+ await this.emitEvent("process.heartbeat_missed", {
725
+ processId: proc.id,
726
+ name: proc.name,
727
+ });
728
+
729
+ const definition = this.definitions.get(proc.name);
730
+ if (definition?.onUnhealthy) {
731
+ await definition.onUnhealthy(proc);
732
+ }
733
+
734
+ // If heartbeat is way overdue (2x timeout), kill and restart
735
+ if (now - lastHeartbeat > timeoutMs * 2) {
736
+ console.warn(`[Processes] Killing unresponsive process ${proc.name} (${proc.id})`);
737
+ await this.kill(proc.id);
738
+ // handleExit will trigger auto-restart if configured
739
+ }
740
+ }
741
+ }
742
+ }, this.heartbeatCheckInterval);
743
+ }
744
+
745
+ private async emitEvent(eventName: string, data: any): Promise<void> {
746
+ if (this.events) {
747
+ await this.events.emit(eventName, data);
748
+ }
749
+ }
750
+ }
751
+
752
+ // ============================================
753
+ // Factory Function
754
+ // ============================================
755
+
756
+ export function createProcesses(config?: ProcessesConfig): Processes {
757
+ return new ProcessesImpl(config);
758
+ }