@donkeylabs/server 0.5.1 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/router.md +93 -0
- package/package.json +2 -2
- package/src/core/index.ts +24 -0
- package/src/core/process-adapter-sqlite.ts +282 -0
- package/src/core/process-socket.ts +521 -0
- package/src/core/processes.ts +758 -0
- package/src/core.ts +2 -0
- package/src/harness.ts +3 -0
- package/src/server.ts +32 -3
|
@@ -0,0 +1,758 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Processes Core Service
|
|
3
|
+
*
|
|
4
|
+
* Manages persistent, long-running processes (daemons) with supervision capabilities.
|
|
5
|
+
* Use cases: FFmpeg subprocesses, Firecracker VMs, any long-running daemon requiring supervision.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Subprocess } from "bun";
|
|
9
|
+
import type { Events } from "./events";
|
|
10
|
+
import {
|
|
11
|
+
createProcessSocketServer,
|
|
12
|
+
type ProcessSocketServer,
|
|
13
|
+
type ProcessMessage,
|
|
14
|
+
type ProcessSocketConfig,
|
|
15
|
+
} from "./process-socket";
|
|
16
|
+
import {
|
|
17
|
+
SqliteProcessAdapter,
|
|
18
|
+
type ProcessAdapter,
|
|
19
|
+
type SqliteProcessAdapterConfig,
|
|
20
|
+
} from "./process-adapter-sqlite";
|
|
21
|
+
|
|
22
|
+
// ============================================
|
|
23
|
+
// Types
|
|
24
|
+
// ============================================
|
|
25
|
+
|
|
26
|
+
export type ProcessStatus =
|
|
27
|
+
| "spawning"
|
|
28
|
+
| "running"
|
|
29
|
+
| "stopping"
|
|
30
|
+
| "stopped"
|
|
31
|
+
| "crashed"
|
|
32
|
+
| "orphaned"
|
|
33
|
+
| "dead";
|
|
34
|
+
|
|
35
|
+
export interface ProcessConfig {
|
|
36
|
+
/** Command to execute (e.g., "ffmpeg", "python", "./script.sh") */
|
|
37
|
+
command: string;
|
|
38
|
+
/** Arguments to pass to the command */
|
|
39
|
+
args?: string[];
|
|
40
|
+
/** Working directory for the process */
|
|
41
|
+
cwd?: string;
|
|
42
|
+
/** Environment variables to set */
|
|
43
|
+
env?: Record<string, string>;
|
|
44
|
+
/** Auto-restart on crash (default: false) */
|
|
45
|
+
autoRestart?: boolean;
|
|
46
|
+
/** Maximum number of restarts before giving up (default: 10, -1 for unlimited) */
|
|
47
|
+
maxRestarts?: number;
|
|
48
|
+
/** Backoff configuration for restarts */
|
|
49
|
+
backoff?: {
|
|
50
|
+
/** Initial delay in ms (default: 1000) */
|
|
51
|
+
initialDelayMs?: number;
|
|
52
|
+
/** Maximum delay in ms (default: 30000) */
|
|
53
|
+
maxDelayMs?: number;
|
|
54
|
+
/** Multiplier for exponential backoff (default: 2) */
|
|
55
|
+
multiplier?: number;
|
|
56
|
+
};
|
|
57
|
+
/** Heartbeat configuration */
|
|
58
|
+
heartbeat?: {
|
|
59
|
+
/** Expected interval between heartbeats in ms (default: 30000) */
|
|
60
|
+
intervalMs?: number;
|
|
61
|
+
/** Timeout before considering unhealthy in ms (default: 60000) */
|
|
62
|
+
timeoutMs?: number;
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export interface ManagedProcess {
|
|
67
|
+
id: string;
|
|
68
|
+
name: string;
|
|
69
|
+
pid?: number;
|
|
70
|
+
socketPath?: string;
|
|
71
|
+
tcpPort?: number;
|
|
72
|
+
status: ProcessStatus;
|
|
73
|
+
config: ProcessConfig;
|
|
74
|
+
metadata?: Record<string, any>;
|
|
75
|
+
createdAt: Date;
|
|
76
|
+
startedAt?: Date;
|
|
77
|
+
stoppedAt?: Date;
|
|
78
|
+
lastHeartbeat?: Date;
|
|
79
|
+
restartCount: number;
|
|
80
|
+
consecutiveFailures: number;
|
|
81
|
+
error?: string;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export interface ProcessDefinition {
|
|
85
|
+
name: string;
|
|
86
|
+
config: Omit<ProcessConfig, "args"> & { args?: string[] };
|
|
87
|
+
/** Called when a message is received from the process */
|
|
88
|
+
onMessage?: (process: ManagedProcess, message: any) => void | Promise<void>;
|
|
89
|
+
/** Called when the process crashes unexpectedly */
|
|
90
|
+
onCrash?: (process: ManagedProcess, exitCode: number | null) => void | Promise<void>;
|
|
91
|
+
/** Called when heartbeat is missed */
|
|
92
|
+
onUnhealthy?: (process: ManagedProcess) => void | Promise<void>;
|
|
93
|
+
/** Called when the process is restarted */
|
|
94
|
+
onRestart?: (oldProcess: ManagedProcess, newProcess: ManagedProcess, attempt: number) => void | Promise<void>;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
export interface SpawnOptions {
|
|
98
|
+
/** Override config fields for this spawn */
|
|
99
|
+
configOverrides?: Partial<ProcessConfig>;
|
|
100
|
+
/** Metadata to store with the process */
|
|
101
|
+
metadata?: Record<string, any>;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// ============================================
|
|
105
|
+
// Configuration
|
|
106
|
+
// ============================================
|
|
107
|
+
|
|
108
|
+
export interface ProcessesConfig {
|
|
109
|
+
/** SQLite adapter configuration */
|
|
110
|
+
adapter?: SqliteProcessAdapterConfig;
|
|
111
|
+
/** Socket server configuration */
|
|
112
|
+
socket?: ProcessSocketConfig;
|
|
113
|
+
/** Events service for emitting process events */
|
|
114
|
+
events?: Events;
|
|
115
|
+
/** Heartbeat check interval in ms (default: 10000) */
|
|
116
|
+
heartbeatCheckInterval?: number;
|
|
117
|
+
/** Enable auto-reconnect to orphaned processes on startup (default: true) */
|
|
118
|
+
autoRecoverOrphans?: boolean;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// ============================================
|
|
122
|
+
// Service Interface
|
|
123
|
+
// ============================================
|
|
124
|
+
|
|
125
|
+
export interface Processes {
|
|
126
|
+
/** Register a process definition */
|
|
127
|
+
register(definition: ProcessDefinition): void;
|
|
128
|
+
/** Spawn a new process instance */
|
|
129
|
+
spawn(name: string, options?: SpawnOptions): Promise<string>;
|
|
130
|
+
/** Gracefully stop a process (SIGTERM) */
|
|
131
|
+
stop(processId: string): Promise<boolean>;
|
|
132
|
+
/** Force kill a process (SIGKILL) */
|
|
133
|
+
kill(processId: string): Promise<boolean>;
|
|
134
|
+
/** Restart a process */
|
|
135
|
+
restart(processId: string): Promise<string>;
|
|
136
|
+
/** Get a process by ID */
|
|
137
|
+
get(processId: string): Promise<ManagedProcess | null>;
|
|
138
|
+
/** Get all processes by name */
|
|
139
|
+
getByName(name: string): Promise<ManagedProcess[]>;
|
|
140
|
+
/** Get all running processes */
|
|
141
|
+
getRunning(): Promise<ManagedProcess[]>;
|
|
142
|
+
/** Send a message to a process via socket */
|
|
143
|
+
send(processId: string, message: any): Promise<boolean>;
|
|
144
|
+
/** Start the service (recovery, monitoring) */
|
|
145
|
+
start(): void;
|
|
146
|
+
/** Shutdown the service and all managed processes */
|
|
147
|
+
shutdown(): Promise<void>;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// ============================================
|
|
151
|
+
// Helper Functions
|
|
152
|
+
// ============================================
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Check if a process with given PID is still alive
|
|
156
|
+
*/
|
|
157
|
+
export function isProcessAlive(pid: number): boolean {
|
|
158
|
+
try {
|
|
159
|
+
// Sending signal 0 doesn't actually send a signal,
|
|
160
|
+
// it just checks if the process exists and we have permission to signal it
|
|
161
|
+
process.kill(pid, 0);
|
|
162
|
+
return true;
|
|
163
|
+
} catch {
|
|
164
|
+
return false;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Calculate backoff delay with jitter
|
|
170
|
+
*/
|
|
171
|
+
function calculateBackoff(
|
|
172
|
+
consecutiveFailures: number,
|
|
173
|
+
config: ProcessConfig["backoff"]
|
|
174
|
+
): number {
|
|
175
|
+
const initialDelay = config?.initialDelayMs ?? 1000;
|
|
176
|
+
const maxDelay = config?.maxDelayMs ?? 30000;
|
|
177
|
+
const multiplier = config?.multiplier ?? 2;
|
|
178
|
+
|
|
179
|
+
const delay = Math.min(
|
|
180
|
+
initialDelay * Math.pow(multiplier, consecutiveFailures),
|
|
181
|
+
maxDelay
|
|
182
|
+
);
|
|
183
|
+
|
|
184
|
+
// Add jitter (0.5 to 1.5x the delay)
|
|
185
|
+
return delay * (0.5 + Math.random());
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// ============================================
|
|
189
|
+
// Implementation
|
|
190
|
+
// ============================================
|
|
191
|
+
|
|
192
|
+
export class ProcessesImpl implements Processes {
|
|
193
|
+
private definitions = new Map<string, ProcessDefinition>();
|
|
194
|
+
private adapter: ProcessAdapter;
|
|
195
|
+
private socketServer: ProcessSocketServer;
|
|
196
|
+
private events?: Events;
|
|
197
|
+
private heartbeatCheckInterval: number;
|
|
198
|
+
private autoRecoverOrphans: boolean;
|
|
199
|
+
|
|
200
|
+
// Track running Bun subprocesses
|
|
201
|
+
private subprocesses = new Map<string, Subprocess>();
|
|
202
|
+
// Track pending restarts (processId -> timeout)
|
|
203
|
+
private pendingRestarts = new Map<string, ReturnType<typeof setTimeout>>();
|
|
204
|
+
// Heartbeat monitor interval
|
|
205
|
+
private heartbeatMonitor?: ReturnType<typeof setInterval>;
|
|
206
|
+
// Shutdown flag
|
|
207
|
+
private isShuttingDown = false;
|
|
208
|
+
|
|
209
|
+
constructor(config: ProcessesConfig = {}) {
|
|
210
|
+
this.adapter = new SqliteProcessAdapter(config.adapter);
|
|
211
|
+
this.events = config.events;
|
|
212
|
+
this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
|
|
213
|
+
this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
|
|
214
|
+
|
|
215
|
+
// Create socket server with callbacks
|
|
216
|
+
this.socketServer = createProcessSocketServer(config.socket ?? {}, {
|
|
217
|
+
onMessage: (message) => this.handleMessage(message),
|
|
218
|
+
onConnect: (processId) => this.handleConnect(processId),
|
|
219
|
+
onDisconnect: (processId) => this.handleDisconnect(processId),
|
|
220
|
+
onError: (error, processId) => this.handleError(error, processId),
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
register(definition: ProcessDefinition): void {
|
|
225
|
+
if (this.definitions.has(definition.name)) {
|
|
226
|
+
console.warn(`[Processes] Overwriting existing definition for '${definition.name}'`);
|
|
227
|
+
}
|
|
228
|
+
this.definitions.set(definition.name, definition);
|
|
229
|
+
console.log(`[Processes] Registered process definition: ${definition.name}`);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
async spawn(name: string, options?: SpawnOptions): Promise<string> {
|
|
233
|
+
const definition = this.definitions.get(name);
|
|
234
|
+
if (!definition) {
|
|
235
|
+
throw new Error(`Process definition '${name}' not found. Did you call register()?`);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// Merge config with overrides
|
|
239
|
+
const config: ProcessConfig = {
|
|
240
|
+
...definition.config,
|
|
241
|
+
...options?.configOverrides,
|
|
242
|
+
args: options?.configOverrides?.args ?? definition.config.args,
|
|
243
|
+
env: {
|
|
244
|
+
...definition.config.env,
|
|
245
|
+
...options?.configOverrides?.env,
|
|
246
|
+
},
|
|
247
|
+
};
|
|
248
|
+
|
|
249
|
+
// Create DB record with status "spawning" (before spawn for crash recovery)
|
|
250
|
+
const process = await this.adapter.create({
|
|
251
|
+
name,
|
|
252
|
+
status: "spawning",
|
|
253
|
+
config,
|
|
254
|
+
metadata: options?.metadata,
|
|
255
|
+
createdAt: new Date(),
|
|
256
|
+
restartCount: 0,
|
|
257
|
+
consecutiveFailures: 0,
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
try {
|
|
261
|
+
// Create Unix socket
|
|
262
|
+
const { socketPath, tcpPort } = await this.socketServer.createSocket(process.id);
|
|
263
|
+
|
|
264
|
+
// Build environment with socket info
|
|
265
|
+
const env: Record<string, string> = {
|
|
266
|
+
...config.env,
|
|
267
|
+
DONKEYLABS_PROCESS_ID: process.id,
|
|
268
|
+
};
|
|
269
|
+
if (socketPath) {
|
|
270
|
+
env.DONKEYLABS_SOCKET_PATH = socketPath;
|
|
271
|
+
}
|
|
272
|
+
if (tcpPort) {
|
|
273
|
+
env.DONKEYLABS_TCP_PORT = tcpPort.toString();
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Spawn the process
|
|
277
|
+
const proc = Bun.spawn([config.command, ...(config.args || [])], {
|
|
278
|
+
cwd: config.cwd,
|
|
279
|
+
env,
|
|
280
|
+
stdout: "inherit",
|
|
281
|
+
stderr: "inherit",
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
// Store subprocess reference
|
|
285
|
+
this.subprocesses.set(process.id, proc);
|
|
286
|
+
|
|
287
|
+
// Update DB with PID and status
|
|
288
|
+
await this.adapter.update(process.id, {
|
|
289
|
+
pid: proc.pid,
|
|
290
|
+
socketPath,
|
|
291
|
+
tcpPort,
|
|
292
|
+
status: "running",
|
|
293
|
+
startedAt: new Date(),
|
|
294
|
+
});
|
|
295
|
+
|
|
296
|
+
// Emit event
|
|
297
|
+
await this.emitEvent("process.spawned", {
|
|
298
|
+
processId: process.id,
|
|
299
|
+
name,
|
|
300
|
+
pid: proc.pid,
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
// Set up exit handler for crash detection
|
|
304
|
+
proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
|
|
305
|
+
|
|
306
|
+
console.log(`[Processes] Spawned ${name} (${process.id}) with PID ${proc.pid}`);
|
|
307
|
+
return process.id;
|
|
308
|
+
} catch (error) {
|
|
309
|
+
// Cleanup on spawn failure
|
|
310
|
+
await this.adapter.update(process.id, {
|
|
311
|
+
status: "crashed",
|
|
312
|
+
error: error instanceof Error ? error.message : String(error),
|
|
313
|
+
stoppedAt: new Date(),
|
|
314
|
+
});
|
|
315
|
+
await this.socketServer.closeSocket(process.id);
|
|
316
|
+
throw error;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
async stop(processId: string): Promise<boolean> {
|
|
321
|
+
const proc = await this.adapter.get(processId);
|
|
322
|
+
if (!proc) return false;
|
|
323
|
+
|
|
324
|
+
if (proc.status !== "running" && proc.status !== "spawning") {
|
|
325
|
+
return false;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Mark as stopping to prevent auto-restart
|
|
329
|
+
await this.adapter.update(processId, { status: "stopping" });
|
|
330
|
+
|
|
331
|
+
const subprocess = this.subprocesses.get(processId);
|
|
332
|
+
if (subprocess && proc.pid) {
|
|
333
|
+
try {
|
|
334
|
+
// Send SIGTERM for graceful shutdown
|
|
335
|
+
subprocess.kill("SIGTERM");
|
|
336
|
+
|
|
337
|
+
// Wait for process to exit (with timeout)
|
|
338
|
+
const exitPromise = subprocess.exited;
|
|
339
|
+
const timeoutPromise = new Promise<null>((resolve) =>
|
|
340
|
+
setTimeout(() => resolve(null), 5000)
|
|
341
|
+
);
|
|
342
|
+
|
|
343
|
+
const result = await Promise.race([exitPromise, timeoutPromise]);
|
|
344
|
+
if (result === null) {
|
|
345
|
+
// Timed out, force kill
|
|
346
|
+
subprocess.kill("SIGKILL");
|
|
347
|
+
await subprocess.exited;
|
|
348
|
+
}
|
|
349
|
+
} catch {
|
|
350
|
+
// Process may have already exited
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Cleanup
|
|
355
|
+
await this.socketServer.closeSocket(processId);
|
|
356
|
+
this.subprocesses.delete(processId);
|
|
357
|
+
|
|
358
|
+
await this.adapter.update(processId, {
|
|
359
|
+
status: "stopped",
|
|
360
|
+
stoppedAt: new Date(),
|
|
361
|
+
});
|
|
362
|
+
|
|
363
|
+
await this.emitEvent("process.stopped", { processId, name: proc.name });
|
|
364
|
+
console.log(`[Processes] Stopped ${proc.name} (${processId})`);
|
|
365
|
+
return true;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
async kill(processId: string): Promise<boolean> {
|
|
369
|
+
const proc = await this.adapter.get(processId);
|
|
370
|
+
if (!proc) return false;
|
|
371
|
+
|
|
372
|
+
// Mark as stopping to prevent auto-restart
|
|
373
|
+
await this.adapter.update(processId, { status: "stopping" });
|
|
374
|
+
|
|
375
|
+
const subprocess = this.subprocesses.get(processId);
|
|
376
|
+
if (subprocess && proc.pid) {
|
|
377
|
+
try {
|
|
378
|
+
subprocess.kill("SIGKILL");
|
|
379
|
+
await subprocess.exited;
|
|
380
|
+
} catch {
|
|
381
|
+
// Process may have already exited
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// Cleanup
|
|
386
|
+
await this.socketServer.closeSocket(processId);
|
|
387
|
+
this.subprocesses.delete(processId);
|
|
388
|
+
|
|
389
|
+
await this.adapter.update(processId, {
|
|
390
|
+
status: "stopped",
|
|
391
|
+
stoppedAt: new Date(),
|
|
392
|
+
});
|
|
393
|
+
|
|
394
|
+
await this.emitEvent("process.stopped", { processId, name: proc.name });
|
|
395
|
+
console.log(`[Processes] Killed ${proc.name} (${processId})`);
|
|
396
|
+
return true;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
async restart(processId: string): Promise<string> {
|
|
400
|
+
const oldProcess = await this.adapter.get(processId);
|
|
401
|
+
if (!oldProcess) {
|
|
402
|
+
throw new Error(`Process ${processId} not found`);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// Stop the old process
|
|
406
|
+
await this.stop(processId);
|
|
407
|
+
|
|
408
|
+
// Spawn new instance with same config
|
|
409
|
+
const newProcessId = await this.spawn(oldProcess.name, {
|
|
410
|
+
configOverrides: oldProcess.config,
|
|
411
|
+
metadata: oldProcess.metadata,
|
|
412
|
+
});
|
|
413
|
+
|
|
414
|
+
const newProcess = await this.adapter.get(newProcessId);
|
|
415
|
+
if (newProcess) {
|
|
416
|
+
// Update restart count
|
|
417
|
+
await this.adapter.update(newProcessId, {
|
|
418
|
+
restartCount: oldProcess.restartCount + 1,
|
|
419
|
+
});
|
|
420
|
+
|
|
421
|
+
const definition = this.definitions.get(oldProcess.name);
|
|
422
|
+
if (definition?.onRestart) {
|
|
423
|
+
await definition.onRestart(oldProcess, newProcess, oldProcess.restartCount + 1);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
await this.emitEvent("process.restarted", {
|
|
427
|
+
oldProcessId: processId,
|
|
428
|
+
newProcessId,
|
|
429
|
+
name: oldProcess.name,
|
|
430
|
+
attempt: oldProcess.restartCount + 1,
|
|
431
|
+
});
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
return newProcessId;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
async get(processId: string): Promise<ManagedProcess | null> {
|
|
438
|
+
return this.adapter.get(processId);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
async getByName(name: string): Promise<ManagedProcess[]> {
|
|
442
|
+
return this.adapter.getByName(name);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
async getRunning(): Promise<ManagedProcess[]> {
|
|
446
|
+
return this.adapter.getRunning();
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
async send(processId: string, message: any): Promise<boolean> {
|
|
450
|
+
return this.socketServer.send(processId, message);
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
start(): void {
|
|
454
|
+
// Recover orphaned processes
|
|
455
|
+
if (this.autoRecoverOrphans) {
|
|
456
|
+
this.reconcileOrphans().catch((err) => {
|
|
457
|
+
console.error("[Processes] Error recovering orphans:", err);
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// Start heartbeat monitoring
|
|
462
|
+
this.startHeartbeatMonitor();
|
|
463
|
+
|
|
464
|
+
console.log("[Processes] Service started");
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
async shutdown(): Promise<void> {
|
|
468
|
+
this.isShuttingDown = true;
|
|
469
|
+
console.log("[Processes] Shutting down...");
|
|
470
|
+
|
|
471
|
+
// Stop heartbeat monitor
|
|
472
|
+
if (this.heartbeatMonitor) {
|
|
473
|
+
clearInterval(this.heartbeatMonitor);
|
|
474
|
+
this.heartbeatMonitor = undefined;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// Cancel pending restarts
|
|
478
|
+
for (const timeout of this.pendingRestarts.values()) {
|
|
479
|
+
clearTimeout(timeout);
|
|
480
|
+
}
|
|
481
|
+
this.pendingRestarts.clear();
|
|
482
|
+
|
|
483
|
+
// Stop all running processes
|
|
484
|
+
const running = await this.adapter.getRunning();
|
|
485
|
+
await Promise.all(running.map((p) => this.stop(p.id)));
|
|
486
|
+
|
|
487
|
+
// Shutdown socket server
|
|
488
|
+
await this.socketServer.shutdown();
|
|
489
|
+
|
|
490
|
+
// Stop adapter
|
|
491
|
+
this.adapter.stop();
|
|
492
|
+
|
|
493
|
+
console.log("[Processes] Shutdown complete");
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
// ============================================
|
|
497
|
+
// Private Methods
|
|
498
|
+
// ============================================
|
|
499
|
+
|
|
500
|
+
private async handleMessage(message: ProcessMessage): Promise<void> {
|
|
501
|
+
const { processId, type } = message;
|
|
502
|
+
const proc = await this.adapter.get(processId);
|
|
503
|
+
if (!proc) return;
|
|
504
|
+
|
|
505
|
+
// Handle heartbeat messages
|
|
506
|
+
if (type === "heartbeat") {
|
|
507
|
+
await this.adapter.update(processId, { lastHeartbeat: new Date() });
|
|
508
|
+
return;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// Emit generic message event
|
|
512
|
+
await this.emitEvent("process.message", {
|
|
513
|
+
processId,
|
|
514
|
+
name: proc.name,
|
|
515
|
+
message,
|
|
516
|
+
});
|
|
517
|
+
|
|
518
|
+
// Call definition callback
|
|
519
|
+
const definition = this.definitions.get(proc.name);
|
|
520
|
+
if (definition?.onMessage) {
|
|
521
|
+
await definition.onMessage(proc, message);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
private handleConnect(processId: string): void {
|
|
526
|
+
console.log(`[Processes] Socket connected: ${processId}`);
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
private async handleDisconnect(processId: string): Promise<void> {
|
|
530
|
+
console.log(`[Processes] Socket disconnected: ${processId}`);
|
|
531
|
+
// Socket disconnect doesn't mean process crashed - wait for exit handler
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
private handleError(error: Error, processId?: string): void {
|
|
535
|
+
console.error(`[Processes] Socket error${processId ? ` for ${processId}` : ""}:`, error.message);
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
private async handleExit(processId: string, exitCode: number | null): Promise<void> {
|
|
539
|
+
if (this.isShuttingDown) return;
|
|
540
|
+
|
|
541
|
+
const proc = await this.adapter.get(processId);
|
|
542
|
+
if (!proc) return;
|
|
543
|
+
|
|
544
|
+
// If we're intentionally stopping, don't treat as crash
|
|
545
|
+
if (proc.status === "stopping" || proc.status === "stopped") {
|
|
546
|
+
return;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
console.log(`[Processes] Process ${proc.name} (${processId}) exited with code ${exitCode}`);
|
|
550
|
+
|
|
551
|
+
// Increment consecutive failures
|
|
552
|
+
const newConsecutiveFailures = proc.consecutiveFailures + 1;
|
|
553
|
+
|
|
554
|
+
// Unexpected crash
|
|
555
|
+
await this.adapter.update(processId, {
|
|
556
|
+
status: "crashed",
|
|
557
|
+
stoppedAt: new Date(),
|
|
558
|
+
consecutiveFailures: newConsecutiveFailures,
|
|
559
|
+
error: `Exited with code ${exitCode}`,
|
|
560
|
+
});
|
|
561
|
+
|
|
562
|
+
// Cleanup
|
|
563
|
+
await this.socketServer.closeSocket(processId);
|
|
564
|
+
this.subprocesses.delete(processId);
|
|
565
|
+
|
|
566
|
+
// Emit event
|
|
567
|
+
await this.emitEvent("process.crashed", {
|
|
568
|
+
processId,
|
|
569
|
+
name: proc.name,
|
|
570
|
+
exitCode,
|
|
571
|
+
});
|
|
572
|
+
|
|
573
|
+
// Call definition callback
|
|
574
|
+
const definition = this.definitions.get(proc.name);
|
|
575
|
+
if (definition?.onCrash) {
|
|
576
|
+
await definition.onCrash(proc, exitCode);
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
// Handle auto-restart with updated consecutive failures
|
|
580
|
+
const updatedProc = { ...proc, consecutiveFailures: newConsecutiveFailures };
|
|
581
|
+
await this.handleAutoRestart(processId, updatedProc);
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
private async handleAutoRestart(processId: string, proc: ManagedProcess): Promise<void> {
|
|
585
|
+
const config = proc.config;
|
|
586
|
+
if (!config.autoRestart) return;
|
|
587
|
+
|
|
588
|
+
const maxRestarts = config.maxRestarts ?? 10;
|
|
589
|
+
if (maxRestarts !== -1 && proc.consecutiveFailures >= maxRestarts) {
|
|
590
|
+
console.log(`[Processes] ${proc.name} (${processId}) reached max restarts (${maxRestarts})`);
|
|
591
|
+
await this.adapter.update(processId, { status: "dead" });
|
|
592
|
+
return;
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
// Calculate backoff delay
|
|
596
|
+
const delay = calculateBackoff(proc.consecutiveFailures, config.backoff);
|
|
597
|
+
console.log(`[Processes] Scheduling restart of ${proc.name} in ${Math.round(delay)}ms`);
|
|
598
|
+
|
|
599
|
+
// Schedule restart
|
|
600
|
+
const timeout = setTimeout(async () => {
|
|
601
|
+
this.pendingRestarts.delete(processId);
|
|
602
|
+
|
|
603
|
+
try {
|
|
604
|
+
const newProcessId = await this.spawn(proc.name, {
|
|
605
|
+
configOverrides: proc.config,
|
|
606
|
+
metadata: proc.metadata,
|
|
607
|
+
});
|
|
608
|
+
|
|
609
|
+
// Preserve restart count
|
|
610
|
+
await this.adapter.update(newProcessId, {
|
|
611
|
+
restartCount: proc.restartCount + 1,
|
|
612
|
+
consecutiveFailures: proc.consecutiveFailures,
|
|
613
|
+
});
|
|
614
|
+
|
|
615
|
+
const newProcess = await this.adapter.get(newProcessId);
|
|
616
|
+
const definition = this.definitions.get(proc.name);
|
|
617
|
+
if (definition?.onRestart && newProcess) {
|
|
618
|
+
await definition.onRestart(proc, newProcess, proc.restartCount + 1);
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
await this.emitEvent("process.restarted", {
|
|
622
|
+
oldProcessId: processId,
|
|
623
|
+
newProcessId,
|
|
624
|
+
name: proc.name,
|
|
625
|
+
attempt: proc.restartCount + 1,
|
|
626
|
+
});
|
|
627
|
+
} catch (err) {
|
|
628
|
+
console.error(`[Processes] Failed to restart ${proc.name}:`, err);
|
|
629
|
+
}
|
|
630
|
+
}, delay);
|
|
631
|
+
|
|
632
|
+
this.pendingRestarts.set(processId, timeout);
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
private async reconcileOrphans(): Promise<void> {
|
|
636
|
+
console.log("[Processes] Checking for orphaned processes...");
|
|
637
|
+
const orphaned = await this.adapter.getOrphaned();
|
|
638
|
+
|
|
639
|
+
if (orphaned.length === 0) {
|
|
640
|
+
console.log("[Processes] No orphaned processes found");
|
|
641
|
+
return;
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
// Reserve socket paths for potential reconnection
|
|
645
|
+
const activeIds = new Set<string>();
|
|
646
|
+
for (const proc of orphaned) {
|
|
647
|
+
if (proc.socketPath || proc.tcpPort) {
|
|
648
|
+
this.socketServer.reserve(proc.id, proc.socketPath, proc.tcpPort);
|
|
649
|
+
activeIds.add(proc.id);
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// Clean orphaned socket files not belonging to our processes
|
|
654
|
+
await this.socketServer.cleanOrphanedSockets(activeIds);
|
|
655
|
+
|
|
656
|
+
for (const proc of orphaned) {
|
|
657
|
+
if (proc.pid && isProcessAlive(proc.pid)) {
|
|
658
|
+
// Process is still running! Try to reconnect
|
|
659
|
+
console.log(`[Processes] Found orphaned process ${proc.name} (${proc.id}) with PID ${proc.pid}`);
|
|
660
|
+
|
|
661
|
+
const reconnected = await this.socketServer.reconnect(
|
|
662
|
+
proc.id,
|
|
663
|
+
proc.socketPath,
|
|
664
|
+
proc.tcpPort
|
|
665
|
+
);
|
|
666
|
+
|
|
667
|
+
if (reconnected) {
|
|
668
|
+
await this.adapter.update(proc.id, { status: "running" });
|
|
669
|
+
await this.emitEvent("process.reconnected", {
|
|
670
|
+
processId: proc.id,
|
|
671
|
+
name: proc.name,
|
|
672
|
+
pid: proc.pid,
|
|
673
|
+
});
|
|
674
|
+
console.log(`[Processes] Reconnected to ${proc.name} (${proc.id})`);
|
|
675
|
+
} else {
|
|
676
|
+
// Couldn't reconnect, mark as orphaned
|
|
677
|
+
await this.adapter.update(proc.id, { status: "orphaned" });
|
|
678
|
+
console.log(`[Processes] Could not reconnect to ${proc.name} (${proc.id}), marked as orphaned`);
|
|
679
|
+
|
|
680
|
+
// Try auto-restart if configured
|
|
681
|
+
const definition = this.definitions.get(proc.name);
|
|
682
|
+
if (definition?.config.autoRestart) {
|
|
683
|
+
console.log(`[Processes] Killing orphaned process and restarting ${proc.name}`);
|
|
684
|
+
try {
|
|
685
|
+
process.kill(proc.pid, "SIGKILL");
|
|
686
|
+
} catch {
|
|
687
|
+
// Process may have already exited
|
|
688
|
+
}
|
|
689
|
+
await this.handleAutoRestart(proc.id, proc);
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
} else {
|
|
693
|
+
// Process is dead
|
|
694
|
+
console.log(`[Processes] Orphaned process ${proc.name} (${proc.id}) is dead`);
|
|
695
|
+
await this.adapter.update(proc.id, { status: "dead", stoppedAt: new Date() });
|
|
696
|
+
this.socketServer.release(proc.id);
|
|
697
|
+
|
|
698
|
+
// Try auto-restart if configured
|
|
699
|
+
const definition = this.definitions.get(proc.name);
|
|
700
|
+
if (definition?.config.autoRestart) {
|
|
701
|
+
await this.handleAutoRestart(proc.id, proc);
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
private startHeartbeatMonitor(): void {
|
|
708
|
+
this.heartbeatMonitor = setInterval(async () => {
|
|
709
|
+
if (this.isShuttingDown) return;
|
|
710
|
+
|
|
711
|
+
const running = await this.adapter.getRunning();
|
|
712
|
+
const now = Date.now();
|
|
713
|
+
|
|
714
|
+
for (const proc of running) {
|
|
715
|
+
const heartbeatConfig = proc.config.heartbeat;
|
|
716
|
+
if (!heartbeatConfig) continue;
|
|
717
|
+
|
|
718
|
+
const timeoutMs = heartbeatConfig.timeoutMs ?? 60000;
|
|
719
|
+
const lastHeartbeat = proc.lastHeartbeat?.getTime() ?? proc.startedAt?.getTime() ?? 0;
|
|
720
|
+
|
|
721
|
+
if (now - lastHeartbeat > timeoutMs) {
|
|
722
|
+
console.warn(`[Processes] Heartbeat missed for ${proc.name} (${proc.id})`);
|
|
723
|
+
|
|
724
|
+
await this.emitEvent("process.heartbeat_missed", {
|
|
725
|
+
processId: proc.id,
|
|
726
|
+
name: proc.name,
|
|
727
|
+
});
|
|
728
|
+
|
|
729
|
+
const definition = this.definitions.get(proc.name);
|
|
730
|
+
if (definition?.onUnhealthy) {
|
|
731
|
+
await definition.onUnhealthy(proc);
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
// If heartbeat is way overdue (2x timeout), kill and restart
|
|
735
|
+
if (now - lastHeartbeat > timeoutMs * 2) {
|
|
736
|
+
console.warn(`[Processes] Killing unresponsive process ${proc.name} (${proc.id})`);
|
|
737
|
+
await this.kill(proc.id);
|
|
738
|
+
// handleExit will trigger auto-restart if configured
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
}, this.heartbeatCheckInterval);
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
private async emitEvent(eventName: string, data: any): Promise<void> {
|
|
746
|
+
if (this.events) {
|
|
747
|
+
await this.events.emit(eventName, data);
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
// ============================================
|
|
753
|
+
// Factory Function
|
|
754
|
+
// ============================================
|
|
755
|
+
|
|
756
|
+
export function createProcesses(config?: ProcessesConfig): Processes {
|
|
757
|
+
return new ProcessesImpl(config);
|
|
758
|
+
}
|