@donkeylabs/server 2.0.28 → 2.0.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/jobs.md +27 -0
- package/docs/processes.md +29 -0
- package/docs/workflows.md +32 -0
- package/package.json +1 -1
- package/src/core/external-job-socket.ts +20 -1
- package/src/core/external-jobs.ts +6 -0
- package/src/core/index.ts +1 -0
- package/src/core/jobs.ts +110 -52
- package/src/core/process-socket.ts +20 -1
- package/src/core/processes.ts +100 -4
- package/src/core/subprocess-bootstrap.ts +14 -1
- package/src/core/watchdog-executor.ts +80 -0
- package/src/core/watchdog-runner.ts +276 -0
- package/src/core/workflow-executor.ts +7 -0
- package/src/core/workflow-socket.ts +21 -2
- package/src/core/workflows.ts +88 -12
- package/src/server.ts +84 -1
package/docs/jobs.md
CHANGED
|
@@ -192,6 +192,33 @@ const cancelled = await ctx.core.jobs.cancel(jobId);
|
|
|
192
192
|
|
|
193
193
|
---
|
|
194
194
|
|
|
195
|
+
## External Jobs (Subprocess)
|
|
196
|
+
|
|
197
|
+
External jobs run in a separate process and are monitored by a watchdog.
|
|
198
|
+
|
|
199
|
+
```ts
|
|
200
|
+
ctx.core.jobs.registerExternal("batchWorker", {
|
|
201
|
+
command: "bun",
|
|
202
|
+
args: ["./workers/batch-worker.ts"],
|
|
203
|
+
heartbeatTimeout: 30000,
|
|
204
|
+
timeout: 10 * 60 * 1000,
|
|
205
|
+
killGraceMs: 5000,
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
// Disable in-process timers when using the watchdog subprocess
|
|
209
|
+
const server = new AppServer({
|
|
210
|
+
db,
|
|
211
|
+
watchdog: { enabled: true },
|
|
212
|
+
jobs: { external: { useWatchdog: true } },
|
|
213
|
+
});
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Watchdog events:
|
|
217
|
+
- `job.watchdog.stale`
|
|
218
|
+
- `job.watchdog.killed`
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
195
222
|
## Event Integration
|
|
196
223
|
|
|
197
224
|
Jobs automatically emit events on completion and failure:
|
package/docs/processes.md
CHANGED
|
@@ -8,6 +8,7 @@ Processes provide:
|
|
|
8
8
|
- Long-running daemon management (start, stop, restart)
|
|
9
9
|
- Typed event communication from process to server
|
|
10
10
|
- Automatic heartbeat monitoring
|
|
11
|
+
- Watchdog termination for unresponsive processes
|
|
11
12
|
- Connection resilience with auto-reconnection
|
|
12
13
|
- Metadata passing to spawned processes
|
|
13
14
|
- Cross-platform support (Unix sockets / TCP on Windows)
|
|
@@ -89,6 +90,13 @@ server.getCore().processes.define("video-encoder", {
|
|
|
89
90
|
|
|
90
91
|
// Heartbeat configuration
|
|
91
92
|
heartbeatTimeout: 30000, // 30 seconds
|
|
93
|
+
|
|
94
|
+
// Optional hard limits (requires stats for memory/CPU)
|
|
95
|
+
limits: {
|
|
96
|
+
maxRuntimeMs: 60_000,
|
|
97
|
+
maxMemoryMb: 512,
|
|
98
|
+
maxCpuPercent: 90,
|
|
99
|
+
},
|
|
92
100
|
});
|
|
93
101
|
```
|
|
94
102
|
|
|
@@ -221,6 +229,27 @@ const client = await ProcessClient.connect({
|
|
|
221
229
|
});
|
|
222
230
|
```
|
|
223
231
|
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Hard Limits
|
|
235
|
+
|
|
236
|
+
Processes can be terminated automatically when limits are exceeded:
|
|
237
|
+
|
|
238
|
+
- `maxRuntimeMs` always enforced by the server watchdog
|
|
239
|
+
- `maxMemoryMb` and `maxCpuPercent` require `ProcessClient` stats enabled
|
|
240
|
+
|
|
241
|
+
```ts
|
|
242
|
+
const client = await ProcessClient.connect({
|
|
243
|
+
stats: { enabled: true, interval: 5000 },
|
|
244
|
+
});
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Watchdog events:
|
|
248
|
+
- `process.watchdog.stale`
|
|
249
|
+
- `process.watchdog.killed`
|
|
250
|
+
|
|
251
|
+
When `watchdog.enabled` is true, heartbeat monitoring runs in the watchdog subprocess.
|
|
252
|
+
|
|
224
253
|
### Properties
|
|
225
254
|
|
|
226
255
|
```typescript
|
package/docs/workflows.md
CHANGED
|
@@ -309,6 +309,38 @@ workflow("batch.status")
|
|
|
309
309
|
|
|
310
310
|
Each poll cycle emits `workflow.step.poll` events and persists progress to the instance.
|
|
311
311
|
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## Watchdog and Subprocess Settings
|
|
315
|
+
|
|
316
|
+
You can tune subprocess termination and SQLite pragmas used by isolated workflows:
|
|
317
|
+
|
|
318
|
+
```ts
|
|
319
|
+
const server = new AppServer({
|
|
320
|
+
db,
|
|
321
|
+
watchdog: {
|
|
322
|
+
enabled: true,
|
|
323
|
+
intervalMs: 5000,
|
|
324
|
+
services: ["workflows", "jobs", "processes"],
|
|
325
|
+
killGraceMs: 5000,
|
|
326
|
+
},
|
|
327
|
+
workflows: {
|
|
328
|
+
killGraceMs: 5000,
|
|
329
|
+
sqlitePragmas: {
|
|
330
|
+
busyTimeout: 5000,
|
|
331
|
+
journalMode: "WAL",
|
|
332
|
+
synchronous: "NORMAL",
|
|
333
|
+
},
|
|
334
|
+
},
|
|
335
|
+
});
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
When `watchdog.enabled` is true, workflow heartbeat timers run in the watchdog subprocess instead of the main server.
|
|
339
|
+
|
|
340
|
+
Watchdog events:
|
|
341
|
+
- `workflow.watchdog.stale` (heartbeat missed)
|
|
342
|
+
- `workflow.watchdog.killed` (process terminated)
|
|
343
|
+
|
|
312
344
|
### Loop
|
|
313
345
|
|
|
314
346
|
Use a loop step to jump back to a previous step until a condition is false.
|
package/package.json
CHANGED
|
@@ -201,6 +201,23 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
|
|
|
201
201
|
|
|
202
202
|
let buffer = "";
|
|
203
203
|
|
|
204
|
+
const queue: AnyExternalJobMessage[] = [];
|
|
205
|
+
let processing = false;
|
|
206
|
+
|
|
207
|
+
const processQueue = async () => {
|
|
208
|
+
if (processing) return;
|
|
209
|
+
processing = true;
|
|
210
|
+
while (queue.length > 0) {
|
|
211
|
+
const message = queue.shift()!;
|
|
212
|
+
try {
|
|
213
|
+
await this.onMessage(message);
|
|
214
|
+
} catch (err) {
|
|
215
|
+
this.onError?.(err instanceof Error ? err : new Error(String(err)), jobId);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
processing = false;
|
|
219
|
+
};
|
|
220
|
+
|
|
204
221
|
socket.on("data", (data) => {
|
|
205
222
|
buffer += data.toString();
|
|
206
223
|
|
|
@@ -213,11 +230,13 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
|
|
|
213
230
|
|
|
214
231
|
const message = parseJobMessage(line);
|
|
215
232
|
if (message) {
|
|
216
|
-
|
|
233
|
+
queue.push(message);
|
|
217
234
|
} else {
|
|
218
235
|
this.onError?.(new Error(`Invalid message: ${line}`), jobId);
|
|
219
236
|
}
|
|
220
237
|
}
|
|
238
|
+
|
|
239
|
+
processQueue().catch(() => undefined);
|
|
221
240
|
});
|
|
222
241
|
|
|
223
242
|
socket.on("error", (err) => {
|
|
@@ -80,6 +80,8 @@ export interface ExternalJobConfig {
|
|
|
80
80
|
heartbeatTimeout?: number;
|
|
81
81
|
/** Job timeout in milliseconds (optional) */
|
|
82
82
|
timeout?: number;
|
|
83
|
+
/** Grace period before SIGKILL when terminating (ms, default: 5000) */
|
|
84
|
+
killGraceMs?: number;
|
|
83
85
|
}
|
|
84
86
|
|
|
85
87
|
// ============================================
|
|
@@ -120,6 +122,10 @@ export interface ExternalJobsConfig {
|
|
|
120
122
|
defaultHeartbeatTimeout?: number;
|
|
121
123
|
/** Heartbeat check interval in ms (default: 10000) */
|
|
122
124
|
heartbeatCheckInterval?: number;
|
|
125
|
+
/** Default grace period before SIGKILL when terminating (ms, default: 5000) */
|
|
126
|
+
killGraceMs?: number;
|
|
127
|
+
/** Disable in-process watchdog timers (use external watchdog instead) */
|
|
128
|
+
useWatchdog?: boolean;
|
|
123
129
|
}
|
|
124
130
|
|
|
125
131
|
// ============================================
|
package/src/core/index.ts
CHANGED
package/src/core/jobs.ts
CHANGED
|
@@ -141,6 +141,8 @@ export interface Jobs {
|
|
|
141
141
|
getRunningExternal(): Promise<Job[]>;
|
|
142
142
|
/** Get all jobs with optional filtering (for admin dashboard) */
|
|
143
143
|
getAll(options?: GetAllJobsOptions): Promise<Job[]>;
|
|
144
|
+
/** Get external job config snapshot for watchdog */
|
|
145
|
+
getExternalJobConfigs(): Record<string, ExternalJobConfig>;
|
|
144
146
|
/** Start the job processing loop */
|
|
145
147
|
start(): void;
|
|
146
148
|
/** Stop the job processing and cleanup */
|
|
@@ -273,7 +275,10 @@ class JobsImpl implements Jobs {
|
|
|
273
275
|
private externalConfigs = new Map<string, ExternalJobConfig>();
|
|
274
276
|
private externalConfig: ExternalJobsConfig;
|
|
275
277
|
private socketServer: ExternalJobSocketServer | null = null;
|
|
276
|
-
private externalProcesses = new Map<
|
|
278
|
+
private externalProcesses = new Map<
|
|
279
|
+
string,
|
|
280
|
+
{ pid: number; timeout?: ReturnType<typeof setTimeout>; killTimer?: ReturnType<typeof setTimeout> }
|
|
281
|
+
>();
|
|
277
282
|
|
|
278
283
|
constructor(config: JobsConfig = {}) {
|
|
279
284
|
this.events = config.events;
|
|
@@ -322,6 +327,14 @@ class JobsImpl implements Jobs {
|
|
|
322
327
|
this.externalConfigs.set(name, config);
|
|
323
328
|
}
|
|
324
329
|
|
|
330
|
+
getExternalJobConfigs(): Record<string, ExternalJobConfig> {
|
|
331
|
+
const snapshot: Record<string, ExternalJobConfig> = {};
|
|
332
|
+
for (const [name, config] of this.externalConfigs.entries()) {
|
|
333
|
+
snapshot[name] = { ...config };
|
|
334
|
+
}
|
|
335
|
+
return snapshot;
|
|
336
|
+
}
|
|
337
|
+
|
|
325
338
|
private isExternalJob(name: string): boolean {
|
|
326
339
|
return this.externalConfigs.has(name);
|
|
327
340
|
}
|
|
@@ -419,7 +432,9 @@ class JobsImpl implements Jobs {
|
|
|
419
432
|
// Initialize socket server for external jobs
|
|
420
433
|
if (this.externalConfigs.size > 0) {
|
|
421
434
|
this.initializeSocketServer();
|
|
422
|
-
this.
|
|
435
|
+
if (!this.externalConfig.useWatchdog) {
|
|
436
|
+
this.startHeartbeatMonitor();
|
|
437
|
+
}
|
|
423
438
|
// Attempt to reconnect to orphaned jobs from previous run
|
|
424
439
|
this.reconnectOrphanedJobs();
|
|
425
440
|
}
|
|
@@ -521,6 +536,7 @@ class JobsImpl implements Jobs {
|
|
|
521
536
|
|
|
522
537
|
const config = this.externalConfigs.get(job.name);
|
|
523
538
|
const heartbeatTimeout = config?.heartbeatTimeout ?? this.externalConfig.defaultHeartbeatTimeout ?? 30000;
|
|
539
|
+
const killGraceMs = config?.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
|
|
524
540
|
const timeSinceHeartbeat = now - job.lastHeartbeat.getTime();
|
|
525
541
|
|
|
526
542
|
if (timeSinceHeartbeat > heartbeatTimeout) {
|
|
@@ -533,36 +549,22 @@ class JobsImpl implements Jobs {
|
|
|
533
549
|
name: job.name,
|
|
534
550
|
timeSinceHeartbeat,
|
|
535
551
|
});
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
console.error(`[Jobs] Killing stale external job ${job.id}`);
|
|
541
|
-
|
|
542
|
-
if (job.pid) {
|
|
543
|
-
try {
|
|
544
|
-
process.kill(job.pid, "SIGKILL");
|
|
545
|
-
} catch {
|
|
546
|
-
// Process may already be dead
|
|
547
|
-
}
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
await this.adapter.update(job.id, {
|
|
551
|
-
status: "failed",
|
|
552
|
-
error: "Heartbeat timeout - job process unresponsive",
|
|
553
|
-
completedAt: new Date(),
|
|
554
|
-
processState: "orphaned",
|
|
552
|
+
await this.events.emit("job.watchdog.stale", {
|
|
553
|
+
jobId: job.id,
|
|
554
|
+
name: job.name,
|
|
555
|
+
timeSinceHeartbeat,
|
|
555
556
|
});
|
|
557
|
+
}
|
|
556
558
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
559
|
+
const procInfo = this.externalProcesses.get(job.id);
|
|
560
|
+
if (job.pid && !procInfo?.killTimer) {
|
|
561
|
+
console.error(`[Jobs] Terminating stale external job ${job.id}`);
|
|
562
|
+
await this.terminateExternalProcess(
|
|
563
|
+
job.id,
|
|
564
|
+
job.pid,
|
|
565
|
+
killGraceMs,
|
|
566
|
+
"Heartbeat timeout - job process unresponsive"
|
|
567
|
+
);
|
|
566
568
|
}
|
|
567
569
|
}
|
|
568
570
|
}
|
|
@@ -764,6 +766,9 @@ class JobsImpl implements Jobs {
|
|
|
764
766
|
if (procInfo?.timeout) {
|
|
765
767
|
clearTimeout(procInfo.timeout);
|
|
766
768
|
}
|
|
769
|
+
if (procInfo?.killTimer) {
|
|
770
|
+
clearTimeout(procInfo.killTimer);
|
|
771
|
+
}
|
|
767
772
|
this.externalProcesses.delete(jobId);
|
|
768
773
|
|
|
769
774
|
// Close the socket
|
|
@@ -898,30 +903,16 @@ class JobsImpl implements Jobs {
|
|
|
898
903
|
proc.stdin.end();
|
|
899
904
|
|
|
900
905
|
// Set up process timeout if configured
|
|
901
|
-
if (config.timeout) {
|
|
906
|
+
if (config.timeout && !this.externalConfig.useWatchdog) {
|
|
902
907
|
const timeout = setTimeout(async () => {
|
|
903
908
|
console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
status: "failed",
|
|
912
|
-
error: `Job timed out after ${config.timeout}ms`,
|
|
913
|
-
completedAt: new Date(),
|
|
914
|
-
});
|
|
915
|
-
|
|
916
|
-
await this.cleanupExternalJob(job.id);
|
|
917
|
-
|
|
918
|
-
if (this.events) {
|
|
919
|
-
await this.events.emit("job.failed", {
|
|
920
|
-
jobId: job.id,
|
|
921
|
-
name: job.name,
|
|
922
|
-
error: "Timeout",
|
|
923
|
-
});
|
|
924
|
-
}
|
|
909
|
+
const killGraceMs = config.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
|
|
910
|
+
await this.terminateExternalProcess(
|
|
911
|
+
job.id,
|
|
912
|
+
proc.pid,
|
|
913
|
+
killGraceMs,
|
|
914
|
+
`Job timed out after ${config.timeout}ms`
|
|
915
|
+
);
|
|
925
916
|
}, config.timeout);
|
|
926
917
|
|
|
927
918
|
const procInfo = this.externalProcesses.get(job.id);
|
|
@@ -998,6 +989,73 @@ class JobsImpl implements Jobs {
|
|
|
998
989
|
}
|
|
999
990
|
}
|
|
1000
991
|
|
|
992
|
+
private async terminateExternalProcess(
|
|
993
|
+
jobId: string,
|
|
994
|
+
pid: number,
|
|
995
|
+
killGraceMs: number,
|
|
996
|
+
error: string
|
|
997
|
+
): Promise<void> {
|
|
998
|
+
try {
|
|
999
|
+
process.kill(pid, "SIGTERM");
|
|
1000
|
+
} catch {
|
|
1001
|
+
return;
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
if (killGraceMs <= 0) {
|
|
1005
|
+
try {
|
|
1006
|
+
process.kill(pid, "SIGKILL");
|
|
1007
|
+
} catch {
|
|
1008
|
+
// ignore
|
|
1009
|
+
}
|
|
1010
|
+
await this.handleExternalFailure(jobId, error);
|
|
1011
|
+
return;
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
const timer = setTimeout(async () => {
|
|
1015
|
+
try {
|
|
1016
|
+
process.kill(pid, 0);
|
|
1017
|
+
process.kill(pid, "SIGKILL");
|
|
1018
|
+
} catch {
|
|
1019
|
+
// ignore
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
await this.handleExternalFailure(jobId, error);
|
|
1023
|
+
}, killGraceMs);
|
|
1024
|
+
|
|
1025
|
+
const procInfo = this.externalProcesses.get(jobId);
|
|
1026
|
+
if (procInfo) {
|
|
1027
|
+
procInfo.killTimer = timer;
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
private async handleExternalFailure(jobId: string, error: string): Promise<void> {
|
|
1032
|
+
await this.adapter.update(jobId, {
|
|
1033
|
+
status: "failed",
|
|
1034
|
+
error,
|
|
1035
|
+
completedAt: new Date(),
|
|
1036
|
+
processState: "orphaned",
|
|
1037
|
+
});
|
|
1038
|
+
|
|
1039
|
+
const job = await this.adapter.get(jobId);
|
|
1040
|
+
if (this.events && job) {
|
|
1041
|
+
await this.events.emit("job.watchdog.killed", {
|
|
1042
|
+
jobId,
|
|
1043
|
+
name: job.name,
|
|
1044
|
+
reason: error,
|
|
1045
|
+
});
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
await this.cleanupExternalJob(jobId);
|
|
1049
|
+
|
|
1050
|
+
if (this.events && job) {
|
|
1051
|
+
await this.events.emit("job.failed", {
|
|
1052
|
+
jobId,
|
|
1053
|
+
name: job.name,
|
|
1054
|
+
error,
|
|
1055
|
+
});
|
|
1056
|
+
}
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1001
1059
|
private streamProcessOutput(
|
|
1002
1060
|
jobId: string,
|
|
1003
1061
|
jobName: string,
|
|
@@ -207,6 +207,23 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
|
|
|
207
207
|
|
|
208
208
|
let buffer = "";
|
|
209
209
|
|
|
210
|
+
const queue: ProcessMessage[] = [];
|
|
211
|
+
let processing = false;
|
|
212
|
+
|
|
213
|
+
const processQueue = async () => {
|
|
214
|
+
if (processing) return;
|
|
215
|
+
processing = true;
|
|
216
|
+
while (queue.length > 0) {
|
|
217
|
+
const message = queue.shift()!;
|
|
218
|
+
try {
|
|
219
|
+
await this.onMessage(message);
|
|
220
|
+
} catch (err) {
|
|
221
|
+
this.onError?.(err instanceof Error ? err : new Error(String(err)), processId);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
processing = false;
|
|
225
|
+
};
|
|
226
|
+
|
|
210
227
|
socket.on("data", (data) => {
|
|
211
228
|
buffer += data.toString();
|
|
212
229
|
|
|
@@ -219,11 +236,13 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
|
|
|
219
236
|
|
|
220
237
|
const message = this.parseMessage(line);
|
|
221
238
|
if (message) {
|
|
222
|
-
|
|
239
|
+
queue.push(message);
|
|
223
240
|
} else {
|
|
224
241
|
this.onError?.(new Error(`Invalid message: ${line}`), processId);
|
|
225
242
|
}
|
|
226
243
|
}
|
|
244
|
+
|
|
245
|
+
processQueue().catch(() => undefined);
|
|
227
246
|
});
|
|
228
247
|
|
|
229
248
|
socket.on("error", (err) => {
|
package/src/core/processes.ts
CHANGED
|
@@ -61,6 +61,15 @@ export interface ProcessConfig {
|
|
|
61
61
|
/** Timeout before considering unhealthy in ms (default: 60000) */
|
|
62
62
|
timeoutMs?: number;
|
|
63
63
|
};
|
|
64
|
+
/** Hard limits for the process (optional) */
|
|
65
|
+
limits?: {
|
|
66
|
+
/** Max runtime in ms before termination */
|
|
67
|
+
maxRuntimeMs?: number;
|
|
68
|
+
/** Max memory (RSS) in MB before termination (requires stats enabled) */
|
|
69
|
+
maxMemoryMb?: number;
|
|
70
|
+
/** Max CPU percent before termination (requires stats enabled) */
|
|
71
|
+
maxCpuPercent?: number;
|
|
72
|
+
};
|
|
64
73
|
}
|
|
65
74
|
|
|
66
75
|
export interface ManagedProcess {
|
|
@@ -171,6 +180,10 @@ export interface ProcessesConfig {
|
|
|
171
180
|
heartbeatCheckInterval?: number;
|
|
172
181
|
/** Enable auto-reconnect to orphaned processes on startup (default: true) */
|
|
173
182
|
autoRecoverOrphans?: boolean;
|
|
183
|
+
/** Grace period before SIGKILL when stopping/killing (ms, default: 5000) */
|
|
184
|
+
killGraceMs?: number;
|
|
185
|
+
/** Disable in-process watchdog timers (use external watchdog instead) */
|
|
186
|
+
useWatchdog?: boolean;
|
|
174
187
|
}
|
|
175
188
|
|
|
176
189
|
// ============================================
|
|
@@ -251,6 +264,9 @@ export class ProcessesImpl implements Processes {
|
|
|
251
264
|
private events?: Events;
|
|
252
265
|
private heartbeatCheckInterval: number;
|
|
253
266
|
private autoRecoverOrphans: boolean;
|
|
267
|
+
private killGraceMs: number;
|
|
268
|
+
private runtimeLimitTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
269
|
+
private useWatchdog: boolean;
|
|
254
270
|
|
|
255
271
|
// Track running Bun subprocesses
|
|
256
272
|
private subprocesses = new Map<string, Subprocess>();
|
|
@@ -266,6 +282,8 @@ export class ProcessesImpl implements Processes {
|
|
|
266
282
|
this.events = config.events;
|
|
267
283
|
this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
|
|
268
284
|
this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
|
|
285
|
+
this.killGraceMs = config.killGraceMs ?? 5000;
|
|
286
|
+
this.useWatchdog = config.useWatchdog ?? false;
|
|
269
287
|
|
|
270
288
|
// Create socket server with callbacks
|
|
271
289
|
this.socketServer = createProcessSocketServer(config.socket ?? {}, {
|
|
@@ -361,6 +379,21 @@ export class ProcessesImpl implements Processes {
|
|
|
361
379
|
// Set up exit handler for crash detection
|
|
362
380
|
proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
|
|
363
381
|
|
|
382
|
+
const maxRuntimeMs = config.limits?.maxRuntimeMs;
|
|
383
|
+
if (!this.useWatchdog && maxRuntimeMs && maxRuntimeMs > 0) {
|
|
384
|
+
const timer = setTimeout(async () => {
|
|
385
|
+
console.warn(`[Processes] Max runtime exceeded for ${name} (${process.id})`);
|
|
386
|
+
await this.emitEvent("process.limits_exceeded", {
|
|
387
|
+
processId: process.id,
|
|
388
|
+
name,
|
|
389
|
+
reason: "maxRuntimeMs",
|
|
390
|
+
limit: maxRuntimeMs,
|
|
391
|
+
});
|
|
392
|
+
await this.stop(process.id);
|
|
393
|
+
}, maxRuntimeMs);
|
|
394
|
+
this.runtimeLimitTimers.set(process.id, timer);
|
|
395
|
+
}
|
|
396
|
+
|
|
364
397
|
console.log(`[Processes] Spawned ${name} (${process.id}) with PID ${proc.pid}`);
|
|
365
398
|
return process.id;
|
|
366
399
|
} catch (error) {
|
|
@@ -395,7 +428,7 @@ export class ProcessesImpl implements Processes {
|
|
|
395
428
|
// Wait for process to exit (with timeout)
|
|
396
429
|
const exitPromise = subprocess.exited;
|
|
397
430
|
const timeoutPromise = new Promise<null>((resolve) =>
|
|
398
|
-
setTimeout(() => resolve(null),
|
|
431
|
+
setTimeout(() => resolve(null), this.killGraceMs)
|
|
399
432
|
);
|
|
400
433
|
|
|
401
434
|
const result = await Promise.race([exitPromise, timeoutPromise]);
|
|
@@ -412,6 +445,11 @@ export class ProcessesImpl implements Processes {
|
|
|
412
445
|
// Cleanup
|
|
413
446
|
await this.socketServer.closeSocket(processId);
|
|
414
447
|
this.subprocesses.delete(processId);
|
|
448
|
+
const runtimeTimer = this.runtimeLimitTimers.get(processId);
|
|
449
|
+
if (runtimeTimer) {
|
|
450
|
+
clearTimeout(runtimeTimer);
|
|
451
|
+
this.runtimeLimitTimers.delete(processId);
|
|
452
|
+
}
|
|
415
453
|
|
|
416
454
|
await this.adapter.update(processId, {
|
|
417
455
|
status: "stopped",
|
|
@@ -443,6 +481,11 @@ export class ProcessesImpl implements Processes {
|
|
|
443
481
|
// Cleanup
|
|
444
482
|
await this.socketServer.closeSocket(processId);
|
|
445
483
|
this.subprocesses.delete(processId);
|
|
484
|
+
const runtimeTimer = this.runtimeLimitTimers.get(processId);
|
|
485
|
+
if (runtimeTimer) {
|
|
486
|
+
clearTimeout(runtimeTimer);
|
|
487
|
+
this.runtimeLimitTimers.delete(processId);
|
|
488
|
+
}
|
|
446
489
|
|
|
447
490
|
await this.adapter.update(processId, {
|
|
448
491
|
status: "stopped",
|
|
@@ -590,6 +633,47 @@ export class ProcessesImpl implements Processes {
|
|
|
590
633
|
await definition.onStats(proc, stats);
|
|
591
634
|
}
|
|
592
635
|
|
|
636
|
+
const limits = proc.config.limits;
|
|
637
|
+
if (limits) {
|
|
638
|
+
if (limits.maxMemoryMb && stats.memory.rss / 1e6 > limits.maxMemoryMb) {
|
|
639
|
+
console.warn(`[Processes] Memory limit exceeded for ${proc.name} (${proc.id})`);
|
|
640
|
+
await this.emitEvent("process.limits_exceeded", {
|
|
641
|
+
processId,
|
|
642
|
+
name: proc.name,
|
|
643
|
+
reason: "maxMemoryMb",
|
|
644
|
+
limit: limits.maxMemoryMb,
|
|
645
|
+
value: stats.memory.rss / 1e6,
|
|
646
|
+
});
|
|
647
|
+
await this.emitEvent("process.watchdog.killed", {
|
|
648
|
+
processId,
|
|
649
|
+
name: proc.name,
|
|
650
|
+
reason: "maxMemoryMb",
|
|
651
|
+
value: stats.memory.rss / 1e6,
|
|
652
|
+
});
|
|
653
|
+
await this.stop(proc.id);
|
|
654
|
+
return;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
if (limits.maxCpuPercent && stats.cpu.percent > limits.maxCpuPercent) {
|
|
658
|
+
console.warn(`[Processes] CPU limit exceeded for ${proc.name} (${proc.id})`);
|
|
659
|
+
await this.emitEvent("process.limits_exceeded", {
|
|
660
|
+
processId,
|
|
661
|
+
name: proc.name,
|
|
662
|
+
reason: "maxCpuPercent",
|
|
663
|
+
limit: limits.maxCpuPercent,
|
|
664
|
+
value: stats.cpu.percent,
|
|
665
|
+
});
|
|
666
|
+
await this.emitEvent("process.watchdog.killed", {
|
|
667
|
+
processId,
|
|
668
|
+
name: proc.name,
|
|
669
|
+
reason: "maxCpuPercent",
|
|
670
|
+
value: stats.cpu.percent,
|
|
671
|
+
});
|
|
672
|
+
await this.stop(proc.id);
|
|
673
|
+
return;
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
|
|
593
677
|
return;
|
|
594
678
|
}
|
|
595
679
|
|
|
@@ -815,6 +899,7 @@ export class ProcessesImpl implements Processes {
|
|
|
815
899
|
}
|
|
816
900
|
|
|
817
901
|
private startHeartbeatMonitor(): void {
|
|
902
|
+
if (this.useWatchdog) return;
|
|
818
903
|
this.heartbeatMonitor = setInterval(async () => {
|
|
819
904
|
if (this.isShuttingDown) return;
|
|
820
905
|
|
|
@@ -835,16 +920,27 @@ export class ProcessesImpl implements Processes {
|
|
|
835
920
|
processId: proc.id,
|
|
836
921
|
name: proc.name,
|
|
837
922
|
});
|
|
923
|
+
await this.emitEvent("process.watchdog.stale", {
|
|
924
|
+
processId: proc.id,
|
|
925
|
+
name: proc.name,
|
|
926
|
+
reason: "heartbeat",
|
|
927
|
+
timeoutMs,
|
|
928
|
+
});
|
|
838
929
|
|
|
839
930
|
const definition = this.definitions.get(proc.name);
|
|
840
931
|
if (definition?.onUnhealthy) {
|
|
841
932
|
await definition.onUnhealthy(proc);
|
|
842
933
|
}
|
|
843
934
|
|
|
844
|
-
// If heartbeat is way overdue (2x timeout),
|
|
935
|
+
// If heartbeat is way overdue (2x timeout), stop and restart
|
|
845
936
|
if (now - lastHeartbeat > timeoutMs * 2) {
|
|
846
|
-
console.warn(`[Processes]
|
|
847
|
-
await this.
|
|
937
|
+
console.warn(`[Processes] Stopping unresponsive process ${proc.name} (${proc.id})`);
|
|
938
|
+
await this.stop(proc.id);
|
|
939
|
+
await this.emitEvent("process.watchdog.killed", {
|
|
940
|
+
processId: proc.id,
|
|
941
|
+
name: proc.name,
|
|
942
|
+
reason: "heartbeat",
|
|
943
|
+
});
|
|
848
944
|
// handleExit will trigger auto-restart if configured
|
|
849
945
|
}
|
|
850
946
|
}
|
|
@@ -32,6 +32,11 @@ export interface SubprocessPluginMetadata {
|
|
|
32
32
|
export interface SubprocessBootstrapOptions {
|
|
33
33
|
dbPath: string;
|
|
34
34
|
coreConfig?: Record<string, any>;
|
|
35
|
+
sqlitePragmas?: {
|
|
36
|
+
busyTimeout?: number;
|
|
37
|
+
synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
|
|
38
|
+
journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
|
|
39
|
+
};
|
|
35
40
|
pluginMetadata: SubprocessPluginMetadata;
|
|
36
41
|
startServices?: {
|
|
37
42
|
cron?: boolean;
|
|
@@ -53,7 +58,15 @@ export async function bootstrapSubprocess(
|
|
|
53
58
|
options: SubprocessBootstrapOptions
|
|
54
59
|
): Promise<SubprocessBootstrapResult> {
|
|
55
60
|
const sqlite = new Database(options.dbPath);
|
|
56
|
-
|
|
61
|
+
const pragmas = options.sqlitePragmas;
|
|
62
|
+
const busyTimeout = pragmas?.busyTimeout ?? 5000;
|
|
63
|
+
sqlite.run(`PRAGMA busy_timeout = ${busyTimeout}`);
|
|
64
|
+
if (pragmas?.journalMode) {
|
|
65
|
+
sqlite.run(`PRAGMA journal_mode = ${pragmas.journalMode}`);
|
|
66
|
+
}
|
|
67
|
+
if (pragmas?.synchronous) {
|
|
68
|
+
sqlite.run(`PRAGMA synchronous = ${pragmas.synchronous}`);
|
|
69
|
+
}
|
|
57
70
|
|
|
58
71
|
const db = new Kysely<any>({
|
|
59
72
|
dialect: new BunSqliteDialect({ database: sqlite }),
|