@donkeylabs/server 2.0.28 → 2.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/jobs.md +20 -0
- package/docs/processes.md +27 -0
- package/docs/workflows.md +24 -0
- package/package.json +1 -1
- package/src/core/external-job-socket.ts +20 -1
- package/src/core/external-jobs.ts +4 -0
- package/src/core/index.ts +1 -0
- package/src/core/jobs.ts +96 -50
- package/src/core/process-socket.ts +20 -1
- package/src/core/processes.ts +95 -4
- package/src/core/subprocess-bootstrap.ts +14 -1
- package/src/core/workflow-executor.ts +7 -0
- package/src/core/workflow-socket.ts +21 -2
- package/src/core/workflows.ts +53 -11
package/docs/jobs.md
CHANGED
|
@@ -192,6 +192,26 @@ const cancelled = await ctx.core.jobs.cancel(jobId);
|
|
|
192
192
|
|
|
193
193
|
---
|
|
194
194
|
|
|
195
|
+
## External Jobs (Subprocess)
|
|
196
|
+
|
|
197
|
+
External jobs run in a separate process and are monitored by a watchdog.
|
|
198
|
+
|
|
199
|
+
```ts
|
|
200
|
+
ctx.core.jobs.registerExternal("batchWorker", {
|
|
201
|
+
command: "bun",
|
|
202
|
+
args: ["./workers/batch-worker.ts"],
|
|
203
|
+
heartbeatTimeout: 30000,
|
|
204
|
+
timeout: 10 * 60 * 1000,
|
|
205
|
+
killGraceMs: 5000,
|
|
206
|
+
});
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Watchdog events:
|
|
210
|
+
- `job.watchdog.stale`
|
|
211
|
+
- `job.watchdog.killed`
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
195
215
|
## Event Integration
|
|
196
216
|
|
|
197
217
|
Jobs automatically emit events on completion and failure:
|
package/docs/processes.md
CHANGED
|
@@ -8,6 +8,7 @@ Processes provide:
|
|
|
8
8
|
- Long-running daemon management (start, stop, restart)
|
|
9
9
|
- Typed event communication from process to server
|
|
10
10
|
- Automatic heartbeat monitoring
|
|
11
|
+
- Watchdog termination for unresponsive processes
|
|
11
12
|
- Connection resilience with auto-reconnection
|
|
12
13
|
- Metadata passing to spawned processes
|
|
13
14
|
- Cross-platform support (Unix sockets / TCP on Windows)
|
|
@@ -89,6 +90,13 @@ server.getCore().processes.define("video-encoder", {
|
|
|
89
90
|
|
|
90
91
|
// Heartbeat configuration
|
|
91
92
|
heartbeatTimeout: 30000, // 30 seconds
|
|
93
|
+
|
|
94
|
+
// Optional hard limits (requires stats for memory/CPU)
|
|
95
|
+
limits: {
|
|
96
|
+
maxRuntimeMs: 60_000,
|
|
97
|
+
maxMemoryMb: 512,
|
|
98
|
+
maxCpuPercent: 90,
|
|
99
|
+
},
|
|
92
100
|
});
|
|
93
101
|
```
|
|
94
102
|
|
|
@@ -221,6 +229,25 @@ const client = await ProcessClient.connect({
|
|
|
221
229
|
});
|
|
222
230
|
```
|
|
223
231
|
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Hard Limits
|
|
235
|
+
|
|
236
|
+
Processes can be terminated automatically when limits are exceeded:
|
|
237
|
+
|
|
238
|
+
- `maxRuntimeMs` always enforced by the server watchdog
|
|
239
|
+
- `maxMemoryMb` and `maxCpuPercent` require `ProcessClient` stats enabled
|
|
240
|
+
|
|
241
|
+
```ts
|
|
242
|
+
const client = await ProcessClient.connect({
|
|
243
|
+
stats: { enabled: true, interval: 5000 },
|
|
244
|
+
});
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Watchdog events:
|
|
248
|
+
- `process.watchdog.stale`
|
|
249
|
+
- `process.watchdog.killed`
|
|
250
|
+
|
|
224
251
|
### Properties
|
|
225
252
|
|
|
226
253
|
```typescript
|
package/docs/workflows.md
CHANGED
|
@@ -309,6 +309,30 @@ workflow("batch.status")
|
|
|
309
309
|
|
|
310
310
|
Each poll cycle emits `workflow.step.poll` events and persists progress to the instance.
|
|
311
311
|
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## Watchdog and Subprocess Settings
|
|
315
|
+
|
|
316
|
+
You can tune subprocess termination and SQLite pragmas used by isolated workflows:
|
|
317
|
+
|
|
318
|
+
```ts
|
|
319
|
+
const server = new AppServer({
|
|
320
|
+
db,
|
|
321
|
+
workflows: {
|
|
322
|
+
killGraceMs: 5000,
|
|
323
|
+
sqlitePragmas: {
|
|
324
|
+
busyTimeout: 5000,
|
|
325
|
+
journalMode: "WAL",
|
|
326
|
+
synchronous: "NORMAL",
|
|
327
|
+
},
|
|
328
|
+
},
|
|
329
|
+
});
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
Watchdog events:
|
|
333
|
+
- `workflow.watchdog.stale` (heartbeat missed)
|
|
334
|
+
- `workflow.watchdog.killed` (process terminated)
|
|
335
|
+
|
|
312
336
|
### Loop
|
|
313
337
|
|
|
314
338
|
Use a loop step to jump back to a previous step until a condition is false.
|
package/package.json
CHANGED
|
@@ -201,6 +201,23 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
|
|
|
201
201
|
|
|
202
202
|
let buffer = "";
|
|
203
203
|
|
|
204
|
+
const queue: AnyExternalJobMessage[] = [];
|
|
205
|
+
let processing = false;
|
|
206
|
+
|
|
207
|
+
const processQueue = async () => {
|
|
208
|
+
if (processing) return;
|
|
209
|
+
processing = true;
|
|
210
|
+
while (queue.length > 0) {
|
|
211
|
+
const message = queue.shift()!;
|
|
212
|
+
try {
|
|
213
|
+
await this.onMessage(message);
|
|
214
|
+
} catch (err) {
|
|
215
|
+
this.onError?.(err instanceof Error ? err : new Error(String(err)), jobId);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
processing = false;
|
|
219
|
+
};
|
|
220
|
+
|
|
204
221
|
socket.on("data", (data) => {
|
|
205
222
|
buffer += data.toString();
|
|
206
223
|
|
|
@@ -213,11 +230,13 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
|
|
|
213
230
|
|
|
214
231
|
const message = parseJobMessage(line);
|
|
215
232
|
if (message) {
|
|
216
|
-
|
|
233
|
+
queue.push(message);
|
|
217
234
|
} else {
|
|
218
235
|
this.onError?.(new Error(`Invalid message: ${line}`), jobId);
|
|
219
236
|
}
|
|
220
237
|
}
|
|
238
|
+
|
|
239
|
+
processQueue().catch(() => undefined);
|
|
221
240
|
});
|
|
222
241
|
|
|
223
242
|
socket.on("error", (err) => {
|
|
@@ -80,6 +80,8 @@ export interface ExternalJobConfig {
|
|
|
80
80
|
heartbeatTimeout?: number;
|
|
81
81
|
/** Job timeout in milliseconds (optional) */
|
|
82
82
|
timeout?: number;
|
|
83
|
+
/** Grace period before SIGKILL when terminating (ms, default: 5000) */
|
|
84
|
+
killGraceMs?: number;
|
|
83
85
|
}
|
|
84
86
|
|
|
85
87
|
// ============================================
|
|
@@ -120,6 +122,8 @@ export interface ExternalJobsConfig {
|
|
|
120
122
|
defaultHeartbeatTimeout?: number;
|
|
121
123
|
/** Heartbeat check interval in ms (default: 10000) */
|
|
122
124
|
heartbeatCheckInterval?: number;
|
|
125
|
+
/** Default grace period before SIGKILL when terminating (ms, default: 5000) */
|
|
126
|
+
killGraceMs?: number;
|
|
123
127
|
}
|
|
124
128
|
|
|
125
129
|
// ============================================
|
package/src/core/index.ts
CHANGED
package/src/core/jobs.ts
CHANGED
|
@@ -273,7 +273,10 @@ class JobsImpl implements Jobs {
|
|
|
273
273
|
private externalConfigs = new Map<string, ExternalJobConfig>();
|
|
274
274
|
private externalConfig: ExternalJobsConfig;
|
|
275
275
|
private socketServer: ExternalJobSocketServer | null = null;
|
|
276
|
-
private externalProcesses = new Map<
|
|
276
|
+
private externalProcesses = new Map<
|
|
277
|
+
string,
|
|
278
|
+
{ pid: number; timeout?: ReturnType<typeof setTimeout>; killTimer?: ReturnType<typeof setTimeout> }
|
|
279
|
+
>();
|
|
277
280
|
|
|
278
281
|
constructor(config: JobsConfig = {}) {
|
|
279
282
|
this.events = config.events;
|
|
@@ -521,6 +524,7 @@ class JobsImpl implements Jobs {
|
|
|
521
524
|
|
|
522
525
|
const config = this.externalConfigs.get(job.name);
|
|
523
526
|
const heartbeatTimeout = config?.heartbeatTimeout ?? this.externalConfig.defaultHeartbeatTimeout ?? 30000;
|
|
527
|
+
const killGraceMs = config?.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
|
|
524
528
|
const timeSinceHeartbeat = now - job.lastHeartbeat.getTime();
|
|
525
529
|
|
|
526
530
|
if (timeSinceHeartbeat > heartbeatTimeout) {
|
|
@@ -533,36 +537,22 @@ class JobsImpl implements Jobs {
|
|
|
533
537
|
name: job.name,
|
|
534
538
|
timeSinceHeartbeat,
|
|
535
539
|
});
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
console.error(`[Jobs] Killing stale external job ${job.id}`);
|
|
541
|
-
|
|
542
|
-
if (job.pid) {
|
|
543
|
-
try {
|
|
544
|
-
process.kill(job.pid, "SIGKILL");
|
|
545
|
-
} catch {
|
|
546
|
-
// Process may already be dead
|
|
547
|
-
}
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
await this.adapter.update(job.id, {
|
|
551
|
-
status: "failed",
|
|
552
|
-
error: "Heartbeat timeout - job process unresponsive",
|
|
553
|
-
completedAt: new Date(),
|
|
554
|
-
processState: "orphaned",
|
|
540
|
+
await this.events.emit("job.watchdog.stale", {
|
|
541
|
+
jobId: job.id,
|
|
542
|
+
name: job.name,
|
|
543
|
+
timeSinceHeartbeat,
|
|
555
544
|
});
|
|
545
|
+
}
|
|
556
546
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
547
|
+
const procInfo = this.externalProcesses.get(job.id);
|
|
548
|
+
if (job.pid && !procInfo?.killTimer) {
|
|
549
|
+
console.error(`[Jobs] Terminating stale external job ${job.id}`);
|
|
550
|
+
await this.terminateExternalProcess(
|
|
551
|
+
job.id,
|
|
552
|
+
job.pid,
|
|
553
|
+
killGraceMs,
|
|
554
|
+
"Heartbeat timeout - job process unresponsive"
|
|
555
|
+
);
|
|
566
556
|
}
|
|
567
557
|
}
|
|
568
558
|
}
|
|
@@ -764,6 +754,9 @@ class JobsImpl implements Jobs {
|
|
|
764
754
|
if (procInfo?.timeout) {
|
|
765
755
|
clearTimeout(procInfo.timeout);
|
|
766
756
|
}
|
|
757
|
+
if (procInfo?.killTimer) {
|
|
758
|
+
clearTimeout(procInfo.killTimer);
|
|
759
|
+
}
|
|
767
760
|
this.externalProcesses.delete(jobId);
|
|
768
761
|
|
|
769
762
|
// Close the socket
|
|
@@ -901,27 +894,13 @@ class JobsImpl implements Jobs {
|
|
|
901
894
|
if (config.timeout) {
|
|
902
895
|
const timeout = setTimeout(async () => {
|
|
903
896
|
console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
status: "failed",
|
|
912
|
-
error: `Job timed out after ${config.timeout}ms`,
|
|
913
|
-
completedAt: new Date(),
|
|
914
|
-
});
|
|
915
|
-
|
|
916
|
-
await this.cleanupExternalJob(job.id);
|
|
917
|
-
|
|
918
|
-
if (this.events) {
|
|
919
|
-
await this.events.emit("job.failed", {
|
|
920
|
-
jobId: job.id,
|
|
921
|
-
name: job.name,
|
|
922
|
-
error: "Timeout",
|
|
923
|
-
});
|
|
924
|
-
}
|
|
897
|
+
const killGraceMs = config.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
|
|
898
|
+
await this.terminateExternalProcess(
|
|
899
|
+
job.id,
|
|
900
|
+
proc.pid,
|
|
901
|
+
killGraceMs,
|
|
902
|
+
`Job timed out after ${config.timeout}ms`
|
|
903
|
+
);
|
|
925
904
|
}, config.timeout);
|
|
926
905
|
|
|
927
906
|
const procInfo = this.externalProcesses.get(job.id);
|
|
@@ -998,6 +977,73 @@ class JobsImpl implements Jobs {
|
|
|
998
977
|
}
|
|
999
978
|
}
|
|
1000
979
|
|
|
980
|
+
private async terminateExternalProcess(
|
|
981
|
+
jobId: string,
|
|
982
|
+
pid: number,
|
|
983
|
+
killGraceMs: number,
|
|
984
|
+
error: string
|
|
985
|
+
): Promise<void> {
|
|
986
|
+
try {
|
|
987
|
+
process.kill(pid, "SIGTERM");
|
|
988
|
+
} catch {
|
|
989
|
+
return;
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
if (killGraceMs <= 0) {
|
|
993
|
+
try {
|
|
994
|
+
process.kill(pid, "SIGKILL");
|
|
995
|
+
} catch {
|
|
996
|
+
// ignore
|
|
997
|
+
}
|
|
998
|
+
await this.handleExternalFailure(jobId, error);
|
|
999
|
+
return;
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
const timer = setTimeout(async () => {
|
|
1003
|
+
try {
|
|
1004
|
+
process.kill(pid, 0);
|
|
1005
|
+
process.kill(pid, "SIGKILL");
|
|
1006
|
+
} catch {
|
|
1007
|
+
// ignore
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
await this.handleExternalFailure(jobId, error);
|
|
1011
|
+
}, killGraceMs);
|
|
1012
|
+
|
|
1013
|
+
const procInfo = this.externalProcesses.get(jobId);
|
|
1014
|
+
if (procInfo) {
|
|
1015
|
+
procInfo.killTimer = timer;
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
private async handleExternalFailure(jobId: string, error: string): Promise<void> {
|
|
1020
|
+
await this.adapter.update(jobId, {
|
|
1021
|
+
status: "failed",
|
|
1022
|
+
error,
|
|
1023
|
+
completedAt: new Date(),
|
|
1024
|
+
processState: "orphaned",
|
|
1025
|
+
});
|
|
1026
|
+
|
|
1027
|
+
const job = await this.adapter.get(jobId);
|
|
1028
|
+
if (this.events && job) {
|
|
1029
|
+
await this.events.emit("job.watchdog.killed", {
|
|
1030
|
+
jobId,
|
|
1031
|
+
name: job.name,
|
|
1032
|
+
reason: error,
|
|
1033
|
+
});
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
await this.cleanupExternalJob(jobId);
|
|
1037
|
+
|
|
1038
|
+
if (this.events && job) {
|
|
1039
|
+
await this.events.emit("job.failed", {
|
|
1040
|
+
jobId,
|
|
1041
|
+
name: job.name,
|
|
1042
|
+
error,
|
|
1043
|
+
});
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1001
1047
|
private streamProcessOutput(
|
|
1002
1048
|
jobId: string,
|
|
1003
1049
|
jobName: string,
|
|
@@ -207,6 +207,23 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
|
|
|
207
207
|
|
|
208
208
|
let buffer = "";
|
|
209
209
|
|
|
210
|
+
const queue: ProcessMessage[] = [];
|
|
211
|
+
let processing = false;
|
|
212
|
+
|
|
213
|
+
const processQueue = async () => {
|
|
214
|
+
if (processing) return;
|
|
215
|
+
processing = true;
|
|
216
|
+
while (queue.length > 0) {
|
|
217
|
+
const message = queue.shift()!;
|
|
218
|
+
try {
|
|
219
|
+
await this.onMessage(message);
|
|
220
|
+
} catch (err) {
|
|
221
|
+
this.onError?.(err instanceof Error ? err : new Error(String(err)), processId);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
processing = false;
|
|
225
|
+
};
|
|
226
|
+
|
|
210
227
|
socket.on("data", (data) => {
|
|
211
228
|
buffer += data.toString();
|
|
212
229
|
|
|
@@ -219,11 +236,13 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
|
|
|
219
236
|
|
|
220
237
|
const message = this.parseMessage(line);
|
|
221
238
|
if (message) {
|
|
222
|
-
|
|
239
|
+
queue.push(message);
|
|
223
240
|
} else {
|
|
224
241
|
this.onError?.(new Error(`Invalid message: ${line}`), processId);
|
|
225
242
|
}
|
|
226
243
|
}
|
|
244
|
+
|
|
245
|
+
processQueue().catch(() => undefined);
|
|
227
246
|
});
|
|
228
247
|
|
|
229
248
|
socket.on("error", (err) => {
|
package/src/core/processes.ts
CHANGED
|
@@ -61,6 +61,15 @@ export interface ProcessConfig {
|
|
|
61
61
|
/** Timeout before considering unhealthy in ms (default: 60000) */
|
|
62
62
|
timeoutMs?: number;
|
|
63
63
|
};
|
|
64
|
+
/** Hard limits for the process (optional) */
|
|
65
|
+
limits?: {
|
|
66
|
+
/** Max runtime in ms before termination */
|
|
67
|
+
maxRuntimeMs?: number;
|
|
68
|
+
/** Max memory (RSS) in MB before termination (requires stats enabled) */
|
|
69
|
+
maxMemoryMb?: number;
|
|
70
|
+
/** Max CPU percent before termination (requires stats enabled) */
|
|
71
|
+
maxCpuPercent?: number;
|
|
72
|
+
};
|
|
64
73
|
}
|
|
65
74
|
|
|
66
75
|
export interface ManagedProcess {
|
|
@@ -171,6 +180,8 @@ export interface ProcessesConfig {
|
|
|
171
180
|
heartbeatCheckInterval?: number;
|
|
172
181
|
/** Enable auto-reconnect to orphaned processes on startup (default: true) */
|
|
173
182
|
autoRecoverOrphans?: boolean;
|
|
183
|
+
/** Grace period before SIGKILL when stopping/killing (ms, default: 5000) */
|
|
184
|
+
killGraceMs?: number;
|
|
174
185
|
}
|
|
175
186
|
|
|
176
187
|
// ============================================
|
|
@@ -251,6 +262,8 @@ export class ProcessesImpl implements Processes {
|
|
|
251
262
|
private events?: Events;
|
|
252
263
|
private heartbeatCheckInterval: number;
|
|
253
264
|
private autoRecoverOrphans: boolean;
|
|
265
|
+
private killGraceMs: number;
|
|
266
|
+
private runtimeLimitTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
254
267
|
|
|
255
268
|
// Track running Bun subprocesses
|
|
256
269
|
private subprocesses = new Map<string, Subprocess>();
|
|
@@ -266,6 +279,7 @@ export class ProcessesImpl implements Processes {
|
|
|
266
279
|
this.events = config.events;
|
|
267
280
|
this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
|
|
268
281
|
this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
|
|
282
|
+
this.killGraceMs = config.killGraceMs ?? 5000;
|
|
269
283
|
|
|
270
284
|
// Create socket server with callbacks
|
|
271
285
|
this.socketServer = createProcessSocketServer(config.socket ?? {}, {
|
|
@@ -361,6 +375,21 @@ export class ProcessesImpl implements Processes {
|
|
|
361
375
|
// Set up exit handler for crash detection
|
|
362
376
|
proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
|
|
363
377
|
|
|
378
|
+
const maxRuntimeMs = config.limits?.maxRuntimeMs;
|
|
379
|
+
if (maxRuntimeMs && maxRuntimeMs > 0) {
|
|
380
|
+
const timer = setTimeout(async () => {
|
|
381
|
+
console.warn(`[Processes] Max runtime exceeded for ${name} (${process.id})`);
|
|
382
|
+
await this.emitEvent("process.limits_exceeded", {
|
|
383
|
+
processId: process.id,
|
|
384
|
+
name,
|
|
385
|
+
reason: "maxRuntimeMs",
|
|
386
|
+
limit: maxRuntimeMs,
|
|
387
|
+
});
|
|
388
|
+
await this.stop(process.id);
|
|
389
|
+
}, maxRuntimeMs);
|
|
390
|
+
this.runtimeLimitTimers.set(process.id, timer);
|
|
391
|
+
}
|
|
392
|
+
|
|
364
393
|
console.log(`[Processes] Spawned ${name} (${process.id}) with PID ${proc.pid}`);
|
|
365
394
|
return process.id;
|
|
366
395
|
} catch (error) {
|
|
@@ -395,7 +424,7 @@ export class ProcessesImpl implements Processes {
|
|
|
395
424
|
// Wait for process to exit (with timeout)
|
|
396
425
|
const exitPromise = subprocess.exited;
|
|
397
426
|
const timeoutPromise = new Promise<null>((resolve) =>
|
|
398
|
-
setTimeout(() => resolve(null),
|
|
427
|
+
setTimeout(() => resolve(null), this.killGraceMs)
|
|
399
428
|
);
|
|
400
429
|
|
|
401
430
|
const result = await Promise.race([exitPromise, timeoutPromise]);
|
|
@@ -412,6 +441,11 @@ export class ProcessesImpl implements Processes {
|
|
|
412
441
|
// Cleanup
|
|
413
442
|
await this.socketServer.closeSocket(processId);
|
|
414
443
|
this.subprocesses.delete(processId);
|
|
444
|
+
const runtimeTimer = this.runtimeLimitTimers.get(processId);
|
|
445
|
+
if (runtimeTimer) {
|
|
446
|
+
clearTimeout(runtimeTimer);
|
|
447
|
+
this.runtimeLimitTimers.delete(processId);
|
|
448
|
+
}
|
|
415
449
|
|
|
416
450
|
await this.adapter.update(processId, {
|
|
417
451
|
status: "stopped",
|
|
@@ -443,6 +477,11 @@ export class ProcessesImpl implements Processes {
|
|
|
443
477
|
// Cleanup
|
|
444
478
|
await this.socketServer.closeSocket(processId);
|
|
445
479
|
this.subprocesses.delete(processId);
|
|
480
|
+
const runtimeTimer = this.runtimeLimitTimers.get(processId);
|
|
481
|
+
if (runtimeTimer) {
|
|
482
|
+
clearTimeout(runtimeTimer);
|
|
483
|
+
this.runtimeLimitTimers.delete(processId);
|
|
484
|
+
}
|
|
446
485
|
|
|
447
486
|
await this.adapter.update(processId, {
|
|
448
487
|
status: "stopped",
|
|
@@ -590,6 +629,47 @@ export class ProcessesImpl implements Processes {
|
|
|
590
629
|
await definition.onStats(proc, stats);
|
|
591
630
|
}
|
|
592
631
|
|
|
632
|
+
const limits = proc.config.limits;
|
|
633
|
+
if (limits) {
|
|
634
|
+
if (limits.maxMemoryMb && stats.memory.rss / 1e6 > limits.maxMemoryMb) {
|
|
635
|
+
console.warn(`[Processes] Memory limit exceeded for ${proc.name} (${proc.id})`);
|
|
636
|
+
await this.emitEvent("process.limits_exceeded", {
|
|
637
|
+
processId,
|
|
638
|
+
name: proc.name,
|
|
639
|
+
reason: "maxMemoryMb",
|
|
640
|
+
limit: limits.maxMemoryMb,
|
|
641
|
+
value: stats.memory.rss / 1e6,
|
|
642
|
+
});
|
|
643
|
+
await this.emitEvent("process.watchdog.killed", {
|
|
644
|
+
processId,
|
|
645
|
+
name: proc.name,
|
|
646
|
+
reason: "maxMemoryMb",
|
|
647
|
+
value: stats.memory.rss / 1e6,
|
|
648
|
+
});
|
|
649
|
+
await this.stop(proc.id);
|
|
650
|
+
return;
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
if (limits.maxCpuPercent && stats.cpu.percent > limits.maxCpuPercent) {
|
|
654
|
+
console.warn(`[Processes] CPU limit exceeded for ${proc.name} (${proc.id})`);
|
|
655
|
+
await this.emitEvent("process.limits_exceeded", {
|
|
656
|
+
processId,
|
|
657
|
+
name: proc.name,
|
|
658
|
+
reason: "maxCpuPercent",
|
|
659
|
+
limit: limits.maxCpuPercent,
|
|
660
|
+
value: stats.cpu.percent,
|
|
661
|
+
});
|
|
662
|
+
await this.emitEvent("process.watchdog.killed", {
|
|
663
|
+
processId,
|
|
664
|
+
name: proc.name,
|
|
665
|
+
reason: "maxCpuPercent",
|
|
666
|
+
value: stats.cpu.percent,
|
|
667
|
+
});
|
|
668
|
+
await this.stop(proc.id);
|
|
669
|
+
return;
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
|
|
593
673
|
return;
|
|
594
674
|
}
|
|
595
675
|
|
|
@@ -835,16 +915,27 @@ export class ProcessesImpl implements Processes {
|
|
|
835
915
|
processId: proc.id,
|
|
836
916
|
name: proc.name,
|
|
837
917
|
});
|
|
918
|
+
await this.emitEvent("process.watchdog.stale", {
|
|
919
|
+
processId: proc.id,
|
|
920
|
+
name: proc.name,
|
|
921
|
+
reason: "heartbeat",
|
|
922
|
+
timeoutMs,
|
|
923
|
+
});
|
|
838
924
|
|
|
839
925
|
const definition = this.definitions.get(proc.name);
|
|
840
926
|
if (definition?.onUnhealthy) {
|
|
841
927
|
await definition.onUnhealthy(proc);
|
|
842
928
|
}
|
|
843
929
|
|
|
844
|
-
// If heartbeat is way overdue (2x timeout),
|
|
930
|
+
// If heartbeat is way overdue (2x timeout), stop and restart
|
|
845
931
|
if (now - lastHeartbeat > timeoutMs * 2) {
|
|
846
|
-
console.warn(`[Processes]
|
|
847
|
-
await this.
|
|
932
|
+
console.warn(`[Processes] Stopping unresponsive process ${proc.name} (${proc.id})`);
|
|
933
|
+
await this.stop(proc.id);
|
|
934
|
+
await this.emitEvent("process.watchdog.killed", {
|
|
935
|
+
processId: proc.id,
|
|
936
|
+
name: proc.name,
|
|
937
|
+
reason: "heartbeat",
|
|
938
|
+
});
|
|
848
939
|
// handleExit will trigger auto-restart if configured
|
|
849
940
|
}
|
|
850
941
|
}
|
|
@@ -32,6 +32,11 @@ export interface SubprocessPluginMetadata {
|
|
|
32
32
|
export interface SubprocessBootstrapOptions {
|
|
33
33
|
dbPath: string;
|
|
34
34
|
coreConfig?: Record<string, any>;
|
|
35
|
+
sqlitePragmas?: {
|
|
36
|
+
busyTimeout?: number;
|
|
37
|
+
synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
|
|
38
|
+
journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
|
|
39
|
+
};
|
|
35
40
|
pluginMetadata: SubprocessPluginMetadata;
|
|
36
41
|
startServices?: {
|
|
37
42
|
cron?: boolean;
|
|
@@ -53,7 +58,15 @@ export async function bootstrapSubprocess(
|
|
|
53
58
|
options: SubprocessBootstrapOptions
|
|
54
59
|
): Promise<SubprocessBootstrapResult> {
|
|
55
60
|
const sqlite = new Database(options.dbPath);
|
|
56
|
-
|
|
61
|
+
const pragmas = options.sqlitePragmas;
|
|
62
|
+
const busyTimeout = pragmas?.busyTimeout ?? 5000;
|
|
63
|
+
sqlite.run(`PRAGMA busy_timeout = ${busyTimeout}`);
|
|
64
|
+
if (pragmas?.journalMode) {
|
|
65
|
+
sqlite.run(`PRAGMA journal_mode = ${pragmas.journalMode}`);
|
|
66
|
+
}
|
|
67
|
+
if (pragmas?.synchronous) {
|
|
68
|
+
sqlite.run(`PRAGMA synchronous = ${pragmas.synchronous}`);
|
|
69
|
+
}
|
|
57
70
|
|
|
58
71
|
const db = new Kysely<any>({
|
|
59
72
|
dialect: new BunSqliteDialect({ database: sqlite }),
|
|
@@ -25,6 +25,11 @@ interface ExecutorConfig {
|
|
|
25
25
|
pluginModulePaths: Record<string, string>;
|
|
26
26
|
pluginConfigs: Record<string, any>;
|
|
27
27
|
coreConfig?: Record<string, any>;
|
|
28
|
+
sqlitePragmas?: {
|
|
29
|
+
busyTimeout?: number;
|
|
30
|
+
synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
|
|
31
|
+
journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
|
|
32
|
+
};
|
|
28
33
|
}
|
|
29
34
|
|
|
30
35
|
// ============================================
|
|
@@ -47,6 +52,7 @@ async function main(): Promise<void> {
|
|
|
47
52
|
pluginModulePaths,
|
|
48
53
|
pluginConfigs,
|
|
49
54
|
coreConfig,
|
|
55
|
+
sqlitePragmas,
|
|
50
56
|
} = config;
|
|
51
57
|
|
|
52
58
|
const socket = await connectToSocket(socketPath, tcpPort);
|
|
@@ -71,6 +77,7 @@ async function main(): Promise<void> {
|
|
|
71
77
|
const bootstrap = await bootstrapSubprocess({
|
|
72
78
|
dbPath,
|
|
73
79
|
coreConfig,
|
|
80
|
+
sqlitePragmas,
|
|
74
81
|
pluginMetadata: {
|
|
75
82
|
names: pluginNames,
|
|
76
83
|
modulePaths: pluginModulePaths,
|
|
@@ -248,7 +248,24 @@ export class WorkflowSocketServerImpl implements WorkflowSocketServer {
|
|
|
248
248
|
|
|
249
249
|
let buffer = "";
|
|
250
250
|
|
|
251
|
-
|
|
251
|
+
const queue: WorkflowMessage[] = [];
|
|
252
|
+
let processing = false;
|
|
253
|
+
|
|
254
|
+
const processQueue = async () => {
|
|
255
|
+
if (processing) return;
|
|
256
|
+
processing = true;
|
|
257
|
+
while (queue.length > 0) {
|
|
258
|
+
const message = queue.shift()!;
|
|
259
|
+
try {
|
|
260
|
+
await this.handleMessage(instanceId, message);
|
|
261
|
+
} catch (err) {
|
|
262
|
+
this.onError?.(err instanceof Error ? err : new Error(String(err)), instanceId);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
processing = false;
|
|
266
|
+
};
|
|
267
|
+
|
|
268
|
+
socket.on("data", (data) => {
|
|
252
269
|
buffer += data.toString();
|
|
253
270
|
|
|
254
271
|
// Process complete messages (newline-delimited JSON)
|
|
@@ -260,11 +277,13 @@ export class WorkflowSocketServerImpl implements WorkflowSocketServer {
|
|
|
260
277
|
|
|
261
278
|
try {
|
|
262
279
|
const message = JSON.parse(line) as WorkflowMessage;
|
|
263
|
-
|
|
280
|
+
queue.push(message);
|
|
264
281
|
} catch (err) {
|
|
265
282
|
this.onError?.(new Error(`Invalid message: ${line}`), instanceId);
|
|
266
283
|
}
|
|
267
284
|
}
|
|
285
|
+
|
|
286
|
+
processQueue().catch(() => undefined);
|
|
268
287
|
});
|
|
269
288
|
|
|
270
289
|
socket.on("error", (err) => {
|
package/src/core/workflows.ts
CHANGED
|
@@ -760,12 +760,22 @@ export interface WorkflowsConfig {
|
|
|
760
760
|
heartbeatTimeout?: number;
|
|
761
761
|
/** Timeout waiting for isolated subprocess readiness (ms, default: 10000) */
|
|
762
762
|
readyTimeout?: number;
|
|
763
|
+
/** Grace period before SIGKILL when terminating isolated subprocesses (ms, default: 5000) */
|
|
764
|
+
killGraceMs?: number;
|
|
765
|
+
/** SQLite pragmas for isolated subprocess connections */
|
|
766
|
+
sqlitePragmas?: SqlitePragmaConfig;
|
|
763
767
|
/** Resume strategy for orphaned workflows (default: "blocking") */
|
|
764
768
|
resumeStrategy?: WorkflowResumeStrategy;
|
|
765
769
|
}
|
|
766
770
|
|
|
767
771
|
export type WorkflowResumeStrategy = "blocking" | "background" | "skip";
|
|
768
772
|
|
|
773
|
+
export interface SqlitePragmaConfig {
|
|
774
|
+
busyTimeout?: number;
|
|
775
|
+
synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
|
|
776
|
+
journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
|
|
777
|
+
}
|
|
778
|
+
|
|
769
779
|
/** Options for registering a workflow */
|
|
770
780
|
export interface WorkflowRegisterOptions {
|
|
771
781
|
/**
|
|
@@ -854,6 +864,8 @@ class WorkflowsImpl implements Workflows {
|
|
|
854
864
|
private dbPath?: string;
|
|
855
865
|
private heartbeatTimeoutMs: number;
|
|
856
866
|
private readyTimeoutMs: number;
|
|
867
|
+
private killGraceMs: number;
|
|
868
|
+
private sqlitePragmas?: SqlitePragmaConfig;
|
|
857
869
|
private resumeStrategy!: WorkflowResumeStrategy;
|
|
858
870
|
private workflowModulePaths = new Map<string, string>();
|
|
859
871
|
private isolatedProcesses = new Map<string, IsolatedProcessInfo>();
|
|
@@ -888,6 +900,8 @@ class WorkflowsImpl implements Workflows {
|
|
|
888
900
|
this.dbPath = config.dbPath;
|
|
889
901
|
this.heartbeatTimeoutMs = config.heartbeatTimeout ?? 60000;
|
|
890
902
|
this.readyTimeoutMs = config.readyTimeout ?? 10000;
|
|
903
|
+
this.killGraceMs = config.killGraceMs ?? 5000;
|
|
904
|
+
this.sqlitePragmas = config.sqlitePragmas;
|
|
891
905
|
this.resumeStrategy = config.resumeStrategy ?? "blocking";
|
|
892
906
|
}
|
|
893
907
|
|
|
@@ -1049,11 +1063,7 @@ class WorkflowsImpl implements Workflows {
|
|
|
1049
1063
|
// Kill isolated process if running
|
|
1050
1064
|
const isolatedInfo = this.isolatedProcesses.get(instanceId);
|
|
1051
1065
|
if (isolatedInfo) {
|
|
1052
|
-
|
|
1053
|
-
process.kill(isolatedInfo.pid, "SIGTERM");
|
|
1054
|
-
} catch {
|
|
1055
|
-
// Process might already be dead
|
|
1056
|
-
}
|
|
1066
|
+
await killProcessWithGrace(isolatedInfo.pid, this.killGraceMs);
|
|
1057
1067
|
if (isolatedInfo.timeout) clearTimeout(isolatedInfo.timeout);
|
|
1058
1068
|
if (isolatedInfo.heartbeatTimeout) clearTimeout(isolatedInfo.heartbeatTimeout);
|
|
1059
1069
|
this.isolatedProcesses.delete(instanceId);
|
|
@@ -1470,6 +1480,7 @@ class WorkflowsImpl implements Workflows {
|
|
|
1470
1480
|
pluginModulePaths: this.pluginModulePaths,
|
|
1471
1481
|
pluginConfigs,
|
|
1472
1482
|
coreConfig,
|
|
1483
|
+
sqlitePragmas: this.sqlitePragmas,
|
|
1473
1484
|
};
|
|
1474
1485
|
|
|
1475
1486
|
// Spawn the subprocess
|
|
@@ -1995,6 +2006,11 @@ class WorkflowsImpl implements Workflows {
|
|
|
1995
2006
|
}
|
|
1996
2007
|
|
|
1997
2008
|
console.error(`[Workflows] No heartbeat from isolated workflow ${instanceId} for ${this.heartbeatTimeoutMs}ms`);
|
|
2009
|
+
await this.emitEvent("workflow.watchdog.stale", {
|
|
2010
|
+
instanceId,
|
|
2011
|
+
reason: "heartbeat",
|
|
2012
|
+
timeoutMs: this.heartbeatTimeoutMs,
|
|
2013
|
+
});
|
|
1998
2014
|
await this.handleIsolatedTimeout(instanceId, pid);
|
|
1999
2015
|
}, this.heartbeatTimeoutMs);
|
|
2000
2016
|
}
|
|
@@ -2006,12 +2022,12 @@ class WorkflowsImpl implements Workflows {
|
|
|
2006
2022
|
const info = this.isolatedProcesses.get(instanceId);
|
|
2007
2023
|
if (!info) return;
|
|
2008
2024
|
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
}
|
|
2025
|
+
await killProcessWithGrace(pid, this.killGraceMs);
|
|
2026
|
+
await this.emitEvent("workflow.watchdog.killed", {
|
|
2027
|
+
instanceId,
|
|
2028
|
+
reason: "timeout",
|
|
2029
|
+
timeoutMs: this.heartbeatTimeoutMs,
|
|
2030
|
+
});
|
|
2015
2031
|
|
|
2016
2032
|
// Clean up
|
|
2017
2033
|
if (info.timeout) clearTimeout(info.timeout);
|
|
@@ -2148,3 +2164,29 @@ function isPlainObject(value: Record<string, any>): boolean {
|
|
|
2148
2164
|
export function createWorkflows(config?: WorkflowsConfig): Workflows {
|
|
2149
2165
|
return new WorkflowsImpl(config);
|
|
2150
2166
|
}
|
|
2167
|
+
|
|
2168
|
+
async function killProcessWithGrace(pid: number, graceMs: number): Promise<void> {
|
|
2169
|
+
try {
|
|
2170
|
+
process.kill(pid, "SIGTERM");
|
|
2171
|
+
} catch {
|
|
2172
|
+
return;
|
|
2173
|
+
}
|
|
2174
|
+
|
|
2175
|
+
if (graceMs <= 0) {
|
|
2176
|
+
try {
|
|
2177
|
+
process.kill(pid, "SIGKILL");
|
|
2178
|
+
} catch {
|
|
2179
|
+
return;
|
|
2180
|
+
}
|
|
2181
|
+
return;
|
|
2182
|
+
}
|
|
2183
|
+
|
|
2184
|
+
await new Promise((resolve) => setTimeout(resolve, graceMs));
|
|
2185
|
+
|
|
2186
|
+
try {
|
|
2187
|
+
process.kill(pid, 0);
|
|
2188
|
+
process.kill(pid, "SIGKILL");
|
|
2189
|
+
} catch {
|
|
2190
|
+
// Process already exited
|
|
2191
|
+
}
|
|
2192
|
+
}
|