@donkeylabs/server 2.0.27 → 2.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/jobs.md +20 -0
- package/docs/processes.md +27 -0
- package/docs/workflows.md +69 -0
- package/package.json +1 -1
- package/src/admin/routes.ts +24 -0
- package/src/core/external-job-socket.ts +20 -1
- package/src/core/external-jobs.ts +4 -0
- package/src/core/index.ts +4 -0
- package/src/core/jobs.ts +96 -50
- package/src/core/process-socket.ts +20 -1
- package/src/core/processes.ts +95 -4
- package/src/core/subprocess-bootstrap.ts +14 -1
- package/src/core/workflow-adapter-kysely.ts +5 -0
- package/src/core/workflow-executor.ts +28 -0
- package/src/core/workflow-socket.ts +28 -2
- package/src/core/workflow-state-machine.ts +147 -2
- package/src/core/workflows.ts +260 -13
package/docs/jobs.md
CHANGED
|
@@ -192,6 +192,26 @@ const cancelled = await ctx.core.jobs.cancel(jobId);
|
|
|
192
192
|
|
|
193
193
|
---
|
|
194
194
|
|
|
195
|
+
## External Jobs (Subprocess)
|
|
196
|
+
|
|
197
|
+
External jobs run in a separate process and are monitored by a watchdog.
|
|
198
|
+
|
|
199
|
+
```ts
|
|
200
|
+
ctx.core.jobs.registerExternal("batchWorker", {
|
|
201
|
+
command: "bun",
|
|
202
|
+
args: ["./workers/batch-worker.ts"],
|
|
203
|
+
heartbeatTimeout: 30000,
|
|
204
|
+
timeout: 10 * 60 * 1000,
|
|
205
|
+
killGraceMs: 5000,
|
|
206
|
+
});
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Watchdog events:
|
|
210
|
+
- `job.watchdog.stale`
|
|
211
|
+
- `job.watchdog.killed`
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
195
215
|
## Event Integration
|
|
196
216
|
|
|
197
217
|
Jobs automatically emit events on completion and failure:
|
package/docs/processes.md
CHANGED
|
@@ -8,6 +8,7 @@ Processes provide:
|
|
|
8
8
|
- Long-running daemon management (start, stop, restart)
|
|
9
9
|
- Typed event communication from process to server
|
|
10
10
|
- Automatic heartbeat monitoring
|
|
11
|
+
- Watchdog termination for unresponsive processes
|
|
11
12
|
- Connection resilience with auto-reconnection
|
|
12
13
|
- Metadata passing to spawned processes
|
|
13
14
|
- Cross-platform support (Unix sockets / TCP on Windows)
|
|
@@ -89,6 +90,13 @@ server.getCore().processes.define("video-encoder", {
|
|
|
89
90
|
|
|
90
91
|
// Heartbeat configuration
|
|
91
92
|
heartbeatTimeout: 30000, // 30 seconds
|
|
93
|
+
|
|
94
|
+
// Optional hard limits (requires stats for memory/CPU)
|
|
95
|
+
limits: {
|
|
96
|
+
maxRuntimeMs: 60_000,
|
|
97
|
+
maxMemoryMb: 512,
|
|
98
|
+
maxCpuPercent: 90,
|
|
99
|
+
},
|
|
92
100
|
});
|
|
93
101
|
```
|
|
94
102
|
|
|
@@ -221,6 +229,25 @@ const client = await ProcessClient.connect({
|
|
|
221
229
|
});
|
|
222
230
|
```
|
|
223
231
|
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Hard Limits
|
|
235
|
+
|
|
236
|
+
Processes can be terminated automatically when limits are exceeded:
|
|
237
|
+
|
|
238
|
+
- `maxRuntimeMs` always enforced by the server watchdog
|
|
239
|
+
- `maxMemoryMb` and `maxCpuPercent` require `ProcessClient` stats enabled
|
|
240
|
+
|
|
241
|
+
```ts
|
|
242
|
+
const client = await ProcessClient.connect({
|
|
243
|
+
stats: { enabled: true, interval: 5000 },
|
|
244
|
+
});
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Watchdog events:
|
|
248
|
+
- `process.watchdog.stale`
|
|
249
|
+
- `process.watchdog.killed`
|
|
250
|
+
|
|
224
251
|
### Properties
|
|
225
252
|
|
|
226
253
|
```typescript
|
package/docs/workflows.md
CHANGED
|
@@ -285,6 +285,75 @@ workflow("example")
|
|
|
285
285
|
.end("done")
|
|
286
286
|
```
|
|
287
287
|
|
|
288
|
+
### Poll
|
|
289
|
+
|
|
290
|
+
Use a poll step for wait → check loops that persist across restarts.
|
|
291
|
+
|
|
292
|
+
```typescript
|
|
293
|
+
workflow("batch.status")
|
|
294
|
+
.poll("wait-for-result", {
|
|
295
|
+
interval: 5000,
|
|
296
|
+
timeout: 600000,
|
|
297
|
+
maxAttempts: 120,
|
|
298
|
+
check: async (input, ctx) => {
|
|
299
|
+
const status = await fetchStatus(input.operationId);
|
|
300
|
+
if (status.state === "FAILED") throw new Error(status.error);
|
|
301
|
+
if (status.state === "SUCCEEDED") {
|
|
302
|
+
return { done: true, result: status.data };
|
|
303
|
+
}
|
|
304
|
+
return { done: false };
|
|
305
|
+
},
|
|
306
|
+
})
|
|
307
|
+
.build();
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
Each poll cycle emits `workflow.step.poll` events and persists progress to the instance.
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## Watchdog and Subprocess Settings
|
|
315
|
+
|
|
316
|
+
You can tune subprocess termination and SQLite pragmas used by isolated workflows:
|
|
317
|
+
|
|
318
|
+
```ts
|
|
319
|
+
const server = new AppServer({
|
|
320
|
+
db,
|
|
321
|
+
workflows: {
|
|
322
|
+
killGraceMs: 5000,
|
|
323
|
+
sqlitePragmas: {
|
|
324
|
+
busyTimeout: 5000,
|
|
325
|
+
journalMode: "WAL",
|
|
326
|
+
synchronous: "NORMAL",
|
|
327
|
+
},
|
|
328
|
+
},
|
|
329
|
+
});
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
Watchdog events:
|
|
333
|
+
- `workflow.watchdog.stale` (heartbeat missed)
|
|
334
|
+
- `workflow.watchdog.killed` (process terminated)
|
|
335
|
+
|
|
336
|
+
### Loop
|
|
337
|
+
|
|
338
|
+
Use a loop step to jump back to a previous step until a condition is false.
|
|
339
|
+
|
|
340
|
+
```typescript
|
|
341
|
+
workflow("loop-example")
|
|
342
|
+
.task("increment", {
|
|
343
|
+
handler: async (input) => ({ count: (input.count ?? 0) + 1 }),
|
|
344
|
+
})
|
|
345
|
+
.loop("repeat", {
|
|
346
|
+
condition: (ctx) => ctx.steps.increment.count < 3,
|
|
347
|
+
target: "increment",
|
|
348
|
+
interval: 1000,
|
|
349
|
+
maxIterations: 10,
|
|
350
|
+
timeout: 30000,
|
|
351
|
+
})
|
|
352
|
+
.build();
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
Each loop iteration emits `workflow.step.loop` and persists loop counters to the instance.
|
|
356
|
+
|
|
288
357
|
## Workflow Context
|
|
289
358
|
|
|
290
359
|
Every step receives a `WorkflowContext` with:
|
package/package.json
CHANGED
package/src/admin/routes.ts
CHANGED
|
@@ -505,6 +505,17 @@ export function createAdminRouter(config: AdminRouteContext) {
|
|
|
505
505
|
stepName: z.string(),
|
|
506
506
|
error: z.string(),
|
|
507
507
|
}),
|
|
508
|
+
"step.poll": z.object({
|
|
509
|
+
stepName: z.string(),
|
|
510
|
+
pollCount: z.number(),
|
|
511
|
+
done: z.boolean(),
|
|
512
|
+
result: z.any().optional(),
|
|
513
|
+
}),
|
|
514
|
+
"step.loop": z.object({
|
|
515
|
+
stepName: z.string(),
|
|
516
|
+
loopCount: z.number(),
|
|
517
|
+
target: z.string(),
|
|
518
|
+
}),
|
|
508
519
|
completed: z.object({
|
|
509
520
|
output: z.any().optional(),
|
|
510
521
|
}),
|
|
@@ -548,6 +559,19 @@ export function createAdminRouter(config: AdminRouteContext) {
|
|
|
548
559
|
workflowName: z.string(),
|
|
549
560
|
error: z.string(),
|
|
550
561
|
}),
|
|
562
|
+
"workflow.step.poll": z.object({
|
|
563
|
+
instanceId: z.string(),
|
|
564
|
+
stepName: z.string(),
|
|
565
|
+
pollCount: z.number(),
|
|
566
|
+
done: z.boolean(),
|
|
567
|
+
result: z.any().optional(),
|
|
568
|
+
}),
|
|
569
|
+
"workflow.step.loop": z.object({
|
|
570
|
+
instanceId: z.string(),
|
|
571
|
+
stepName: z.string(),
|
|
572
|
+
loopCount: z.number(),
|
|
573
|
+
target: z.string(),
|
|
574
|
+
}),
|
|
551
575
|
},
|
|
552
576
|
handle: (input, ctx) => {
|
|
553
577
|
if (!checkAuth(ctx)) {
|
|
@@ -201,6 +201,23 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
|
|
|
201
201
|
|
|
202
202
|
let buffer = "";
|
|
203
203
|
|
|
204
|
+
const queue: AnyExternalJobMessage[] = [];
|
|
205
|
+
let processing = false;
|
|
206
|
+
|
|
207
|
+
const processQueue = async () => {
|
|
208
|
+
if (processing) return;
|
|
209
|
+
processing = true;
|
|
210
|
+
while (queue.length > 0) {
|
|
211
|
+
const message = queue.shift()!;
|
|
212
|
+
try {
|
|
213
|
+
await this.onMessage(message);
|
|
214
|
+
} catch (err) {
|
|
215
|
+
this.onError?.(err instanceof Error ? err : new Error(String(err)), jobId);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
processing = false;
|
|
219
|
+
};
|
|
220
|
+
|
|
204
221
|
socket.on("data", (data) => {
|
|
205
222
|
buffer += data.toString();
|
|
206
223
|
|
|
@@ -213,11 +230,13 @@ export class ExternalJobSocketServerImpl implements ExternalJobSocketServer {
|
|
|
213
230
|
|
|
214
231
|
const message = parseJobMessage(line);
|
|
215
232
|
if (message) {
|
|
216
|
-
|
|
233
|
+
queue.push(message);
|
|
217
234
|
} else {
|
|
218
235
|
this.onError?.(new Error(`Invalid message: ${line}`), jobId);
|
|
219
236
|
}
|
|
220
237
|
}
|
|
238
|
+
|
|
239
|
+
processQueue().catch(() => undefined);
|
|
221
240
|
});
|
|
222
241
|
|
|
223
242
|
socket.on("error", (err) => {
|
|
@@ -80,6 +80,8 @@ export interface ExternalJobConfig {
|
|
|
80
80
|
heartbeatTimeout?: number;
|
|
81
81
|
/** Job timeout in milliseconds (optional) */
|
|
82
82
|
timeout?: number;
|
|
83
|
+
/** Grace period before SIGKILL when terminating (ms, default: 5000) */
|
|
84
|
+
killGraceMs?: number;
|
|
83
85
|
}
|
|
84
86
|
|
|
85
87
|
// ============================================
|
|
@@ -120,6 +122,8 @@ export interface ExternalJobsConfig {
|
|
|
120
122
|
defaultHeartbeatTimeout?: number;
|
|
121
123
|
/** Heartbeat check interval in ms (default: 10000) */
|
|
122
124
|
heartbeatCheckInterval?: number;
|
|
125
|
+
/** Default grace period before SIGKILL when terminating (ms, default: 5000) */
|
|
126
|
+
killGraceMs?: number;
|
|
123
127
|
}
|
|
124
128
|
|
|
125
129
|
// ============================================
|
package/src/core/index.ts
CHANGED
|
@@ -134,6 +134,7 @@ export {
|
|
|
134
134
|
export {
|
|
135
135
|
type Workflows,
|
|
136
136
|
type WorkflowsConfig,
|
|
137
|
+
type SqlitePragmaConfig,
|
|
137
138
|
type WorkflowRegisterOptions,
|
|
138
139
|
type WorkflowDefinition,
|
|
139
140
|
type WorkflowInstance,
|
|
@@ -149,6 +150,9 @@ export {
|
|
|
149
150
|
type ChoiceStepDefinition,
|
|
150
151
|
type ChoiceCondition,
|
|
151
152
|
type PassStepDefinition,
|
|
153
|
+
type PollStepDefinition,
|
|
154
|
+
type PollStepResult,
|
|
155
|
+
type LoopStepDefinition,
|
|
152
156
|
type RetryConfig,
|
|
153
157
|
type GetAllWorkflowsOptions,
|
|
154
158
|
type PluginMetadata,
|
package/src/core/jobs.ts
CHANGED
|
@@ -273,7 +273,10 @@ class JobsImpl implements Jobs {
|
|
|
273
273
|
private externalConfigs = new Map<string, ExternalJobConfig>();
|
|
274
274
|
private externalConfig: ExternalJobsConfig;
|
|
275
275
|
private socketServer: ExternalJobSocketServer | null = null;
|
|
276
|
-
private externalProcesses = new Map<
|
|
276
|
+
private externalProcesses = new Map<
|
|
277
|
+
string,
|
|
278
|
+
{ pid: number; timeout?: ReturnType<typeof setTimeout>; killTimer?: ReturnType<typeof setTimeout> }
|
|
279
|
+
>();
|
|
277
280
|
|
|
278
281
|
constructor(config: JobsConfig = {}) {
|
|
279
282
|
this.events = config.events;
|
|
@@ -521,6 +524,7 @@ class JobsImpl implements Jobs {
|
|
|
521
524
|
|
|
522
525
|
const config = this.externalConfigs.get(job.name);
|
|
523
526
|
const heartbeatTimeout = config?.heartbeatTimeout ?? this.externalConfig.defaultHeartbeatTimeout ?? 30000;
|
|
527
|
+
const killGraceMs = config?.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
|
|
524
528
|
const timeSinceHeartbeat = now - job.lastHeartbeat.getTime();
|
|
525
529
|
|
|
526
530
|
if (timeSinceHeartbeat > heartbeatTimeout) {
|
|
@@ -533,36 +537,22 @@ class JobsImpl implements Jobs {
|
|
|
533
537
|
name: job.name,
|
|
534
538
|
timeSinceHeartbeat,
|
|
535
539
|
});
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
console.error(`[Jobs] Killing stale external job ${job.id}`);
|
|
541
|
-
|
|
542
|
-
if (job.pid) {
|
|
543
|
-
try {
|
|
544
|
-
process.kill(job.pid, "SIGKILL");
|
|
545
|
-
} catch {
|
|
546
|
-
// Process may already be dead
|
|
547
|
-
}
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
await this.adapter.update(job.id, {
|
|
551
|
-
status: "failed",
|
|
552
|
-
error: "Heartbeat timeout - job process unresponsive",
|
|
553
|
-
completedAt: new Date(),
|
|
554
|
-
processState: "orphaned",
|
|
540
|
+
await this.events.emit("job.watchdog.stale", {
|
|
541
|
+
jobId: job.id,
|
|
542
|
+
name: job.name,
|
|
543
|
+
timeSinceHeartbeat,
|
|
555
544
|
});
|
|
545
|
+
}
|
|
556
546
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
547
|
+
const procInfo = this.externalProcesses.get(job.id);
|
|
548
|
+
if (job.pid && !procInfo?.killTimer) {
|
|
549
|
+
console.error(`[Jobs] Terminating stale external job ${job.id}`);
|
|
550
|
+
await this.terminateExternalProcess(
|
|
551
|
+
job.id,
|
|
552
|
+
job.pid,
|
|
553
|
+
killGraceMs,
|
|
554
|
+
"Heartbeat timeout - job process unresponsive"
|
|
555
|
+
);
|
|
566
556
|
}
|
|
567
557
|
}
|
|
568
558
|
}
|
|
@@ -764,6 +754,9 @@ class JobsImpl implements Jobs {
|
|
|
764
754
|
if (procInfo?.timeout) {
|
|
765
755
|
clearTimeout(procInfo.timeout);
|
|
766
756
|
}
|
|
757
|
+
if (procInfo?.killTimer) {
|
|
758
|
+
clearTimeout(procInfo.killTimer);
|
|
759
|
+
}
|
|
767
760
|
this.externalProcesses.delete(jobId);
|
|
768
761
|
|
|
769
762
|
// Close the socket
|
|
@@ -901,27 +894,13 @@ class JobsImpl implements Jobs {
|
|
|
901
894
|
if (config.timeout) {
|
|
902
895
|
const timeout = setTimeout(async () => {
|
|
903
896
|
console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
status: "failed",
|
|
912
|
-
error: `Job timed out after ${config.timeout}ms`,
|
|
913
|
-
completedAt: new Date(),
|
|
914
|
-
});
|
|
915
|
-
|
|
916
|
-
await this.cleanupExternalJob(job.id);
|
|
917
|
-
|
|
918
|
-
if (this.events) {
|
|
919
|
-
await this.events.emit("job.failed", {
|
|
920
|
-
jobId: job.id,
|
|
921
|
-
name: job.name,
|
|
922
|
-
error: "Timeout",
|
|
923
|
-
});
|
|
924
|
-
}
|
|
897
|
+
const killGraceMs = config.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
|
|
898
|
+
await this.terminateExternalProcess(
|
|
899
|
+
job.id,
|
|
900
|
+
proc.pid,
|
|
901
|
+
killGraceMs,
|
|
902
|
+
`Job timed out after ${config.timeout}ms`
|
|
903
|
+
);
|
|
925
904
|
}, config.timeout);
|
|
926
905
|
|
|
927
906
|
const procInfo = this.externalProcesses.get(job.id);
|
|
@@ -998,6 +977,73 @@ class JobsImpl implements Jobs {
|
|
|
998
977
|
}
|
|
999
978
|
}
|
|
1000
979
|
|
|
980
|
+
private async terminateExternalProcess(
|
|
981
|
+
jobId: string,
|
|
982
|
+
pid: number,
|
|
983
|
+
killGraceMs: number,
|
|
984
|
+
error: string
|
|
985
|
+
): Promise<void> {
|
|
986
|
+
try {
|
|
987
|
+
process.kill(pid, "SIGTERM");
|
|
988
|
+
} catch {
|
|
989
|
+
return;
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
if (killGraceMs <= 0) {
|
|
993
|
+
try {
|
|
994
|
+
process.kill(pid, "SIGKILL");
|
|
995
|
+
} catch {
|
|
996
|
+
// ignore
|
|
997
|
+
}
|
|
998
|
+
await this.handleExternalFailure(jobId, error);
|
|
999
|
+
return;
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
const timer = setTimeout(async () => {
|
|
1003
|
+
try {
|
|
1004
|
+
process.kill(pid, 0);
|
|
1005
|
+
process.kill(pid, "SIGKILL");
|
|
1006
|
+
} catch {
|
|
1007
|
+
// ignore
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
await this.handleExternalFailure(jobId, error);
|
|
1011
|
+
}, killGraceMs);
|
|
1012
|
+
|
|
1013
|
+
const procInfo = this.externalProcesses.get(jobId);
|
|
1014
|
+
if (procInfo) {
|
|
1015
|
+
procInfo.killTimer = timer;
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
private async handleExternalFailure(jobId: string, error: string): Promise<void> {
|
|
1020
|
+
await this.adapter.update(jobId, {
|
|
1021
|
+
status: "failed",
|
|
1022
|
+
error,
|
|
1023
|
+
completedAt: new Date(),
|
|
1024
|
+
processState: "orphaned",
|
|
1025
|
+
});
|
|
1026
|
+
|
|
1027
|
+
const job = await this.adapter.get(jobId);
|
|
1028
|
+
if (this.events && job) {
|
|
1029
|
+
await this.events.emit("job.watchdog.killed", {
|
|
1030
|
+
jobId,
|
|
1031
|
+
name: job.name,
|
|
1032
|
+
reason: error,
|
|
1033
|
+
});
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
await this.cleanupExternalJob(jobId);
|
|
1037
|
+
|
|
1038
|
+
if (this.events && job) {
|
|
1039
|
+
await this.events.emit("job.failed", {
|
|
1040
|
+
jobId,
|
|
1041
|
+
name: job.name,
|
|
1042
|
+
error,
|
|
1043
|
+
});
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1001
1047
|
private streamProcessOutput(
|
|
1002
1048
|
jobId: string,
|
|
1003
1049
|
jobName: string,
|
|
@@ -207,6 +207,23 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
|
|
|
207
207
|
|
|
208
208
|
let buffer = "";
|
|
209
209
|
|
|
210
|
+
const queue: ProcessMessage[] = [];
|
|
211
|
+
let processing = false;
|
|
212
|
+
|
|
213
|
+
const processQueue = async () => {
|
|
214
|
+
if (processing) return;
|
|
215
|
+
processing = true;
|
|
216
|
+
while (queue.length > 0) {
|
|
217
|
+
const message = queue.shift()!;
|
|
218
|
+
try {
|
|
219
|
+
await this.onMessage(message);
|
|
220
|
+
} catch (err) {
|
|
221
|
+
this.onError?.(err instanceof Error ? err : new Error(String(err)), processId);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
processing = false;
|
|
225
|
+
};
|
|
226
|
+
|
|
210
227
|
socket.on("data", (data) => {
|
|
211
228
|
buffer += data.toString();
|
|
212
229
|
|
|
@@ -219,11 +236,13 @@ export class ProcessSocketServerImpl implements ProcessSocketServer {
|
|
|
219
236
|
|
|
220
237
|
const message = this.parseMessage(line);
|
|
221
238
|
if (message) {
|
|
222
|
-
|
|
239
|
+
queue.push(message);
|
|
223
240
|
} else {
|
|
224
241
|
this.onError?.(new Error(`Invalid message: ${line}`), processId);
|
|
225
242
|
}
|
|
226
243
|
}
|
|
244
|
+
|
|
245
|
+
processQueue().catch(() => undefined);
|
|
227
246
|
});
|
|
228
247
|
|
|
229
248
|
socket.on("error", (err) => {
|
package/src/core/processes.ts
CHANGED
|
@@ -61,6 +61,15 @@ export interface ProcessConfig {
|
|
|
61
61
|
/** Timeout before considering unhealthy in ms (default: 60000) */
|
|
62
62
|
timeoutMs?: number;
|
|
63
63
|
};
|
|
64
|
+
/** Hard limits for the process (optional) */
|
|
65
|
+
limits?: {
|
|
66
|
+
/** Max runtime in ms before termination */
|
|
67
|
+
maxRuntimeMs?: number;
|
|
68
|
+
/** Max memory (RSS) in MB before termination (requires stats enabled) */
|
|
69
|
+
maxMemoryMb?: number;
|
|
70
|
+
/** Max CPU percent before termination (requires stats enabled) */
|
|
71
|
+
maxCpuPercent?: number;
|
|
72
|
+
};
|
|
64
73
|
}
|
|
65
74
|
|
|
66
75
|
export interface ManagedProcess {
|
|
@@ -171,6 +180,8 @@ export interface ProcessesConfig {
|
|
|
171
180
|
heartbeatCheckInterval?: number;
|
|
172
181
|
/** Enable auto-reconnect to orphaned processes on startup (default: true) */
|
|
173
182
|
autoRecoverOrphans?: boolean;
|
|
183
|
+
/** Grace period before SIGKILL when stopping/killing (ms, default: 5000) */
|
|
184
|
+
killGraceMs?: number;
|
|
174
185
|
}
|
|
175
186
|
|
|
176
187
|
// ============================================
|
|
@@ -251,6 +262,8 @@ export class ProcessesImpl implements Processes {
|
|
|
251
262
|
private events?: Events;
|
|
252
263
|
private heartbeatCheckInterval: number;
|
|
253
264
|
private autoRecoverOrphans: boolean;
|
|
265
|
+
private killGraceMs: number;
|
|
266
|
+
private runtimeLimitTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
254
267
|
|
|
255
268
|
// Track running Bun subprocesses
|
|
256
269
|
private subprocesses = new Map<string, Subprocess>();
|
|
@@ -266,6 +279,7 @@ export class ProcessesImpl implements Processes {
|
|
|
266
279
|
this.events = config.events;
|
|
267
280
|
this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
|
|
268
281
|
this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
|
|
282
|
+
this.killGraceMs = config.killGraceMs ?? 5000;
|
|
269
283
|
|
|
270
284
|
// Create socket server with callbacks
|
|
271
285
|
this.socketServer = createProcessSocketServer(config.socket ?? {}, {
|
|
@@ -361,6 +375,21 @@ export class ProcessesImpl implements Processes {
|
|
|
361
375
|
// Set up exit handler for crash detection
|
|
362
376
|
proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
|
|
363
377
|
|
|
378
|
+
const maxRuntimeMs = config.limits?.maxRuntimeMs;
|
|
379
|
+
if (maxRuntimeMs && maxRuntimeMs > 0) {
|
|
380
|
+
const timer = setTimeout(async () => {
|
|
381
|
+
console.warn(`[Processes] Max runtime exceeded for ${name} (${process.id})`);
|
|
382
|
+
await this.emitEvent("process.limits_exceeded", {
|
|
383
|
+
processId: process.id,
|
|
384
|
+
name,
|
|
385
|
+
reason: "maxRuntimeMs",
|
|
386
|
+
limit: maxRuntimeMs,
|
|
387
|
+
});
|
|
388
|
+
await this.stop(process.id);
|
|
389
|
+
}, maxRuntimeMs);
|
|
390
|
+
this.runtimeLimitTimers.set(process.id, timer);
|
|
391
|
+
}
|
|
392
|
+
|
|
364
393
|
console.log(`[Processes] Spawned ${name} (${process.id}) with PID ${proc.pid}`);
|
|
365
394
|
return process.id;
|
|
366
395
|
} catch (error) {
|
|
@@ -395,7 +424,7 @@ export class ProcessesImpl implements Processes {
|
|
|
395
424
|
// Wait for process to exit (with timeout)
|
|
396
425
|
const exitPromise = subprocess.exited;
|
|
397
426
|
const timeoutPromise = new Promise<null>((resolve) =>
|
|
398
|
-
setTimeout(() => resolve(null),
|
|
427
|
+
setTimeout(() => resolve(null), this.killGraceMs)
|
|
399
428
|
);
|
|
400
429
|
|
|
401
430
|
const result = await Promise.race([exitPromise, timeoutPromise]);
|
|
@@ -412,6 +441,11 @@ export class ProcessesImpl implements Processes {
|
|
|
412
441
|
// Cleanup
|
|
413
442
|
await this.socketServer.closeSocket(processId);
|
|
414
443
|
this.subprocesses.delete(processId);
|
|
444
|
+
const runtimeTimer = this.runtimeLimitTimers.get(processId);
|
|
445
|
+
if (runtimeTimer) {
|
|
446
|
+
clearTimeout(runtimeTimer);
|
|
447
|
+
this.runtimeLimitTimers.delete(processId);
|
|
448
|
+
}
|
|
415
449
|
|
|
416
450
|
await this.adapter.update(processId, {
|
|
417
451
|
status: "stopped",
|
|
@@ -443,6 +477,11 @@ export class ProcessesImpl implements Processes {
|
|
|
443
477
|
// Cleanup
|
|
444
478
|
await this.socketServer.closeSocket(processId);
|
|
445
479
|
this.subprocesses.delete(processId);
|
|
480
|
+
const runtimeTimer = this.runtimeLimitTimers.get(processId);
|
|
481
|
+
if (runtimeTimer) {
|
|
482
|
+
clearTimeout(runtimeTimer);
|
|
483
|
+
this.runtimeLimitTimers.delete(processId);
|
|
484
|
+
}
|
|
446
485
|
|
|
447
486
|
await this.adapter.update(processId, {
|
|
448
487
|
status: "stopped",
|
|
@@ -590,6 +629,47 @@ export class ProcessesImpl implements Processes {
|
|
|
590
629
|
await definition.onStats(proc, stats);
|
|
591
630
|
}
|
|
592
631
|
|
|
632
|
+
const limits = proc.config.limits;
|
|
633
|
+
if (limits) {
|
|
634
|
+
if (limits.maxMemoryMb && stats.memory.rss / 1e6 > limits.maxMemoryMb) {
|
|
635
|
+
console.warn(`[Processes] Memory limit exceeded for ${proc.name} (${proc.id})`);
|
|
636
|
+
await this.emitEvent("process.limits_exceeded", {
|
|
637
|
+
processId,
|
|
638
|
+
name: proc.name,
|
|
639
|
+
reason: "maxMemoryMb",
|
|
640
|
+
limit: limits.maxMemoryMb,
|
|
641
|
+
value: stats.memory.rss / 1e6,
|
|
642
|
+
});
|
|
643
|
+
await this.emitEvent("process.watchdog.killed", {
|
|
644
|
+
processId,
|
|
645
|
+
name: proc.name,
|
|
646
|
+
reason: "maxMemoryMb",
|
|
647
|
+
value: stats.memory.rss / 1e6,
|
|
648
|
+
});
|
|
649
|
+
await this.stop(proc.id);
|
|
650
|
+
return;
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
if (limits.maxCpuPercent && stats.cpu.percent > limits.maxCpuPercent) {
|
|
654
|
+
console.warn(`[Processes] CPU limit exceeded for ${proc.name} (${proc.id})`);
|
|
655
|
+
await this.emitEvent("process.limits_exceeded", {
|
|
656
|
+
processId,
|
|
657
|
+
name: proc.name,
|
|
658
|
+
reason: "maxCpuPercent",
|
|
659
|
+
limit: limits.maxCpuPercent,
|
|
660
|
+
value: stats.cpu.percent,
|
|
661
|
+
});
|
|
662
|
+
await this.emitEvent("process.watchdog.killed", {
|
|
663
|
+
processId,
|
|
664
|
+
name: proc.name,
|
|
665
|
+
reason: "maxCpuPercent",
|
|
666
|
+
value: stats.cpu.percent,
|
|
667
|
+
});
|
|
668
|
+
await this.stop(proc.id);
|
|
669
|
+
return;
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
|
|
593
673
|
return;
|
|
594
674
|
}
|
|
595
675
|
|
|
@@ -835,16 +915,27 @@ export class ProcessesImpl implements Processes {
|
|
|
835
915
|
processId: proc.id,
|
|
836
916
|
name: proc.name,
|
|
837
917
|
});
|
|
918
|
+
await this.emitEvent("process.watchdog.stale", {
|
|
919
|
+
processId: proc.id,
|
|
920
|
+
name: proc.name,
|
|
921
|
+
reason: "heartbeat",
|
|
922
|
+
timeoutMs,
|
|
923
|
+
});
|
|
838
924
|
|
|
839
925
|
const definition = this.definitions.get(proc.name);
|
|
840
926
|
if (definition?.onUnhealthy) {
|
|
841
927
|
await definition.onUnhealthy(proc);
|
|
842
928
|
}
|
|
843
929
|
|
|
844
|
-
// If heartbeat is way overdue (2x timeout),
|
|
930
|
+
// If heartbeat is way overdue (2x timeout), stop and restart
|
|
845
931
|
if (now - lastHeartbeat > timeoutMs * 2) {
|
|
846
|
-
console.warn(`[Processes]
|
|
847
|
-
await this.
|
|
932
|
+
console.warn(`[Processes] Stopping unresponsive process ${proc.name} (${proc.id})`);
|
|
933
|
+
await this.stop(proc.id);
|
|
934
|
+
await this.emitEvent("process.watchdog.killed", {
|
|
935
|
+
processId: proc.id,
|
|
936
|
+
name: proc.name,
|
|
937
|
+
reason: "heartbeat",
|
|
938
|
+
});
|
|
848
939
|
// handleExit will trigger auto-restart if configured
|
|
849
940
|
}
|
|
850
941
|
}
|