@donkeylabs/server 2.0.29 → 2.0.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/jobs.md +7 -0
- package/docs/processes.md +2 -0
- package/docs/workflows.md +21 -0
- package/package.json +1 -1
- package/src/core/external-jobs.ts +2 -0
- package/src/core/jobs.ts +14 -2
- package/src/core/processes.ts +6 -1
- package/src/core/watchdog-executor.ts +80 -0
- package/src/core/watchdog-runner.ts +276 -0
- package/src/core/workflows.ts +49 -1
- package/src/server.ts +84 -1
package/docs/jobs.md
CHANGED
|
@@ -204,6 +204,13 @@ ctx.core.jobs.registerExternal("batchWorker", {
|
|
|
204
204
|
timeout: 10 * 60 * 1000,
|
|
205
205
|
killGraceMs: 5000,
|
|
206
206
|
});
|
|
207
|
+
|
|
208
|
+
// Disable in-process timers when using the watchdog subprocess
|
|
209
|
+
const server = new AppServer({
|
|
210
|
+
db,
|
|
211
|
+
watchdog: { enabled: true },
|
|
212
|
+
jobs: { external: { useWatchdog: true } },
|
|
213
|
+
});
|
|
207
214
|
```
|
|
208
215
|
|
|
209
216
|
Watchdog events:
|
package/docs/processes.md
CHANGED
package/docs/workflows.md
CHANGED
|
@@ -89,6 +89,19 @@ const instanceId = await ctx.core.workflows.start("process-order", {
|
|
|
89
89
|
});
|
|
90
90
|
```
|
|
91
91
|
|
|
92
|
+
### Concurrency Guard
|
|
93
|
+
|
|
94
|
+
Limit concurrent instances per workflow name:
|
|
95
|
+
|
|
96
|
+
```ts
|
|
97
|
+
const server = new AppServer({
|
|
98
|
+
db,
|
|
99
|
+
workflows: {
|
|
100
|
+
concurrentWorkflows: 1, // 0 = unlimited
|
|
101
|
+
},
|
|
102
|
+
});
|
|
103
|
+
```
|
|
104
|
+
|
|
92
105
|
### 3. Track Progress
|
|
93
106
|
|
|
94
107
|
```typescript
|
|
@@ -318,6 +331,12 @@ You can tune subprocess termination and SQLite pragmas used by isolated workflow
|
|
|
318
331
|
```ts
|
|
319
332
|
const server = new AppServer({
|
|
320
333
|
db,
|
|
334
|
+
watchdog: {
|
|
335
|
+
enabled: true,
|
|
336
|
+
intervalMs: 5000,
|
|
337
|
+
services: ["workflows", "jobs", "processes"],
|
|
338
|
+
killGraceMs: 5000,
|
|
339
|
+
},
|
|
321
340
|
workflows: {
|
|
322
341
|
killGraceMs: 5000,
|
|
323
342
|
sqlitePragmas: {
|
|
@@ -329,6 +348,8 @@ const server = new AppServer({
|
|
|
329
348
|
});
|
|
330
349
|
```
|
|
331
350
|
|
|
351
|
+
When `watchdog.enabled` is true, workflow heartbeat timers run in the watchdog subprocess instead of the main server.
|
|
352
|
+
|
|
332
353
|
Watchdog events:
|
|
333
354
|
- `workflow.watchdog.stale` (heartbeat missed)
|
|
334
355
|
- `workflow.watchdog.killed` (process terminated)
|
package/package.json
CHANGED
|
@@ -124,6 +124,8 @@ export interface ExternalJobsConfig {
|
|
|
124
124
|
heartbeatCheckInterval?: number;
|
|
125
125
|
/** Default grace period before SIGKILL when terminating (ms, default: 5000) */
|
|
126
126
|
killGraceMs?: number;
|
|
127
|
+
/** Disable in-process watchdog timers (use external watchdog instead) */
|
|
128
|
+
useWatchdog?: boolean;
|
|
127
129
|
}
|
|
128
130
|
|
|
129
131
|
// ============================================
|
package/src/core/jobs.ts
CHANGED
|
@@ -141,6 +141,8 @@ export interface Jobs {
|
|
|
141
141
|
getRunningExternal(): Promise<Job[]>;
|
|
142
142
|
/** Get all jobs with optional filtering (for admin dashboard) */
|
|
143
143
|
getAll(options?: GetAllJobsOptions): Promise<Job[]>;
|
|
144
|
+
/** Get external job config snapshot for watchdog */
|
|
145
|
+
getExternalJobConfigs(): Record<string, ExternalJobConfig>;
|
|
144
146
|
/** Start the job processing loop */
|
|
145
147
|
start(): void;
|
|
146
148
|
/** Stop the job processing and cleanup */
|
|
@@ -325,6 +327,14 @@ class JobsImpl implements Jobs {
|
|
|
325
327
|
this.externalConfigs.set(name, config);
|
|
326
328
|
}
|
|
327
329
|
|
|
330
|
+
getExternalJobConfigs(): Record<string, ExternalJobConfig> {
|
|
331
|
+
const snapshot: Record<string, ExternalJobConfig> = {};
|
|
332
|
+
for (const [name, config] of this.externalConfigs.entries()) {
|
|
333
|
+
snapshot[name] = { ...config };
|
|
334
|
+
}
|
|
335
|
+
return snapshot;
|
|
336
|
+
}
|
|
337
|
+
|
|
328
338
|
private isExternalJob(name: string): boolean {
|
|
329
339
|
return this.externalConfigs.has(name);
|
|
330
340
|
}
|
|
@@ -422,7 +432,9 @@ class JobsImpl implements Jobs {
|
|
|
422
432
|
// Initialize socket server for external jobs
|
|
423
433
|
if (this.externalConfigs.size > 0) {
|
|
424
434
|
this.initializeSocketServer();
|
|
425
|
-
this.
|
|
435
|
+
if (!this.externalConfig.useWatchdog) {
|
|
436
|
+
this.startHeartbeatMonitor();
|
|
437
|
+
}
|
|
426
438
|
// Attempt to reconnect to orphaned jobs from previous run
|
|
427
439
|
this.reconnectOrphanedJobs();
|
|
428
440
|
}
|
|
@@ -891,7 +903,7 @@ class JobsImpl implements Jobs {
|
|
|
891
903
|
proc.stdin.end();
|
|
892
904
|
|
|
893
905
|
// Set up process timeout if configured
|
|
894
|
-
if (config.timeout) {
|
|
906
|
+
if (config.timeout && !this.externalConfig.useWatchdog) {
|
|
895
907
|
const timeout = setTimeout(async () => {
|
|
896
908
|
console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
|
|
897
909
|
const killGraceMs = config.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
|
package/src/core/processes.ts
CHANGED
|
@@ -182,6 +182,8 @@ export interface ProcessesConfig {
|
|
|
182
182
|
autoRecoverOrphans?: boolean;
|
|
183
183
|
/** Grace period before SIGKILL when stopping/killing (ms, default: 5000) */
|
|
184
184
|
killGraceMs?: number;
|
|
185
|
+
/** Disable in-process watchdog timers (use external watchdog instead) */
|
|
186
|
+
useWatchdog?: boolean;
|
|
185
187
|
}
|
|
186
188
|
|
|
187
189
|
// ============================================
|
|
@@ -264,6 +266,7 @@ export class ProcessesImpl implements Processes {
|
|
|
264
266
|
private autoRecoverOrphans: boolean;
|
|
265
267
|
private killGraceMs: number;
|
|
266
268
|
private runtimeLimitTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
269
|
+
private useWatchdog: boolean;
|
|
267
270
|
|
|
268
271
|
// Track running Bun subprocesses
|
|
269
272
|
private subprocesses = new Map<string, Subprocess>();
|
|
@@ -280,6 +283,7 @@ export class ProcessesImpl implements Processes {
|
|
|
280
283
|
this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
|
|
281
284
|
this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
|
|
282
285
|
this.killGraceMs = config.killGraceMs ?? 5000;
|
|
286
|
+
this.useWatchdog = config.useWatchdog ?? false;
|
|
283
287
|
|
|
284
288
|
// Create socket server with callbacks
|
|
285
289
|
this.socketServer = createProcessSocketServer(config.socket ?? {}, {
|
|
@@ -376,7 +380,7 @@ export class ProcessesImpl implements Processes {
|
|
|
376
380
|
proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
|
|
377
381
|
|
|
378
382
|
const maxRuntimeMs = config.limits?.maxRuntimeMs;
|
|
379
|
-
if (maxRuntimeMs && maxRuntimeMs > 0) {
|
|
383
|
+
if (!this.useWatchdog && maxRuntimeMs && maxRuntimeMs > 0) {
|
|
380
384
|
const timer = setTimeout(async () => {
|
|
381
385
|
console.warn(`[Processes] Max runtime exceeded for ${name} (${process.id})`);
|
|
382
386
|
await this.emitEvent("process.limits_exceeded", {
|
|
@@ -895,6 +899,7 @@ export class ProcessesImpl implements Processes {
|
|
|
895
899
|
}
|
|
896
900
|
|
|
897
901
|
private startHeartbeatMonitor(): void {
|
|
902
|
+
if (this.useWatchdog) return;
|
|
898
903
|
this.heartbeatMonitor = setInterval(async () => {
|
|
899
904
|
if (this.isShuttingDown) return;
|
|
900
905
|
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { Kysely } from "kysely";
|
|
2
|
+
import { BunSqliteDialect } from "kysely-bun-sqlite";
|
|
3
|
+
import Database from "bun:sqlite";
|
|
4
|
+
import { ProcessClient } from "./process-client";
|
|
5
|
+
import { KyselyWorkflowAdapter } from "./workflow-adapter-kysely";
|
|
6
|
+
import { KyselyJobAdapter } from "./job-adapter-kysely";
|
|
7
|
+
import { SqliteProcessAdapter } from "./process-adapter-sqlite";
|
|
8
|
+
import { WatchdogRunner, type WatchdogRunnerConfig } from "./watchdog-runner";
|
|
9
|
+
|
|
10
|
+
interface WatchdogConfig extends WatchdogRunnerConfig {
|
|
11
|
+
intervalMs: number;
|
|
12
|
+
workflows?: { dbPath?: string };
|
|
13
|
+
jobs?: { dbPath?: string };
|
|
14
|
+
processes?: { dbPath?: string };
|
|
15
|
+
sqlitePragmas?: {
|
|
16
|
+
busyTimeout?: number;
|
|
17
|
+
synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
|
|
18
|
+
journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const raw = process.env.DONKEYLABS_WATCHDOG_CONFIG;
|
|
23
|
+
if (!raw) {
|
|
24
|
+
throw new Error("Missing DONKEYLABS_WATCHDOG_CONFIG");
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const config: WatchdogConfig = JSON.parse(raw);
|
|
28
|
+
const client = await ProcessClient.connect();
|
|
29
|
+
|
|
30
|
+
const workflowAdapter = config.workflows?.dbPath
|
|
31
|
+
? new KyselyWorkflowAdapter(createDb(config.workflows.dbPath, config.sqlitePragmas), {
|
|
32
|
+
cleanupDays: 0,
|
|
33
|
+
})
|
|
34
|
+
: undefined;
|
|
35
|
+
const jobAdapter = config.jobs?.dbPath
|
|
36
|
+
? new KyselyJobAdapter(createDb(config.jobs.dbPath, config.sqlitePragmas), {
|
|
37
|
+
cleanupDays: 0,
|
|
38
|
+
})
|
|
39
|
+
: undefined;
|
|
40
|
+
const processAdapter = config.processes?.dbPath
|
|
41
|
+
? new SqliteProcessAdapter({ path: config.processes.dbPath, cleanupDays: 0 })
|
|
42
|
+
: undefined;
|
|
43
|
+
|
|
44
|
+
const runner = new WatchdogRunner(config, {
|
|
45
|
+
workflowsAdapter: workflowAdapter,
|
|
46
|
+
jobsAdapter: jobAdapter,
|
|
47
|
+
processesAdapter: processAdapter,
|
|
48
|
+
emit: async (event, data) => {
|
|
49
|
+
await client.emit(event, data);
|
|
50
|
+
},
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
const interval = Math.max(1000, config.intervalMs);
|
|
54
|
+
const timer = setInterval(() => {
|
|
55
|
+
runner.runOnce().catch(() => undefined);
|
|
56
|
+
}, interval);
|
|
57
|
+
|
|
58
|
+
process.on("SIGTERM", async () => {
|
|
59
|
+
clearInterval(timer);
|
|
60
|
+
client.disconnect();
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
function createDb(
|
|
64
|
+
dbPath: string,
|
|
65
|
+
pragmas?: { busyTimeout?: number; synchronous?: string; journalMode?: string }
|
|
66
|
+
): Kysely<any> {
|
|
67
|
+
const sqlite = new Database(dbPath);
|
|
68
|
+
const busyTimeout = pragmas?.busyTimeout ?? 5000;
|
|
69
|
+
sqlite.run(`PRAGMA busy_timeout = ${busyTimeout}`);
|
|
70
|
+
if (pragmas?.journalMode) {
|
|
71
|
+
sqlite.run(`PRAGMA journal_mode = ${pragmas.journalMode}`);
|
|
72
|
+
}
|
|
73
|
+
if (pragmas?.synchronous) {
|
|
74
|
+
sqlite.run(`PRAGMA synchronous = ${pragmas.synchronous}`);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return new Kysely<any>({
|
|
78
|
+
dialect: new BunSqliteDialect({ database: sqlite }),
|
|
79
|
+
});
|
|
80
|
+
}
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import type { WorkflowAdapter, WorkflowInstance } from "./workflows";
|
|
2
|
+
import type { JobAdapter, Job } from "./jobs";
|
|
3
|
+
import type { ProcessAdapter } from "./process-adapter-sqlite";
|
|
4
|
+
|
|
5
|
+
export type WatchdogService = "workflows" | "jobs" | "processes";
|
|
6
|
+
|
|
7
|
+
export interface WatchdogRunnerConfig {
|
|
8
|
+
services: WatchdogService[];
|
|
9
|
+
killGraceMs: number;
|
|
10
|
+
workflowHeartbeatTimeoutMs: number;
|
|
11
|
+
jobDefaults: {
|
|
12
|
+
heartbeatTimeoutMs: number;
|
|
13
|
+
killGraceMs: number;
|
|
14
|
+
};
|
|
15
|
+
jobConfigs: Record<string, { heartbeatTimeout?: number; timeout?: number; killGraceMs?: number }>;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface WatchdogRunnerDeps {
|
|
19
|
+
workflowsAdapter?: WorkflowAdapter;
|
|
20
|
+
jobsAdapter?: JobAdapter;
|
|
21
|
+
processesAdapter?: ProcessAdapter;
|
|
22
|
+
now?: () => number;
|
|
23
|
+
killProcess?: (pid: number, signal: NodeJS.Signals) => void;
|
|
24
|
+
isProcessAlive?: (pid: number) => boolean;
|
|
25
|
+
emit?: (event: string, data: Record<string, any>) => Promise<void>;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export class WatchdogRunner {
|
|
29
|
+
private config: WatchdogRunnerConfig;
|
|
30
|
+
private deps: WatchdogRunnerDeps;
|
|
31
|
+
|
|
32
|
+
constructor(config: WatchdogRunnerConfig, deps: WatchdogRunnerDeps) {
|
|
33
|
+
this.config = config;
|
|
34
|
+
this.deps = deps;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async runOnce(): Promise<void> {
|
|
38
|
+
if (this.config.services.includes("workflows") && this.deps.workflowsAdapter) {
|
|
39
|
+
await this.checkWorkflows();
|
|
40
|
+
}
|
|
41
|
+
if (this.config.services.includes("jobs") && this.deps.jobsAdapter) {
|
|
42
|
+
await this.checkJobs();
|
|
43
|
+
}
|
|
44
|
+
if (this.config.services.includes("processes") && this.deps.processesAdapter) {
|
|
45
|
+
await this.checkProcesses();
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
private async checkWorkflows(): Promise<void> {
|
|
50
|
+
const adapter = this.deps.workflowsAdapter!;
|
|
51
|
+
const now = this.now();
|
|
52
|
+
const instances = await adapter.getRunningInstances();
|
|
53
|
+
|
|
54
|
+
for (const instance of instances) {
|
|
55
|
+
const info = this.getWatchdogMetadata(instance);
|
|
56
|
+
if (!info?.pid) continue;
|
|
57
|
+
|
|
58
|
+
const last = info.lastHeartbeat ?? instance.startedAt?.getTime() ?? 0;
|
|
59
|
+
if (now - last <= this.config.workflowHeartbeatTimeoutMs) continue;
|
|
60
|
+
|
|
61
|
+
await this.emit("workflow.watchdog.stale", {
|
|
62
|
+
instanceId: instance.id,
|
|
63
|
+
pid: info.pid,
|
|
64
|
+
timeoutMs: this.config.workflowHeartbeatTimeoutMs,
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
await this.killProcessWithGrace(info.pid, this.config.killGraceMs);
|
|
68
|
+
|
|
69
|
+
await adapter.updateInstance(instance.id, {
|
|
70
|
+
status: "failed",
|
|
71
|
+
error: "Watchdog killed unresponsive workflow",
|
|
72
|
+
completedAt: new Date(),
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
await this.emit("workflow.watchdog.killed", {
|
|
76
|
+
instanceId: instance.id,
|
|
77
|
+
pid: info.pid,
|
|
78
|
+
reason: "heartbeat",
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
private async checkJobs(): Promise<void> {
|
|
84
|
+
const adapter = this.deps.jobsAdapter!;
|
|
85
|
+
const now = this.now();
|
|
86
|
+
const jobs = await adapter.getRunningExternal();
|
|
87
|
+
|
|
88
|
+
for (const job of jobs) {
|
|
89
|
+
if (!job.pid) continue;
|
|
90
|
+
const config = this.config.jobConfigs[job.name] ?? {};
|
|
91
|
+
const heartbeatTimeout =
|
|
92
|
+
config.heartbeatTimeout ?? this.config.jobDefaults.heartbeatTimeoutMs;
|
|
93
|
+
const killGraceMs = config.killGraceMs ?? this.config.jobDefaults.killGraceMs;
|
|
94
|
+
const lastHeartbeat = job.lastHeartbeat?.getTime() ?? job.startedAt?.getTime() ?? 0;
|
|
95
|
+
|
|
96
|
+
if (now - lastHeartbeat > heartbeatTimeout) {
|
|
97
|
+
await this.emit("job.watchdog.stale", {
|
|
98
|
+
jobId: job.id,
|
|
99
|
+
name: job.name,
|
|
100
|
+
pid: job.pid,
|
|
101
|
+
timeoutMs: heartbeatTimeout,
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
await this.killProcessWithGrace(job.pid, killGraceMs);
|
|
105
|
+
|
|
106
|
+
await adapter.update(job.id, {
|
|
107
|
+
status: "failed",
|
|
108
|
+
error: "Watchdog killed unresponsive job",
|
|
109
|
+
completedAt: new Date(),
|
|
110
|
+
processState: "orphaned",
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
await this.emit("job.watchdog.killed", {
|
|
114
|
+
jobId: job.id,
|
|
115
|
+
name: job.name,
|
|
116
|
+
pid: job.pid,
|
|
117
|
+
reason: "heartbeat",
|
|
118
|
+
});
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (config.timeout && job.startedAt) {
|
|
123
|
+
if (now - job.startedAt.getTime() > config.timeout) {
|
|
124
|
+
await this.emit("job.watchdog.stale", {
|
|
125
|
+
jobId: job.id,
|
|
126
|
+
name: job.name,
|
|
127
|
+
pid: job.pid,
|
|
128
|
+
timeoutMs: config.timeout,
|
|
129
|
+
reason: "timeout",
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
await this.killProcessWithGrace(job.pid, killGraceMs);
|
|
133
|
+
|
|
134
|
+
await adapter.update(job.id, {
|
|
135
|
+
status: "failed",
|
|
136
|
+
error: "Watchdog killed job after timeout",
|
|
137
|
+
completedAt: new Date(),
|
|
138
|
+
processState: "orphaned",
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
await this.emit("job.watchdog.killed", {
|
|
142
|
+
jobId: job.id,
|
|
143
|
+
name: job.name,
|
|
144
|
+
pid: job.pid,
|
|
145
|
+
reason: "timeout",
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
private async checkProcesses(): Promise<void> {
|
|
153
|
+
const adapter = this.deps.processesAdapter!;
|
|
154
|
+
const now = this.now();
|
|
155
|
+
const running = await adapter.getRunning();
|
|
156
|
+
|
|
157
|
+
for (const proc of running) {
|
|
158
|
+
if (!proc.pid) continue;
|
|
159
|
+
const heartbeatTimeout = proc.config.heartbeat?.timeoutMs;
|
|
160
|
+
const lastHeartbeat = proc.lastHeartbeat?.getTime() ?? proc.startedAt?.getTime() ?? 0;
|
|
161
|
+
|
|
162
|
+
if (heartbeatTimeout && now - lastHeartbeat > heartbeatTimeout) {
|
|
163
|
+
await this.emit("process.watchdog.stale", {
|
|
164
|
+
processId: proc.id,
|
|
165
|
+
name: proc.name,
|
|
166
|
+
pid: proc.pid,
|
|
167
|
+
timeoutMs: heartbeatTimeout,
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
|
|
171
|
+
|
|
172
|
+
await adapter.update(proc.id, {
|
|
173
|
+
status: "crashed",
|
|
174
|
+
error: "Watchdog killed unresponsive process",
|
|
175
|
+
stoppedAt: new Date(),
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
await this.emit("process.watchdog.killed", {
|
|
179
|
+
processId: proc.id,
|
|
180
|
+
name: proc.name,
|
|
181
|
+
pid: proc.pid,
|
|
182
|
+
reason: "heartbeat",
|
|
183
|
+
});
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const maxRuntimeMs = proc.config.limits?.maxRuntimeMs;
|
|
188
|
+
if (maxRuntimeMs && proc.startedAt) {
|
|
189
|
+
if (now - proc.startedAt.getTime() > maxRuntimeMs) {
|
|
190
|
+
await this.emit("process.watchdog.stale", {
|
|
191
|
+
processId: proc.id,
|
|
192
|
+
name: proc.name,
|
|
193
|
+
pid: proc.pid,
|
|
194
|
+
timeoutMs: maxRuntimeMs,
|
|
195
|
+
reason: "maxRuntimeMs",
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
|
|
199
|
+
|
|
200
|
+
await adapter.update(proc.id, {
|
|
201
|
+
status: "crashed",
|
|
202
|
+
error: "Watchdog killed process after max runtime",
|
|
203
|
+
stoppedAt: new Date(),
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
await this.emit("process.watchdog.killed", {
|
|
207
|
+
processId: proc.id,
|
|
208
|
+
name: proc.name,
|
|
209
|
+
pid: proc.pid,
|
|
210
|
+
reason: "maxRuntimeMs",
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
private getWatchdogMetadata(instance: WorkflowInstance): { pid?: number; lastHeartbeat?: number } | null {
|
|
218
|
+
const meta = instance.metadata as any;
|
|
219
|
+
if (!meta || typeof meta !== "object") return null;
|
|
220
|
+
const info = meta.__watchdog;
|
|
221
|
+
if (!info || typeof info !== "object") return null;
|
|
222
|
+
return {
|
|
223
|
+
pid: typeof info.pid === "number" ? info.pid : undefined,
|
|
224
|
+
lastHeartbeat: info.lastHeartbeat ? new Date(info.lastHeartbeat).getTime() : undefined,
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
private now(): number {
|
|
229
|
+
return this.deps.now ? this.deps.now() : Date.now();
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
private async emit(event: string, data: Record<string, any>): Promise<void> {
|
|
233
|
+
if (this.deps.emit) {
|
|
234
|
+
await this.deps.emit(event, data);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
private async killProcessWithGrace(pid: number, graceMs: number): Promise<void> {
|
|
239
|
+
const kill = this.deps.killProcess ?? process.kill;
|
|
240
|
+
try {
|
|
241
|
+
kill(pid, "SIGTERM");
|
|
242
|
+
} catch {
|
|
243
|
+
return;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (graceMs <= 0) {
|
|
247
|
+
try {
|
|
248
|
+
kill(pid, "SIGKILL");
|
|
249
|
+
} catch {
|
|
250
|
+
// ignore
|
|
251
|
+
}
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
await new Promise((resolve) => setTimeout(resolve, graceMs));
|
|
256
|
+
|
|
257
|
+
try {
|
|
258
|
+
const isAlive = this.deps.isProcessAlive
|
|
259
|
+
? this.deps.isProcessAlive(pid)
|
|
260
|
+
: (() => {
|
|
261
|
+
try {
|
|
262
|
+
process.kill(pid, 0);
|
|
263
|
+
return true;
|
|
264
|
+
} catch {
|
|
265
|
+
return false;
|
|
266
|
+
}
|
|
267
|
+
})();
|
|
268
|
+
|
|
269
|
+
if (isAlive) {
|
|
270
|
+
kill(pid, "SIGKILL");
|
|
271
|
+
}
|
|
272
|
+
} catch {
|
|
273
|
+
// ignore
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
package/src/core/workflows.ts
CHANGED
|
@@ -764,6 +764,10 @@ export interface WorkflowsConfig {
|
|
|
764
764
|
killGraceMs?: number;
|
|
765
765
|
/** SQLite pragmas for isolated subprocess connections */
|
|
766
766
|
sqlitePragmas?: SqlitePragmaConfig;
|
|
767
|
+
/** Disable in-process watchdog timers (use external watchdog instead) */
|
|
768
|
+
useWatchdog?: boolean;
|
|
769
|
+
/** Max concurrent instances per workflow name (0 = unlimited, default: 0) */
|
|
770
|
+
concurrentWorkflows?: number;
|
|
767
771
|
/** Resume strategy for orphaned workflows (default: "blocking") */
|
|
768
772
|
resumeStrategy?: WorkflowResumeStrategy;
|
|
769
773
|
}
|
|
@@ -825,6 +829,8 @@ export interface Workflows {
|
|
|
825
829
|
updateMetadata(instanceId: string, key: string, value: any): Promise<void>;
|
|
826
830
|
/** Set plugin metadata for local instantiation in isolated workflows */
|
|
827
831
|
setPluginMetadata(metadata: PluginMetadata): void;
|
|
832
|
+
/** Get resolved SQLite db path (for watchdog) */
|
|
833
|
+
getDbPath(): string | undefined;
|
|
828
834
|
}
|
|
829
835
|
|
|
830
836
|
export interface PluginMetadata {
|
|
@@ -866,6 +872,8 @@ class WorkflowsImpl implements Workflows {
|
|
|
866
872
|
private readyTimeoutMs: number;
|
|
867
873
|
private killGraceMs: number;
|
|
868
874
|
private sqlitePragmas?: SqlitePragmaConfig;
|
|
875
|
+
private useWatchdog: boolean;
|
|
876
|
+
private concurrentWorkflows: number;
|
|
869
877
|
private resumeStrategy!: WorkflowResumeStrategy;
|
|
870
878
|
private workflowModulePaths = new Map<string, string>();
|
|
871
879
|
private isolatedProcesses = new Map<string, IsolatedProcessInfo>();
|
|
@@ -902,6 +910,8 @@ class WorkflowsImpl implements Workflows {
|
|
|
902
910
|
this.readyTimeoutMs = config.readyTimeout ?? 10000;
|
|
903
911
|
this.killGraceMs = config.killGraceMs ?? 5000;
|
|
904
912
|
this.sqlitePragmas = config.sqlitePragmas;
|
|
913
|
+
this.useWatchdog = config.useWatchdog ?? false;
|
|
914
|
+
this.concurrentWorkflows = config.concurrentWorkflows ?? 0;
|
|
905
915
|
this.resumeStrategy = config.resumeStrategy ?? "blocking";
|
|
906
916
|
}
|
|
907
917
|
|
|
@@ -962,6 +972,10 @@ class WorkflowsImpl implements Workflows {
|
|
|
962
972
|
this.pluginCustomErrors = metadata.customErrors;
|
|
963
973
|
}
|
|
964
974
|
|
|
975
|
+
getDbPath(): string | undefined {
|
|
976
|
+
return this.dbPath;
|
|
977
|
+
}
|
|
978
|
+
|
|
965
979
|
async updateMetadata(instanceId: string, key: string, value: any): Promise<void> {
|
|
966
980
|
const instance = await this.adapter.getInstance(instanceId);
|
|
967
981
|
if (!instance) return;
|
|
@@ -996,6 +1010,16 @@ class WorkflowsImpl implements Workflows {
|
|
|
996
1010
|
throw new Error(`Workflow "${workflowName}" is not registered`);
|
|
997
1011
|
}
|
|
998
1012
|
|
|
1013
|
+
if (this.concurrentWorkflows > 0) {
|
|
1014
|
+
const running = await this.adapter.getInstancesByWorkflow(workflowName, "running");
|
|
1015
|
+
const pending = await this.adapter.getInstancesByWorkflow(workflowName, "pending");
|
|
1016
|
+
if (running.length + pending.length >= this.concurrentWorkflows) {
|
|
1017
|
+
throw new Error(
|
|
1018
|
+
`Workflow "${workflowName}" has reached its concurrency limit (${this.concurrentWorkflows})`
|
|
1019
|
+
);
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
|
|
999
1023
|
const instance = await this.adapter.createInstance({
|
|
1000
1024
|
workflowName,
|
|
1001
1025
|
status: "pending",
|
|
@@ -1517,6 +1541,17 @@ class WorkflowsImpl implements Workflows {
|
|
|
1517
1541
|
// Set up heartbeat timeout
|
|
1518
1542
|
this.resetHeartbeatTimeout(instanceId, proc.pid);
|
|
1519
1543
|
|
|
1544
|
+
const instance = await this.adapter.getInstance(instanceId);
|
|
1545
|
+
const metadata = { ...(instance?.metadata ?? {}) } as Record<string, any>;
|
|
1546
|
+
metadata.__watchdog = {
|
|
1547
|
+
...(metadata.__watchdog ?? {}),
|
|
1548
|
+
pid: proc.pid,
|
|
1549
|
+
socketPath,
|
|
1550
|
+
tcpPort,
|
|
1551
|
+
lastHeartbeat: new Date().toISOString(),
|
|
1552
|
+
};
|
|
1553
|
+
await this.adapter.updateInstance(instanceId, { metadata });
|
|
1554
|
+
|
|
1520
1555
|
const exitBeforeReady = proc.exited.then((exitCode) => {
|
|
1521
1556
|
throw new Error(`Subprocess exited before ready (code ${exitCode})`);
|
|
1522
1557
|
});
|
|
@@ -1590,7 +1625,8 @@ class WorkflowsImpl implements Workflows {
|
|
|
1590
1625
|
|
|
1591
1626
|
case "started":
|
|
1592
1627
|
case "heartbeat":
|
|
1593
|
-
//
|
|
1628
|
+
// Update heartbeat tracking metadata
|
|
1629
|
+
await this.updateWatchdogHeartbeat(instanceId);
|
|
1594
1630
|
break;
|
|
1595
1631
|
|
|
1596
1632
|
case "step.started": {
|
|
@@ -1958,6 +1994,17 @@ class WorkflowsImpl implements Workflows {
|
|
|
1958
1994
|
this.rejectIsolatedReady(instanceId, new Error("Isolated workflow cleaned up"));
|
|
1959
1995
|
}
|
|
1960
1996
|
|
|
1997
|
+
private async updateWatchdogHeartbeat(instanceId: string): Promise<void> {
|
|
1998
|
+
const instance = await this.adapter.getInstance(instanceId);
|
|
1999
|
+
if (!instance) return;
|
|
2000
|
+
const metadata = { ...(instance.metadata ?? {}) } as Record<string, any>;
|
|
2001
|
+
metadata.__watchdog = {
|
|
2002
|
+
...(metadata.__watchdog ?? {}),
|
|
2003
|
+
lastHeartbeat: new Date().toISOString(),
|
|
2004
|
+
};
|
|
2005
|
+
await this.adapter.updateInstance(instanceId, { metadata });
|
|
2006
|
+
}
|
|
2007
|
+
|
|
1961
2008
|
private async markOrphanedAsFailed(
|
|
1962
2009
|
instances: WorkflowInstance[],
|
|
1963
2010
|
reason: string
|
|
@@ -1990,6 +2037,7 @@ class WorkflowsImpl implements Workflows {
|
|
|
1990
2037
|
* Reset heartbeat timeout for an isolated workflow
|
|
1991
2038
|
*/
|
|
1992
2039
|
private resetHeartbeatTimeout(instanceId: string, pid: number): void {
|
|
2040
|
+
if (this.useWatchdog) return;
|
|
1993
2041
|
const info = this.isolatedProcesses.get(instanceId);
|
|
1994
2042
|
if (!info) return;
|
|
1995
2043
|
|
package/src/server.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
2
|
import { mkdir, writeFile } from "node:fs/promises";
|
|
3
|
-
import { dirname } from "node:path";
|
|
3
|
+
import { dirname, join } from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
4
5
|
import { PluginManager, type CoreServices, type ConfiguredPlugin } from "./core";
|
|
5
6
|
import { type IRouter, type RouteDefinition, type ServerContext, type HandlerRegistry } from "./router";
|
|
6
7
|
import { Handlers } from "./handlers";
|
|
@@ -87,6 +88,13 @@ export interface ServerConfig {
|
|
|
87
88
|
*/
|
|
88
89
|
workflowsResumeStrategy?: "blocking" | "background" | "skip";
|
|
89
90
|
processes?: ProcessesConfig;
|
|
91
|
+
/** Watchdog subprocess configuration */
|
|
92
|
+
watchdog?: {
|
|
93
|
+
enabled?: boolean;
|
|
94
|
+
intervalMs?: number;
|
|
95
|
+
services?: ("workflows" | "jobs" | "processes")[];
|
|
96
|
+
killGraceMs?: number;
|
|
97
|
+
};
|
|
90
98
|
audit?: AuditConfig;
|
|
91
99
|
websocket?: WebSocketConfig;
|
|
92
100
|
storage?: StorageConfig;
|
|
@@ -221,6 +229,9 @@ export class AppServer {
|
|
|
221
229
|
private generateModeSetup = false;
|
|
222
230
|
private initMode: "adapter" | "server" = "server";
|
|
223
231
|
private workflowsResumeStrategy?: "blocking" | "background" | "skip";
|
|
232
|
+
private watchdogConfig?: ServerConfig["watchdog"];
|
|
233
|
+
private watchdogStarted = false;
|
|
234
|
+
private options: ServerConfig;
|
|
224
235
|
|
|
225
236
|
// Custom services registry
|
|
226
237
|
private serviceFactories = new Map<string, ServiceFactory<any>>();
|
|
@@ -228,11 +239,13 @@ export class AppServer {
|
|
|
228
239
|
private generateModeTimer?: ReturnType<typeof setTimeout>;
|
|
229
240
|
|
|
230
241
|
constructor(options: ServerConfig) {
|
|
242
|
+
this.options = options;
|
|
231
243
|
// Port priority: explicit config > PORT env var > default 3000
|
|
232
244
|
const envPort = process.env.PORT ? parseInt(process.env.PORT, 10) : undefined;
|
|
233
245
|
this.port = options.port ?? envPort ?? 3000;
|
|
234
246
|
this.maxPortAttempts = options.maxPortAttempts ?? 5;
|
|
235
247
|
this.workflowsResumeStrategy = options.workflowsResumeStrategy ?? options.workflows?.resumeStrategy;
|
|
248
|
+
this.watchdogConfig = options.watchdog;
|
|
236
249
|
|
|
237
250
|
// Determine if we should use legacy databases
|
|
238
251
|
const useLegacy = options.useLegacyCoreDatabases ?? false;
|
|
@@ -286,6 +299,10 @@ export class AppServer {
|
|
|
286
299
|
events,
|
|
287
300
|
logger,
|
|
288
301
|
adapter: jobAdapter,
|
|
302
|
+
external: {
|
|
303
|
+
...options.jobs?.external,
|
|
304
|
+
useWatchdog: options.watchdog?.enabled ? true : options.jobs?.external?.useWatchdog,
|
|
305
|
+
},
|
|
289
306
|
// Disable built-in persistence when using Kysely adapter
|
|
290
307
|
persist: useLegacy ? options.jobs?.persist : false,
|
|
291
308
|
});
|
|
@@ -297,12 +314,14 @@ export class AppServer {
|
|
|
297
314
|
jobs,
|
|
298
315
|
sse,
|
|
299
316
|
adapter: workflowAdapter,
|
|
317
|
+
useWatchdog: options.watchdog?.enabled ? true : options.workflows?.useWatchdog,
|
|
300
318
|
});
|
|
301
319
|
|
|
302
320
|
// Processes - still uses its own adapter pattern but can use Kysely
|
|
303
321
|
const processes = createProcesses({
|
|
304
322
|
...options.processes,
|
|
305
323
|
events,
|
|
324
|
+
useWatchdog: options.watchdog?.enabled ? true : options.processes?.useWatchdog,
|
|
306
325
|
});
|
|
307
326
|
|
|
308
327
|
// New services
|
|
@@ -1055,6 +1074,7 @@ ${factoryFunction}
|
|
|
1055
1074
|
await this.coreServices.workflows.resume();
|
|
1056
1075
|
}
|
|
1057
1076
|
this.coreServices.processes.start();
|
|
1077
|
+
await this.startWatchdog();
|
|
1058
1078
|
logger.info("Background services started (cron, jobs, workflows, processes)");
|
|
1059
1079
|
|
|
1060
1080
|
for (const router of this.routers) {
|
|
@@ -1073,6 +1093,69 @@ ${factoryFunction}
|
|
|
1073
1093
|
await this.runReadyHandlers();
|
|
1074
1094
|
}
|
|
1075
1095
|
|
|
1096
|
+
private async startWatchdog(): Promise<void> {
|
|
1097
|
+
if (!this.watchdogConfig?.enabled) return;
|
|
1098
|
+
if (this.watchdogStarted) return;
|
|
1099
|
+
|
|
1100
|
+
const executorPath = join(dirname(fileURLToPath(import.meta.url)), "core", "watchdog-executor.ts");
|
|
1101
|
+
const services = this.watchdogConfig.services ?? ["workflows", "jobs", "processes"];
|
|
1102
|
+
const workflowsDbPath = this.coreServices.workflows.getDbPath?.();
|
|
1103
|
+
const jobsDbPath = (this.options.jobs?.dbPath ?? workflowsDbPath ?? ".donkeylabs/jobs.db") as string;
|
|
1104
|
+
const processesDbPath = (this.options.processes?.adapter?.path ?? ".donkeylabs/processes.db") as string;
|
|
1105
|
+
|
|
1106
|
+
const config = {
|
|
1107
|
+
intervalMs: this.watchdogConfig.intervalMs ?? 5000,
|
|
1108
|
+
services,
|
|
1109
|
+
killGraceMs: this.watchdogConfig.killGraceMs ?? 5000,
|
|
1110
|
+
workflowHeartbeatTimeoutMs: this.options.workflows?.heartbeatTimeout ?? 60000,
|
|
1111
|
+
jobDefaults: {
|
|
1112
|
+
heartbeatTimeoutMs: this.options.jobs?.external?.defaultHeartbeatTimeout ?? 30000,
|
|
1113
|
+
killGraceMs: this.options.jobs?.external?.killGraceMs ?? this.watchdogConfig.killGraceMs ?? 5000,
|
|
1114
|
+
},
|
|
1115
|
+
jobConfigs: this.coreServices.jobs.getExternalJobConfigs(),
|
|
1116
|
+
workflows: workflowsDbPath ? { dbPath: workflowsDbPath } : undefined,
|
|
1117
|
+
jobs: jobsDbPath ? { dbPath: jobsDbPath } : undefined,
|
|
1118
|
+
processes: processesDbPath ? { dbPath: processesDbPath } : undefined,
|
|
1119
|
+
sqlitePragmas: this.options.workflows?.sqlitePragmas,
|
|
1120
|
+
};
|
|
1121
|
+
|
|
1122
|
+
try {
|
|
1123
|
+
this.coreServices.processes.register({
|
|
1124
|
+
name: "__watchdog",
|
|
1125
|
+
config: {
|
|
1126
|
+
command: "bun",
|
|
1127
|
+
args: ["run", executorPath],
|
|
1128
|
+
env: {
|
|
1129
|
+
DONKEYLABS_WATCHDOG_CONFIG: JSON.stringify(config),
|
|
1130
|
+
},
|
|
1131
|
+
heartbeat: { intervalMs: 5000, timeoutMs: 30000 },
|
|
1132
|
+
},
|
|
1133
|
+
});
|
|
1134
|
+
} catch {
|
|
1135
|
+
// Already registered
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
await this.coreServices.processes.spawn("__watchdog", {
|
|
1139
|
+
metadata: { role: "watchdog" },
|
|
1140
|
+
});
|
|
1141
|
+
|
|
1142
|
+
this.coreServices.events.on("process.event", async (data: any) => {
|
|
1143
|
+
if (data?.name !== "__watchdog") return;
|
|
1144
|
+
if (!data.event) return;
|
|
1145
|
+
|
|
1146
|
+
await this.coreServices.events.emit(data.event, data.data ?? {});
|
|
1147
|
+
|
|
1148
|
+
if (data.event.startsWith("workflow.watchdog")) {
|
|
1149
|
+
const instanceId = data.data?.instanceId;
|
|
1150
|
+
if (instanceId && this.coreServices.sse) {
|
|
1151
|
+
this.coreServices.sse.broadcast(`workflow:${instanceId}`, data.event, data.data ?? {});
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
});
|
|
1155
|
+
|
|
1156
|
+
this.watchdogStarted = true;
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1076
1159
|
/**
|
|
1077
1160
|
* Handle a single API request. Used by adapters.
|
|
1078
1161
|
* Returns null if the route is not found.
|