@donkeylabs/server 2.0.29 → 2.0.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/jobs.md +7 -0
- package/docs/processes.md +2 -0
- package/docs/workflows.md +8 -0
- package/package.json +1 -1
- package/src/core/external-jobs.ts +2 -0
- package/src/core/jobs.ts +14 -2
- package/src/core/processes.ts +6 -1
- package/src/core/watchdog-executor.ts +80 -0
- package/src/core/watchdog-runner.ts +276 -0
- package/src/core/workflows.ts +35 -1
- package/src/server.ts +84 -1
package/docs/jobs.md
CHANGED
|
@@ -204,6 +204,13 @@ ctx.core.jobs.registerExternal("batchWorker", {
|
|
|
204
204
|
timeout: 10 * 60 * 1000,
|
|
205
205
|
killGraceMs: 5000,
|
|
206
206
|
});
|
|
207
|
+
|
|
208
|
+
// Disable in-process timers when using the watchdog subprocess
|
|
209
|
+
const server = new AppServer({
|
|
210
|
+
db,
|
|
211
|
+
watchdog: { enabled: true },
|
|
212
|
+
jobs: { external: { useWatchdog: true } },
|
|
213
|
+
});
|
|
207
214
|
```
|
|
208
215
|
|
|
209
216
|
Watchdog events:
|
package/docs/processes.md
CHANGED
package/docs/workflows.md
CHANGED
|
@@ -318,6 +318,12 @@ You can tune subprocess termination and SQLite pragmas used by isolated workflow
|
|
|
318
318
|
```ts
|
|
319
319
|
const server = new AppServer({
|
|
320
320
|
db,
|
|
321
|
+
watchdog: {
|
|
322
|
+
enabled: true,
|
|
323
|
+
intervalMs: 5000,
|
|
324
|
+
services: ["workflows", "jobs", "processes"],
|
|
325
|
+
killGraceMs: 5000,
|
|
326
|
+
},
|
|
321
327
|
workflows: {
|
|
322
328
|
killGraceMs: 5000,
|
|
323
329
|
sqlitePragmas: {
|
|
@@ -329,6 +335,8 @@ const server = new AppServer({
|
|
|
329
335
|
});
|
|
330
336
|
```
|
|
331
337
|
|
|
338
|
+
When `watchdog.enabled` is true, workflow heartbeat timers run in the watchdog subprocess instead of the main server.
|
|
339
|
+
|
|
332
340
|
Watchdog events:
|
|
333
341
|
- `workflow.watchdog.stale` (heartbeat missed)
|
|
334
342
|
- `workflow.watchdog.killed` (process terminated)
|
package/package.json
CHANGED
|
@@ -124,6 +124,8 @@ export interface ExternalJobsConfig {
|
|
|
124
124
|
heartbeatCheckInterval?: number;
|
|
125
125
|
/** Default grace period before SIGKILL when terminating (ms, default: 5000) */
|
|
126
126
|
killGraceMs?: number;
|
|
127
|
+
/** Disable in-process watchdog timers (use external watchdog instead) */
|
|
128
|
+
useWatchdog?: boolean;
|
|
127
129
|
}
|
|
128
130
|
|
|
129
131
|
// ============================================
|
package/src/core/jobs.ts
CHANGED
|
@@ -141,6 +141,8 @@ export interface Jobs {
|
|
|
141
141
|
getRunningExternal(): Promise<Job[]>;
|
|
142
142
|
/** Get all jobs with optional filtering (for admin dashboard) */
|
|
143
143
|
getAll(options?: GetAllJobsOptions): Promise<Job[]>;
|
|
144
|
+
/** Get external job config snapshot for watchdog */
|
|
145
|
+
getExternalJobConfigs(): Record<string, ExternalJobConfig>;
|
|
144
146
|
/** Start the job processing loop */
|
|
145
147
|
start(): void;
|
|
146
148
|
/** Stop the job processing and cleanup */
|
|
@@ -325,6 +327,14 @@ class JobsImpl implements Jobs {
|
|
|
325
327
|
this.externalConfigs.set(name, config);
|
|
326
328
|
}
|
|
327
329
|
|
|
330
|
+
getExternalJobConfigs(): Record<string, ExternalJobConfig> {
|
|
331
|
+
const snapshot: Record<string, ExternalJobConfig> = {};
|
|
332
|
+
for (const [name, config] of this.externalConfigs.entries()) {
|
|
333
|
+
snapshot[name] = { ...config };
|
|
334
|
+
}
|
|
335
|
+
return snapshot;
|
|
336
|
+
}
|
|
337
|
+
|
|
328
338
|
private isExternalJob(name: string): boolean {
|
|
329
339
|
return this.externalConfigs.has(name);
|
|
330
340
|
}
|
|
@@ -422,7 +432,9 @@ class JobsImpl implements Jobs {
|
|
|
422
432
|
// Initialize socket server for external jobs
|
|
423
433
|
if (this.externalConfigs.size > 0) {
|
|
424
434
|
this.initializeSocketServer();
|
|
425
|
-
this.
|
|
435
|
+
if (!this.externalConfig.useWatchdog) {
|
|
436
|
+
this.startHeartbeatMonitor();
|
|
437
|
+
}
|
|
426
438
|
// Attempt to reconnect to orphaned jobs from previous run
|
|
427
439
|
this.reconnectOrphanedJobs();
|
|
428
440
|
}
|
|
@@ -891,7 +903,7 @@ class JobsImpl implements Jobs {
|
|
|
891
903
|
proc.stdin.end();
|
|
892
904
|
|
|
893
905
|
// Set up process timeout if configured
|
|
894
|
-
if (config.timeout) {
|
|
906
|
+
if (config.timeout && !this.externalConfig.useWatchdog) {
|
|
895
907
|
const timeout = setTimeout(async () => {
|
|
896
908
|
console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
|
|
897
909
|
const killGraceMs = config.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
|
package/src/core/processes.ts
CHANGED
|
@@ -182,6 +182,8 @@ export interface ProcessesConfig {
|
|
|
182
182
|
autoRecoverOrphans?: boolean;
|
|
183
183
|
/** Grace period before SIGKILL when stopping/killing (ms, default: 5000) */
|
|
184
184
|
killGraceMs?: number;
|
|
185
|
+
/** Disable in-process watchdog timers (use external watchdog instead) */
|
|
186
|
+
useWatchdog?: boolean;
|
|
185
187
|
}
|
|
186
188
|
|
|
187
189
|
// ============================================
|
|
@@ -264,6 +266,7 @@ export class ProcessesImpl implements Processes {
|
|
|
264
266
|
private autoRecoverOrphans: boolean;
|
|
265
267
|
private killGraceMs: number;
|
|
266
268
|
private runtimeLimitTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
269
|
+
private useWatchdog: boolean;
|
|
267
270
|
|
|
268
271
|
// Track running Bun subprocesses
|
|
269
272
|
private subprocesses = new Map<string, Subprocess>();
|
|
@@ -280,6 +283,7 @@ export class ProcessesImpl implements Processes {
|
|
|
280
283
|
this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
|
|
281
284
|
this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
|
|
282
285
|
this.killGraceMs = config.killGraceMs ?? 5000;
|
|
286
|
+
this.useWatchdog = config.useWatchdog ?? false;
|
|
283
287
|
|
|
284
288
|
// Create socket server with callbacks
|
|
285
289
|
this.socketServer = createProcessSocketServer(config.socket ?? {}, {
|
|
@@ -376,7 +380,7 @@ export class ProcessesImpl implements Processes {
|
|
|
376
380
|
proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
|
|
377
381
|
|
|
378
382
|
const maxRuntimeMs = config.limits?.maxRuntimeMs;
|
|
379
|
-
if (maxRuntimeMs && maxRuntimeMs > 0) {
|
|
383
|
+
if (!this.useWatchdog && maxRuntimeMs && maxRuntimeMs > 0) {
|
|
380
384
|
const timer = setTimeout(async () => {
|
|
381
385
|
console.warn(`[Processes] Max runtime exceeded for ${name} (${process.id})`);
|
|
382
386
|
await this.emitEvent("process.limits_exceeded", {
|
|
@@ -895,6 +899,7 @@ export class ProcessesImpl implements Processes {
|
|
|
895
899
|
}
|
|
896
900
|
|
|
897
901
|
private startHeartbeatMonitor(): void {
|
|
902
|
+
if (this.useWatchdog) return;
|
|
898
903
|
this.heartbeatMonitor = setInterval(async () => {
|
|
899
904
|
if (this.isShuttingDown) return;
|
|
900
905
|
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { Kysely } from "kysely";
|
|
2
|
+
import { BunSqliteDialect } from "kysely-bun-sqlite";
|
|
3
|
+
import Database from "bun:sqlite";
|
|
4
|
+
import { ProcessClient } from "./process-client";
|
|
5
|
+
import { KyselyWorkflowAdapter } from "./workflow-adapter-kysely";
|
|
6
|
+
import { KyselyJobAdapter } from "./job-adapter-kysely";
|
|
7
|
+
import { SqliteProcessAdapter } from "./process-adapter-sqlite";
|
|
8
|
+
import { WatchdogRunner, type WatchdogRunnerConfig } from "./watchdog-runner";
|
|
9
|
+
|
|
10
|
+
interface WatchdogConfig extends WatchdogRunnerConfig {
|
|
11
|
+
intervalMs: number;
|
|
12
|
+
workflows?: { dbPath?: string };
|
|
13
|
+
jobs?: { dbPath?: string };
|
|
14
|
+
processes?: { dbPath?: string };
|
|
15
|
+
sqlitePragmas?: {
|
|
16
|
+
busyTimeout?: number;
|
|
17
|
+
synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
|
|
18
|
+
journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const raw = process.env.DONKEYLABS_WATCHDOG_CONFIG;
|
|
23
|
+
if (!raw) {
|
|
24
|
+
throw new Error("Missing DONKEYLABS_WATCHDOG_CONFIG");
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const config: WatchdogConfig = JSON.parse(raw);
|
|
28
|
+
const client = await ProcessClient.connect();
|
|
29
|
+
|
|
30
|
+
const workflowAdapter = config.workflows?.dbPath
|
|
31
|
+
? new KyselyWorkflowAdapter(createDb(config.workflows.dbPath, config.sqlitePragmas), {
|
|
32
|
+
cleanupDays: 0,
|
|
33
|
+
})
|
|
34
|
+
: undefined;
|
|
35
|
+
const jobAdapter = config.jobs?.dbPath
|
|
36
|
+
? new KyselyJobAdapter(createDb(config.jobs.dbPath, config.sqlitePragmas), {
|
|
37
|
+
cleanupDays: 0,
|
|
38
|
+
})
|
|
39
|
+
: undefined;
|
|
40
|
+
const processAdapter = config.processes?.dbPath
|
|
41
|
+
? new SqliteProcessAdapter({ path: config.processes.dbPath, cleanupDays: 0 })
|
|
42
|
+
: undefined;
|
|
43
|
+
|
|
44
|
+
const runner = new WatchdogRunner(config, {
|
|
45
|
+
workflowsAdapter: workflowAdapter,
|
|
46
|
+
jobsAdapter: jobAdapter,
|
|
47
|
+
processesAdapter: processAdapter,
|
|
48
|
+
emit: async (event, data) => {
|
|
49
|
+
await client.emit(event, data);
|
|
50
|
+
},
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
const interval = Math.max(1000, config.intervalMs);
|
|
54
|
+
const timer = setInterval(() => {
|
|
55
|
+
runner.runOnce().catch(() => undefined);
|
|
56
|
+
}, interval);
|
|
57
|
+
|
|
58
|
+
process.on("SIGTERM", async () => {
|
|
59
|
+
clearInterval(timer);
|
|
60
|
+
client.disconnect();
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
function createDb(
|
|
64
|
+
dbPath: string,
|
|
65
|
+
pragmas?: { busyTimeout?: number; synchronous?: string; journalMode?: string }
|
|
66
|
+
): Kysely<any> {
|
|
67
|
+
const sqlite = new Database(dbPath);
|
|
68
|
+
const busyTimeout = pragmas?.busyTimeout ?? 5000;
|
|
69
|
+
sqlite.run(`PRAGMA busy_timeout = ${busyTimeout}`);
|
|
70
|
+
if (pragmas?.journalMode) {
|
|
71
|
+
sqlite.run(`PRAGMA journal_mode = ${pragmas.journalMode}`);
|
|
72
|
+
}
|
|
73
|
+
if (pragmas?.synchronous) {
|
|
74
|
+
sqlite.run(`PRAGMA synchronous = ${pragmas.synchronous}`);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return new Kysely<any>({
|
|
78
|
+
dialect: new BunSqliteDialect({ database: sqlite }),
|
|
79
|
+
});
|
|
80
|
+
}
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import type { WorkflowAdapter, WorkflowInstance } from "./workflows";
|
|
2
|
+
import type { JobAdapter, Job } from "./jobs";
|
|
3
|
+
import type { ProcessAdapter } from "./process-adapter-sqlite";
|
|
4
|
+
|
|
5
|
+
export type WatchdogService = "workflows" | "jobs" | "processes";
|
|
6
|
+
|
|
7
|
+
export interface WatchdogRunnerConfig {
|
|
8
|
+
services: WatchdogService[];
|
|
9
|
+
killGraceMs: number;
|
|
10
|
+
workflowHeartbeatTimeoutMs: number;
|
|
11
|
+
jobDefaults: {
|
|
12
|
+
heartbeatTimeoutMs: number;
|
|
13
|
+
killGraceMs: number;
|
|
14
|
+
};
|
|
15
|
+
jobConfigs: Record<string, { heartbeatTimeout?: number; timeout?: number; killGraceMs?: number }>;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface WatchdogRunnerDeps {
|
|
19
|
+
workflowsAdapter?: WorkflowAdapter;
|
|
20
|
+
jobsAdapter?: JobAdapter;
|
|
21
|
+
processesAdapter?: ProcessAdapter;
|
|
22
|
+
now?: () => number;
|
|
23
|
+
killProcess?: (pid: number, signal: NodeJS.Signals) => void;
|
|
24
|
+
isProcessAlive?: (pid: number) => boolean;
|
|
25
|
+
emit?: (event: string, data: Record<string, any>) => Promise<void>;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export class WatchdogRunner {
|
|
29
|
+
private config: WatchdogRunnerConfig;
|
|
30
|
+
private deps: WatchdogRunnerDeps;
|
|
31
|
+
|
|
32
|
+
constructor(config: WatchdogRunnerConfig, deps: WatchdogRunnerDeps) {
|
|
33
|
+
this.config = config;
|
|
34
|
+
this.deps = deps;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async runOnce(): Promise<void> {
|
|
38
|
+
if (this.config.services.includes("workflows") && this.deps.workflowsAdapter) {
|
|
39
|
+
await this.checkWorkflows();
|
|
40
|
+
}
|
|
41
|
+
if (this.config.services.includes("jobs") && this.deps.jobsAdapter) {
|
|
42
|
+
await this.checkJobs();
|
|
43
|
+
}
|
|
44
|
+
if (this.config.services.includes("processes") && this.deps.processesAdapter) {
|
|
45
|
+
await this.checkProcesses();
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
private async checkWorkflows(): Promise<void> {
|
|
50
|
+
const adapter = this.deps.workflowsAdapter!;
|
|
51
|
+
const now = this.now();
|
|
52
|
+
const instances = await adapter.getRunningInstances();
|
|
53
|
+
|
|
54
|
+
for (const instance of instances) {
|
|
55
|
+
const info = this.getWatchdogMetadata(instance);
|
|
56
|
+
if (!info?.pid) continue;
|
|
57
|
+
|
|
58
|
+
const last = info.lastHeartbeat ?? instance.startedAt?.getTime() ?? 0;
|
|
59
|
+
if (now - last <= this.config.workflowHeartbeatTimeoutMs) continue;
|
|
60
|
+
|
|
61
|
+
await this.emit("workflow.watchdog.stale", {
|
|
62
|
+
instanceId: instance.id,
|
|
63
|
+
pid: info.pid,
|
|
64
|
+
timeoutMs: this.config.workflowHeartbeatTimeoutMs,
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
await this.killProcessWithGrace(info.pid, this.config.killGraceMs);
|
|
68
|
+
|
|
69
|
+
await adapter.updateInstance(instance.id, {
|
|
70
|
+
status: "failed",
|
|
71
|
+
error: "Watchdog killed unresponsive workflow",
|
|
72
|
+
completedAt: new Date(),
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
await this.emit("workflow.watchdog.killed", {
|
|
76
|
+
instanceId: instance.id,
|
|
77
|
+
pid: info.pid,
|
|
78
|
+
reason: "heartbeat",
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
private async checkJobs(): Promise<void> {
|
|
84
|
+
const adapter = this.deps.jobsAdapter!;
|
|
85
|
+
const now = this.now();
|
|
86
|
+
const jobs = await adapter.getRunningExternal();
|
|
87
|
+
|
|
88
|
+
for (const job of jobs) {
|
|
89
|
+
if (!job.pid) continue;
|
|
90
|
+
const config = this.config.jobConfigs[job.name] ?? {};
|
|
91
|
+
const heartbeatTimeout =
|
|
92
|
+
config.heartbeatTimeout ?? this.config.jobDefaults.heartbeatTimeoutMs;
|
|
93
|
+
const killGraceMs = config.killGraceMs ?? this.config.jobDefaults.killGraceMs;
|
|
94
|
+
const lastHeartbeat = job.lastHeartbeat?.getTime() ?? job.startedAt?.getTime() ?? 0;
|
|
95
|
+
|
|
96
|
+
if (now - lastHeartbeat > heartbeatTimeout) {
|
|
97
|
+
await this.emit("job.watchdog.stale", {
|
|
98
|
+
jobId: job.id,
|
|
99
|
+
name: job.name,
|
|
100
|
+
pid: job.pid,
|
|
101
|
+
timeoutMs: heartbeatTimeout,
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
await this.killProcessWithGrace(job.pid, killGraceMs);
|
|
105
|
+
|
|
106
|
+
await adapter.update(job.id, {
|
|
107
|
+
status: "failed",
|
|
108
|
+
error: "Watchdog killed unresponsive job",
|
|
109
|
+
completedAt: new Date(),
|
|
110
|
+
processState: "orphaned",
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
await this.emit("job.watchdog.killed", {
|
|
114
|
+
jobId: job.id,
|
|
115
|
+
name: job.name,
|
|
116
|
+
pid: job.pid,
|
|
117
|
+
reason: "heartbeat",
|
|
118
|
+
});
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (config.timeout && job.startedAt) {
|
|
123
|
+
if (now - job.startedAt.getTime() > config.timeout) {
|
|
124
|
+
await this.emit("job.watchdog.stale", {
|
|
125
|
+
jobId: job.id,
|
|
126
|
+
name: job.name,
|
|
127
|
+
pid: job.pid,
|
|
128
|
+
timeoutMs: config.timeout,
|
|
129
|
+
reason: "timeout",
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
await this.killProcessWithGrace(job.pid, killGraceMs);
|
|
133
|
+
|
|
134
|
+
await adapter.update(job.id, {
|
|
135
|
+
status: "failed",
|
|
136
|
+
error: "Watchdog killed job after timeout",
|
|
137
|
+
completedAt: new Date(),
|
|
138
|
+
processState: "orphaned",
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
await this.emit("job.watchdog.killed", {
|
|
142
|
+
jobId: job.id,
|
|
143
|
+
name: job.name,
|
|
144
|
+
pid: job.pid,
|
|
145
|
+
reason: "timeout",
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
private async checkProcesses(): Promise<void> {
|
|
153
|
+
const adapter = this.deps.processesAdapter!;
|
|
154
|
+
const now = this.now();
|
|
155
|
+
const running = await adapter.getRunning();
|
|
156
|
+
|
|
157
|
+
for (const proc of running) {
|
|
158
|
+
if (!proc.pid) continue;
|
|
159
|
+
const heartbeatTimeout = proc.config.heartbeat?.timeoutMs;
|
|
160
|
+
const lastHeartbeat = proc.lastHeartbeat?.getTime() ?? proc.startedAt?.getTime() ?? 0;
|
|
161
|
+
|
|
162
|
+
if (heartbeatTimeout && now - lastHeartbeat > heartbeatTimeout) {
|
|
163
|
+
await this.emit("process.watchdog.stale", {
|
|
164
|
+
processId: proc.id,
|
|
165
|
+
name: proc.name,
|
|
166
|
+
pid: proc.pid,
|
|
167
|
+
timeoutMs: heartbeatTimeout,
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
|
|
171
|
+
|
|
172
|
+
await adapter.update(proc.id, {
|
|
173
|
+
status: "crashed",
|
|
174
|
+
error: "Watchdog killed unresponsive process",
|
|
175
|
+
stoppedAt: new Date(),
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
await this.emit("process.watchdog.killed", {
|
|
179
|
+
processId: proc.id,
|
|
180
|
+
name: proc.name,
|
|
181
|
+
pid: proc.pid,
|
|
182
|
+
reason: "heartbeat",
|
|
183
|
+
});
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const maxRuntimeMs = proc.config.limits?.maxRuntimeMs;
|
|
188
|
+
if (maxRuntimeMs && proc.startedAt) {
|
|
189
|
+
if (now - proc.startedAt.getTime() > maxRuntimeMs) {
|
|
190
|
+
await this.emit("process.watchdog.stale", {
|
|
191
|
+
processId: proc.id,
|
|
192
|
+
name: proc.name,
|
|
193
|
+
pid: proc.pid,
|
|
194
|
+
timeoutMs: maxRuntimeMs,
|
|
195
|
+
reason: "maxRuntimeMs",
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
|
|
199
|
+
|
|
200
|
+
await adapter.update(proc.id, {
|
|
201
|
+
status: "crashed",
|
|
202
|
+
error: "Watchdog killed process after max runtime",
|
|
203
|
+
stoppedAt: new Date(),
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
await this.emit("process.watchdog.killed", {
|
|
207
|
+
processId: proc.id,
|
|
208
|
+
name: proc.name,
|
|
209
|
+
pid: proc.pid,
|
|
210
|
+
reason: "maxRuntimeMs",
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
private getWatchdogMetadata(instance: WorkflowInstance): { pid?: number; lastHeartbeat?: number } | null {
|
|
218
|
+
const meta = instance.metadata as any;
|
|
219
|
+
if (!meta || typeof meta !== "object") return null;
|
|
220
|
+
const info = meta.__watchdog;
|
|
221
|
+
if (!info || typeof info !== "object") return null;
|
|
222
|
+
return {
|
|
223
|
+
pid: typeof info.pid === "number" ? info.pid : undefined,
|
|
224
|
+
lastHeartbeat: info.lastHeartbeat ? new Date(info.lastHeartbeat).getTime() : undefined,
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
private now(): number {
|
|
229
|
+
return this.deps.now ? this.deps.now() : Date.now();
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
private async emit(event: string, data: Record<string, any>): Promise<void> {
|
|
233
|
+
if (this.deps.emit) {
|
|
234
|
+
await this.deps.emit(event, data);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
private async killProcessWithGrace(pid: number, graceMs: number): Promise<void> {
|
|
239
|
+
const kill = this.deps.killProcess ?? process.kill;
|
|
240
|
+
try {
|
|
241
|
+
kill(pid, "SIGTERM");
|
|
242
|
+
} catch {
|
|
243
|
+
return;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (graceMs <= 0) {
|
|
247
|
+
try {
|
|
248
|
+
kill(pid, "SIGKILL");
|
|
249
|
+
} catch {
|
|
250
|
+
// ignore
|
|
251
|
+
}
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
await new Promise((resolve) => setTimeout(resolve, graceMs));
|
|
256
|
+
|
|
257
|
+
try {
|
|
258
|
+
const isAlive = this.deps.isProcessAlive
|
|
259
|
+
? this.deps.isProcessAlive(pid)
|
|
260
|
+
: (() => {
|
|
261
|
+
try {
|
|
262
|
+
process.kill(pid, 0);
|
|
263
|
+
return true;
|
|
264
|
+
} catch {
|
|
265
|
+
return false;
|
|
266
|
+
}
|
|
267
|
+
})();
|
|
268
|
+
|
|
269
|
+
if (isAlive) {
|
|
270
|
+
kill(pid, "SIGKILL");
|
|
271
|
+
}
|
|
272
|
+
} catch {
|
|
273
|
+
// ignore
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
package/src/core/workflows.ts
CHANGED
|
@@ -764,6 +764,8 @@ export interface WorkflowsConfig {
|
|
|
764
764
|
killGraceMs?: number;
|
|
765
765
|
/** SQLite pragmas for isolated subprocess connections */
|
|
766
766
|
sqlitePragmas?: SqlitePragmaConfig;
|
|
767
|
+
/** Disable in-process watchdog timers (use external watchdog instead) */
|
|
768
|
+
useWatchdog?: boolean;
|
|
767
769
|
/** Resume strategy for orphaned workflows (default: "blocking") */
|
|
768
770
|
resumeStrategy?: WorkflowResumeStrategy;
|
|
769
771
|
}
|
|
@@ -825,6 +827,8 @@ export interface Workflows {
|
|
|
825
827
|
updateMetadata(instanceId: string, key: string, value: any): Promise<void>;
|
|
826
828
|
/** Set plugin metadata for local instantiation in isolated workflows */
|
|
827
829
|
setPluginMetadata(metadata: PluginMetadata): void;
|
|
830
|
+
/** Get resolved SQLite db path (for watchdog) */
|
|
831
|
+
getDbPath(): string | undefined;
|
|
828
832
|
}
|
|
829
833
|
|
|
830
834
|
export interface PluginMetadata {
|
|
@@ -866,6 +870,7 @@ class WorkflowsImpl implements Workflows {
|
|
|
866
870
|
private readyTimeoutMs: number;
|
|
867
871
|
private killGraceMs: number;
|
|
868
872
|
private sqlitePragmas?: SqlitePragmaConfig;
|
|
873
|
+
private useWatchdog: boolean;
|
|
869
874
|
private resumeStrategy!: WorkflowResumeStrategy;
|
|
870
875
|
private workflowModulePaths = new Map<string, string>();
|
|
871
876
|
private isolatedProcesses = new Map<string, IsolatedProcessInfo>();
|
|
@@ -902,6 +907,7 @@ class WorkflowsImpl implements Workflows {
|
|
|
902
907
|
this.readyTimeoutMs = config.readyTimeout ?? 10000;
|
|
903
908
|
this.killGraceMs = config.killGraceMs ?? 5000;
|
|
904
909
|
this.sqlitePragmas = config.sqlitePragmas;
|
|
910
|
+
this.useWatchdog = config.useWatchdog ?? false;
|
|
905
911
|
this.resumeStrategy = config.resumeStrategy ?? "blocking";
|
|
906
912
|
}
|
|
907
913
|
|
|
@@ -962,6 +968,10 @@ class WorkflowsImpl implements Workflows {
|
|
|
962
968
|
this.pluginCustomErrors = metadata.customErrors;
|
|
963
969
|
}
|
|
964
970
|
|
|
971
|
+
getDbPath(): string | undefined {
|
|
972
|
+
return this.dbPath;
|
|
973
|
+
}
|
|
974
|
+
|
|
965
975
|
async updateMetadata(instanceId: string, key: string, value: any): Promise<void> {
|
|
966
976
|
const instance = await this.adapter.getInstance(instanceId);
|
|
967
977
|
if (!instance) return;
|
|
@@ -1517,6 +1527,17 @@ class WorkflowsImpl implements Workflows {
|
|
|
1517
1527
|
// Set up heartbeat timeout
|
|
1518
1528
|
this.resetHeartbeatTimeout(instanceId, proc.pid);
|
|
1519
1529
|
|
|
1530
|
+
const instance = await this.adapter.getInstance(instanceId);
|
|
1531
|
+
const metadata = { ...(instance?.metadata ?? {}) } as Record<string, any>;
|
|
1532
|
+
metadata.__watchdog = {
|
|
1533
|
+
...(metadata.__watchdog ?? {}),
|
|
1534
|
+
pid: proc.pid,
|
|
1535
|
+
socketPath,
|
|
1536
|
+
tcpPort,
|
|
1537
|
+
lastHeartbeat: new Date().toISOString(),
|
|
1538
|
+
};
|
|
1539
|
+
await this.adapter.updateInstance(instanceId, { metadata });
|
|
1540
|
+
|
|
1520
1541
|
const exitBeforeReady = proc.exited.then((exitCode) => {
|
|
1521
1542
|
throw new Error(`Subprocess exited before ready (code ${exitCode})`);
|
|
1522
1543
|
});
|
|
@@ -1590,7 +1611,8 @@ class WorkflowsImpl implements Workflows {
|
|
|
1590
1611
|
|
|
1591
1612
|
case "started":
|
|
1592
1613
|
case "heartbeat":
|
|
1593
|
-
//
|
|
1614
|
+
// Update heartbeat tracking metadata
|
|
1615
|
+
await this.updateWatchdogHeartbeat(instanceId);
|
|
1594
1616
|
break;
|
|
1595
1617
|
|
|
1596
1618
|
case "step.started": {
|
|
@@ -1958,6 +1980,17 @@ class WorkflowsImpl implements Workflows {
|
|
|
1958
1980
|
this.rejectIsolatedReady(instanceId, new Error("Isolated workflow cleaned up"));
|
|
1959
1981
|
}
|
|
1960
1982
|
|
|
1983
|
+
private async updateWatchdogHeartbeat(instanceId: string): Promise<void> {
|
|
1984
|
+
const instance = await this.adapter.getInstance(instanceId);
|
|
1985
|
+
if (!instance) return;
|
|
1986
|
+
const metadata = { ...(instance.metadata ?? {}) } as Record<string, any>;
|
|
1987
|
+
metadata.__watchdog = {
|
|
1988
|
+
...(metadata.__watchdog ?? {}),
|
|
1989
|
+
lastHeartbeat: new Date().toISOString(),
|
|
1990
|
+
};
|
|
1991
|
+
await this.adapter.updateInstance(instanceId, { metadata });
|
|
1992
|
+
}
|
|
1993
|
+
|
|
1961
1994
|
private async markOrphanedAsFailed(
|
|
1962
1995
|
instances: WorkflowInstance[],
|
|
1963
1996
|
reason: string
|
|
@@ -1990,6 +2023,7 @@ class WorkflowsImpl implements Workflows {
|
|
|
1990
2023
|
* Reset heartbeat timeout for an isolated workflow
|
|
1991
2024
|
*/
|
|
1992
2025
|
private resetHeartbeatTimeout(instanceId: string, pid: number): void {
|
|
2026
|
+
if (this.useWatchdog) return;
|
|
1993
2027
|
const info = this.isolatedProcesses.get(instanceId);
|
|
1994
2028
|
if (!info) return;
|
|
1995
2029
|
|
package/src/server.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
2
|
import { mkdir, writeFile } from "node:fs/promises";
|
|
3
|
-
import { dirname } from "node:path";
|
|
3
|
+
import { dirname, join } from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
4
5
|
import { PluginManager, type CoreServices, type ConfiguredPlugin } from "./core";
|
|
5
6
|
import { type IRouter, type RouteDefinition, type ServerContext, type HandlerRegistry } from "./router";
|
|
6
7
|
import { Handlers } from "./handlers";
|
|
@@ -87,6 +88,13 @@ export interface ServerConfig {
|
|
|
87
88
|
*/
|
|
88
89
|
workflowsResumeStrategy?: "blocking" | "background" | "skip";
|
|
89
90
|
processes?: ProcessesConfig;
|
|
91
|
+
/** Watchdog subprocess configuration */
|
|
92
|
+
watchdog?: {
|
|
93
|
+
enabled?: boolean;
|
|
94
|
+
intervalMs?: number;
|
|
95
|
+
services?: ("workflows" | "jobs" | "processes")[];
|
|
96
|
+
killGraceMs?: number;
|
|
97
|
+
};
|
|
90
98
|
audit?: AuditConfig;
|
|
91
99
|
websocket?: WebSocketConfig;
|
|
92
100
|
storage?: StorageConfig;
|
|
@@ -221,6 +229,9 @@ export class AppServer {
|
|
|
221
229
|
private generateModeSetup = false;
|
|
222
230
|
private initMode: "adapter" | "server" = "server";
|
|
223
231
|
private workflowsResumeStrategy?: "blocking" | "background" | "skip";
|
|
232
|
+
private watchdogConfig?: ServerConfig["watchdog"];
|
|
233
|
+
private watchdogStarted = false;
|
|
234
|
+
private options: ServerConfig;
|
|
224
235
|
|
|
225
236
|
// Custom services registry
|
|
226
237
|
private serviceFactories = new Map<string, ServiceFactory<any>>();
|
|
@@ -228,11 +239,13 @@ export class AppServer {
|
|
|
228
239
|
private generateModeTimer?: ReturnType<typeof setTimeout>;
|
|
229
240
|
|
|
230
241
|
constructor(options: ServerConfig) {
|
|
242
|
+
this.options = options;
|
|
231
243
|
// Port priority: explicit config > PORT env var > default 3000
|
|
232
244
|
const envPort = process.env.PORT ? parseInt(process.env.PORT, 10) : undefined;
|
|
233
245
|
this.port = options.port ?? envPort ?? 3000;
|
|
234
246
|
this.maxPortAttempts = options.maxPortAttempts ?? 5;
|
|
235
247
|
this.workflowsResumeStrategy = options.workflowsResumeStrategy ?? options.workflows?.resumeStrategy;
|
|
248
|
+
this.watchdogConfig = options.watchdog;
|
|
236
249
|
|
|
237
250
|
// Determine if we should use legacy databases
|
|
238
251
|
const useLegacy = options.useLegacyCoreDatabases ?? false;
|
|
@@ -286,6 +299,10 @@ export class AppServer {
|
|
|
286
299
|
events,
|
|
287
300
|
logger,
|
|
288
301
|
adapter: jobAdapter,
|
|
302
|
+
external: {
|
|
303
|
+
...options.jobs?.external,
|
|
304
|
+
useWatchdog: options.watchdog?.enabled ? true : options.jobs?.external?.useWatchdog,
|
|
305
|
+
},
|
|
289
306
|
// Disable built-in persistence when using Kysely adapter
|
|
290
307
|
persist: useLegacy ? options.jobs?.persist : false,
|
|
291
308
|
});
|
|
@@ -297,12 +314,14 @@ export class AppServer {
|
|
|
297
314
|
jobs,
|
|
298
315
|
sse,
|
|
299
316
|
adapter: workflowAdapter,
|
|
317
|
+
useWatchdog: options.watchdog?.enabled ? true : options.workflows?.useWatchdog,
|
|
300
318
|
});
|
|
301
319
|
|
|
302
320
|
// Processes - still uses its own adapter pattern but can use Kysely
|
|
303
321
|
const processes = createProcesses({
|
|
304
322
|
...options.processes,
|
|
305
323
|
events,
|
|
324
|
+
useWatchdog: options.watchdog?.enabled ? true : options.processes?.useWatchdog,
|
|
306
325
|
});
|
|
307
326
|
|
|
308
327
|
// New services
|
|
@@ -1055,6 +1074,7 @@ ${factoryFunction}
|
|
|
1055
1074
|
await this.coreServices.workflows.resume();
|
|
1056
1075
|
}
|
|
1057
1076
|
this.coreServices.processes.start();
|
|
1077
|
+
await this.startWatchdog();
|
|
1058
1078
|
logger.info("Background services started (cron, jobs, workflows, processes)");
|
|
1059
1079
|
|
|
1060
1080
|
for (const router of this.routers) {
|
|
@@ -1073,6 +1093,69 @@ ${factoryFunction}
|
|
|
1073
1093
|
await this.runReadyHandlers();
|
|
1074
1094
|
}
|
|
1075
1095
|
|
|
1096
|
+
private async startWatchdog(): Promise<void> {
|
|
1097
|
+
if (!this.watchdogConfig?.enabled) return;
|
|
1098
|
+
if (this.watchdogStarted) return;
|
|
1099
|
+
|
|
1100
|
+
const executorPath = join(dirname(fileURLToPath(import.meta.url)), "core", "watchdog-executor.ts");
|
|
1101
|
+
const services = this.watchdogConfig.services ?? ["workflows", "jobs", "processes"];
|
|
1102
|
+
const workflowsDbPath = this.coreServices.workflows.getDbPath?.();
|
|
1103
|
+
const jobsDbPath = (this.options.jobs?.dbPath ?? workflowsDbPath ?? ".donkeylabs/jobs.db") as string;
|
|
1104
|
+
const processesDbPath = (this.options.processes?.adapter?.path ?? ".donkeylabs/processes.db") as string;
|
|
1105
|
+
|
|
1106
|
+
const config = {
|
|
1107
|
+
intervalMs: this.watchdogConfig.intervalMs ?? 5000,
|
|
1108
|
+
services,
|
|
1109
|
+
killGraceMs: this.watchdogConfig.killGraceMs ?? 5000,
|
|
1110
|
+
workflowHeartbeatTimeoutMs: this.options.workflows?.heartbeatTimeout ?? 60000,
|
|
1111
|
+
jobDefaults: {
|
|
1112
|
+
heartbeatTimeoutMs: this.options.jobs?.external?.defaultHeartbeatTimeout ?? 30000,
|
|
1113
|
+
killGraceMs: this.options.jobs?.external?.killGraceMs ?? this.watchdogConfig.killGraceMs ?? 5000,
|
|
1114
|
+
},
|
|
1115
|
+
jobConfigs: this.coreServices.jobs.getExternalJobConfigs(),
|
|
1116
|
+
workflows: workflowsDbPath ? { dbPath: workflowsDbPath } : undefined,
|
|
1117
|
+
jobs: jobsDbPath ? { dbPath: jobsDbPath } : undefined,
|
|
1118
|
+
processes: processesDbPath ? { dbPath: processesDbPath } : undefined,
|
|
1119
|
+
sqlitePragmas: this.options.workflows?.sqlitePragmas,
|
|
1120
|
+
};
|
|
1121
|
+
|
|
1122
|
+
try {
|
|
1123
|
+
this.coreServices.processes.register({
|
|
1124
|
+
name: "__watchdog",
|
|
1125
|
+
config: {
|
|
1126
|
+
command: "bun",
|
|
1127
|
+
args: ["run", executorPath],
|
|
1128
|
+
env: {
|
|
1129
|
+
DONKEYLABS_WATCHDOG_CONFIG: JSON.stringify(config),
|
|
1130
|
+
},
|
|
1131
|
+
heartbeat: { intervalMs: 5000, timeoutMs: 30000 },
|
|
1132
|
+
},
|
|
1133
|
+
});
|
|
1134
|
+
} catch {
|
|
1135
|
+
// Already registered
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
await this.coreServices.processes.spawn("__watchdog", {
|
|
1139
|
+
metadata: { role: "watchdog" },
|
|
1140
|
+
});
|
|
1141
|
+
|
|
1142
|
+
this.coreServices.events.on("process.event", async (data: any) => {
|
|
1143
|
+
if (data?.name !== "__watchdog") return;
|
|
1144
|
+
if (!data.event) return;
|
|
1145
|
+
|
|
1146
|
+
await this.coreServices.events.emit(data.event, data.data ?? {});
|
|
1147
|
+
|
|
1148
|
+
if (data.event.startsWith("workflow.watchdog")) {
|
|
1149
|
+
const instanceId = data.data?.instanceId;
|
|
1150
|
+
if (instanceId && this.coreServices.sse) {
|
|
1151
|
+
this.coreServices.sse.broadcast(`workflow:${instanceId}`, data.event, data.data ?? {});
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
});
|
|
1155
|
+
|
|
1156
|
+
this.watchdogStarted = true;
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1076
1159
|
/**
|
|
1077
1160
|
* Handle a single API request. Used by adapters.
|
|
1078
1161
|
* Returns null if the route is not found.
|