@donkeylabs/server 2.0.29 → 2.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/jobs.md CHANGED
@@ -204,6 +204,13 @@ ctx.core.jobs.registerExternal("batchWorker", {
204
204
  timeout: 10 * 60 * 1000,
205
205
  killGraceMs: 5000,
206
206
  });
207
+
208
+ // Disable in-process timers when using the watchdog subprocess
209
+ const server = new AppServer({
210
+ db,
211
+ watchdog: { enabled: true },
212
+ jobs: { external: { useWatchdog: true } },
213
+ });
207
214
  ```
208
215
 
209
216
  Watchdog events:
package/docs/processes.md CHANGED
@@ -248,6 +248,8 @@ Watchdog events:
248
248
  - `process.watchdog.stale`
249
249
  - `process.watchdog.killed`
250
250
 
251
+ When `watchdog.enabled` is true, heartbeat monitoring runs in the watchdog subprocess.
252
+
251
253
  ### Properties
252
254
 
253
255
  ```typescript
package/docs/workflows.md CHANGED
@@ -318,6 +318,12 @@ You can tune subprocess termination and SQLite pragmas used by isolated workflow
318
318
  ```ts
319
319
  const server = new AppServer({
320
320
  db,
321
+ watchdog: {
322
+ enabled: true,
323
+ intervalMs: 5000,
324
+ services: ["workflows", "jobs", "processes"],
325
+ killGraceMs: 5000,
326
+ },
321
327
  workflows: {
322
328
  killGraceMs: 5000,
323
329
  sqlitePragmas: {
@@ -329,6 +335,8 @@ const server = new AppServer({
329
335
  });
330
336
  ```
331
337
 
338
+ When `watchdog.enabled` is true, workflow heartbeat timers run in the watchdog subprocess instead of the main server.
339
+
332
340
  Watchdog events:
333
341
  - `workflow.watchdog.stale` (heartbeat missed)
334
342
  - `workflow.watchdog.killed` (process terminated)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@donkeylabs/server",
3
- "version": "2.0.29",
3
+ "version": "2.0.30",
4
4
  "type": "module",
5
5
  "description": "Type-safe plugin system for building RPC-style APIs with Bun",
6
6
  "main": "./src/index.ts",
@@ -124,6 +124,8 @@ export interface ExternalJobsConfig {
124
124
  heartbeatCheckInterval?: number;
125
125
  /** Default grace period before SIGKILL when terminating (ms, default: 5000) */
126
126
  killGraceMs?: number;
127
+ /** Disable in-process watchdog timers (use external watchdog instead) */
128
+ useWatchdog?: boolean;
127
129
  }
128
130
 
129
131
  // ============================================
package/src/core/jobs.ts CHANGED
@@ -141,6 +141,8 @@ export interface Jobs {
141
141
  getRunningExternal(): Promise<Job[]>;
142
142
  /** Get all jobs with optional filtering (for admin dashboard) */
143
143
  getAll(options?: GetAllJobsOptions): Promise<Job[]>;
144
+ /** Get external job config snapshot for watchdog */
145
+ getExternalJobConfigs(): Record<string, ExternalJobConfig>;
144
146
  /** Start the job processing loop */
145
147
  start(): void;
146
148
  /** Stop the job processing and cleanup */
@@ -325,6 +327,14 @@ class JobsImpl implements Jobs {
325
327
  this.externalConfigs.set(name, config);
326
328
  }
327
329
 
330
+ getExternalJobConfigs(): Record<string, ExternalJobConfig> {
331
+ const snapshot: Record<string, ExternalJobConfig> = {};
332
+ for (const [name, config] of this.externalConfigs.entries()) {
333
+ snapshot[name] = { ...config };
334
+ }
335
+ return snapshot;
336
+ }
337
+
328
338
  private isExternalJob(name: string): boolean {
329
339
  return this.externalConfigs.has(name);
330
340
  }
@@ -422,7 +432,9 @@ class JobsImpl implements Jobs {
422
432
  // Initialize socket server for external jobs
423
433
  if (this.externalConfigs.size > 0) {
424
434
  this.initializeSocketServer();
425
- this.startHeartbeatMonitor();
435
+ if (!this.externalConfig.useWatchdog) {
436
+ this.startHeartbeatMonitor();
437
+ }
426
438
  // Attempt to reconnect to orphaned jobs from previous run
427
439
  this.reconnectOrphanedJobs();
428
440
  }
@@ -891,7 +903,7 @@ class JobsImpl implements Jobs {
891
903
  proc.stdin.end();
892
904
 
893
905
  // Set up process timeout if configured
894
- if (config.timeout) {
906
+ if (config.timeout && !this.externalConfig.useWatchdog) {
895
907
  const timeout = setTimeout(async () => {
896
908
  console.warn(`[Jobs] External job ${job.id} timed out after ${config.timeout}ms`);
897
909
  const killGraceMs = config.killGraceMs ?? this.externalConfig.killGraceMs ?? 5000;
@@ -182,6 +182,8 @@ export interface ProcessesConfig {
182
182
  autoRecoverOrphans?: boolean;
183
183
  /** Grace period before SIGKILL when stopping/killing (ms, default: 5000) */
184
184
  killGraceMs?: number;
185
+ /** Disable in-process watchdog timers (use external watchdog instead) */
186
+ useWatchdog?: boolean;
185
187
  }
186
188
 
187
189
  // ============================================
@@ -264,6 +266,7 @@ export class ProcessesImpl implements Processes {
264
266
  private autoRecoverOrphans: boolean;
265
267
  private killGraceMs: number;
266
268
  private runtimeLimitTimers = new Map<string, ReturnType<typeof setTimeout>>();
269
+ private useWatchdog: boolean;
267
270
 
268
271
  // Track running Bun subprocesses
269
272
  private subprocesses = new Map<string, Subprocess>();
@@ -280,6 +283,7 @@ export class ProcessesImpl implements Processes {
280
283
  this.heartbeatCheckInterval = config.heartbeatCheckInterval ?? 10000;
281
284
  this.autoRecoverOrphans = config.autoRecoverOrphans ?? true;
282
285
  this.killGraceMs = config.killGraceMs ?? 5000;
286
+ this.useWatchdog = config.useWatchdog ?? false;
283
287
 
284
288
  // Create socket server with callbacks
285
289
  this.socketServer = createProcessSocketServer(config.socket ?? {}, {
@@ -376,7 +380,7 @@ export class ProcessesImpl implements Processes {
376
380
  proc.exited.then((exitCode) => this.handleExit(process.id, exitCode));
377
381
 
378
382
  const maxRuntimeMs = config.limits?.maxRuntimeMs;
379
- if (maxRuntimeMs && maxRuntimeMs > 0) {
383
+ if (!this.useWatchdog && maxRuntimeMs && maxRuntimeMs > 0) {
380
384
  const timer = setTimeout(async () => {
381
385
  console.warn(`[Processes] Max runtime exceeded for ${name} (${process.id})`);
382
386
  await this.emitEvent("process.limits_exceeded", {
@@ -895,6 +899,7 @@ export class ProcessesImpl implements Processes {
895
899
  }
896
900
 
897
901
  private startHeartbeatMonitor(): void {
902
+ if (this.useWatchdog) return;
898
903
  this.heartbeatMonitor = setInterval(async () => {
899
904
  if (this.isShuttingDown) return;
900
905
 
@@ -0,0 +1,80 @@
1
+ import { Kysely } from "kysely";
2
+ import { BunSqliteDialect } from "kysely-bun-sqlite";
3
+ import Database from "bun:sqlite";
4
+ import { ProcessClient } from "./process-client";
5
+ import { KyselyWorkflowAdapter } from "./workflow-adapter-kysely";
6
+ import { KyselyJobAdapter } from "./job-adapter-kysely";
7
+ import { SqliteProcessAdapter } from "./process-adapter-sqlite";
8
+ import { WatchdogRunner, type WatchdogRunnerConfig } from "./watchdog-runner";
9
+
10
+ interface WatchdogConfig extends WatchdogRunnerConfig {
11
+ intervalMs: number;
12
+ workflows?: { dbPath?: string };
13
+ jobs?: { dbPath?: string };
14
+ processes?: { dbPath?: string };
15
+ sqlitePragmas?: {
16
+ busyTimeout?: number;
17
+ synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
18
+ journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
19
+ };
20
+ }
21
+
22
+ const raw = process.env.DONKEYLABS_WATCHDOG_CONFIG;
23
+ if (!raw) {
24
+ throw new Error("Missing DONKEYLABS_WATCHDOG_CONFIG");
25
+ }
26
+
27
+ const config: WatchdogConfig = JSON.parse(raw);
28
+ const client = await ProcessClient.connect();
29
+
30
+ const workflowAdapter = config.workflows?.dbPath
31
+ ? new KyselyWorkflowAdapter(createDb(config.workflows.dbPath, config.sqlitePragmas), {
32
+ cleanupDays: 0,
33
+ })
34
+ : undefined;
35
+ const jobAdapter = config.jobs?.dbPath
36
+ ? new KyselyJobAdapter(createDb(config.jobs.dbPath, config.sqlitePragmas), {
37
+ cleanupDays: 0,
38
+ })
39
+ : undefined;
40
+ const processAdapter = config.processes?.dbPath
41
+ ? new SqliteProcessAdapter({ path: config.processes.dbPath, cleanupDays: 0 })
42
+ : undefined;
43
+
44
+ const runner = new WatchdogRunner(config, {
45
+ workflowsAdapter: workflowAdapter,
46
+ jobsAdapter: jobAdapter,
47
+ processesAdapter: processAdapter,
48
+ emit: async (event, data) => {
49
+ await client.emit(event, data);
50
+ },
51
+ });
52
+
53
+ const interval = Math.max(1000, config.intervalMs);
54
+ const timer = setInterval(() => {
55
+ runner.runOnce().catch(() => undefined);
56
+ }, interval);
57
+
58
+ process.on("SIGTERM", async () => {
59
+ clearInterval(timer);
60
+ client.disconnect();
61
+ });
62
+
63
+ function createDb(
64
+ dbPath: string,
65
+ pragmas?: { busyTimeout?: number; synchronous?: string; journalMode?: string }
66
+ ): Kysely<any> {
67
+ const sqlite = new Database(dbPath);
68
+ const busyTimeout = pragmas?.busyTimeout ?? 5000;
69
+ sqlite.run(`PRAGMA busy_timeout = ${busyTimeout}`);
70
+ if (pragmas?.journalMode) {
71
+ sqlite.run(`PRAGMA journal_mode = ${pragmas.journalMode}`);
72
+ }
73
+ if (pragmas?.synchronous) {
74
+ sqlite.run(`PRAGMA synchronous = ${pragmas.synchronous}`);
75
+ }
76
+
77
+ return new Kysely<any>({
78
+ dialect: new BunSqliteDialect({ database: sqlite }),
79
+ });
80
+ }
@@ -0,0 +1,276 @@
1
+ import type { WorkflowAdapter, WorkflowInstance } from "./workflows";
2
+ import type { JobAdapter, Job } from "./jobs";
3
+ import type { ProcessAdapter } from "./process-adapter-sqlite";
4
+
5
+ export type WatchdogService = "workflows" | "jobs" | "processes";
6
+
7
+ export interface WatchdogRunnerConfig {
8
+ services: WatchdogService[];
9
+ killGraceMs: number;
10
+ workflowHeartbeatTimeoutMs: number;
11
+ jobDefaults: {
12
+ heartbeatTimeoutMs: number;
13
+ killGraceMs: number;
14
+ };
15
+ jobConfigs: Record<string, { heartbeatTimeout?: number; timeout?: number; killGraceMs?: number }>;
16
+ }
17
+
18
+ export interface WatchdogRunnerDeps {
19
+ workflowsAdapter?: WorkflowAdapter;
20
+ jobsAdapter?: JobAdapter;
21
+ processesAdapter?: ProcessAdapter;
22
+ now?: () => number;
23
+ killProcess?: (pid: number, signal: NodeJS.Signals) => void;
24
+ isProcessAlive?: (pid: number) => boolean;
25
+ emit?: (event: string, data: Record<string, any>) => Promise<void>;
26
+ }
27
+
28
+ export class WatchdogRunner {
29
+ private config: WatchdogRunnerConfig;
30
+ private deps: WatchdogRunnerDeps;
31
+
32
+ constructor(config: WatchdogRunnerConfig, deps: WatchdogRunnerDeps) {
33
+ this.config = config;
34
+ this.deps = deps;
35
+ }
36
+
37
+ async runOnce(): Promise<void> {
38
+ if (this.config.services.includes("workflows") && this.deps.workflowsAdapter) {
39
+ await this.checkWorkflows();
40
+ }
41
+ if (this.config.services.includes("jobs") && this.deps.jobsAdapter) {
42
+ await this.checkJobs();
43
+ }
44
+ if (this.config.services.includes("processes") && this.deps.processesAdapter) {
45
+ await this.checkProcesses();
46
+ }
47
+ }
48
+
49
+ private async checkWorkflows(): Promise<void> {
50
+ const adapter = this.deps.workflowsAdapter!;
51
+ const now = this.now();
52
+ const instances = await adapter.getRunningInstances();
53
+
54
+ for (const instance of instances) {
55
+ const info = this.getWatchdogMetadata(instance);
56
+ if (!info?.pid) continue;
57
+
58
+ const last = info.lastHeartbeat ?? instance.startedAt?.getTime() ?? 0;
59
+ if (now - last <= this.config.workflowHeartbeatTimeoutMs) continue;
60
+
61
+ await this.emit("workflow.watchdog.stale", {
62
+ instanceId: instance.id,
63
+ pid: info.pid,
64
+ timeoutMs: this.config.workflowHeartbeatTimeoutMs,
65
+ });
66
+
67
+ await this.killProcessWithGrace(info.pid, this.config.killGraceMs);
68
+
69
+ await adapter.updateInstance(instance.id, {
70
+ status: "failed",
71
+ error: "Watchdog killed unresponsive workflow",
72
+ completedAt: new Date(),
73
+ });
74
+
75
+ await this.emit("workflow.watchdog.killed", {
76
+ instanceId: instance.id,
77
+ pid: info.pid,
78
+ reason: "heartbeat",
79
+ });
80
+ }
81
+ }
82
+
83
+ private async checkJobs(): Promise<void> {
84
+ const adapter = this.deps.jobsAdapter!;
85
+ const now = this.now();
86
+ const jobs = await adapter.getRunningExternal();
87
+
88
+ for (const job of jobs) {
89
+ if (!job.pid) continue;
90
+ const config = this.config.jobConfigs[job.name] ?? {};
91
+ const heartbeatTimeout =
92
+ config.heartbeatTimeout ?? this.config.jobDefaults.heartbeatTimeoutMs;
93
+ const killGraceMs = config.killGraceMs ?? this.config.jobDefaults.killGraceMs;
94
+ const lastHeartbeat = job.lastHeartbeat?.getTime() ?? job.startedAt?.getTime() ?? 0;
95
+
96
+ if (now - lastHeartbeat > heartbeatTimeout) {
97
+ await this.emit("job.watchdog.stale", {
98
+ jobId: job.id,
99
+ name: job.name,
100
+ pid: job.pid,
101
+ timeoutMs: heartbeatTimeout,
102
+ });
103
+
104
+ await this.killProcessWithGrace(job.pid, killGraceMs);
105
+
106
+ await adapter.update(job.id, {
107
+ status: "failed",
108
+ error: "Watchdog killed unresponsive job",
109
+ completedAt: new Date(),
110
+ processState: "orphaned",
111
+ });
112
+
113
+ await this.emit("job.watchdog.killed", {
114
+ jobId: job.id,
115
+ name: job.name,
116
+ pid: job.pid,
117
+ reason: "heartbeat",
118
+ });
119
+ continue;
120
+ }
121
+
122
+ if (config.timeout && job.startedAt) {
123
+ if (now - job.startedAt.getTime() > config.timeout) {
124
+ await this.emit("job.watchdog.stale", {
125
+ jobId: job.id,
126
+ name: job.name,
127
+ pid: job.pid,
128
+ timeoutMs: config.timeout,
129
+ reason: "timeout",
130
+ });
131
+
132
+ await this.killProcessWithGrace(job.pid, killGraceMs);
133
+
134
+ await adapter.update(job.id, {
135
+ status: "failed",
136
+ error: "Watchdog killed job after timeout",
137
+ completedAt: new Date(),
138
+ processState: "orphaned",
139
+ });
140
+
141
+ await this.emit("job.watchdog.killed", {
142
+ jobId: job.id,
143
+ name: job.name,
144
+ pid: job.pid,
145
+ reason: "timeout",
146
+ });
147
+ }
148
+ }
149
+ }
150
+ }
151
+
152
+ private async checkProcesses(): Promise<void> {
153
+ const adapter = this.deps.processesAdapter!;
154
+ const now = this.now();
155
+ const running = await adapter.getRunning();
156
+
157
+ for (const proc of running) {
158
+ if (!proc.pid) continue;
159
+ const heartbeatTimeout = proc.config.heartbeat?.timeoutMs;
160
+ const lastHeartbeat = proc.lastHeartbeat?.getTime() ?? proc.startedAt?.getTime() ?? 0;
161
+
162
+ if (heartbeatTimeout && now - lastHeartbeat > heartbeatTimeout) {
163
+ await this.emit("process.watchdog.stale", {
164
+ processId: proc.id,
165
+ name: proc.name,
166
+ pid: proc.pid,
167
+ timeoutMs: heartbeatTimeout,
168
+ });
169
+
170
+ await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
171
+
172
+ await adapter.update(proc.id, {
173
+ status: "crashed",
174
+ error: "Watchdog killed unresponsive process",
175
+ stoppedAt: new Date(),
176
+ });
177
+
178
+ await this.emit("process.watchdog.killed", {
179
+ processId: proc.id,
180
+ name: proc.name,
181
+ pid: proc.pid,
182
+ reason: "heartbeat",
183
+ });
184
+ continue;
185
+ }
186
+
187
+ const maxRuntimeMs = proc.config.limits?.maxRuntimeMs;
188
+ if (maxRuntimeMs && proc.startedAt) {
189
+ if (now - proc.startedAt.getTime() > maxRuntimeMs) {
190
+ await this.emit("process.watchdog.stale", {
191
+ processId: proc.id,
192
+ name: proc.name,
193
+ pid: proc.pid,
194
+ timeoutMs: maxRuntimeMs,
195
+ reason: "maxRuntimeMs",
196
+ });
197
+
198
+ await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
199
+
200
+ await adapter.update(proc.id, {
201
+ status: "crashed",
202
+ error: "Watchdog killed process after max runtime",
203
+ stoppedAt: new Date(),
204
+ });
205
+
206
+ await this.emit("process.watchdog.killed", {
207
+ processId: proc.id,
208
+ name: proc.name,
209
+ pid: proc.pid,
210
+ reason: "maxRuntimeMs",
211
+ });
212
+ }
213
+ }
214
+ }
215
+ }
216
+
217
+ private getWatchdogMetadata(instance: WorkflowInstance): { pid?: number; lastHeartbeat?: number } | null {
218
+ const meta = instance.metadata as any;
219
+ if (!meta || typeof meta !== "object") return null;
220
+ const info = meta.__watchdog;
221
+ if (!info || typeof info !== "object") return null;
222
+ return {
223
+ pid: typeof info.pid === "number" ? info.pid : undefined,
224
+ lastHeartbeat: info.lastHeartbeat ? new Date(info.lastHeartbeat).getTime() : undefined,
225
+ };
226
+ }
227
+
228
+ private now(): number {
229
+ return this.deps.now ? this.deps.now() : Date.now();
230
+ }
231
+
232
+ private async emit(event: string, data: Record<string, any>): Promise<void> {
233
+ if (this.deps.emit) {
234
+ await this.deps.emit(event, data);
235
+ }
236
+ }
237
+
238
+ private async killProcessWithGrace(pid: number, graceMs: number): Promise<void> {
239
+ const kill = this.deps.killProcess ?? process.kill;
240
+ try {
241
+ kill(pid, "SIGTERM");
242
+ } catch {
243
+ return;
244
+ }
245
+
246
+ if (graceMs <= 0) {
247
+ try {
248
+ kill(pid, "SIGKILL");
249
+ } catch {
250
+ // ignore
251
+ }
252
+ return;
253
+ }
254
+
255
+ await new Promise((resolve) => setTimeout(resolve, graceMs));
256
+
257
+ try {
258
+ const isAlive = this.deps.isProcessAlive
259
+ ? this.deps.isProcessAlive(pid)
260
+ : (() => {
261
+ try {
262
+ process.kill(pid, 0);
263
+ return true;
264
+ } catch {
265
+ return false;
266
+ }
267
+ })();
268
+
269
+ if (isAlive) {
270
+ kill(pid, "SIGKILL");
271
+ }
272
+ } catch {
273
+ // ignore
274
+ }
275
+ }
276
+ }
@@ -764,6 +764,8 @@ export interface WorkflowsConfig {
764
764
  killGraceMs?: number;
765
765
  /** SQLite pragmas for isolated subprocess connections */
766
766
  sqlitePragmas?: SqlitePragmaConfig;
767
+ /** Disable in-process watchdog timers (use external watchdog instead) */
768
+ useWatchdog?: boolean;
767
769
  /** Resume strategy for orphaned workflows (default: "blocking") */
768
770
  resumeStrategy?: WorkflowResumeStrategy;
769
771
  }
@@ -825,6 +827,8 @@ export interface Workflows {
825
827
  updateMetadata(instanceId: string, key: string, value: any): Promise<void>;
826
828
  /** Set plugin metadata for local instantiation in isolated workflows */
827
829
  setPluginMetadata(metadata: PluginMetadata): void;
830
+ /** Get resolved SQLite db path (for watchdog) */
831
+ getDbPath(): string | undefined;
828
832
  }
829
833
 
830
834
  export interface PluginMetadata {
@@ -866,6 +870,7 @@ class WorkflowsImpl implements Workflows {
866
870
  private readyTimeoutMs: number;
867
871
  private killGraceMs: number;
868
872
  private sqlitePragmas?: SqlitePragmaConfig;
873
+ private useWatchdog: boolean;
869
874
  private resumeStrategy!: WorkflowResumeStrategy;
870
875
  private workflowModulePaths = new Map<string, string>();
871
876
  private isolatedProcesses = new Map<string, IsolatedProcessInfo>();
@@ -902,6 +907,7 @@ class WorkflowsImpl implements Workflows {
902
907
  this.readyTimeoutMs = config.readyTimeout ?? 10000;
903
908
  this.killGraceMs = config.killGraceMs ?? 5000;
904
909
  this.sqlitePragmas = config.sqlitePragmas;
910
+ this.useWatchdog = config.useWatchdog ?? false;
905
911
  this.resumeStrategy = config.resumeStrategy ?? "blocking";
906
912
  }
907
913
 
@@ -962,6 +968,10 @@ class WorkflowsImpl implements Workflows {
962
968
  this.pluginCustomErrors = metadata.customErrors;
963
969
  }
964
970
 
971
+ getDbPath(): string | undefined {
972
+ return this.dbPath;
973
+ }
974
+
965
975
  async updateMetadata(instanceId: string, key: string, value: any): Promise<void> {
966
976
  const instance = await this.adapter.getInstance(instanceId);
967
977
  if (!instance) return;
@@ -1517,6 +1527,17 @@ class WorkflowsImpl implements Workflows {
1517
1527
  // Set up heartbeat timeout
1518
1528
  this.resetHeartbeatTimeout(instanceId, proc.pid);
1519
1529
 
1530
+ const instance = await this.adapter.getInstance(instanceId);
1531
+ const metadata = { ...(instance?.metadata ?? {}) } as Record<string, any>;
1532
+ metadata.__watchdog = {
1533
+ ...(metadata.__watchdog ?? {}),
1534
+ pid: proc.pid,
1535
+ socketPath,
1536
+ tcpPort,
1537
+ lastHeartbeat: new Date().toISOString(),
1538
+ };
1539
+ await this.adapter.updateInstance(instanceId, { metadata });
1540
+
1520
1541
  const exitBeforeReady = proc.exited.then((exitCode) => {
1521
1542
  throw new Error(`Subprocess exited before ready (code ${exitCode})`);
1522
1543
  });
@@ -1590,7 +1611,8 @@ class WorkflowsImpl implements Workflows {
1590
1611
 
1591
1612
  case "started":
1592
1613
  case "heartbeat":
1593
- // No-op: heartbeat handled above, started is handled by executeIsolatedWorkflow
1614
+ // Update heartbeat tracking metadata
1615
+ await this.updateWatchdogHeartbeat(instanceId);
1594
1616
  break;
1595
1617
 
1596
1618
  case "step.started": {
@@ -1958,6 +1980,17 @@ class WorkflowsImpl implements Workflows {
1958
1980
  this.rejectIsolatedReady(instanceId, new Error("Isolated workflow cleaned up"));
1959
1981
  }
1960
1982
 
1983
+ private async updateWatchdogHeartbeat(instanceId: string): Promise<void> {
1984
+ const instance = await this.adapter.getInstance(instanceId);
1985
+ if (!instance) return;
1986
+ const metadata = { ...(instance.metadata ?? {}) } as Record<string, any>;
1987
+ metadata.__watchdog = {
1988
+ ...(metadata.__watchdog ?? {}),
1989
+ lastHeartbeat: new Date().toISOString(),
1990
+ };
1991
+ await this.adapter.updateInstance(instanceId, { metadata });
1992
+ }
1993
+
1961
1994
  private async markOrphanedAsFailed(
1962
1995
  instances: WorkflowInstance[],
1963
1996
  reason: string
@@ -1990,6 +2023,7 @@ class WorkflowsImpl implements Workflows {
1990
2023
  * Reset heartbeat timeout for an isolated workflow
1991
2024
  */
1992
2025
  private resetHeartbeatTimeout(instanceId: string, pid: number): void {
2026
+ if (this.useWatchdog) return;
1993
2027
  const info = this.isolatedProcesses.get(instanceId);
1994
2028
  if (!info) return;
1995
2029
 
package/src/server.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  import { z } from "zod";
2
2
  import { mkdir, writeFile } from "node:fs/promises";
3
- import { dirname } from "node:path";
3
+ import { dirname, join } from "node:path";
4
+ import { fileURLToPath } from "node:url";
4
5
  import { PluginManager, type CoreServices, type ConfiguredPlugin } from "./core";
5
6
  import { type IRouter, type RouteDefinition, type ServerContext, type HandlerRegistry } from "./router";
6
7
  import { Handlers } from "./handlers";
@@ -87,6 +88,13 @@ export interface ServerConfig {
87
88
  */
88
89
  workflowsResumeStrategy?: "blocking" | "background" | "skip";
89
90
  processes?: ProcessesConfig;
91
+ /** Watchdog subprocess configuration */
92
+ watchdog?: {
93
+ enabled?: boolean;
94
+ intervalMs?: number;
95
+ services?: ("workflows" | "jobs" | "processes")[];
96
+ killGraceMs?: number;
97
+ };
90
98
  audit?: AuditConfig;
91
99
  websocket?: WebSocketConfig;
92
100
  storage?: StorageConfig;
@@ -221,6 +229,9 @@ export class AppServer {
221
229
  private generateModeSetup = false;
222
230
  private initMode: "adapter" | "server" = "server";
223
231
  private workflowsResumeStrategy?: "blocking" | "background" | "skip";
232
+ private watchdogConfig?: ServerConfig["watchdog"];
233
+ private watchdogStarted = false;
234
+ private options: ServerConfig;
224
235
 
225
236
  // Custom services registry
226
237
  private serviceFactories = new Map<string, ServiceFactory<any>>();
@@ -228,11 +239,13 @@ export class AppServer {
228
239
  private generateModeTimer?: ReturnType<typeof setTimeout>;
229
240
 
230
241
  constructor(options: ServerConfig) {
242
+ this.options = options;
231
243
  // Port priority: explicit config > PORT env var > default 3000
232
244
  const envPort = process.env.PORT ? parseInt(process.env.PORT, 10) : undefined;
233
245
  this.port = options.port ?? envPort ?? 3000;
234
246
  this.maxPortAttempts = options.maxPortAttempts ?? 5;
235
247
  this.workflowsResumeStrategy = options.workflowsResumeStrategy ?? options.workflows?.resumeStrategy;
248
+ this.watchdogConfig = options.watchdog;
236
249
 
237
250
  // Determine if we should use legacy databases
238
251
  const useLegacy = options.useLegacyCoreDatabases ?? false;
@@ -286,6 +299,10 @@ export class AppServer {
286
299
  events,
287
300
  logger,
288
301
  adapter: jobAdapter,
302
+ external: {
303
+ ...options.jobs?.external,
304
+ useWatchdog: options.watchdog?.enabled ? true : options.jobs?.external?.useWatchdog,
305
+ },
289
306
  // Disable built-in persistence when using Kysely adapter
290
307
  persist: useLegacy ? options.jobs?.persist : false,
291
308
  });
@@ -297,12 +314,14 @@ export class AppServer {
297
314
  jobs,
298
315
  sse,
299
316
  adapter: workflowAdapter,
317
+ useWatchdog: options.watchdog?.enabled ? true : options.workflows?.useWatchdog,
300
318
  });
301
319
 
302
320
  // Processes - still uses its own adapter pattern but can use Kysely
303
321
  const processes = createProcesses({
304
322
  ...options.processes,
305
323
  events,
324
+ useWatchdog: options.watchdog?.enabled ? true : options.processes?.useWatchdog,
306
325
  });
307
326
 
308
327
  // New services
@@ -1055,6 +1074,7 @@ ${factoryFunction}
1055
1074
  await this.coreServices.workflows.resume();
1056
1075
  }
1057
1076
  this.coreServices.processes.start();
1077
+ await this.startWatchdog();
1058
1078
  logger.info("Background services started (cron, jobs, workflows, processes)");
1059
1079
 
1060
1080
  for (const router of this.routers) {
@@ -1073,6 +1093,69 @@ ${factoryFunction}
1073
1093
  await this.runReadyHandlers();
1074
1094
  }
1075
1095
 
1096
+ private async startWatchdog(): Promise<void> {
1097
+ if (!this.watchdogConfig?.enabled) return;
1098
+ if (this.watchdogStarted) return;
1099
+
1100
+ const executorPath = join(dirname(fileURLToPath(import.meta.url)), "core", "watchdog-executor.ts");
1101
+ const services = this.watchdogConfig.services ?? ["workflows", "jobs", "processes"];
1102
+ const workflowsDbPath = this.coreServices.workflows.getDbPath?.();
1103
+ const jobsDbPath = (this.options.jobs?.dbPath ?? workflowsDbPath ?? ".donkeylabs/jobs.db") as string;
1104
+ const processesDbPath = (this.options.processes?.adapter?.path ?? ".donkeylabs/processes.db") as string;
1105
+
1106
+ const config = {
1107
+ intervalMs: this.watchdogConfig.intervalMs ?? 5000,
1108
+ services,
1109
+ killGraceMs: this.watchdogConfig.killGraceMs ?? 5000,
1110
+ workflowHeartbeatTimeoutMs: this.options.workflows?.heartbeatTimeout ?? 60000,
1111
+ jobDefaults: {
1112
+ heartbeatTimeoutMs: this.options.jobs?.external?.defaultHeartbeatTimeout ?? 30000,
1113
+ killGraceMs: this.options.jobs?.external?.killGraceMs ?? this.watchdogConfig.killGraceMs ?? 5000,
1114
+ },
1115
+ jobConfigs: this.coreServices.jobs.getExternalJobConfigs(),
1116
+ workflows: workflowsDbPath ? { dbPath: workflowsDbPath } : undefined,
1117
+ jobs: jobsDbPath ? { dbPath: jobsDbPath } : undefined,
1118
+ processes: processesDbPath ? { dbPath: processesDbPath } : undefined,
1119
+ sqlitePragmas: this.options.workflows?.sqlitePragmas,
1120
+ };
1121
+
1122
+ try {
1123
+ this.coreServices.processes.register({
1124
+ name: "__watchdog",
1125
+ config: {
1126
+ command: "bun",
1127
+ args: ["run", executorPath],
1128
+ env: {
1129
+ DONKEYLABS_WATCHDOG_CONFIG: JSON.stringify(config),
1130
+ },
1131
+ heartbeat: { intervalMs: 5000, timeoutMs: 30000 },
1132
+ },
1133
+ });
1134
+ } catch {
1135
+ // Already registered
1136
+ }
1137
+
1138
+ await this.coreServices.processes.spawn("__watchdog", {
1139
+ metadata: { role: "watchdog" },
1140
+ });
1141
+
1142
+ this.coreServices.events.on("process.event", async (data: any) => {
1143
+ if (data?.name !== "__watchdog") return;
1144
+ if (!data.event) return;
1145
+
1146
+ await this.coreServices.events.emit(data.event, data.data ?? {});
1147
+
1148
+ if (data.event.startsWith("workflow.watchdog")) {
1149
+ const instanceId = data.data?.instanceId;
1150
+ if (instanceId && this.coreServices.sse) {
1151
+ this.coreServices.sse.broadcast(`workflow:${instanceId}`, data.event, data.data ?? {});
1152
+ }
1153
+ }
1154
+ });
1155
+
1156
+ this.watchdogStarted = true;
1157
+ }
1158
+
1076
1159
  /**
1077
1160
  * Handle a single API request. Used by adapters.
1078
1161
  * Returns null if the route is not found.