@donkeylabs/server 2.0.28 → 2.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,80 @@
1
+ import { Kysely } from "kysely";
2
+ import { BunSqliteDialect } from "kysely-bun-sqlite";
3
+ import Database from "bun:sqlite";
4
+ import { ProcessClient } from "./process-client";
5
+ import { KyselyWorkflowAdapter } from "./workflow-adapter-kysely";
6
+ import { KyselyJobAdapter } from "./job-adapter-kysely";
7
+ import { SqliteProcessAdapter } from "./process-adapter-sqlite";
8
+ import { WatchdogRunner, type WatchdogRunnerConfig } from "./watchdog-runner";
9
+
10
+ interface WatchdogConfig extends WatchdogRunnerConfig {
11
+ intervalMs: number;
12
+ workflows?: { dbPath?: string };
13
+ jobs?: { dbPath?: string };
14
+ processes?: { dbPath?: string };
15
+ sqlitePragmas?: {
16
+ busyTimeout?: number;
17
+ synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
18
+ journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
19
+ };
20
+ }
21
+
22
+ const raw = process.env.DONKEYLABS_WATCHDOG_CONFIG;
23
+ if (!raw) {
24
+ throw new Error("Missing DONKEYLABS_WATCHDOG_CONFIG");
25
+ }
26
+
27
+ const config: WatchdogConfig = JSON.parse(raw);
28
+ const client = await ProcessClient.connect();
29
+
30
+ const workflowAdapter = config.workflows?.dbPath
31
+ ? new KyselyWorkflowAdapter(createDb(config.workflows.dbPath, config.sqlitePragmas), {
32
+ cleanupDays: 0,
33
+ })
34
+ : undefined;
35
+ const jobAdapter = config.jobs?.dbPath
36
+ ? new KyselyJobAdapter(createDb(config.jobs.dbPath, config.sqlitePragmas), {
37
+ cleanupDays: 0,
38
+ })
39
+ : undefined;
40
+ const processAdapter = config.processes?.dbPath
41
+ ? new SqliteProcessAdapter({ path: config.processes.dbPath, cleanupDays: 0 })
42
+ : undefined;
43
+
44
+ const runner = new WatchdogRunner(config, {
45
+ workflowsAdapter: workflowAdapter,
46
+ jobsAdapter: jobAdapter,
47
+ processesAdapter: processAdapter,
48
+ emit: async (event, data) => {
49
+ await client.emit(event, data);
50
+ },
51
+ });
52
+
53
+ const interval = Math.max(1000, config.intervalMs);
54
+ const timer = setInterval(() => {
55
+ runner.runOnce().catch(() => undefined);
56
+ }, interval);
57
+
58
+ process.on("SIGTERM", async () => {
59
+ clearInterval(timer);
60
+ client.disconnect();
61
+ });
62
+
63
+ function createDb(
64
+ dbPath: string,
65
+ pragmas?: { busyTimeout?: number; synchronous?: string; journalMode?: string }
66
+ ): Kysely<any> {
67
+ const sqlite = new Database(dbPath);
68
+ const busyTimeout = pragmas?.busyTimeout ?? 5000;
69
+ sqlite.run(`PRAGMA busy_timeout = ${busyTimeout}`);
70
+ if (pragmas?.journalMode) {
71
+ sqlite.run(`PRAGMA journal_mode = ${pragmas.journalMode}`);
72
+ }
73
+ if (pragmas?.synchronous) {
74
+ sqlite.run(`PRAGMA synchronous = ${pragmas.synchronous}`);
75
+ }
76
+
77
+ return new Kysely<any>({
78
+ dialect: new BunSqliteDialect({ database: sqlite }),
79
+ });
80
+ }
@@ -0,0 +1,276 @@
1
+ import type { WorkflowAdapter, WorkflowInstance } from "./workflows";
2
+ import type { JobAdapter, Job } from "./jobs";
3
+ import type { ProcessAdapter } from "./process-adapter-sqlite";
4
+
5
+ export type WatchdogService = "workflows" | "jobs" | "processes";
6
+
7
+ export interface WatchdogRunnerConfig {
8
+ services: WatchdogService[];
9
+ killGraceMs: number;
10
+ workflowHeartbeatTimeoutMs: number;
11
+ jobDefaults: {
12
+ heartbeatTimeoutMs: number;
13
+ killGraceMs: number;
14
+ };
15
+ jobConfigs: Record<string, { heartbeatTimeout?: number; timeout?: number; killGraceMs?: number }>;
16
+ }
17
+
18
+ export interface WatchdogRunnerDeps {
19
+ workflowsAdapter?: WorkflowAdapter;
20
+ jobsAdapter?: JobAdapter;
21
+ processesAdapter?: ProcessAdapter;
22
+ now?: () => number;
23
+ killProcess?: (pid: number, signal: NodeJS.Signals) => void;
24
+ isProcessAlive?: (pid: number) => boolean;
25
+ emit?: (event: string, data: Record<string, any>) => Promise<void>;
26
+ }
27
+
28
+ export class WatchdogRunner {
29
+ private config: WatchdogRunnerConfig;
30
+ private deps: WatchdogRunnerDeps;
31
+
32
+ constructor(config: WatchdogRunnerConfig, deps: WatchdogRunnerDeps) {
33
+ this.config = config;
34
+ this.deps = deps;
35
+ }
36
+
37
+ async runOnce(): Promise<void> {
38
+ if (this.config.services.includes("workflows") && this.deps.workflowsAdapter) {
39
+ await this.checkWorkflows();
40
+ }
41
+ if (this.config.services.includes("jobs") && this.deps.jobsAdapter) {
42
+ await this.checkJobs();
43
+ }
44
+ if (this.config.services.includes("processes") && this.deps.processesAdapter) {
45
+ await this.checkProcesses();
46
+ }
47
+ }
48
+
49
+ private async checkWorkflows(): Promise<void> {
50
+ const adapter = this.deps.workflowsAdapter!;
51
+ const now = this.now();
52
+ const instances = await adapter.getRunningInstances();
53
+
54
+ for (const instance of instances) {
55
+ const info = this.getWatchdogMetadata(instance);
56
+ if (!info?.pid) continue;
57
+
58
+ const last = info.lastHeartbeat ?? instance.startedAt?.getTime() ?? 0;
59
+ if (now - last <= this.config.workflowHeartbeatTimeoutMs) continue;
60
+
61
+ await this.emit("workflow.watchdog.stale", {
62
+ instanceId: instance.id,
63
+ pid: info.pid,
64
+ timeoutMs: this.config.workflowHeartbeatTimeoutMs,
65
+ });
66
+
67
+ await this.killProcessWithGrace(info.pid, this.config.killGraceMs);
68
+
69
+ await adapter.updateInstance(instance.id, {
70
+ status: "failed",
71
+ error: "Watchdog killed unresponsive workflow",
72
+ completedAt: new Date(),
73
+ });
74
+
75
+ await this.emit("workflow.watchdog.killed", {
76
+ instanceId: instance.id,
77
+ pid: info.pid,
78
+ reason: "heartbeat",
79
+ });
80
+ }
81
+ }
82
+
83
+ private async checkJobs(): Promise<void> {
84
+ const adapter = this.deps.jobsAdapter!;
85
+ const now = this.now();
86
+ const jobs = await adapter.getRunningExternal();
87
+
88
+ for (const job of jobs) {
89
+ if (!job.pid) continue;
90
+ const config = this.config.jobConfigs[job.name] ?? {};
91
+ const heartbeatTimeout =
92
+ config.heartbeatTimeout ?? this.config.jobDefaults.heartbeatTimeoutMs;
93
+ const killGraceMs = config.killGraceMs ?? this.config.jobDefaults.killGraceMs;
94
+ const lastHeartbeat = job.lastHeartbeat?.getTime() ?? job.startedAt?.getTime() ?? 0;
95
+
96
+ if (now - lastHeartbeat > heartbeatTimeout) {
97
+ await this.emit("job.watchdog.stale", {
98
+ jobId: job.id,
99
+ name: job.name,
100
+ pid: job.pid,
101
+ timeoutMs: heartbeatTimeout,
102
+ });
103
+
104
+ await this.killProcessWithGrace(job.pid, killGraceMs);
105
+
106
+ await adapter.update(job.id, {
107
+ status: "failed",
108
+ error: "Watchdog killed unresponsive job",
109
+ completedAt: new Date(),
110
+ processState: "orphaned",
111
+ });
112
+
113
+ await this.emit("job.watchdog.killed", {
114
+ jobId: job.id,
115
+ name: job.name,
116
+ pid: job.pid,
117
+ reason: "heartbeat",
118
+ });
119
+ continue;
120
+ }
121
+
122
+ if (config.timeout && job.startedAt) {
123
+ if (now - job.startedAt.getTime() > config.timeout) {
124
+ await this.emit("job.watchdog.stale", {
125
+ jobId: job.id,
126
+ name: job.name,
127
+ pid: job.pid,
128
+ timeoutMs: config.timeout,
129
+ reason: "timeout",
130
+ });
131
+
132
+ await this.killProcessWithGrace(job.pid, killGraceMs);
133
+
134
+ await adapter.update(job.id, {
135
+ status: "failed",
136
+ error: "Watchdog killed job after timeout",
137
+ completedAt: new Date(),
138
+ processState: "orphaned",
139
+ });
140
+
141
+ await this.emit("job.watchdog.killed", {
142
+ jobId: job.id,
143
+ name: job.name,
144
+ pid: job.pid,
145
+ reason: "timeout",
146
+ });
147
+ }
148
+ }
149
+ }
150
+ }
151
+
152
+ private async checkProcesses(): Promise<void> {
153
+ const adapter = this.deps.processesAdapter!;
154
+ const now = this.now();
155
+ const running = await adapter.getRunning();
156
+
157
+ for (const proc of running) {
158
+ if (!proc.pid) continue;
159
+ const heartbeatTimeout = proc.config.heartbeat?.timeoutMs;
160
+ const lastHeartbeat = proc.lastHeartbeat?.getTime() ?? proc.startedAt?.getTime() ?? 0;
161
+
162
+ if (heartbeatTimeout && now - lastHeartbeat > heartbeatTimeout) {
163
+ await this.emit("process.watchdog.stale", {
164
+ processId: proc.id,
165
+ name: proc.name,
166
+ pid: proc.pid,
167
+ timeoutMs: heartbeatTimeout,
168
+ });
169
+
170
+ await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
171
+
172
+ await adapter.update(proc.id, {
173
+ status: "crashed",
174
+ error: "Watchdog killed unresponsive process",
175
+ stoppedAt: new Date(),
176
+ });
177
+
178
+ await this.emit("process.watchdog.killed", {
179
+ processId: proc.id,
180
+ name: proc.name,
181
+ pid: proc.pid,
182
+ reason: "heartbeat",
183
+ });
184
+ continue;
185
+ }
186
+
187
+ const maxRuntimeMs = proc.config.limits?.maxRuntimeMs;
188
+ if (maxRuntimeMs && proc.startedAt) {
189
+ if (now - proc.startedAt.getTime() > maxRuntimeMs) {
190
+ await this.emit("process.watchdog.stale", {
191
+ processId: proc.id,
192
+ name: proc.name,
193
+ pid: proc.pid,
194
+ timeoutMs: maxRuntimeMs,
195
+ reason: "maxRuntimeMs",
196
+ });
197
+
198
+ await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
199
+
200
+ await adapter.update(proc.id, {
201
+ status: "crashed",
202
+ error: "Watchdog killed process after max runtime",
203
+ stoppedAt: new Date(),
204
+ });
205
+
206
+ await this.emit("process.watchdog.killed", {
207
+ processId: proc.id,
208
+ name: proc.name,
209
+ pid: proc.pid,
210
+ reason: "maxRuntimeMs",
211
+ });
212
+ }
213
+ }
214
+ }
215
+ }
216
+
217
+ private getWatchdogMetadata(instance: WorkflowInstance): { pid?: number; lastHeartbeat?: number } | null {
218
+ const meta = instance.metadata as any;
219
+ if (!meta || typeof meta !== "object") return null;
220
+ const info = meta.__watchdog;
221
+ if (!info || typeof info !== "object") return null;
222
+ return {
223
+ pid: typeof info.pid === "number" ? info.pid : undefined,
224
+ lastHeartbeat: info.lastHeartbeat ? new Date(info.lastHeartbeat).getTime() : undefined,
225
+ };
226
+ }
227
+
228
+ private now(): number {
229
+ return this.deps.now ? this.deps.now() : Date.now();
230
+ }
231
+
232
+ private async emit(event: string, data: Record<string, any>): Promise<void> {
233
+ if (this.deps.emit) {
234
+ await this.deps.emit(event, data);
235
+ }
236
+ }
237
+
238
+ private async killProcessWithGrace(pid: number, graceMs: number): Promise<void> {
239
+ const kill = this.deps.killProcess ?? process.kill;
240
+ try {
241
+ kill(pid, "SIGTERM");
242
+ } catch {
243
+ return;
244
+ }
245
+
246
+ if (graceMs <= 0) {
247
+ try {
248
+ kill(pid, "SIGKILL");
249
+ } catch {
250
+ // ignore
251
+ }
252
+ return;
253
+ }
254
+
255
+ await new Promise((resolve) => setTimeout(resolve, graceMs));
256
+
257
+ try {
258
+ const isAlive = this.deps.isProcessAlive
259
+ ? this.deps.isProcessAlive(pid)
260
+ : (() => {
261
+ try {
262
+ process.kill(pid, 0);
263
+ return true;
264
+ } catch {
265
+ return false;
266
+ }
267
+ })();
268
+
269
+ if (isAlive) {
270
+ kill(pid, "SIGKILL");
271
+ }
272
+ } catch {
273
+ // ignore
274
+ }
275
+ }
276
+ }
@@ -25,6 +25,11 @@ interface ExecutorConfig {
25
25
  pluginModulePaths: Record<string, string>;
26
26
  pluginConfigs: Record<string, any>;
27
27
  coreConfig?: Record<string, any>;
28
+ sqlitePragmas?: {
29
+ busyTimeout?: number;
30
+ synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
31
+ journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
32
+ };
28
33
  }
29
34
 
30
35
  // ============================================
@@ -47,6 +52,7 @@ async function main(): Promise<void> {
47
52
  pluginModulePaths,
48
53
  pluginConfigs,
49
54
  coreConfig,
55
+ sqlitePragmas,
50
56
  } = config;
51
57
 
52
58
  const socket = await connectToSocket(socketPath, tcpPort);
@@ -71,6 +77,7 @@ async function main(): Promise<void> {
71
77
  const bootstrap = await bootstrapSubprocess({
72
78
  dbPath,
73
79
  coreConfig,
80
+ sqlitePragmas,
74
81
  pluginMetadata: {
75
82
  names: pluginNames,
76
83
  modulePaths: pluginModulePaths,
@@ -248,7 +248,24 @@ export class WorkflowSocketServerImpl implements WorkflowSocketServer {
248
248
 
249
249
  let buffer = "";
250
250
 
251
- socket.on("data", async (data) => {
251
+ const queue: WorkflowMessage[] = [];
252
+ let processing = false;
253
+
254
+ const processQueue = async () => {
255
+ if (processing) return;
256
+ processing = true;
257
+ while (queue.length > 0) {
258
+ const message = queue.shift()!;
259
+ try {
260
+ await this.handleMessage(instanceId, message);
261
+ } catch (err) {
262
+ this.onError?.(err instanceof Error ? err : new Error(String(err)), instanceId);
263
+ }
264
+ }
265
+ processing = false;
266
+ };
267
+
268
+ socket.on("data", (data) => {
252
269
  buffer += data.toString();
253
270
 
254
271
  // Process complete messages (newline-delimited JSON)
@@ -260,11 +277,13 @@ export class WorkflowSocketServerImpl implements WorkflowSocketServer {
260
277
 
261
278
  try {
262
279
  const message = JSON.parse(line) as WorkflowMessage;
263
- await this.handleMessage(instanceId, message);
280
+ queue.push(message);
264
281
  } catch (err) {
265
282
  this.onError?.(new Error(`Invalid message: ${line}`), instanceId);
266
283
  }
267
284
  }
285
+
286
+ processQueue().catch(() => undefined);
268
287
  });
269
288
 
270
289
  socket.on("error", (err) => {
@@ -760,12 +760,24 @@ export interface WorkflowsConfig {
760
760
  heartbeatTimeout?: number;
761
761
  /** Timeout waiting for isolated subprocess readiness (ms, default: 10000) */
762
762
  readyTimeout?: number;
763
+ /** Grace period before SIGKILL when terminating isolated subprocesses (ms, default: 5000) */
764
+ killGraceMs?: number;
765
+ /** SQLite pragmas for isolated subprocess connections */
766
+ sqlitePragmas?: SqlitePragmaConfig;
767
+ /** Disable in-process watchdog timers (use external watchdog instead) */
768
+ useWatchdog?: boolean;
763
769
  /** Resume strategy for orphaned workflows (default: "blocking") */
764
770
  resumeStrategy?: WorkflowResumeStrategy;
765
771
  }
766
772
 
767
773
  export type WorkflowResumeStrategy = "blocking" | "background" | "skip";
768
774
 
775
+ export interface SqlitePragmaConfig {
776
+ busyTimeout?: number;
777
+ synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
778
+ journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
779
+ }
780
+
769
781
  /** Options for registering a workflow */
770
782
  export interface WorkflowRegisterOptions {
771
783
  /**
@@ -815,6 +827,8 @@ export interface Workflows {
815
827
  updateMetadata(instanceId: string, key: string, value: any): Promise<void>;
816
828
  /** Set plugin metadata for local instantiation in isolated workflows */
817
829
  setPluginMetadata(metadata: PluginMetadata): void;
830
+ /** Get resolved SQLite db path (for watchdog) */
831
+ getDbPath(): string | undefined;
818
832
  }
819
833
 
820
834
  export interface PluginMetadata {
@@ -854,6 +868,9 @@ class WorkflowsImpl implements Workflows {
854
868
  private dbPath?: string;
855
869
  private heartbeatTimeoutMs: number;
856
870
  private readyTimeoutMs: number;
871
+ private killGraceMs: number;
872
+ private sqlitePragmas?: SqlitePragmaConfig;
873
+ private useWatchdog: boolean;
857
874
  private resumeStrategy!: WorkflowResumeStrategy;
858
875
  private workflowModulePaths = new Map<string, string>();
859
876
  private isolatedProcesses = new Map<string, IsolatedProcessInfo>();
@@ -888,6 +905,9 @@ class WorkflowsImpl implements Workflows {
888
905
  this.dbPath = config.dbPath;
889
906
  this.heartbeatTimeoutMs = config.heartbeatTimeout ?? 60000;
890
907
  this.readyTimeoutMs = config.readyTimeout ?? 10000;
908
+ this.killGraceMs = config.killGraceMs ?? 5000;
909
+ this.sqlitePragmas = config.sqlitePragmas;
910
+ this.useWatchdog = config.useWatchdog ?? false;
891
911
  this.resumeStrategy = config.resumeStrategy ?? "blocking";
892
912
  }
893
913
 
@@ -948,6 +968,10 @@ class WorkflowsImpl implements Workflows {
948
968
  this.pluginCustomErrors = metadata.customErrors;
949
969
  }
950
970
 
971
+ getDbPath(): string | undefined {
972
+ return this.dbPath;
973
+ }
974
+
951
975
  async updateMetadata(instanceId: string, key: string, value: any): Promise<void> {
952
976
  const instance = await this.adapter.getInstance(instanceId);
953
977
  if (!instance) return;
@@ -1049,11 +1073,7 @@ class WorkflowsImpl implements Workflows {
1049
1073
  // Kill isolated process if running
1050
1074
  const isolatedInfo = this.isolatedProcesses.get(instanceId);
1051
1075
  if (isolatedInfo) {
1052
- try {
1053
- process.kill(isolatedInfo.pid, "SIGTERM");
1054
- } catch {
1055
- // Process might already be dead
1056
- }
1076
+ await killProcessWithGrace(isolatedInfo.pid, this.killGraceMs);
1057
1077
  if (isolatedInfo.timeout) clearTimeout(isolatedInfo.timeout);
1058
1078
  if (isolatedInfo.heartbeatTimeout) clearTimeout(isolatedInfo.heartbeatTimeout);
1059
1079
  this.isolatedProcesses.delete(instanceId);
@@ -1470,6 +1490,7 @@ class WorkflowsImpl implements Workflows {
1470
1490
  pluginModulePaths: this.pluginModulePaths,
1471
1491
  pluginConfigs,
1472
1492
  coreConfig,
1493
+ sqlitePragmas: this.sqlitePragmas,
1473
1494
  };
1474
1495
 
1475
1496
  // Spawn the subprocess
@@ -1506,6 +1527,17 @@ class WorkflowsImpl implements Workflows {
1506
1527
  // Set up heartbeat timeout
1507
1528
  this.resetHeartbeatTimeout(instanceId, proc.pid);
1508
1529
 
1530
+ const instance = await this.adapter.getInstance(instanceId);
1531
+ const metadata = { ...(instance?.metadata ?? {}) } as Record<string, any>;
1532
+ metadata.__watchdog = {
1533
+ ...(metadata.__watchdog ?? {}),
1534
+ pid: proc.pid,
1535
+ socketPath,
1536
+ tcpPort,
1537
+ lastHeartbeat: new Date().toISOString(),
1538
+ };
1539
+ await this.adapter.updateInstance(instanceId, { metadata });
1540
+
1509
1541
  const exitBeforeReady = proc.exited.then((exitCode) => {
1510
1542
  throw new Error(`Subprocess exited before ready (code ${exitCode})`);
1511
1543
  });
@@ -1579,7 +1611,8 @@ class WorkflowsImpl implements Workflows {
1579
1611
 
1580
1612
  case "started":
1581
1613
  case "heartbeat":
1582
- // No-op: heartbeat handled above, started is handled by executeIsolatedWorkflow
1614
+ // Update heartbeat tracking metadata
1615
+ await this.updateWatchdogHeartbeat(instanceId);
1583
1616
  break;
1584
1617
 
1585
1618
  case "step.started": {
@@ -1947,6 +1980,17 @@ class WorkflowsImpl implements Workflows {
1947
1980
  this.rejectIsolatedReady(instanceId, new Error("Isolated workflow cleaned up"));
1948
1981
  }
1949
1982
 
1983
+ private async updateWatchdogHeartbeat(instanceId: string): Promise<void> {
1984
+ const instance = await this.adapter.getInstance(instanceId);
1985
+ if (!instance) return;
1986
+ const metadata = { ...(instance.metadata ?? {}) } as Record<string, any>;
1987
+ metadata.__watchdog = {
1988
+ ...(metadata.__watchdog ?? {}),
1989
+ lastHeartbeat: new Date().toISOString(),
1990
+ };
1991
+ await this.adapter.updateInstance(instanceId, { metadata });
1992
+ }
1993
+
1950
1994
  private async markOrphanedAsFailed(
1951
1995
  instances: WorkflowInstance[],
1952
1996
  reason: string
@@ -1979,6 +2023,7 @@ class WorkflowsImpl implements Workflows {
1979
2023
  * Reset heartbeat timeout for an isolated workflow
1980
2024
  */
1981
2025
  private resetHeartbeatTimeout(instanceId: string, pid: number): void {
2026
+ if (this.useWatchdog) return;
1982
2027
  const info = this.isolatedProcesses.get(instanceId);
1983
2028
  if (!info) return;
1984
2029
 
@@ -1995,6 +2040,11 @@ class WorkflowsImpl implements Workflows {
1995
2040
  }
1996
2041
 
1997
2042
  console.error(`[Workflows] No heartbeat from isolated workflow ${instanceId} for ${this.heartbeatTimeoutMs}ms`);
2043
+ await this.emitEvent("workflow.watchdog.stale", {
2044
+ instanceId,
2045
+ reason: "heartbeat",
2046
+ timeoutMs: this.heartbeatTimeoutMs,
2047
+ });
1998
2048
  await this.handleIsolatedTimeout(instanceId, pid);
1999
2049
  }, this.heartbeatTimeoutMs);
2000
2050
  }
@@ -2006,12 +2056,12 @@ class WorkflowsImpl implements Workflows {
2006
2056
  const info = this.isolatedProcesses.get(instanceId);
2007
2057
  if (!info) return;
2008
2058
 
2009
- // Kill the process
2010
- try {
2011
- process.kill(pid, "SIGKILL");
2012
- } catch {
2013
- // Process might already be dead
2014
- }
2059
+ await killProcessWithGrace(pid, this.killGraceMs);
2060
+ await this.emitEvent("workflow.watchdog.killed", {
2061
+ instanceId,
2062
+ reason: "timeout",
2063
+ timeoutMs: this.heartbeatTimeoutMs,
2064
+ });
2015
2065
 
2016
2066
  // Clean up
2017
2067
  if (info.timeout) clearTimeout(info.timeout);
@@ -2148,3 +2198,29 @@ function isPlainObject(value: Record<string, any>): boolean {
2148
2198
  export function createWorkflows(config?: WorkflowsConfig): Workflows {
2149
2199
  return new WorkflowsImpl(config);
2150
2200
  }
2201
+
2202
+ async function killProcessWithGrace(pid: number, graceMs: number): Promise<void> {
2203
+ try {
2204
+ process.kill(pid, "SIGTERM");
2205
+ } catch {
2206
+ return;
2207
+ }
2208
+
2209
+ if (graceMs <= 0) {
2210
+ try {
2211
+ process.kill(pid, "SIGKILL");
2212
+ } catch {
2213
+ return;
2214
+ }
2215
+ return;
2216
+ }
2217
+
2218
+ await new Promise((resolve) => setTimeout(resolve, graceMs));
2219
+
2220
+ try {
2221
+ process.kill(pid, 0);
2222
+ process.kill(pid, "SIGKILL");
2223
+ } catch {
2224
+ // Process already exited
2225
+ }
2226
+ }