@donkeylabs/server 2.0.28 → 2.0.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/jobs.md +27 -0
- package/docs/processes.md +29 -0
- package/docs/workflows.md +32 -0
- package/package.json +1 -1
- package/src/core/external-job-socket.ts +20 -1
- package/src/core/external-jobs.ts +6 -0
- package/src/core/index.ts +1 -0
- package/src/core/jobs.ts +110 -52
- package/src/core/process-socket.ts +20 -1
- package/src/core/processes.ts +100 -4
- package/src/core/subprocess-bootstrap.ts +14 -1
- package/src/core/watchdog-executor.ts +80 -0
- package/src/core/watchdog-runner.ts +276 -0
- package/src/core/workflow-executor.ts +7 -0
- package/src/core/workflow-socket.ts +21 -2
- package/src/core/workflows.ts +88 -12
- package/src/server.ts +84 -1
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { Kysely } from "kysely";
|
|
2
|
+
import { BunSqliteDialect } from "kysely-bun-sqlite";
|
|
3
|
+
import Database from "bun:sqlite";
|
|
4
|
+
import { ProcessClient } from "./process-client";
|
|
5
|
+
import { KyselyWorkflowAdapter } from "./workflow-adapter-kysely";
|
|
6
|
+
import { KyselyJobAdapter } from "./job-adapter-kysely";
|
|
7
|
+
import { SqliteProcessAdapter } from "./process-adapter-sqlite";
|
|
8
|
+
import { WatchdogRunner, type WatchdogRunnerConfig } from "./watchdog-runner";
|
|
9
|
+
|
|
10
|
+
interface WatchdogConfig extends WatchdogRunnerConfig {
|
|
11
|
+
intervalMs: number;
|
|
12
|
+
workflows?: { dbPath?: string };
|
|
13
|
+
jobs?: { dbPath?: string };
|
|
14
|
+
processes?: { dbPath?: string };
|
|
15
|
+
sqlitePragmas?: {
|
|
16
|
+
busyTimeout?: number;
|
|
17
|
+
synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
|
|
18
|
+
journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const raw = process.env.DONKEYLABS_WATCHDOG_CONFIG;
|
|
23
|
+
if (!raw) {
|
|
24
|
+
throw new Error("Missing DONKEYLABS_WATCHDOG_CONFIG");
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const config: WatchdogConfig = JSON.parse(raw);
|
|
28
|
+
const client = await ProcessClient.connect();
|
|
29
|
+
|
|
30
|
+
const workflowAdapter = config.workflows?.dbPath
|
|
31
|
+
? new KyselyWorkflowAdapter(createDb(config.workflows.dbPath, config.sqlitePragmas), {
|
|
32
|
+
cleanupDays: 0,
|
|
33
|
+
})
|
|
34
|
+
: undefined;
|
|
35
|
+
const jobAdapter = config.jobs?.dbPath
|
|
36
|
+
? new KyselyJobAdapter(createDb(config.jobs.dbPath, config.sqlitePragmas), {
|
|
37
|
+
cleanupDays: 0,
|
|
38
|
+
})
|
|
39
|
+
: undefined;
|
|
40
|
+
const processAdapter = config.processes?.dbPath
|
|
41
|
+
? new SqliteProcessAdapter({ path: config.processes.dbPath, cleanupDays: 0 })
|
|
42
|
+
: undefined;
|
|
43
|
+
|
|
44
|
+
const runner = new WatchdogRunner(config, {
|
|
45
|
+
workflowsAdapter: workflowAdapter,
|
|
46
|
+
jobsAdapter: jobAdapter,
|
|
47
|
+
processesAdapter: processAdapter,
|
|
48
|
+
emit: async (event, data) => {
|
|
49
|
+
await client.emit(event, data);
|
|
50
|
+
},
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
const interval = Math.max(1000, config.intervalMs);
|
|
54
|
+
const timer = setInterval(() => {
|
|
55
|
+
runner.runOnce().catch(() => undefined);
|
|
56
|
+
}, interval);
|
|
57
|
+
|
|
58
|
+
process.on("SIGTERM", async () => {
|
|
59
|
+
clearInterval(timer);
|
|
60
|
+
client.disconnect();
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
function createDb(
|
|
64
|
+
dbPath: string,
|
|
65
|
+
pragmas?: { busyTimeout?: number; synchronous?: string; journalMode?: string }
|
|
66
|
+
): Kysely<any> {
|
|
67
|
+
const sqlite = new Database(dbPath);
|
|
68
|
+
const busyTimeout = pragmas?.busyTimeout ?? 5000;
|
|
69
|
+
sqlite.run(`PRAGMA busy_timeout = ${busyTimeout}`);
|
|
70
|
+
if (pragmas?.journalMode) {
|
|
71
|
+
sqlite.run(`PRAGMA journal_mode = ${pragmas.journalMode}`);
|
|
72
|
+
}
|
|
73
|
+
if (pragmas?.synchronous) {
|
|
74
|
+
sqlite.run(`PRAGMA synchronous = ${pragmas.synchronous}`);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return new Kysely<any>({
|
|
78
|
+
dialect: new BunSqliteDialect({ database: sqlite }),
|
|
79
|
+
});
|
|
80
|
+
}
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import type { WorkflowAdapter, WorkflowInstance } from "./workflows";
|
|
2
|
+
import type { JobAdapter, Job } from "./jobs";
|
|
3
|
+
import type { ProcessAdapter } from "./process-adapter-sqlite";
|
|
4
|
+
|
|
5
|
+
export type WatchdogService = "workflows" | "jobs" | "processes";
|
|
6
|
+
|
|
7
|
+
export interface WatchdogRunnerConfig {
|
|
8
|
+
services: WatchdogService[];
|
|
9
|
+
killGraceMs: number;
|
|
10
|
+
workflowHeartbeatTimeoutMs: number;
|
|
11
|
+
jobDefaults: {
|
|
12
|
+
heartbeatTimeoutMs: number;
|
|
13
|
+
killGraceMs: number;
|
|
14
|
+
};
|
|
15
|
+
jobConfigs: Record<string, { heartbeatTimeout?: number; timeout?: number; killGraceMs?: number }>;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface WatchdogRunnerDeps {
|
|
19
|
+
workflowsAdapter?: WorkflowAdapter;
|
|
20
|
+
jobsAdapter?: JobAdapter;
|
|
21
|
+
processesAdapter?: ProcessAdapter;
|
|
22
|
+
now?: () => number;
|
|
23
|
+
killProcess?: (pid: number, signal: NodeJS.Signals) => void;
|
|
24
|
+
isProcessAlive?: (pid: number) => boolean;
|
|
25
|
+
emit?: (event: string, data: Record<string, any>) => Promise<void>;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export class WatchdogRunner {
|
|
29
|
+
private config: WatchdogRunnerConfig;
|
|
30
|
+
private deps: WatchdogRunnerDeps;
|
|
31
|
+
|
|
32
|
+
constructor(config: WatchdogRunnerConfig, deps: WatchdogRunnerDeps) {
|
|
33
|
+
this.config = config;
|
|
34
|
+
this.deps = deps;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async runOnce(): Promise<void> {
|
|
38
|
+
if (this.config.services.includes("workflows") && this.deps.workflowsAdapter) {
|
|
39
|
+
await this.checkWorkflows();
|
|
40
|
+
}
|
|
41
|
+
if (this.config.services.includes("jobs") && this.deps.jobsAdapter) {
|
|
42
|
+
await this.checkJobs();
|
|
43
|
+
}
|
|
44
|
+
if (this.config.services.includes("processes") && this.deps.processesAdapter) {
|
|
45
|
+
await this.checkProcesses();
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
private async checkWorkflows(): Promise<void> {
|
|
50
|
+
const adapter = this.deps.workflowsAdapter!;
|
|
51
|
+
const now = this.now();
|
|
52
|
+
const instances = await adapter.getRunningInstances();
|
|
53
|
+
|
|
54
|
+
for (const instance of instances) {
|
|
55
|
+
const info = this.getWatchdogMetadata(instance);
|
|
56
|
+
if (!info?.pid) continue;
|
|
57
|
+
|
|
58
|
+
const last = info.lastHeartbeat ?? instance.startedAt?.getTime() ?? 0;
|
|
59
|
+
if (now - last <= this.config.workflowHeartbeatTimeoutMs) continue;
|
|
60
|
+
|
|
61
|
+
await this.emit("workflow.watchdog.stale", {
|
|
62
|
+
instanceId: instance.id,
|
|
63
|
+
pid: info.pid,
|
|
64
|
+
timeoutMs: this.config.workflowHeartbeatTimeoutMs,
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
await this.killProcessWithGrace(info.pid, this.config.killGraceMs);
|
|
68
|
+
|
|
69
|
+
await adapter.updateInstance(instance.id, {
|
|
70
|
+
status: "failed",
|
|
71
|
+
error: "Watchdog killed unresponsive workflow",
|
|
72
|
+
completedAt: new Date(),
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
await this.emit("workflow.watchdog.killed", {
|
|
76
|
+
instanceId: instance.id,
|
|
77
|
+
pid: info.pid,
|
|
78
|
+
reason: "heartbeat",
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
private async checkJobs(): Promise<void> {
|
|
84
|
+
const adapter = this.deps.jobsAdapter!;
|
|
85
|
+
const now = this.now();
|
|
86
|
+
const jobs = await adapter.getRunningExternal();
|
|
87
|
+
|
|
88
|
+
for (const job of jobs) {
|
|
89
|
+
if (!job.pid) continue;
|
|
90
|
+
const config = this.config.jobConfigs[job.name] ?? {};
|
|
91
|
+
const heartbeatTimeout =
|
|
92
|
+
config.heartbeatTimeout ?? this.config.jobDefaults.heartbeatTimeoutMs;
|
|
93
|
+
const killGraceMs = config.killGraceMs ?? this.config.jobDefaults.killGraceMs;
|
|
94
|
+
const lastHeartbeat = job.lastHeartbeat?.getTime() ?? job.startedAt?.getTime() ?? 0;
|
|
95
|
+
|
|
96
|
+
if (now - lastHeartbeat > heartbeatTimeout) {
|
|
97
|
+
await this.emit("job.watchdog.stale", {
|
|
98
|
+
jobId: job.id,
|
|
99
|
+
name: job.name,
|
|
100
|
+
pid: job.pid,
|
|
101
|
+
timeoutMs: heartbeatTimeout,
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
await this.killProcessWithGrace(job.pid, killGraceMs);
|
|
105
|
+
|
|
106
|
+
await adapter.update(job.id, {
|
|
107
|
+
status: "failed",
|
|
108
|
+
error: "Watchdog killed unresponsive job",
|
|
109
|
+
completedAt: new Date(),
|
|
110
|
+
processState: "orphaned",
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
await this.emit("job.watchdog.killed", {
|
|
114
|
+
jobId: job.id,
|
|
115
|
+
name: job.name,
|
|
116
|
+
pid: job.pid,
|
|
117
|
+
reason: "heartbeat",
|
|
118
|
+
});
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (config.timeout && job.startedAt) {
|
|
123
|
+
if (now - job.startedAt.getTime() > config.timeout) {
|
|
124
|
+
await this.emit("job.watchdog.stale", {
|
|
125
|
+
jobId: job.id,
|
|
126
|
+
name: job.name,
|
|
127
|
+
pid: job.pid,
|
|
128
|
+
timeoutMs: config.timeout,
|
|
129
|
+
reason: "timeout",
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
await this.killProcessWithGrace(job.pid, killGraceMs);
|
|
133
|
+
|
|
134
|
+
await adapter.update(job.id, {
|
|
135
|
+
status: "failed",
|
|
136
|
+
error: "Watchdog killed job after timeout",
|
|
137
|
+
completedAt: new Date(),
|
|
138
|
+
processState: "orphaned",
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
await this.emit("job.watchdog.killed", {
|
|
142
|
+
jobId: job.id,
|
|
143
|
+
name: job.name,
|
|
144
|
+
pid: job.pid,
|
|
145
|
+
reason: "timeout",
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
private async checkProcesses(): Promise<void> {
|
|
153
|
+
const adapter = this.deps.processesAdapter!;
|
|
154
|
+
const now = this.now();
|
|
155
|
+
const running = await adapter.getRunning();
|
|
156
|
+
|
|
157
|
+
for (const proc of running) {
|
|
158
|
+
if (!proc.pid) continue;
|
|
159
|
+
const heartbeatTimeout = proc.config.heartbeat?.timeoutMs;
|
|
160
|
+
const lastHeartbeat = proc.lastHeartbeat?.getTime() ?? proc.startedAt?.getTime() ?? 0;
|
|
161
|
+
|
|
162
|
+
if (heartbeatTimeout && now - lastHeartbeat > heartbeatTimeout) {
|
|
163
|
+
await this.emit("process.watchdog.stale", {
|
|
164
|
+
processId: proc.id,
|
|
165
|
+
name: proc.name,
|
|
166
|
+
pid: proc.pid,
|
|
167
|
+
timeoutMs: heartbeatTimeout,
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
|
|
171
|
+
|
|
172
|
+
await adapter.update(proc.id, {
|
|
173
|
+
status: "crashed",
|
|
174
|
+
error: "Watchdog killed unresponsive process",
|
|
175
|
+
stoppedAt: new Date(),
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
await this.emit("process.watchdog.killed", {
|
|
179
|
+
processId: proc.id,
|
|
180
|
+
name: proc.name,
|
|
181
|
+
pid: proc.pid,
|
|
182
|
+
reason: "heartbeat",
|
|
183
|
+
});
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const maxRuntimeMs = proc.config.limits?.maxRuntimeMs;
|
|
188
|
+
if (maxRuntimeMs && proc.startedAt) {
|
|
189
|
+
if (now - proc.startedAt.getTime() > maxRuntimeMs) {
|
|
190
|
+
await this.emit("process.watchdog.stale", {
|
|
191
|
+
processId: proc.id,
|
|
192
|
+
name: proc.name,
|
|
193
|
+
pid: proc.pid,
|
|
194
|
+
timeoutMs: maxRuntimeMs,
|
|
195
|
+
reason: "maxRuntimeMs",
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
await this.killProcessWithGrace(proc.pid, this.config.killGraceMs);
|
|
199
|
+
|
|
200
|
+
await adapter.update(proc.id, {
|
|
201
|
+
status: "crashed",
|
|
202
|
+
error: "Watchdog killed process after max runtime",
|
|
203
|
+
stoppedAt: new Date(),
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
await this.emit("process.watchdog.killed", {
|
|
207
|
+
processId: proc.id,
|
|
208
|
+
name: proc.name,
|
|
209
|
+
pid: proc.pid,
|
|
210
|
+
reason: "maxRuntimeMs",
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
private getWatchdogMetadata(instance: WorkflowInstance): { pid?: number; lastHeartbeat?: number } | null {
|
|
218
|
+
const meta = instance.metadata as any;
|
|
219
|
+
if (!meta || typeof meta !== "object") return null;
|
|
220
|
+
const info = meta.__watchdog;
|
|
221
|
+
if (!info || typeof info !== "object") return null;
|
|
222
|
+
return {
|
|
223
|
+
pid: typeof info.pid === "number" ? info.pid : undefined,
|
|
224
|
+
lastHeartbeat: info.lastHeartbeat ? new Date(info.lastHeartbeat).getTime() : undefined,
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
private now(): number {
|
|
229
|
+
return this.deps.now ? this.deps.now() : Date.now();
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
private async emit(event: string, data: Record<string, any>): Promise<void> {
|
|
233
|
+
if (this.deps.emit) {
|
|
234
|
+
await this.deps.emit(event, data);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
private async killProcessWithGrace(pid: number, graceMs: number): Promise<void> {
|
|
239
|
+
const kill = this.deps.killProcess ?? process.kill;
|
|
240
|
+
try {
|
|
241
|
+
kill(pid, "SIGTERM");
|
|
242
|
+
} catch {
|
|
243
|
+
return;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (graceMs <= 0) {
|
|
247
|
+
try {
|
|
248
|
+
kill(pid, "SIGKILL");
|
|
249
|
+
} catch {
|
|
250
|
+
// ignore
|
|
251
|
+
}
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
await new Promise((resolve) => setTimeout(resolve, graceMs));
|
|
256
|
+
|
|
257
|
+
try {
|
|
258
|
+
const isAlive = this.deps.isProcessAlive
|
|
259
|
+
? this.deps.isProcessAlive(pid)
|
|
260
|
+
: (() => {
|
|
261
|
+
try {
|
|
262
|
+
process.kill(pid, 0);
|
|
263
|
+
return true;
|
|
264
|
+
} catch {
|
|
265
|
+
return false;
|
|
266
|
+
}
|
|
267
|
+
})();
|
|
268
|
+
|
|
269
|
+
if (isAlive) {
|
|
270
|
+
kill(pid, "SIGKILL");
|
|
271
|
+
}
|
|
272
|
+
} catch {
|
|
273
|
+
// ignore
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
@@ -25,6 +25,11 @@ interface ExecutorConfig {
|
|
|
25
25
|
pluginModulePaths: Record<string, string>;
|
|
26
26
|
pluginConfigs: Record<string, any>;
|
|
27
27
|
coreConfig?: Record<string, any>;
|
|
28
|
+
sqlitePragmas?: {
|
|
29
|
+
busyTimeout?: number;
|
|
30
|
+
synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
|
|
31
|
+
journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
|
|
32
|
+
};
|
|
28
33
|
}
|
|
29
34
|
|
|
30
35
|
// ============================================
|
|
@@ -47,6 +52,7 @@ async function main(): Promise<void> {
|
|
|
47
52
|
pluginModulePaths,
|
|
48
53
|
pluginConfigs,
|
|
49
54
|
coreConfig,
|
|
55
|
+
sqlitePragmas,
|
|
50
56
|
} = config;
|
|
51
57
|
|
|
52
58
|
const socket = await connectToSocket(socketPath, tcpPort);
|
|
@@ -71,6 +77,7 @@ async function main(): Promise<void> {
|
|
|
71
77
|
const bootstrap = await bootstrapSubprocess({
|
|
72
78
|
dbPath,
|
|
73
79
|
coreConfig,
|
|
80
|
+
sqlitePragmas,
|
|
74
81
|
pluginMetadata: {
|
|
75
82
|
names: pluginNames,
|
|
76
83
|
modulePaths: pluginModulePaths,
|
|
@@ -248,7 +248,24 @@ export class WorkflowSocketServerImpl implements WorkflowSocketServer {
|
|
|
248
248
|
|
|
249
249
|
let buffer = "";
|
|
250
250
|
|
|
251
|
-
|
|
251
|
+
const queue: WorkflowMessage[] = [];
|
|
252
|
+
let processing = false;
|
|
253
|
+
|
|
254
|
+
const processQueue = async () => {
|
|
255
|
+
if (processing) return;
|
|
256
|
+
processing = true;
|
|
257
|
+
while (queue.length > 0) {
|
|
258
|
+
const message = queue.shift()!;
|
|
259
|
+
try {
|
|
260
|
+
await this.handleMessage(instanceId, message);
|
|
261
|
+
} catch (err) {
|
|
262
|
+
this.onError?.(err instanceof Error ? err : new Error(String(err)), instanceId);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
processing = false;
|
|
266
|
+
};
|
|
267
|
+
|
|
268
|
+
socket.on("data", (data) => {
|
|
252
269
|
buffer += data.toString();
|
|
253
270
|
|
|
254
271
|
// Process complete messages (newline-delimited JSON)
|
|
@@ -260,11 +277,13 @@ export class WorkflowSocketServerImpl implements WorkflowSocketServer {
|
|
|
260
277
|
|
|
261
278
|
try {
|
|
262
279
|
const message = JSON.parse(line) as WorkflowMessage;
|
|
263
|
-
|
|
280
|
+
queue.push(message);
|
|
264
281
|
} catch (err) {
|
|
265
282
|
this.onError?.(new Error(`Invalid message: ${line}`), instanceId);
|
|
266
283
|
}
|
|
267
284
|
}
|
|
285
|
+
|
|
286
|
+
processQueue().catch(() => undefined);
|
|
268
287
|
});
|
|
269
288
|
|
|
270
289
|
socket.on("error", (err) => {
|
package/src/core/workflows.ts
CHANGED
|
@@ -760,12 +760,24 @@ export interface WorkflowsConfig {
|
|
|
760
760
|
heartbeatTimeout?: number;
|
|
761
761
|
/** Timeout waiting for isolated subprocess readiness (ms, default: 10000) */
|
|
762
762
|
readyTimeout?: number;
|
|
763
|
+
/** Grace period before SIGKILL when terminating isolated subprocesses (ms, default: 5000) */
|
|
764
|
+
killGraceMs?: number;
|
|
765
|
+
/** SQLite pragmas for isolated subprocess connections */
|
|
766
|
+
sqlitePragmas?: SqlitePragmaConfig;
|
|
767
|
+
/** Disable in-process watchdog timers (use external watchdog instead) */
|
|
768
|
+
useWatchdog?: boolean;
|
|
763
769
|
/** Resume strategy for orphaned workflows (default: "blocking") */
|
|
764
770
|
resumeStrategy?: WorkflowResumeStrategy;
|
|
765
771
|
}
|
|
766
772
|
|
|
767
773
|
export type WorkflowResumeStrategy = "blocking" | "background" | "skip";
|
|
768
774
|
|
|
775
|
+
export interface SqlitePragmaConfig {
|
|
776
|
+
busyTimeout?: number;
|
|
777
|
+
synchronous?: "OFF" | "NORMAL" | "FULL" | "EXTRA";
|
|
778
|
+
journalMode?: "DELETE" | "TRUNCATE" | "PERSIST" | "MEMORY" | "WAL" | "OFF";
|
|
779
|
+
}
|
|
780
|
+
|
|
769
781
|
/** Options for registering a workflow */
|
|
770
782
|
export interface WorkflowRegisterOptions {
|
|
771
783
|
/**
|
|
@@ -815,6 +827,8 @@ export interface Workflows {
|
|
|
815
827
|
updateMetadata(instanceId: string, key: string, value: any): Promise<void>;
|
|
816
828
|
/** Set plugin metadata for local instantiation in isolated workflows */
|
|
817
829
|
setPluginMetadata(metadata: PluginMetadata): void;
|
|
830
|
+
/** Get resolved SQLite db path (for watchdog) */
|
|
831
|
+
getDbPath(): string | undefined;
|
|
818
832
|
}
|
|
819
833
|
|
|
820
834
|
export interface PluginMetadata {
|
|
@@ -854,6 +868,9 @@ class WorkflowsImpl implements Workflows {
|
|
|
854
868
|
private dbPath?: string;
|
|
855
869
|
private heartbeatTimeoutMs: number;
|
|
856
870
|
private readyTimeoutMs: number;
|
|
871
|
+
private killGraceMs: number;
|
|
872
|
+
private sqlitePragmas?: SqlitePragmaConfig;
|
|
873
|
+
private useWatchdog: boolean;
|
|
857
874
|
private resumeStrategy!: WorkflowResumeStrategy;
|
|
858
875
|
private workflowModulePaths = new Map<string, string>();
|
|
859
876
|
private isolatedProcesses = new Map<string, IsolatedProcessInfo>();
|
|
@@ -888,6 +905,9 @@ class WorkflowsImpl implements Workflows {
|
|
|
888
905
|
this.dbPath = config.dbPath;
|
|
889
906
|
this.heartbeatTimeoutMs = config.heartbeatTimeout ?? 60000;
|
|
890
907
|
this.readyTimeoutMs = config.readyTimeout ?? 10000;
|
|
908
|
+
this.killGraceMs = config.killGraceMs ?? 5000;
|
|
909
|
+
this.sqlitePragmas = config.sqlitePragmas;
|
|
910
|
+
this.useWatchdog = config.useWatchdog ?? false;
|
|
891
911
|
this.resumeStrategy = config.resumeStrategy ?? "blocking";
|
|
892
912
|
}
|
|
893
913
|
|
|
@@ -948,6 +968,10 @@ class WorkflowsImpl implements Workflows {
|
|
|
948
968
|
this.pluginCustomErrors = metadata.customErrors;
|
|
949
969
|
}
|
|
950
970
|
|
|
971
|
+
getDbPath(): string | undefined {
|
|
972
|
+
return this.dbPath;
|
|
973
|
+
}
|
|
974
|
+
|
|
951
975
|
async updateMetadata(instanceId: string, key: string, value: any): Promise<void> {
|
|
952
976
|
const instance = await this.adapter.getInstance(instanceId);
|
|
953
977
|
if (!instance) return;
|
|
@@ -1049,11 +1073,7 @@ class WorkflowsImpl implements Workflows {
|
|
|
1049
1073
|
// Kill isolated process if running
|
|
1050
1074
|
const isolatedInfo = this.isolatedProcesses.get(instanceId);
|
|
1051
1075
|
if (isolatedInfo) {
|
|
1052
|
-
|
|
1053
|
-
process.kill(isolatedInfo.pid, "SIGTERM");
|
|
1054
|
-
} catch {
|
|
1055
|
-
// Process might already be dead
|
|
1056
|
-
}
|
|
1076
|
+
await killProcessWithGrace(isolatedInfo.pid, this.killGraceMs);
|
|
1057
1077
|
if (isolatedInfo.timeout) clearTimeout(isolatedInfo.timeout);
|
|
1058
1078
|
if (isolatedInfo.heartbeatTimeout) clearTimeout(isolatedInfo.heartbeatTimeout);
|
|
1059
1079
|
this.isolatedProcesses.delete(instanceId);
|
|
@@ -1470,6 +1490,7 @@ class WorkflowsImpl implements Workflows {
|
|
|
1470
1490
|
pluginModulePaths: this.pluginModulePaths,
|
|
1471
1491
|
pluginConfigs,
|
|
1472
1492
|
coreConfig,
|
|
1493
|
+
sqlitePragmas: this.sqlitePragmas,
|
|
1473
1494
|
};
|
|
1474
1495
|
|
|
1475
1496
|
// Spawn the subprocess
|
|
@@ -1506,6 +1527,17 @@ class WorkflowsImpl implements Workflows {
|
|
|
1506
1527
|
// Set up heartbeat timeout
|
|
1507
1528
|
this.resetHeartbeatTimeout(instanceId, proc.pid);
|
|
1508
1529
|
|
|
1530
|
+
const instance = await this.adapter.getInstance(instanceId);
|
|
1531
|
+
const metadata = { ...(instance?.metadata ?? {}) } as Record<string, any>;
|
|
1532
|
+
metadata.__watchdog = {
|
|
1533
|
+
...(metadata.__watchdog ?? {}),
|
|
1534
|
+
pid: proc.pid,
|
|
1535
|
+
socketPath,
|
|
1536
|
+
tcpPort,
|
|
1537
|
+
lastHeartbeat: new Date().toISOString(),
|
|
1538
|
+
};
|
|
1539
|
+
await this.adapter.updateInstance(instanceId, { metadata });
|
|
1540
|
+
|
|
1509
1541
|
const exitBeforeReady = proc.exited.then((exitCode) => {
|
|
1510
1542
|
throw new Error(`Subprocess exited before ready (code ${exitCode})`);
|
|
1511
1543
|
});
|
|
@@ -1579,7 +1611,8 @@ class WorkflowsImpl implements Workflows {
|
|
|
1579
1611
|
|
|
1580
1612
|
case "started":
|
|
1581
1613
|
case "heartbeat":
|
|
1582
|
-
//
|
|
1614
|
+
// Update heartbeat tracking metadata
|
|
1615
|
+
await this.updateWatchdogHeartbeat(instanceId);
|
|
1583
1616
|
break;
|
|
1584
1617
|
|
|
1585
1618
|
case "step.started": {
|
|
@@ -1947,6 +1980,17 @@ class WorkflowsImpl implements Workflows {
|
|
|
1947
1980
|
this.rejectIsolatedReady(instanceId, new Error("Isolated workflow cleaned up"));
|
|
1948
1981
|
}
|
|
1949
1982
|
|
|
1983
|
+
private async updateWatchdogHeartbeat(instanceId: string): Promise<void> {
|
|
1984
|
+
const instance = await this.adapter.getInstance(instanceId);
|
|
1985
|
+
if (!instance) return;
|
|
1986
|
+
const metadata = { ...(instance.metadata ?? {}) } as Record<string, any>;
|
|
1987
|
+
metadata.__watchdog = {
|
|
1988
|
+
...(metadata.__watchdog ?? {}),
|
|
1989
|
+
lastHeartbeat: new Date().toISOString(),
|
|
1990
|
+
};
|
|
1991
|
+
await this.adapter.updateInstance(instanceId, { metadata });
|
|
1992
|
+
}
|
|
1993
|
+
|
|
1950
1994
|
private async markOrphanedAsFailed(
|
|
1951
1995
|
instances: WorkflowInstance[],
|
|
1952
1996
|
reason: string
|
|
@@ -1979,6 +2023,7 @@ class WorkflowsImpl implements Workflows {
|
|
|
1979
2023
|
* Reset heartbeat timeout for an isolated workflow
|
|
1980
2024
|
*/
|
|
1981
2025
|
private resetHeartbeatTimeout(instanceId: string, pid: number): void {
|
|
2026
|
+
if (this.useWatchdog) return;
|
|
1982
2027
|
const info = this.isolatedProcesses.get(instanceId);
|
|
1983
2028
|
if (!info) return;
|
|
1984
2029
|
|
|
@@ -1995,6 +2040,11 @@ class WorkflowsImpl implements Workflows {
|
|
|
1995
2040
|
}
|
|
1996
2041
|
|
|
1997
2042
|
console.error(`[Workflows] No heartbeat from isolated workflow ${instanceId} for ${this.heartbeatTimeoutMs}ms`);
|
|
2043
|
+
await this.emitEvent("workflow.watchdog.stale", {
|
|
2044
|
+
instanceId,
|
|
2045
|
+
reason: "heartbeat",
|
|
2046
|
+
timeoutMs: this.heartbeatTimeoutMs,
|
|
2047
|
+
});
|
|
1998
2048
|
await this.handleIsolatedTimeout(instanceId, pid);
|
|
1999
2049
|
}, this.heartbeatTimeoutMs);
|
|
2000
2050
|
}
|
|
@@ -2006,12 +2056,12 @@ class WorkflowsImpl implements Workflows {
|
|
|
2006
2056
|
const info = this.isolatedProcesses.get(instanceId);
|
|
2007
2057
|
if (!info) return;
|
|
2008
2058
|
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
}
|
|
2059
|
+
await killProcessWithGrace(pid, this.killGraceMs);
|
|
2060
|
+
await this.emitEvent("workflow.watchdog.killed", {
|
|
2061
|
+
instanceId,
|
|
2062
|
+
reason: "timeout",
|
|
2063
|
+
timeoutMs: this.heartbeatTimeoutMs,
|
|
2064
|
+
});
|
|
2015
2065
|
|
|
2016
2066
|
// Clean up
|
|
2017
2067
|
if (info.timeout) clearTimeout(info.timeout);
|
|
@@ -2148,3 +2198,29 @@ function isPlainObject(value: Record<string, any>): boolean {
|
|
|
2148
2198
|
export function createWorkflows(config?: WorkflowsConfig): Workflows {
|
|
2149
2199
|
return new WorkflowsImpl(config);
|
|
2150
2200
|
}
|
|
2201
|
+
|
|
2202
|
+
async function killProcessWithGrace(pid: number, graceMs: number): Promise<void> {
|
|
2203
|
+
try {
|
|
2204
|
+
process.kill(pid, "SIGTERM");
|
|
2205
|
+
} catch {
|
|
2206
|
+
return;
|
|
2207
|
+
}
|
|
2208
|
+
|
|
2209
|
+
if (graceMs <= 0) {
|
|
2210
|
+
try {
|
|
2211
|
+
process.kill(pid, "SIGKILL");
|
|
2212
|
+
} catch {
|
|
2213
|
+
return;
|
|
2214
|
+
}
|
|
2215
|
+
return;
|
|
2216
|
+
}
|
|
2217
|
+
|
|
2218
|
+
await new Promise((resolve) => setTimeout(resolve, graceMs));
|
|
2219
|
+
|
|
2220
|
+
try {
|
|
2221
|
+
process.kill(pid, 0);
|
|
2222
|
+
process.kill(pid, "SIGKILL");
|
|
2223
|
+
} catch {
|
|
2224
|
+
// Process already exited
|
|
2225
|
+
}
|
|
2226
|
+
}
|