elasticdash-sdk 0.2.7-beta → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -9
- package/dist/cli.js +36 -17
- package/dist/cli.js.map +1 -1
- package/dist/execution/tool-runner.d.ts.map +1 -1
- package/dist/execution/tool-runner.js +58 -14
- package/dist/execution/tool-runner.js.map +1 -1
- package/dist/index.cjs +37 -14
- package/dist/tool-runner-worker.js +114 -1
- package/dist/tool-runner-worker.js.map +1 -1
- package/dist/workflow-runner-worker.js +87 -1
- package/dist/workflow-runner-worker.js.map +1 -1
- package/docs/agent-integration-guide.md +30 -8
- package/docs/partial-mocking.md +10 -4
- package/docs/workflow-modes.md +6 -7
- package/package.json +1 -1
- package/src/cli.ts +36 -17
- package/src/execution/tool-runner.ts +52 -8
- package/src/tool-runner-worker.ts +115 -1
- package/src/workflow-runner-worker.ts +91 -1
|
@@ -132,13 +132,48 @@ export function runToolInSubprocess(
|
|
|
132
132
|
const runtimeArgs = denoProject ? ['run', '--allow-all', workerScript] : [workerScript]
|
|
133
133
|
debugLog(`[elasticdash portal] Runtime for tool subprocess: ${runtime} ${runtimeArgs.join(' ')}`, { isDenoProject: denoProject })
|
|
134
134
|
|
|
135
|
+
// detached:true on POSIX puts the child in its OWN process group, so
|
|
136
|
+
// process.kill(-pid, sig) reaches the whole subtree (worker + any
|
|
137
|
+
// grandchildren the tool spawned — HTTP keepalive sockets, etc.).
|
|
138
|
+
// We still own stdio pipes so the IPC protocol is unchanged. On
|
|
139
|
+
// Windows we fall back to plain child.kill — process-group semantics
|
|
140
|
+
// differ and detached:true there would let the child outlive us.
|
|
141
|
+
const useProcessGroup = process.platform !== 'win32'
|
|
135
142
|
const child = spawn(runtime, runtimeArgs, {
|
|
136
143
|
env: childEnv,
|
|
137
144
|
cwd: projectDir,
|
|
138
145
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
146
|
+
...(useProcessGroup ? { detached: true } : {}),
|
|
139
147
|
})
|
|
140
148
|
const pid = child.pid ?? -1
|
|
141
|
-
debugLog(`[elasticdash portal] tool subprocess stage=spawned pid=${pid} elapsedMs=${elapsed()} tool=${toolName}`)
|
|
149
|
+
debugLog(`[elasticdash portal] tool subprocess stage=spawned pid=${pid} elapsedMs=${elapsed()} tool=${toolName} pgroup=${useProcessGroup}`)
|
|
150
|
+
|
|
151
|
+
// Tree-kill helper: signal the process group on POSIX so the worker
|
|
152
|
+
// AND any subprocess the tool spawned die together. Falls back to
|
|
153
|
+
// direct child.kill on Windows / when the group send fails (e.g.
|
|
154
|
+
// child already exited).
|
|
155
|
+
const killTree = (signal: NodeJS.Signals) => {
|
|
156
|
+
if (useProcessGroup && pid > 0) {
|
|
157
|
+
try { process.kill(-pid, signal); return } catch { /* fall through */ }
|
|
158
|
+
}
|
|
159
|
+
try { child.kill(signal) } catch { /* already gone */ }
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Forward SIGTERM/SIGINT/SIGHUP received by THIS process to the child
|
|
163
|
+
// group, so when the MCP / npm shim / agent kills us, the worker
|
|
164
|
+
// tree dies too instead of orphaning. Without this, the inner node
|
|
165
|
+
// process can sit alive after the shim has exited.
|
|
166
|
+
const forwarded: NodeJS.Signals[] = ['SIGTERM', 'SIGINT', 'SIGHUP']
|
|
167
|
+
const forwardHandlers: Partial<Record<NodeJS.Signals, () => void>> = {}
|
|
168
|
+
for (const sig of forwarded) {
|
|
169
|
+
const h = () => {
|
|
170
|
+
debugLog(`[elasticdash portal] parent received ${sig} — forwarding to child group pid=${pid}`)
|
|
171
|
+
killTree(sig)
|
|
172
|
+
setTimeout(() => killTree('SIGKILL'), 2000)
|
|
173
|
+
}
|
|
174
|
+
forwardHandlers[sig] = h
|
|
175
|
+
process.on(sig, h)
|
|
176
|
+
}
|
|
142
177
|
|
|
143
178
|
// Heartbeat: prove the parent is still waiting on a live child. 0 disables.
|
|
144
179
|
const heartbeatMs = Number(process.env.ELASTICDASH_HEARTBEAT_MS ?? 5000)
|
|
@@ -148,23 +183,32 @@ export function runToolInSubprocess(
|
|
|
148
183
|
}, heartbeatMs)
|
|
149
184
|
: null
|
|
150
185
|
|
|
151
|
-
//
|
|
186
|
+
// Kill switch. Default 300_000ms (5 min) — covers most legitimate tool
|
|
187
|
+
// runs while still preventing the "child sits alive forever" hang the
|
|
188
|
+
// research_bot_js agent ran into. Set ELASTICDASH_TOOL_TIMEOUT_MS=0 to
|
|
189
|
+
// restore the old behavior, or any positive number to tune it.
|
|
152
190
|
let timedOut = false
|
|
153
|
-
const
|
|
191
|
+
const timeoutMsRaw = process.env.ELASTICDASH_TOOL_TIMEOUT_MS
|
|
192
|
+
const timeoutMs = timeoutMsRaw === undefined ? 300_000 : Number(timeoutMsRaw)
|
|
154
193
|
const timeout = timeoutMs > 0
|
|
155
194
|
? setTimeout(() => {
|
|
156
195
|
timedOut = true
|
|
157
|
-
debugLog(`[elasticdash portal] tool subprocess TIMEOUT pid=${pid} after ${timeoutMs}ms — sending SIGTERM`)
|
|
158
|
-
|
|
159
|
-
setTimeout(() =>
|
|
160
|
-
try { child.kill('SIGKILL') } catch { /* already dead */ }
|
|
161
|
-
}, 2000)
|
|
196
|
+
debugLog(`[elasticdash portal] tool subprocess TIMEOUT pid=${pid} after ${timeoutMs}ms — sending SIGTERM to group`)
|
|
197
|
+
killTree('SIGTERM')
|
|
198
|
+
setTimeout(() => killTree('SIGKILL'), 2000)
|
|
162
199
|
}, timeoutMs)
|
|
163
200
|
: null
|
|
164
201
|
|
|
165
202
|
const cleanup = () => {
|
|
166
203
|
if (heartbeat) clearInterval(heartbeat)
|
|
167
204
|
if (timeout) clearTimeout(timeout)
|
|
205
|
+
// Remove the parent-side signal forwarders we registered above so
|
|
206
|
+
// when the parent process eventually receives its own SIGTERM after
|
|
207
|
+
// legit exit, it doesn't double-handle.
|
|
208
|
+
for (const sig of forwarded) {
|
|
209
|
+
const h = forwardHandlers[sig]
|
|
210
|
+
if (h) process.removeListener(sig, h)
|
|
211
|
+
}
|
|
168
212
|
}
|
|
169
213
|
|
|
170
214
|
const RESULT_PREFIX = '__ELASTICDASH_RESULT__:'
|
|
@@ -164,8 +164,116 @@ function installFrozenFetchFallback(frozenEvents: FrozenEvent[]): void {
|
|
|
164
164
|
}
|
|
165
165
|
}
|
|
166
166
|
|
|
167
|
+
// ─── Signal-driven shutdown + diagnostic dump ─────────────────────────
|
|
168
|
+
//
|
|
169
|
+
// Pre-fix behavior: this worker had ZERO signal handlers. When the parent
|
|
170
|
+
// CLI / MCP / agent sent SIGTERM (e.g. on timeout), the inner tool's
|
|
171
|
+
// blocking call (typically a hung HTTP fetch) just kept running. The
|
|
172
|
+
// process never exited and no diagnostic landed anywhere.
|
|
173
|
+
//
|
|
174
|
+
// Post-fix:
|
|
175
|
+
// 1. Install SIGTERM / SIGINT / SIGHUP handlers that flip a shared
|
|
176
|
+
// AbortSignal and start a 2s grace timer before forced exit.
|
|
177
|
+
// 2. Before forced exit, dump a structured snapshot to stderr listing
|
|
178
|
+
// the current tool name, elapsed time, time since last heartbeat,
|
|
179
|
+
// and process.getActiveResourcesInfo() — tells the user "stuck on
|
|
180
|
+
// TCPWRAP / TLSWRAP / Timeout" without needing a real stack trace.
|
|
181
|
+
// 3. While a tool is running, emit a 10s heartbeat (default — tunable
|
|
182
|
+
// via ELASTICDASH_WORKER_HEARTBEAT_MS=0 to disable) so a watchdog
|
|
183
|
+
// upstream sees continuous output instead of declaring the worker
|
|
184
|
+
// stalled.
|
|
185
|
+
const workerAbortController = new AbortController()
|
|
186
|
+
let currentToolName: string | undefined
|
|
187
|
+
let toolStartedAtMs: number | undefined
|
|
188
|
+
let lastHeartbeatAtMs: number = Date.now()
|
|
189
|
+
let shuttingDown = false
|
|
190
|
+
|
|
191
|
+
function activeResourcesSnapshot(): string[] {
|
|
192
|
+
// process.getActiveResourcesInfo() is stable in Node ≥17.3. It returns
|
|
193
|
+
// a list of resource names (TCPWRAP, TLSWRAP, Timeout, etc.) keeping
|
|
194
|
+
// the event loop alive. That alone tells a user whether the hang is
|
|
195
|
+
// network-bound, timer-bound, or CPU-bound (empty array = likely a
|
|
196
|
+
// sync infinite loop, which getActiveResourcesInfo wouldn't catch).
|
|
197
|
+
try {
|
|
198
|
+
const getActiveResourcesInfo = (process as unknown as {
|
|
199
|
+
getActiveResourcesInfo?: () => string[]
|
|
200
|
+
}).getActiveResourcesInfo
|
|
201
|
+
return typeof getActiveResourcesInfo === 'function' ? getActiveResourcesInfo() : []
|
|
202
|
+
} catch {
|
|
203
|
+
return []
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
function diagnosticHint(resources: string[]): string {
|
|
208
|
+
const set = new Set(resources)
|
|
209
|
+
if (set.has('TCPWRAP') || set.has('TLSWRAP')) {
|
|
210
|
+
return 'TCPWRAP/TLSWRAP present — likely a hung HTTP / network request. Add per-request timeouts (e.g. AbortSignal.timeout(...)) inside the tool implementation.'
|
|
211
|
+
}
|
|
212
|
+
if (set.has('Timeout')) {
|
|
213
|
+
return 'Timeout resources only — the tool may be waiting on a setTimeout / sleep. Check for missing clear or long backoff.'
|
|
214
|
+
}
|
|
215
|
+
if (resources.length === 0) {
|
|
216
|
+
return 'No active async resources — likely a sync infinite loop or sync I/O. Inspect tight loops in the tool.'
|
|
217
|
+
}
|
|
218
|
+
return `Active: ${resources.join(', ')}`
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
function dumpDiagnostic(reason: string): void {
|
|
222
|
+
const elapsedMs = toolStartedAtMs ? Date.now() - toolStartedAtMs : 0
|
|
223
|
+
const sinceHeartbeatMs = Date.now() - lastHeartbeatAtMs
|
|
224
|
+
const resources = activeResourcesSnapshot()
|
|
225
|
+
const lines = [
|
|
226
|
+
`[elasticdash-worker] ${reason} — diagnostic snapshot:`,
|
|
227
|
+
` tool: ${currentToolName ?? '<no tool running>'}`,
|
|
228
|
+
` elapsed_ms: ${elapsedMs}`,
|
|
229
|
+
` since_last_heartbeat_ms: ${sinceHeartbeatMs}`,
|
|
230
|
+
` active_async_resources: ${JSON.stringify(resources)}`,
|
|
231
|
+
` hint: ${diagnosticHint(resources)}`,
|
|
232
|
+
]
|
|
233
|
+
process.stderr.write(lines.join('\n') + '\n')
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
function installSignalHandlers(): void {
|
|
237
|
+
const handle = (sig: NodeJS.Signals) => {
|
|
238
|
+
if (shuttingDown) return
|
|
239
|
+
shuttingDown = true
|
|
240
|
+
process.stderr.write(`[elasticdash-worker] received ${sig} — aborting in-flight work\n`)
|
|
241
|
+
dumpDiagnostic(`${sig} received`)
|
|
242
|
+
try { workerAbortController.abort(new Error(`worker received ${sig}`)) } catch { /* may not be wired */ }
|
|
243
|
+
// 2s grace for in-flight async cleanup (flush stdout, close sockets);
|
|
244
|
+
// then force exit. .unref() so the timer itself doesn't keep us alive
|
|
245
|
+
// past natural completion if the abort actually lets cleanup happen.
|
|
246
|
+
setTimeout(() => {
|
|
247
|
+
process.stderr.write(`[elasticdash-worker] grace expired — force exit\n`)
|
|
248
|
+
process.exit(1)
|
|
249
|
+
}, 2_000).unref()
|
|
250
|
+
}
|
|
251
|
+
process.on('SIGTERM', () => handle('SIGTERM'))
|
|
252
|
+
process.on('SIGINT', () => handle('SIGINT'))
|
|
253
|
+
process.on('SIGHUP', () => handle('SIGHUP'))
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
function startToolHeartbeat(toolName: string): () => void {
|
|
257
|
+
currentToolName = toolName
|
|
258
|
+
toolStartedAtMs = Date.now()
|
|
259
|
+
lastHeartbeatAtMs = Date.now()
|
|
260
|
+
const heartbeatMs = Number(process.env.ELASTICDASH_WORKER_HEARTBEAT_MS ?? 10_000)
|
|
261
|
+
if (!(heartbeatMs > 0)) return () => { currentToolName = undefined; toolStartedAtMs = undefined }
|
|
262
|
+
const interval = setInterval(() => {
|
|
263
|
+
lastHeartbeatAtMs = Date.now()
|
|
264
|
+
const elapsedSec = Math.round((Date.now() - (toolStartedAtMs ?? Date.now())) / 1000)
|
|
265
|
+
process.stderr.write(`[elasticdash-worker] alive tool=${toolName} elapsed=${elapsedSec}s\n`)
|
|
266
|
+
}, heartbeatMs)
|
|
267
|
+
return () => {
|
|
268
|
+
clearInterval(interval)
|
|
269
|
+
currentToolName = undefined
|
|
270
|
+
toolStartedAtMs = undefined
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
167
274
|
async function main() {
|
|
168
275
|
stage('boot')
|
|
276
|
+
installSignalHandlers()
|
|
169
277
|
const originalExit = process.exit.bind(process)
|
|
170
278
|
|
|
171
279
|
// Prevent the SDK's tryAutoInitHttpContext from triggering full observability
|
|
@@ -244,7 +352,13 @@ async function main() {
|
|
|
244
352
|
stage('tool-resolved', { tool: toolName, via: resolvedVia })
|
|
245
353
|
|
|
246
354
|
stage('tool-call-start', { tool: toolName })
|
|
247
|
-
const
|
|
355
|
+
const stopHeartbeat = startToolHeartbeat(toolName)
|
|
356
|
+
let currentOutput: unknown
|
|
357
|
+
try {
|
|
358
|
+
currentOutput = await fn(...args)
|
|
359
|
+
} finally {
|
|
360
|
+
stopHeartbeat()
|
|
361
|
+
}
|
|
248
362
|
stage('tool-call-end', { tool: toolName })
|
|
249
363
|
await writeResult({ ok: true, currentOutput })
|
|
250
364
|
stage('result-written')
|
|
@@ -221,8 +221,93 @@ async function loadAndWrapTools(
|
|
|
221
221
|
}
|
|
222
222
|
}
|
|
223
223
|
|
|
224
|
+
// ─── Signal-driven shutdown + diagnostic dump ─────────────────────────
|
|
225
|
+
// See tool-runner-worker.ts for the rationale — same pattern, workflow-
|
|
226
|
+
// scoped. When SIGTERM lands here, we dump a snapshot to stderr so the
|
|
227
|
+
// user can see exactly what the workflow was stuck on (tool name,
|
|
228
|
+
// elapsed, active resources) before the 2s grace and forced exit.
|
|
229
|
+
let currentWorkflowName: string | undefined
|
|
230
|
+
let workflowStartedAtMs: number | undefined
|
|
231
|
+
let lastWfHeartbeatAtMs: number = Date.now()
|
|
232
|
+
let workflowShuttingDown = false
|
|
233
|
+
|
|
234
|
+
function activeResourcesSnapshotWf(): string[] {
|
|
235
|
+
try {
|
|
236
|
+
const getActiveResourcesInfo = (process as unknown as {
|
|
237
|
+
getActiveResourcesInfo?: () => string[]
|
|
238
|
+
}).getActiveResourcesInfo
|
|
239
|
+
return typeof getActiveResourcesInfo === 'function' ? getActiveResourcesInfo() : []
|
|
240
|
+
} catch {
|
|
241
|
+
return []
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
function workflowDiagnosticHint(resources: string[]): string {
|
|
246
|
+
const set = new Set(resources)
|
|
247
|
+
if (set.has('TCPWRAP') || set.has('TLSWRAP')) {
|
|
248
|
+
return 'TCPWRAP/TLSWRAP present — likely a hung HTTP / AI provider request. Check per-request timeouts in workflow tools.'
|
|
249
|
+
}
|
|
250
|
+
if (set.has('Timeout')) {
|
|
251
|
+
return 'Timeout resources only — workflow may be waiting on a sleep / backoff.'
|
|
252
|
+
}
|
|
253
|
+
if (resources.length === 0) {
|
|
254
|
+
return 'No active async resources — likely a sync infinite loop or sync I/O in user code.'
|
|
255
|
+
}
|
|
256
|
+
return `Active: ${resources.join(', ')}`
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
function dumpWorkflowDiagnostic(reason: string): void {
|
|
260
|
+
const elapsedMs = workflowStartedAtMs ? Date.now() - workflowStartedAtMs : 0
|
|
261
|
+
const sinceHeartbeatMs = Date.now() - lastWfHeartbeatAtMs
|
|
262
|
+
const resources = activeResourcesSnapshotWf()
|
|
263
|
+
const lines = [
|
|
264
|
+
`[elasticdash-worker workflow] ${reason} — diagnostic snapshot:`,
|
|
265
|
+
` workflow: ${currentWorkflowName ?? '<not started>'}`,
|
|
266
|
+
` elapsed_ms: ${elapsedMs}`,
|
|
267
|
+
` since_last_heartbeat_ms: ${sinceHeartbeatMs}`,
|
|
268
|
+
` active_async_resources: ${JSON.stringify(resources)}`,
|
|
269
|
+
` hint: ${workflowDiagnosticHint(resources)}`,
|
|
270
|
+
]
|
|
271
|
+
process.stderr.write(lines.join('\n') + '\n')
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
function installWorkflowSignalHandlers(): void {
|
|
275
|
+
const handle = (sig: NodeJS.Signals) => {
|
|
276
|
+
if (workflowShuttingDown) return
|
|
277
|
+
workflowShuttingDown = true
|
|
278
|
+
process.stderr.write(`[elasticdash-worker workflow] received ${sig} — aborting in-flight work\n`)
|
|
279
|
+
dumpWorkflowDiagnostic(`${sig} received`)
|
|
280
|
+
setTimeout(() => {
|
|
281
|
+
process.stderr.write(`[elasticdash-worker workflow] grace expired — force exit\n`)
|
|
282
|
+
process.exit(1)
|
|
283
|
+
}, 2_000).unref()
|
|
284
|
+
}
|
|
285
|
+
process.on('SIGTERM', () => handle('SIGTERM'))
|
|
286
|
+
process.on('SIGINT', () => handle('SIGINT'))
|
|
287
|
+
process.on('SIGHUP', () => handle('SIGHUP'))
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function startWorkflowHeartbeat(workflowName: string): () => void {
|
|
291
|
+
currentWorkflowName = workflowName
|
|
292
|
+
workflowStartedAtMs = Date.now()
|
|
293
|
+
lastWfHeartbeatAtMs = Date.now()
|
|
294
|
+
const heartbeatMs = Number(process.env.ELASTICDASH_WORKER_HEARTBEAT_MS ?? 10_000)
|
|
295
|
+
if (!(heartbeatMs > 0)) return () => { currentWorkflowName = undefined; workflowStartedAtMs = undefined }
|
|
296
|
+
const interval = setInterval(() => {
|
|
297
|
+
lastWfHeartbeatAtMs = Date.now()
|
|
298
|
+
const elapsedSec = Math.round((Date.now() - (workflowStartedAtMs ?? Date.now())) / 1000)
|
|
299
|
+
process.stderr.write(`[elasticdash-worker workflow] alive workflow=${workflowName} elapsed=${elapsedSec}s\n`)
|
|
300
|
+
}, heartbeatMs)
|
|
301
|
+
return () => {
|
|
302
|
+
clearInterval(interval)
|
|
303
|
+
currentWorkflowName = undefined
|
|
304
|
+
workflowStartedAtMs = undefined
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
224
308
|
async function main() {
|
|
225
309
|
stage('boot')
|
|
310
|
+
installWorkflowSignalHandlers()
|
|
226
311
|
// Keep a reference to the real process.exit so we can call it after flushing stdout.
|
|
227
312
|
const originalExit = process.exit.bind(process)
|
|
228
313
|
|
|
@@ -347,7 +432,12 @@ async function main() {
|
|
|
347
432
|
// Standardize workflow argument resolution: always pass [input] if args is empty
|
|
348
433
|
const callArgs = args.length ? args : [input]
|
|
349
434
|
stage('workflow-call-start', { workflow: workflowName })
|
|
350
|
-
|
|
435
|
+
const stopWfHeartbeat = startWorkflowHeartbeat(workflowName)
|
|
436
|
+
try {
|
|
437
|
+
currentOutput = await (workflowFn as (...a: unknown[]) => unknown)(...callArgs)
|
|
438
|
+
} finally {
|
|
439
|
+
stopWfHeartbeat()
|
|
440
|
+
}
|
|
351
441
|
stage('workflow-call-end', { workflow: workflowName })
|
|
352
442
|
console.error('[worker] workflowFn resolved, currentOutput:', currentOutput) // stderr so it's visible
|
|
353
443
|
}
|