elasticdash-sdk 0.2.7-beta-2 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,7 @@ const AI_URL_PATTERNS = [
12
12
  /https?:\/\/generativelanguage\.googleapis\.com\/.*\/models\/[^/:]+:(generateContent|streamGenerateContent)/,
13
13
  /https?:\/\/api\.x\.ai\/v1\/(chat\/)?completions/,
14
14
  /https?:\/\/api\.moonshot\.ai\/v1\/(chat\/)?completions/,
15
+ /https?:\/\/bedrock-runtime\.[^./]+\.amazonaws\.com\/model\/[^/]+\/(invoke|invoke-with-response-stream|converse|converse-stream)/,
15
16
  ]
16
17
 
17
18
  function isAIProviderUrl(url: string): boolean {
@@ -336,6 +337,22 @@ export function interceptFetch(): void {
336
337
  const httpCtx = getHttpRunContext()
337
338
  const obsCtx = getObservabilityContext()
338
339
 
340
+ const url =
341
+ typeof input === 'string'
342
+ ? input
343
+ : input instanceof URL
344
+ ? input.href
345
+ : (input as Request).url
346
+
347
+ // Let ai-interceptor handle AI provider URLs — it assigns its own event IDs.
348
+ // Short-circuit BEFORE attaching the mock-config header: AI provider servers
349
+ // don't run the SDK so they can't honor the header, and on Bedrock the
350
+ // gzipped header can exceed the edge proxy's header-size cap (yields a
351
+ // generic 400 "Request Header Or Cookie Too Large" page).
352
+ if (isAIProviderUrl(url)) {
353
+ return originalFetch!(input, init)
354
+ }
355
+
339
356
  // Option-A passthrough: even when there is no SDK context active, if the
340
357
  // CLI seeded mock globals we still attach the header so downstream Node
341
358
  // processes (e.g. a Next.js dev server hit by an HTTP-mode workflow) can
@@ -348,21 +365,10 @@ export function interceptFetch(): void {
348
365
  }
349
366
  if (mockHeader) init = attachMockConfigHeader(init, mockHeader)
350
367
 
351
- const url =
352
- typeof input === 'string'
353
- ? input
354
- : input instanceof URL
355
- ? input.href
356
- : (input as Request).url
357
368
  const method = (init?.method ?? (input instanceof Request ? input.method : 'GET')).toUpperCase()
358
369
  const rawHeaders = init?.headers ?? (input instanceof Request ? input.headers : undefined)
359
370
  const rawBody = init?.body ?? (input instanceof Request ? input.body : undefined)
360
371
 
361
- // Let ai-interceptor handle AI provider URLs — it assigns its own event IDs
362
- if (isAIProviderUrl(url)) {
363
- return originalFetch!(input, init)
364
- }
365
-
366
372
 
367
373
  // --- Observability-only mode: record and push, no mocks/replay ---
368
374
  if (!ctx && !httpCtx && obsCtx) {
@@ -96,7 +96,11 @@ function enrichFromLLMCapture(
96
96
  return { input: enrichedInput, usage }
97
97
  }
98
98
 
99
- // Interceptor capture failed (e.g. OpenAI SDK uses native fetch not globalThis.fetch).
99
+ // Interceptor capture failed (e.g. OpenAI SDK uses native fetch not globalThis.fetch;
100
+ // @aws-sdk/client-bedrock-runtime uses its own HTTP signer/handler on Node and likewise
101
+ // bypasses globalThis.fetch — when wrapping a Bedrock call, pass
102
+ // wrapAI(..., { provider: 'bedrock', model: '<modelId>' }) so this fallback enrichment
103
+ // still tags the event so mocked rerun can match it).
100
104
  // Fall back to model/provider from wrapAI options or function arguments.
101
105
  const resolvedModel = fallbackModel
102
106
  || (input && typeof input === 'object' ? (input as Record<string, unknown>).model as string | undefined : undefined)
@@ -164,8 +164,116 @@ function installFrozenFetchFallback(frozenEvents: FrozenEvent[]): void {
164
164
  }
165
165
  }
166
166
 
167
+ // ─── Signal-driven shutdown + diagnostic dump ─────────────────────────
168
+ //
169
+ // Pre-fix behavior: this worker had ZERO signal handlers. When the parent
170
+ // CLI / MCP / agent sent SIGTERM (e.g. on timeout), the inner tool's
171
+ // blocking call (typically a hung HTTP fetch) just kept running. The
172
+ // process never exited and no diagnostic landed anywhere.
173
+ //
174
+ // Post-fix:
175
+ // 1. Install SIGTERM / SIGINT / SIGHUP handlers that flip a shared
176
+ // AbortSignal and start a 2s grace timer before forced exit.
177
+ // 2. Before forced exit, dump a structured snapshot to stderr listing
178
+ // the current tool name, elapsed time, time since last heartbeat,
179
+ // and process.getActiveResourcesInfo() — tells the user "stuck on
180
+ // TCPWRAP / TLSWRAP / Timeout" without needing a real stack trace.
181
+ // 3. While a tool is running, emit a 10s heartbeat (default — tunable
182
+ // via ELASTICDASH_WORKER_HEARTBEAT_MS=0 to disable) so a watchdog
183
+ // upstream sees continuous output instead of declaring the worker
184
+ // stalled.
185
+ const workerAbortController = new AbortController()
186
+ let currentToolName: string | undefined
187
+ let toolStartedAtMs: number | undefined
188
+ let lastHeartbeatAtMs: number = Date.now()
189
+ let shuttingDown = false
190
+
191
+ function activeResourcesSnapshot(): string[] {
192
+ // process.getActiveResourcesInfo() is stable in Node ≥17.3. It returns
193
+ // a list of resource names (TCPWRAP, TLSWRAP, Timeout, etc.) keeping
194
+ // the event loop alive. That alone tells a user whether the hang is
195
+ // network-bound, timer-bound, or CPU-bound (empty array = likely a
196
+ // sync infinite loop, which getActiveResourcesInfo wouldn't catch).
197
+ try {
198
+ const getActiveResourcesInfo = (process as unknown as {
199
+ getActiveResourcesInfo?: () => string[]
200
+ }).getActiveResourcesInfo
201
+ return typeof getActiveResourcesInfo === 'function' ? getActiveResourcesInfo() : []
202
+ } catch {
203
+ return []
204
+ }
205
+ }
206
+
207
+ function diagnosticHint(resources: string[]): string {
208
+ const set = new Set(resources)
209
+ if (set.has('TCPWRAP') || set.has('TLSWRAP')) {
210
+ return 'TCPWRAP/TLSWRAP present — likely a hung HTTP / network request. Add per-request timeouts (e.g. AbortSignal.timeout(...)) inside the tool implementation.'
211
+ }
212
+ if (set.has('Timeout')) {
213
+ return 'Timeout resources only — the tool may be waiting on a setTimeout / sleep. Check for missing clear or long backoff.'
214
+ }
215
+ if (resources.length === 0) {
216
+ return 'No active async resources — likely a sync infinite loop or sync I/O. Inspect tight loops in the tool.'
217
+ }
218
+ return `Active: ${resources.join(', ')}`
219
+ }
220
+
221
+ function dumpDiagnostic(reason: string): void {
222
+ const elapsedMs = toolStartedAtMs ? Date.now() - toolStartedAtMs : 0
223
+ const sinceHeartbeatMs = Date.now() - lastHeartbeatAtMs
224
+ const resources = activeResourcesSnapshot()
225
+ const lines = [
226
+ `[elasticdash-worker] ${reason} — diagnostic snapshot:`,
227
+ ` tool: ${currentToolName ?? '<no tool running>'}`,
228
+ ` elapsed_ms: ${elapsedMs}`,
229
+ ` since_last_heartbeat_ms: ${sinceHeartbeatMs}`,
230
+ ` active_async_resources: ${JSON.stringify(resources)}`,
231
+ ` hint: ${diagnosticHint(resources)}`,
232
+ ]
233
+ process.stderr.write(lines.join('\n') + '\n')
234
+ }
235
+
236
+ function installSignalHandlers(): void {
237
+ const handle = (sig: NodeJS.Signals) => {
238
+ if (shuttingDown) return
239
+ shuttingDown = true
240
+ process.stderr.write(`[elasticdash-worker] received ${sig} — aborting in-flight work\n`)
241
+ dumpDiagnostic(`${sig} received`)
242
+ try { workerAbortController.abort(new Error(`worker received ${sig}`)) } catch { /* may not be wired */ }
243
+ // 2s grace for in-flight async cleanup (flush stdout, close sockets);
244
+ // then force exit. .unref() so the timer itself doesn't keep us alive
245
+ // past natural completion if the abort actually lets cleanup happen.
246
+ setTimeout(() => {
247
+ process.stderr.write(`[elasticdash-worker] grace expired — force exit\n`)
248
+ process.exit(1)
249
+ }, 2_000).unref()
250
+ }
251
+ process.on('SIGTERM', () => handle('SIGTERM'))
252
+ process.on('SIGINT', () => handle('SIGINT'))
253
+ process.on('SIGHUP', () => handle('SIGHUP'))
254
+ }
255
+
256
+ function startToolHeartbeat(toolName: string): () => void {
257
+ currentToolName = toolName
258
+ toolStartedAtMs = Date.now()
259
+ lastHeartbeatAtMs = Date.now()
260
+ const heartbeatMs = Number(process.env.ELASTICDASH_WORKER_HEARTBEAT_MS ?? 10_000)
261
+ if (!(heartbeatMs > 0)) return () => { currentToolName = undefined; toolStartedAtMs = undefined }
262
+ const interval = setInterval(() => {
263
+ lastHeartbeatAtMs = Date.now()
264
+ const elapsedSec = Math.round((Date.now() - (toolStartedAtMs ?? Date.now())) / 1000)
265
+ process.stderr.write(`[elasticdash-worker] alive tool=${toolName} elapsed=${elapsedSec}s\n`)
266
+ }, heartbeatMs)
267
+ return () => {
268
+ clearInterval(interval)
269
+ currentToolName = undefined
270
+ toolStartedAtMs = undefined
271
+ }
272
+ }
273
+
167
274
  async function main() {
168
275
  stage('boot')
276
+ installSignalHandlers()
169
277
  const originalExit = process.exit.bind(process)
170
278
 
171
279
  // Prevent the SDK's tryAutoInitHttpContext from triggering full observability
@@ -244,7 +352,13 @@ async function main() {
244
352
  stage('tool-resolved', { tool: toolName, via: resolvedVia })
245
353
 
246
354
  stage('tool-call-start', { tool: toolName })
247
- const currentOutput = await fn(...args)
355
+ const stopHeartbeat = startToolHeartbeat(toolName)
356
+ let currentOutput: unknown
357
+ try {
358
+ currentOutput = await fn(...args)
359
+ } finally {
360
+ stopHeartbeat()
361
+ }
248
362
  stage('tool-call-end', { tool: toolName })
249
363
  await writeResult({ ok: true, currentOutput })
250
364
  stage('result-written')
@@ -221,8 +221,93 @@ async function loadAndWrapTools(
221
221
  }
222
222
  }
223
223
 
224
+ // ─── Signal-driven shutdown + diagnostic dump ─────────────────────────
225
+ // See tool-runner-worker.ts for the rationale — same pattern, workflow-
226
+ // scoped. When SIGTERM lands here, we dump a snapshot to stderr so the
227
+ // user can see exactly what the workflow was stuck on (tool name,
228
+ // elapsed, active resources) before the 2s grace and forced exit.
229
+ let currentWorkflowName: string | undefined
230
+ let workflowStartedAtMs: number | undefined
231
+ let lastWfHeartbeatAtMs: number = Date.now()
232
+ let workflowShuttingDown = false
233
+
234
+ function activeResourcesSnapshotWf(): string[] {
235
+ try {
236
+ const getActiveResourcesInfo = (process as unknown as {
237
+ getActiveResourcesInfo?: () => string[]
238
+ }).getActiveResourcesInfo
239
+ return typeof getActiveResourcesInfo === 'function' ? getActiveResourcesInfo() : []
240
+ } catch {
241
+ return []
242
+ }
243
+ }
244
+
245
+ function workflowDiagnosticHint(resources: string[]): string {
246
+ const set = new Set(resources)
247
+ if (set.has('TCPWRAP') || set.has('TLSWRAP')) {
248
+ return 'TCPWRAP/TLSWRAP present — likely a hung HTTP / AI provider request. Check per-request timeouts in workflow tools.'
249
+ }
250
+ if (set.has('Timeout')) {
251
+ return 'Timeout resources only — workflow may be waiting on a sleep / backoff.'
252
+ }
253
+ if (resources.length === 0) {
254
+ return 'No active async resources — likely a sync infinite loop or sync I/O in user code.'
255
+ }
256
+ return `Active: ${resources.join(', ')}`
257
+ }
258
+
259
+ function dumpWorkflowDiagnostic(reason: string): void {
260
+ const elapsedMs = workflowStartedAtMs ? Date.now() - workflowStartedAtMs : 0
261
+ const sinceHeartbeatMs = Date.now() - lastWfHeartbeatAtMs
262
+ const resources = activeResourcesSnapshotWf()
263
+ const lines = [
264
+ `[elasticdash-worker workflow] ${reason} — diagnostic snapshot:`,
265
+ ` workflow: ${currentWorkflowName ?? '<not started>'}`,
266
+ ` elapsed_ms: ${elapsedMs}`,
267
+ ` since_last_heartbeat_ms: ${sinceHeartbeatMs}`,
268
+ ` active_async_resources: ${JSON.stringify(resources)}`,
269
+ ` hint: ${workflowDiagnosticHint(resources)}`,
270
+ ]
271
+ process.stderr.write(lines.join('\n') + '\n')
272
+ }
273
+
274
+ function installWorkflowSignalHandlers(): void {
275
+ const handle = (sig: NodeJS.Signals) => {
276
+ if (workflowShuttingDown) return
277
+ workflowShuttingDown = true
278
+ process.stderr.write(`[elasticdash-worker workflow] received ${sig} — aborting in-flight work\n`)
279
+ dumpWorkflowDiagnostic(`${sig} received`)
280
+ setTimeout(() => {
281
+ process.stderr.write(`[elasticdash-worker workflow] grace expired — force exit\n`)
282
+ process.exit(1)
283
+ }, 2_000).unref()
284
+ }
285
+ process.on('SIGTERM', () => handle('SIGTERM'))
286
+ process.on('SIGINT', () => handle('SIGINT'))
287
+ process.on('SIGHUP', () => handle('SIGHUP'))
288
+ }
289
+
290
+ function startWorkflowHeartbeat(workflowName: string): () => void {
291
+ currentWorkflowName = workflowName
292
+ workflowStartedAtMs = Date.now()
293
+ lastWfHeartbeatAtMs = Date.now()
294
+ const heartbeatMs = Number(process.env.ELASTICDASH_WORKER_HEARTBEAT_MS ?? 10_000)
295
+ if (!(heartbeatMs > 0)) return () => { currentWorkflowName = undefined; workflowStartedAtMs = undefined }
296
+ const interval = setInterval(() => {
297
+ lastWfHeartbeatAtMs = Date.now()
298
+ const elapsedSec = Math.round((Date.now() - (workflowStartedAtMs ?? Date.now())) / 1000)
299
+ process.stderr.write(`[elasticdash-worker workflow] alive workflow=${workflowName} elapsed=${elapsedSec}s\n`)
300
+ }, heartbeatMs)
301
+ return () => {
302
+ clearInterval(interval)
303
+ currentWorkflowName = undefined
304
+ workflowStartedAtMs = undefined
305
+ }
306
+ }
307
+
224
308
  async function main() {
225
309
  stage('boot')
310
+ installWorkflowSignalHandlers()
226
311
  // Keep a reference to the real process.exit so we can call it after flushing stdout.
227
312
  const originalExit = process.exit.bind(process)
228
313
 
@@ -347,7 +432,12 @@ async function main() {
347
432
  // Standardize workflow argument resolution: always pass [input] if args is empty
348
433
  const callArgs = args.length ? args : [input]
349
434
  stage('workflow-call-start', { workflow: workflowName })
350
- currentOutput = await (workflowFn as (...a: unknown[]) => unknown)(...callArgs)
435
+ const stopWfHeartbeat = startWorkflowHeartbeat(workflowName)
436
+ try {
437
+ currentOutput = await (workflowFn as (...a: unknown[]) => unknown)(...callArgs)
438
+ } finally {
439
+ stopWfHeartbeat()
440
+ }
351
441
  stage('workflow-call-end', { workflow: workflowName })
352
442
  console.error('[worker] workflowFn resolved, currentOutput:', currentOutput) // stderr so it's visible
353
443
  }