elasticdash-sdk 0.2.6 → 0.2.7-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,11 +49,13 @@ Add to `.gitignore`:
49
49
  .ed_traces/
50
50
  ```
51
51
 
52
+ > **Do not stop here — Step 1 is not a complete integration.** Without Steps 2–4, no tool calls are wrapped and no traces are produced. Continue to Step 2 before reporting "done" to the user.
53
+
52
54
  ---
53
55
 
54
56
  ## Step 2: Create `ed_tools.ts`
55
57
 
56
- Create `ed_tools.ts` in the project root. This file wraps each tool function with `wrapTool()` for automatic tracing, mocking, and telemetry.
58
+ Create `ed_tools.ts` in the project root. This file wraps each tool function with `edTool()` for automatic tracing, mocking, telemetry, and CLI/MCP rerun discovery. (`edTool` is `wrapTool` + global registry registration — prefer it as the default. Drop down to `wrapTool` only for inline closures that should NOT be discoverable by name.)
57
59
 
58
60
  ### Template
59
61
 
@@ -70,14 +72,14 @@ import { originalTool2 } from './utils/YOUR_SOURCE_2'
70
72
  // ---------------------------------------------------------------------------
71
73
 
72
74
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
73
- type WrapToolFn = <T extends (...args: any[]) => any>(name: string, fn: T) => T
75
+ type EdToolFn = <T extends (...args: any[]) => any>(name: string, fn: T) => T
74
76
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
75
- let wrapTool: WrapToolFn = (_name: string, fn: any) => fn
77
+ let edTool: EdToolFn = (_name: string, fn: any) => fn
76
78
 
77
79
  try {
78
80
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
79
81
  const _edModule = (eval('require') as (id: string) => any)('elasticdash-sdk')
80
- wrapTool = _edModule.wrapTool ?? wrapTool
82
+ edTool = _edModule.edTool ?? _edModule.wrapTool ?? edTool
81
83
  // Share the module instance with ed_workflows.ts so trace hooks use the same context
82
84
  setElasticDashModule(_edModule)
83
85
  } catch {
@@ -88,11 +90,11 @@ try {
88
90
  // Wrapped tools — one export per tool
89
91
  // ---------------------------------------------------------------------------
90
92
 
91
- export const myTool1 = wrapTool('myTool1', async (input: any) => {
93
+ export const myTool1 = edTool('myTool1', async (input: any) => {
92
94
  return await originalTool1(input)
93
95
  })
94
96
 
95
- export const myTool2 = wrapTool('myTool2', async (input: any) => {
97
+ export const myTool2 = edTool('myTool2', async (input: any) => {
96
98
  const { someField } = input as { someField: string }
97
99
  return await originalTool2(someField)
98
100
  })
@@ -100,14 +102,14 @@ export const myTool2 = wrapTool('myTool2', async (input: any) => {
100
102
 
101
103
  ### Key patterns
102
104
 
103
- - **`wrapTool(name, fn)`** wraps the function with automatic tracing, mocking, and telemetry. Falls back to a passthrough if `elasticdash-sdk` is not installed.
105
+ - **`edTool(name, fn)`** wraps the function with automatic tracing, mocking, telemetry, and global registry registration so the CLI `run-tool <name>` and MCP `run_tool` can rerun it by name. Falls back to a passthrough if `elasticdash-sdk` is not installed.
104
106
  - **`eval('require')`** is used instead of `import()` to share the same module instance across `ed_tools.ts` and `ed_workflows.ts`. This avoids ESM/CJS dual-instance issues.
105
- - **`setElasticDashModule`** shares the loaded module with `ed_workflows.ts` so `edStartTrace`/`edEndTrace` use the same tracing context as `wrapTool`.
107
+ - **`setElasticDashModule`** shares the loaded module with `ed_workflows.ts` so `edStartTrace`/`edEndTrace` use the same tracing context as `edTool`.
106
108
  - The exported name (e.g., `myTool1`) can differ from the original function name (e.g., `originalTool1`). The call sites in existing source files will be updated to use the new name in Step 4.
107
109
 
108
110
  ### Important rules
109
111
 
110
- - The string name passed to `wrapTool()` **must match** the exported function name exactly.
112
+ - The string name passed to `edTool()` (or `wrapTool()`) **must match** the exported function name exactly.
111
113
  - Each tool function must accept a single input object and return a plain value (JSON-serializable).
112
114
  - Tool functions must not close over HTTP context, framework state, or database clients — extract pure logic first.
113
115
 
@@ -141,15 +143,43 @@ Every `ed_workflows.ts` should export `edStartTrace` and `edEndTrace`. These are
141
143
 
142
144
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
143
145
  let _ed: any = null
146
+ let _obsInitialised = false
144
147
 
145
148
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
146
149
  export function setElasticDashModule(mod: any): void {
147
150
  _ed = mod
148
151
  }
149
152
 
153
+ /**
154
+ * Initialise observability through the SHARED SDK module instance.
155
+ * Call this once at process startup (e.g. from main.ts or your server
156
+ * entry point) BEFORE any workflow runs. If env vars are set this is
157
+ * also called lazily from edStartTrace, so this explicit form is for
158
+ * projects that want predictable, fail-fast init.
159
+ */
160
+ export function edInitObservability(opts?: { serverUrl?: string; apiKey?: string }): void {
161
+ if (!_ed || _obsInitialised) return
162
+ const serverUrl = opts?.serverUrl
163
+ ?? process.env.ELASTICDASH_API_URL
164
+ ?? process.env.ELASTICDASH_SERVER_URL
165
+ ?? process.env.ELASTICDASH_SERVER
166
+ const apiKey = opts?.apiKey ?? process.env.ELASTICDASH_API_KEY
167
+ if (!serverUrl || !apiKey) return
168
+ try {
169
+ _ed.initObservability({ serverUrl, apiKey })
170
+ _obsInitialised = true
171
+ } catch (err) {
172
+ console.error('[ed_workflows] edInitObservability error:', err)
173
+ }
174
+ }
175
+
150
176
  export const edStartTrace = async (workflowName: string): Promise<void> => {
151
177
  if (!_ed) return
152
178
  try {
179
+ // Lazy init from env vars on first trace — keeps the simple case
180
+ // ("set ELASTICDASH_API_URL + ELASTICDASH_API_KEY, just run") working
181
+ // without an explicit init call.
182
+ if (!_obsInitialised) edInitObservability()
153
183
  await _ed.tryAutoInitHttpContext()
154
184
  _ed.startTrace(workflowName)
155
185
  } catch (err) {
@@ -165,8 +195,32 @@ export const edEndTrace = (): void => {
165
195
  console.error('[ed_workflows] edEndTrace error:', err)
166
196
  }
167
197
  }
198
+
199
+ /**
200
+ * Flush remaining trace events and close the backend connection.
201
+ * Call from a `finally` block at the end of your process lifecycle
202
+ * (CLI: in main() finally; HTTP server: rarely needed — the SDK
203
+ * auto-registers SIGTERM/SIGINT handlers that call this).
204
+ *
205
+ * The SDK's auto-exit hooks (registered by initObservability) are
206
+ * async; for short-lived CLI scripts the process can terminate BEFORE
207
+ * those hooks complete and drop the final event batch. Explicit
208
+ * shutdown via this helper is the only guarantee that the last batch
209
+ * lands.
210
+ */
211
+ export const edShutdownObservability = async (): Promise<void> => {
212
+ if (!_ed || !_obsInitialised) return
213
+ try {
214
+ await _ed.shutdownObservability()
215
+ _obsInitialised = false
216
+ } catch (err) {
217
+ console.error('[ed_workflows] edShutdownObservability error:', err)
218
+ }
219
+ }
168
220
  ```
169
221
 
222
+ > **Why route init through `_ed` instead of importing `initObservability` directly?** The SDK uses AsyncLocalStorage to correlate events. Both `ed_tools.ts` and `ed_workflows.ts` must share the same SDK module instance — that's why `ed_tools.ts` loads the SDK via `eval('require')` and passes it through `setElasticDashModule`. If `main.ts` does `import { initObservability } from 'elasticdash-sdk'` directly, the ESM-loaded copy is a **different module instance** from the CJS-loaded copy that `_ed` references — init writes to one store, `startTrace` reads from another, and you get `[elasticdash] startTrace: observability not initialised` at runtime. Always init through `edInitObservability` from `ed_workflows.ts`.
223
+
170
224
  ### Workflow exports — simple case
171
225
 
172
226
  For non-framework projects where the workflow can be imported directly:
@@ -221,7 +275,7 @@ export const YOUR_WORKFLOW = async (input: {
221
275
  ```
222
276
  ed_tools.ts
223
277
  ├── imports original functions from services/utils
224
- ├── wraps each with wrapTool() for tracing
278
+ ├── wraps each with edTool() for tracing + rerun registration
225
279
  └── exports wrapped versions with the SAME or similar names
226
280
 
227
281
  ed_workflows.ts
@@ -239,6 +293,32 @@ Existing source files (MODIFIED):
239
293
 
240
294
  ### What to do
241
295
 
296
+ **0. Add `edInitObservability` to your entry point.**
297
+
298
+ Call `edInitObservability` once at process startup so observability is wired up through the SAME SDK module instance that `ed_tools.ts` and `ed_workflows.ts` share. Do this BEFORE any workflow runs. Skipping this is the #1 cause of `[elasticdash] startTrace: observability not initialised` errors at runtime.
299
+
300
+ For a CLI / standalone Node script — init at the top, shutdown in a `finally` block:
301
+
302
+ ```ts
303
+ // src/main.ts
304
+ import 'dotenv/config'
305
+ import { edInitObservability, edShutdownObservability } from '../ed_workflows.js'
306
+ import { researchManager } from './manager.js'
307
+
308
+ async function main() {
309
+ edInitObservability() // env vars: ELASTICDASH_API_URL + ELASTICDASH_API_KEY
310
+ try {
311
+ // ... rest of your main() ...
312
+ } finally {
313
+ await edShutdownObservability() // guarantees the final batch lands
314
+ }
315
+ }
316
+ ```
317
+
318
+ The `finally + edShutdownObservability` block is **required for CLIs**. The SDK auto-registers `beforeExit`/`SIGTERM`/`SIGINT` handlers, but those are async; for short-lived scripts the process can tear down before they complete, dropping the final batch.
319
+
320
+ For Next.js / Remix / SvelteKit / Express, call `edInitObservability()` in your framework's instrumentation hook OR at the very top of your server entry file before any route handler is registered. Explicit shutdown is rarely needed — the server stays up; the auto-registered SIGTERM handler covers graceful restarts. Do NOT replace `edInitObservability` with `import { initObservability } from 'elasticdash-sdk'` — that hits a different module instance and the error returns. See Step 3's "Why route init through `_ed`?" callout.
321
+
242
322
  **1. Find every file that calls a tool function and update its imports:**
243
323
 
244
324
  For each tool exported from `ed_tools.ts`, search the codebase for files that import the original function. Update the import to come from `ed_tools` instead.
@@ -433,6 +513,46 @@ This confirms:
433
513
 
434
514
  **If it fails:** Check that `.env` has valid `ELASTICDASH_API_URL` and `ELASTICDASH_API_KEY` values. If the API key is rejected, the user needs to get a new one from https://app.elasticdash.com.
435
515
 
516
+ ### Verifying programmatic init from the integrated app
517
+
518
+ `npx elasticdash observe` is the *CLI* path — it runs `initObservability` in its own process. The actual integrated app (via `edInitObservability` in `ed_workflows.ts`) also calls `initObservability` programmatically, opens the same socket, installs the same AI interceptor, and pushes events on the same batcher. **You do not need `npx elasticdash observe` running for the integrated app to produce traces.**
519
+
520
+ To verify the programmatic path is working, set `ELASTICDASH_DEBUG=1` in the user's `.env` and run the user's app once. You should see these lines on stderr on the first trace:
521
+
522
+ ```
523
+ [elasticdash] Observability initialised — sessionId=<uuid> workflow=<name> server=<url>
524
+ [elasticdash] Socket connected: <socket-id>
525
+ [elasticdash] startTrace: <workflow-name>, capture=false
526
+ [elasticdash] Flushed N events (status 200)
527
+ ```
528
+
529
+ If you see NONE of them, init is not running. The most common causes are the dual-module-instance trap (someone imported `initObservability` directly instead of using `edInitObservability`), missing env vars, or `ed_tools.ts` was never imported by the running code path (so `setElasticDashModule` was never called and `_ed` is null — `edStartTrace` / `edInitObservability` silently return).
530
+
531
+ If you see the first three lines but not the "Flushed" line, the final batch is being dropped on exit — confirm that the entry point calls `edShutdownObservability()` in a `finally` block.
532
+
533
+ ### Where to view the traces
534
+
535
+ Once `[elasticdash] Flushed N events` appears on stderr, the events are live on the backend. Two ways to view them:
536
+
537
+ 1. **Cloud dashboard** — open `https://app.elasticdash.com`, sign in with the project the API key belongs to, and the run shows up in the traces list. This is the simplest path for users who already have a cloud project.
538
+ 2. **Local dashboard** — run `npx elasticdash dashboard` (opens `http://localhost:4573` and proxies the same backend). Useful for debugging since it lets the agent rerun individual workflow steps with mocked tools, browse the captured tool/AI events, and edit prompts in place. See [Workflows Dashboard](dashboard.md) for the full feature set.
539
+
540
+ For framework projects (Next.js / Remix / etc.), the dashboard can also **trigger workflow runs directly against the running dev server** via HTTP-mode reruns — see [HTTP Workflow Mode](../README.md#http-workflow-mode) in the README. Confirm by running `npm run dev` in the user's project, then `npx elasticdash dashboard` in a second terminal, and triggering a rerun from the dashboard UI.
541
+
542
+ **End-to-end flow recap** for the agent reporting "done" to the user:
543
+
544
+ ```
545
+ 1. npm install elasticdash-sdk ← Step 1
546
+ 2. ed_tools.ts wraps tool functions ← Step 2
547
+ 3. ed_workflows.ts has init/start/end/shutdown helpers + workflow exports ← Step 3
548
+ 4. Entry point calls edInitObservability() then runs the workflow, finally edShutdownObservability() ← Step 4
549
+ 5. .env has ELASTICDASH_API_URL + ELASTICDASH_API_KEY ← Step 6
550
+ 6. User runs their app → sees [elasticdash] ... logs on stderr ← this section
551
+ 7. User opens https://app.elasticdash.com or `npx elasticdash dashboard` → sees the trace
552
+ ```
553
+
554
+ Only after step 7 has been confirmed is the integration end-to-end. If step 7 fails (logs say "Flushed" but trace doesn't appear), the most likely cause is the API key belongs to a different project than the one the user is viewing — check the project picker in the dashboard.
555
+
436
556
  After validation, stop the observe process (Ctrl+C) and inform the user that ElasticDash is integrated. Provide these commands for ongoing use:
437
557
 
438
558
  ```bash
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "elasticdash-sdk",
3
- "version": "0.2.6",
3
+ "version": "0.2.7-beta",
4
4
  "description": "AI-native SDK for ElasticDash workflow testing, tracing, and observability",
5
5
  "type": "module",
6
6
  "bin": {
@@ -453,6 +453,11 @@ function runWorkflowInSubprocess(
453
453
  },
454
454
  ): Promise<WorkflowSubprocessResult> {
455
455
  return new Promise((resolve) => {
456
+ const startMs = Date.now()
457
+ const elapsed = () => Date.now() - startMs
458
+ const debug = (...a: unknown[]) => {
459
+ if (process.env.ELASTICDASH_DEBUG === '1') console.error(...a)
460
+ }
456
461
  const workerScript = new URL('./workflow-runner-worker.js', import.meta.url).pathname
457
462
  const projectDir = path.dirname(workflowsModulePath)
458
463
  const denoProject = isDenoProject(projectDir)
@@ -472,15 +477,52 @@ function runWorkflowInSubprocess(
472
477
  cwd: projectDir,
473
478
  stdio: ['pipe', 'pipe', 'pipe', 'pipe'],
474
479
  })
480
+ const pid = child.pid ?? -1
481
+ debug(`[elasticdash dashboard] workflow subprocess stage=spawned pid=${pid} elapsedMs=${elapsed()} workflow=${workflowName}`)
482
+
483
+ // Heartbeat — workflows can be long; without this the dashboard is blind.
484
+ // 0 disables. Default 5s.
485
+ const heartbeatMs = Number(process.env.ELASTICDASH_HEARTBEAT_MS ?? 5000)
486
+ const heartbeat = heartbeatMs > 0
487
+ ? setInterval(() => {
488
+ debug(`[elasticdash dashboard] workflow subprocess heartbeat pid=${pid} elapsedMs=${elapsed()} workflow=${workflowName}`)
489
+ }, heartbeatMs)
490
+ : null
491
+
492
+ // Optional kill switch. Default unset = no timeout (preserves prior behavior).
493
+ let timedOut = false
494
+ const timeoutMs = Number(process.env.ELASTICDASH_WORKFLOW_TIMEOUT_MS ?? 0)
495
+ const timeout = timeoutMs > 0
496
+ ? setTimeout(() => {
497
+ timedOut = true
498
+ debug(`[elasticdash dashboard] workflow subprocess TIMEOUT pid=${pid} after ${timeoutMs}ms — sending SIGTERM`)
499
+ try { child.kill('SIGTERM') } catch { /* already dead */ }
500
+ setTimeout(() => {
501
+ try { child.kill('SIGKILL') } catch { /* already dead */ }
502
+ }, 2000)
503
+ }, timeoutMs)
504
+ : null
505
+
506
+ const cleanup = () => {
507
+ if (heartbeat) clearInterval(heartbeat)
508
+ if (timeout) clearTimeout(timeout)
509
+ }
475
510
 
476
511
  let fd3Data = ''
477
512
  let stderr = ''
513
+ let sawFd3 = false
514
+ let sawStdout = false
515
+ let sawStderr = false
478
516
 
479
517
  // Line-buffer stdout so that large result JSON lines split across multiple
480
518
  // data events are reassembled before processing.
481
519
  const WORKFLOW_RESULT_PREFIX = '__ELASTICDASH_RESULT__:'
482
520
  let stdoutBuf = ''
483
521
  child.stdout.on('data', (chunk) => {
522
+ if (!sawStdout) {
523
+ sawStdout = true
524
+ debug(`[elasticdash dashboard] workflow subprocess stage=first-stdout pid=${pid} elapsedMs=${elapsed()}`)
525
+ }
484
526
  stdoutBuf += chunk.toString()
485
527
  const lines = stdoutBuf.split('\n')
486
528
  stdoutBuf = lines.pop() ?? '' // keep last (possibly incomplete) line
@@ -494,15 +536,27 @@ function runWorkflowInSubprocess(
494
536
  }
495
537
  })
496
538
  child.stderr.on('data', (chunk) => {
539
+ if (!sawStderr) {
540
+ sawStderr = true
541
+ debug(`[elasticdash dashboard] workflow subprocess stage=first-stderr pid=${pid} elapsedMs=${elapsed()}`)
542
+ }
497
543
  stderr += chunk.toString()
498
544
  process.stderr.write(chunk)
499
545
  })
500
546
  const fd3 = child.stdio[3] as import('stream').Readable | null
501
547
  fd3?.on('data', (chunk: Buffer | string) => {
548
+ if (!sawFd3) {
549
+ sawFd3 = true
550
+ debug(`[elasticdash dashboard] workflow subprocess stage=first-fd3 pid=${pid} elapsedMs=${elapsed()}`)
551
+ }
502
552
  fd3Data += chunk.toString()
503
553
  })
504
554
 
505
- child.on('close', () => {
555
+ child.on('close', (code, signal) => {
556
+ cleanup()
557
+ const elapsedMs = elapsed()
558
+ debug(`[elasticdash dashboard] workflow subprocess stage=closed pid=${pid} code=${code} signal=${signal ?? 'none'} elapsedMs=${elapsedMs} stderrBytes=${stderr.length} fd3Bytes=${fd3Data.length}`)
559
+
506
560
  // Flush any remaining buffered stdout line (e.g. result with no trailing newline)
507
561
  if (stdoutBuf.startsWith(WORKFLOW_RESULT_PREFIX)) {
508
562
  fd3Data += stdoutBuf.slice(WORKFLOW_RESULT_PREFIX.length)
@@ -514,12 +568,25 @@ function runWorkflowInSubprocess(
514
568
  try {
515
569
  resolve(JSON.parse(fd3Data))
516
570
  return
517
- } catch { /* fall through */ }
571
+ } catch (parseErr) {
572
+ const detail = `[exit=${code} signal=${signal ?? 'none'} elapsedMs=${elapsedMs} pid=${pid}] fd3 payload failed to parse: ${(parseErr as Error).message}`
573
+ resolve({ ok: false, error: detail })
574
+ return
575
+ }
518
576
  }
519
- resolve({ ok: false, error: stderr.trim() || 'Workflow subprocess produced no output.' })
577
+ const stderrExcerpt = stderr.length > 1024 ? `…${stderr.slice(-1024)}` : stderr
578
+ const detail = `[exit=${code} signal=${signal ?? 'none'} elapsedMs=${elapsedMs} pid=${pid} stderrBytes=${stderr.length}]`
579
+ const baseError = timedOut
580
+ ? `Workflow subprocess timed out after ${timeoutMs}ms`
581
+ : (stderr.trim() || 'Workflow subprocess produced no output.')
582
+ const errorMsg = stderr.trim()
583
+ ? `${baseError} ${detail}`
584
+ : `${baseError} ${detail}${stderrExcerpt ? `\nLast stderr: ${stderrExcerpt}` : ''}`
585
+ resolve({ ok: false, error: errorMsg })
520
586
  })
521
587
 
522
588
  child.on('error', (err) => {
589
+ cleanup()
523
590
  const hint = denoProject && (err as NodeJS.ErrnoException).code === 'ENOENT'
524
591
  ? ' (Deno project detected — ensure "deno" is installed and available in PATH)'
525
592
  : ''
@@ -544,6 +611,7 @@ function runWorkflowInSubprocess(
544
611
  })
545
612
  child.stdin.write(payload)
546
613
  child.stdin.end() // Always close stdin to avoid subprocess hang
614
+ debug(`[elasticdash dashboard] workflow subprocess stage=payload-written pid=${pid} elapsedMs=${elapsed()} payloadBytes=${payload.length}`)
547
615
  })
548
616
  }
549
617
 
@@ -118,6 +118,7 @@ export function runToolInSubprocess(
118
118
  return new Promise((resolve) => {
119
119
  debugLog('[elasticdash portal] Spawning tool subprocess', { toolsModulePath, toolName, args, frozenEventsCount: frozenEvents?.length ?? 0 })
120
120
  const startMs = Date.now()
121
+ const elapsed = () => Date.now() - startMs
121
122
  const workerScript = resolveWorkerScript('../tool-runner-worker.js')
122
123
  const projectDir = path.dirname(toolsModulePath)
123
124
  const denoProject = isDenoProject(projectDir)
@@ -136,15 +137,50 @@ export function runToolInSubprocess(
136
137
  cwd: projectDir,
137
138
  stdio: ['pipe', 'pipe', 'pipe'],
138
139
  })
140
+ const pid = child.pid ?? -1
141
+ debugLog(`[elasticdash portal] tool subprocess stage=spawned pid=${pid} elapsedMs=${elapsed()} tool=${toolName}`)
142
+
143
+ // Heartbeat: prove the parent is still waiting on a live child. 0 disables.
144
+ const heartbeatMs = Number(process.env.ELASTICDASH_HEARTBEAT_MS ?? 5000)
145
+ const heartbeat = heartbeatMs > 0
146
+ ? setInterval(() => {
147
+ debugLog(`[elasticdash portal] tool subprocess heartbeat pid=${pid} elapsedMs=${elapsed()} tool=${toolName}`)
148
+ }, heartbeatMs)
149
+ : null
150
+
151
+ // Optional kill switch. Default unset = no timeout (preserves prior behavior).
152
+ let timedOut = false
153
+ const timeoutMs = Number(process.env.ELASTICDASH_TOOL_TIMEOUT_MS ?? 0)
154
+ const timeout = timeoutMs > 0
155
+ ? setTimeout(() => {
156
+ timedOut = true
157
+ debugLog(`[elasticdash portal] tool subprocess TIMEOUT pid=${pid} after ${timeoutMs}ms — sending SIGTERM`)
158
+ try { child.kill('SIGTERM') } catch { /* already dead */ }
159
+ setTimeout(() => {
160
+ try { child.kill('SIGKILL') } catch { /* already dead */ }
161
+ }, 2000)
162
+ }, timeoutMs)
163
+ : null
164
+
165
+ const cleanup = () => {
166
+ if (heartbeat) clearInterval(heartbeat)
167
+ if (timeout) clearTimeout(timeout)
168
+ }
139
169
 
140
170
  const RESULT_PREFIX = '__ELASTICDASH_RESULT__:'
141
171
  let resultLine = ''
142
172
  let stderr = ''
173
+ let sawStdout = false
174
+ let sawStderr = false
143
175
 
144
176
  // Line-buffer stdout so that large result JSON lines split across multiple
145
177
  // data events are reassembled before processing.
146
178
  let stdoutBuf = ''
147
179
  child.stdout.on('data', (chunk: Buffer) => {
180
+ if (!sawStdout) {
181
+ sawStdout = true
182
+ debugLog(`[elasticdash portal] tool subprocess stage=first-stdout pid=${pid} elapsedMs=${elapsed()}`)
183
+ }
148
184
  stdoutBuf += chunk.toString()
149
185
  const lines = stdoutBuf.split('\n')
150
186
  stdoutBuf = lines.pop() ?? '' // keep last (possibly incomplete) line
@@ -157,12 +193,18 @@ export function runToolInSubprocess(
157
193
  }
158
194
  })
159
195
  child.stderr.on('data', (chunk: Buffer) => {
196
+ if (!sawStderr) {
197
+ sawStderr = true
198
+ debugLog(`[elasticdash portal] tool subprocess stage=first-stderr pid=${pid} elapsedMs=${elapsed()}`)
199
+ }
160
200
  stderr += chunk.toString()
161
201
  process.stderr.write(chunk)
162
202
  })
163
203
 
164
- child.on('close', () => {
165
- const currentDurationMs = Date.now() - startMs
204
+ child.on('close', (code, signal) => {
205
+ cleanup()
206
+ const currentDurationMs = elapsed()
207
+ debugLog(`[elasticdash portal] tool subprocess stage=closed pid=${pid} code=${code} signal=${signal ?? 'none'} elapsedMs=${currentDurationMs} stderrBytes=${stderr.length}`)
166
208
 
167
209
  // Flush any remaining buffered stdout line (e.g. result with no trailing newline)
168
210
  if (stdoutBuf.startsWith(RESULT_PREFIX)) {
@@ -175,17 +217,31 @@ export function runToolInSubprocess(
175
217
  try {
176
218
  resolve({ ...JSON.parse(resultLine), currentDurationMs })
177
219
  return
178
- } catch { /* fall through */ }
220
+ } catch (parseErr) {
221
+ const detail = `[exit=${code} signal=${signal ?? 'none'} elapsedMs=${currentDurationMs} pid=${pid}] resultLine failed to parse: ${(parseErr as Error).message}`
222
+ resolve({ ok: false, error: detail, currentDurationMs })
223
+ return
224
+ }
179
225
  }
180
- resolve({ ok: false, error: stderr.trim() || 'Tool subprocess produced no output.', currentDurationMs })
226
+
227
+ const stderrExcerpt = stderr.length > 1024 ? `…${stderr.slice(-1024)}` : stderr
228
+ const detail = `[exit=${code} signal=${signal ?? 'none'} elapsedMs=${currentDurationMs} pid=${pid} stderrBytes=${stderr.length}]`
229
+ const baseError = timedOut
230
+ ? `Tool subprocess timed out after ${timeoutMs}ms`
231
+ : (stderr.trim() || 'Tool subprocess produced no output.')
232
+ const errorMsg = stderr.trim()
233
+ ? `${baseError} ${detail}`
234
+ : `${baseError} ${detail}${stderrExcerpt ? `\nLast stderr: ${stderrExcerpt}` : ''}`
235
+ resolve({ ok: false, error: errorMsg, currentDurationMs })
181
236
  })
182
237
 
183
238
  child.on('error', (err) => {
239
+ cleanup()
184
240
  const hint = denoProject && (err as NodeJS.ErrnoException).code === 'ENOENT'
185
241
  ? ' (Deno project detected — ensure "deno" is installed and available in PATH)'
186
242
  : ''
187
243
  debugLog(`[elasticdash portal] Failed to spawn tool subprocess: ${err.message}${hint}`)
188
- resolve({ ok: false, error: `Failed to spawn tool subprocess: ${err.message}${hint}`, currentDurationMs: Date.now() - startMs })
244
+ resolve({ ok: false, error: `Failed to spawn tool subprocess: ${err.message}${hint}`, currentDurationMs: elapsed() })
189
245
  })
190
246
 
191
247
  const payload = JSON.stringify({
@@ -196,6 +252,7 @@ export function runToolInSubprocess(
196
252
  })
197
253
  child.stdin.write(payload)
198
254
  child.stdin.end()
255
+ debugLog(`[elasticdash portal] tool subprocess stage=payload-written pid=${pid} elapsedMs=${elapsed()} payloadBytes=${payload.length}`)
199
256
  })
200
257
  }
201
258
 
@@ -23,6 +23,13 @@ import { pathToFileURL } from 'node:url'
23
23
 
24
24
  const RESULT_PREFIX = '__ELASTICDASH_RESULT__:'
25
25
 
26
+ const WORKER_START_MS = Date.now()
27
+ function stage(name: string, extra?: Record<string, unknown>): void {
28
+ if (process.env.ELASTICDASH_DEBUG !== '1') return
29
+ const tail = extra ? ' ' + Object.entries(extra).map(([k, v]) => `${k}=${typeof v === 'string' ? v : JSON.stringify(v)}`).join(' ') : ''
30
+ process.stderr.write(`[elasticdash-worker tool] stage=${name} pid=${process.pid} elapsedMs=${Date.now() - WORKER_START_MS}${tail}\n`)
31
+ }
32
+
26
33
  function writeResult(result: unknown): Promise<void> {
27
34
  return new Promise((resolve, reject) => {
28
35
  process.stdout.write(RESULT_PREFIX + JSON.stringify(result) + '\n', (err) =>
@@ -158,6 +165,7 @@ function installFrozenFetchFallback(frozenEvents: FrozenEvent[]): void {
158
165
  }
159
166
 
160
167
  async function main() {
168
+ stage('boot')
161
169
  const originalExit = process.exit.bind(process)
162
170
 
163
171
  // Prevent the SDK's tryAutoInitHttpContext from triggering full observability
@@ -175,6 +183,7 @@ async function main() {
175
183
  for await (const chunk of process.stdin) {
176
184
  raw += chunk
177
185
  }
186
+ stage('stdin-eof', { bytes: raw.length })
178
187
 
179
188
  let payload: { toolsModulePath: string; toolName: string; args: unknown[]; frozenEvents?: FrozenEvent[] }
180
189
  try {
@@ -184,6 +193,7 @@ async function main() {
184
193
  originalExit(1)
185
194
  return
186
195
  }
196
+ stage('payload-parsed')
187
197
 
188
198
  const { toolsModulePath, toolName, args, frozenEvents } = payload
189
199
 
@@ -193,12 +203,16 @@ async function main() {
193
203
  const hasFrozen = frozenEvents && frozenEvents.length > 0
194
204
  if (hasFrozen) {
195
205
  await setupFrozenContext(frozenEvents)
206
+ stage('frozen-context-ready', { count: frozenEvents.length })
207
+ } else {
208
+ stage('frozen-context-skipped')
196
209
  }
197
210
 
198
211
  try {
199
212
  let mod: any
200
213
  try {
201
214
  mod = await import(pathToFileURL(toolsModulePath).href)
215
+ stage('tool-module-imported')
202
216
  } catch (importErr) {
203
217
  const ie = importErr as Error
204
218
  await writeResult({ ok: false, error: `Failed to import tool module: ${ie.stack || ie.message}` })
@@ -210,31 +224,37 @@ async function main() {
210
224
  // as long as their containing module is reachable from toolsModulePath's
211
225
  // import graph. Falls back to ed_tools-style module export lookup.
212
226
  let fn: ((...a: unknown[]) => unknown) | undefined
227
+ let resolvedVia = 'none'
213
228
  try {
214
229
  const reg = await import('./tool-registry.js')
215
230
  const registered = reg.getRegisteredTool(toolName)
216
- if (registered) fn = registered.wrapped
231
+ if (registered) { fn = registered.wrapped; resolvedVia = 'registry' }
217
232
  } catch {
218
233
  // Registry module not available (older SDK build); fall through to export lookup.
219
234
  }
220
235
  if (!fn) {
221
236
  const exported = mod[toolName]
222
- if (typeof exported === 'function') fn = exported
237
+ if (typeof exported === 'function') { fn = exported; resolvedVia = 'module-export' }
223
238
  }
224
239
  if (typeof fn !== 'function') {
225
240
  await writeResult({ ok: false, error: `"${toolName}" not found via edTool() registry or as an exported function in the module.` })
226
241
  originalExit(1)
227
242
  return
228
243
  }
244
+ stage('tool-resolved', { tool: toolName, via: resolvedVia })
229
245
 
246
+ stage('tool-call-start', { tool: toolName })
230
247
  const currentOutput = await fn(...args)
248
+ stage('tool-call-end', { tool: toolName })
231
249
  await writeResult({ ok: true, currentOutput })
250
+ stage('result-written')
232
251
  originalExit(0)
233
252
  } catch (e) {
234
253
  const err = e as Error
235
254
  const errorMsg = err.stack || err.message || String(e)
236
255
  process.stderr.write(`[elasticdash-worker] Tool execution failed:\n${errorMsg}\n`)
237
256
  await writeResult({ ok: false, error: errorMsg })
257
+ stage('result-written', { ok: false })
238
258
  originalExit(1)
239
259
  } finally {
240
260
  if (hasFrozen) restoreFrozenFetch()
@@ -101,6 +101,8 @@ export async function executeTrigger(
101
101
  const runs: StepRunResult[] = []
102
102
 
103
103
  for (let i = 0; i < trigger.runCount; i++) {
104
+ const runStart = Date.now()
105
+ debugLog(`[elasticdash] Trigger ${trigger.triggerId} step=${stepIndex + 1}/${totalSteps} name=${step.eventName} run=${i + 1}/${trigger.runCount} phase=start`)
104
106
  const result = await executePortalTask(
105
107
  {
106
108
  taskId: `trigger-${trigger.triggerId}-${step.eventName}-${i}`,
@@ -130,7 +132,7 @@ export async function executeTrigger(
130
132
  usageTotalTokens: result.usage?.totalTokens,
131
133
  })
132
134
 
133
- debugLog(`[elasticdash] Trigger ${trigger.triggerId} step=${step.eventName} run=${i} ok=${result.ok}`)
135
+ debugLog(`[elasticdash] Trigger ${trigger.triggerId} step=${stepIndex + 1}/${totalSteps} name=${step.eventName} run=${i + 1}/${trigger.runCount} phase=done ok=${result.ok} elapsedMs=${Date.now() - runStart}`)
134
136
  }
135
137
 
136
138
  stepResult = {