elasticdash-sdk 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (349) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +775 -0
  3. package/dist/browser-ui.d.ts +43 -0
  4. package/dist/browser-ui.d.ts.map +1 -0
  5. package/dist/browser-ui.js +246 -0
  6. package/dist/browser-ui.js.map +1 -0
  7. package/dist/capture/event.d.ts +33 -0
  8. package/dist/capture/event.d.ts.map +1 -0
  9. package/dist/capture/event.js +2 -0
  10. package/dist/capture/event.js.map +1 -0
  11. package/dist/capture/index.d.ts +4 -0
  12. package/dist/capture/index.d.ts.map +1 -0
  13. package/dist/capture/index.js +4 -0
  14. package/dist/capture/index.js.map +1 -0
  15. package/dist/capture/recorder.d.ts +24 -0
  16. package/dist/capture/recorder.d.ts.map +1 -0
  17. package/dist/capture/recorder.js +46 -0
  18. package/dist/capture/recorder.js.map +1 -0
  19. package/dist/capture/replay.d.ts +20 -0
  20. package/dist/capture/replay.d.ts.map +1 -0
  21. package/dist/capture/replay.js +47 -0
  22. package/dist/capture/replay.js.map +1 -0
  23. package/dist/ci/api-client.d.ts +38 -0
  24. package/dist/ci/api-client.d.ts.map +1 -0
  25. package/dist/ci/api-client.js +96 -0
  26. package/dist/ci/api-client.js.map +1 -0
  27. package/dist/ci/benchmark.d.ts +33 -0
  28. package/dist/ci/benchmark.d.ts.map +1 -0
  29. package/dist/ci/benchmark.js +213 -0
  30. package/dist/ci/benchmark.js.map +1 -0
  31. package/dist/ci/ed-runner.d.ts +48 -0
  32. package/dist/ci/ed-runner.d.ts.map +1 -0
  33. package/dist/ci/ed-runner.js +260 -0
  34. package/dist/ci/ed-runner.js.map +1 -0
  35. package/dist/ci/executor.d.ts +13 -0
  36. package/dist/ci/executor.d.ts.map +1 -0
  37. package/dist/ci/executor.js +542 -0
  38. package/dist/ci/executor.js.map +1 -0
  39. package/dist/ci/git-info.d.ts +17 -0
  40. package/dist/ci/git-info.d.ts.map +1 -0
  41. package/dist/ci/git-info.js +102 -0
  42. package/dist/ci/git-info.js.map +1 -0
  43. package/dist/ci/index.d.ts +6 -0
  44. package/dist/ci/index.d.ts.map +1 -0
  45. package/dist/ci/index.js +4 -0
  46. package/dist/ci/index.js.map +1 -0
  47. package/dist/ci/measurement.d.ts +9 -0
  48. package/dist/ci/measurement.d.ts.map +1 -0
  49. package/dist/ci/measurement.js +15 -0
  50. package/dist/ci/measurement.js.map +1 -0
  51. package/dist/ci/replay.d.ts +31 -0
  52. package/dist/ci/replay.d.ts.map +1 -0
  53. package/dist/ci/replay.js +96 -0
  54. package/dist/ci/replay.js.map +1 -0
  55. package/dist/ci/reporters/default.d.ts +8 -0
  56. package/dist/ci/reporters/default.d.ts.map +1 -0
  57. package/dist/ci/reporters/default.js +46 -0
  58. package/dist/ci/reporters/default.js.map +1 -0
  59. package/dist/ci/reporters/index.d.ts +8 -0
  60. package/dist/ci/reporters/index.d.ts.map +1 -0
  61. package/dist/ci/reporters/index.js +14 -0
  62. package/dist/ci/reporters/index.js.map +1 -0
  63. package/dist/ci/reporters/json.d.ts +8 -0
  64. package/dist/ci/reporters/json.d.ts.map +1 -0
  65. package/dist/ci/reporters/json.js +14 -0
  66. package/dist/ci/reporters/json.js.map +1 -0
  67. package/dist/ci/reporters/junit.d.ts +8 -0
  68. package/dist/ci/reporters/junit.d.ts.map +1 -0
  69. package/dist/ci/reporters/junit.js +48 -0
  70. package/dist/ci/reporters/junit.js.map +1 -0
  71. package/dist/ci/runner.d.ts +3 -0
  72. package/dist/ci/runner.d.ts.map +1 -0
  73. package/dist/ci/runner.js +187 -0
  74. package/dist/ci/runner.js.map +1 -0
  75. package/dist/ci/test-discovery.d.ts +5 -0
  76. package/dist/ci/test-discovery.d.ts.map +1 -0
  77. package/dist/ci/test-discovery.js +11 -0
  78. package/dist/ci/test-discovery.js.map +1 -0
  79. package/dist/ci/test-loader.d.ts +19 -0
  80. package/dist/ci/test-loader.d.ts.map +1 -0
  81. package/dist/ci/test-loader.js +149 -0
  82. package/dist/ci/test-loader.js.map +1 -0
  83. package/dist/ci/test-registry.d.ts +42 -0
  84. package/dist/ci/test-registry.d.ts.map +1 -0
  85. package/dist/ci/test-registry.js +18 -0
  86. package/dist/ci/test-registry.js.map +1 -0
  87. package/dist/ci/trace-schema.d.ts +30 -0
  88. package/dist/ci/trace-schema.d.ts.map +1 -0
  89. package/dist/ci/trace-schema.js +66 -0
  90. package/dist/ci/trace-schema.js.map +1 -0
  91. package/dist/ci/trace-writer.d.ts +16 -0
  92. package/dist/ci/trace-writer.d.ts.map +1 -0
  93. package/dist/ci/trace-writer.js +108 -0
  94. package/dist/ci/trace-writer.js.map +1 -0
  95. package/dist/ci/types.d.ts +108 -0
  96. package/dist/ci/types.d.ts.map +1 -0
  97. package/dist/ci/types.js +3 -0
  98. package/dist/ci/types.js.map +1 -0
  99. package/dist/ci/upload-client.d.ts +74 -0
  100. package/dist/ci/upload-client.d.ts.map +1 -0
  101. package/dist/ci/upload-client.js +195 -0
  102. package/dist/ci/upload-client.js.map +1 -0
  103. package/dist/cli.d.ts +3 -0
  104. package/dist/cli.d.ts.map +1 -0
  105. package/dist/cli.js +716 -0
  106. package/dist/cli.js.map +1 -0
  107. package/dist/core/agent-state.d.ts +47 -0
  108. package/dist/core/agent-state.d.ts.map +1 -0
  109. package/dist/core/agent-state.js +137 -0
  110. package/dist/core/agent-state.js.map +1 -0
  111. package/dist/core/judge-utils.d.ts +22 -0
  112. package/dist/core/judge-utils.d.ts.map +1 -0
  113. package/dist/core/judge-utils.js +211 -0
  114. package/dist/core/judge-utils.js.map +1 -0
  115. package/dist/core/registry.d.ts +28 -0
  116. package/dist/core/registry.d.ts.map +1 -0
  117. package/dist/core/registry.js +52 -0
  118. package/dist/core/registry.js.map +1 -0
  119. package/dist/dashboard-server.d.ts +65 -0
  120. package/dist/dashboard-server.d.ts.map +1 -0
  121. package/dist/dashboard-server.js +3940 -0
  122. package/dist/dashboard-server.js.map +1 -0
  123. package/dist/execution/tool-runner.d.ts +26 -0
  124. package/dist/execution/tool-runner.d.ts.map +1 -0
  125. package/dist/execution/tool-runner.js +316 -0
  126. package/dist/execution/tool-runner.js.map +1 -0
  127. package/dist/html/dashboard.html +2218 -0
  128. package/dist/http.d.ts +14 -0
  129. package/dist/http.d.ts.map +1 -0
  130. package/dist/http.js +13 -0
  131. package/dist/http.js.map +1 -0
  132. package/dist/index.cjs +8102 -0
  133. package/dist/index.d.ts +61 -0
  134. package/dist/index.d.ts.map +1 -0
  135. package/dist/index.js +67 -0
  136. package/dist/index.js.map +1 -0
  137. package/dist/interceptors/ai-interceptor.d.ts +26 -0
  138. package/dist/interceptors/ai-interceptor.d.ts.map +1 -0
  139. package/dist/interceptors/ai-interceptor.js +756 -0
  140. package/dist/interceptors/ai-interceptor.js.map +1 -0
  141. package/dist/interceptors/db-auto.d.ts +8 -0
  142. package/dist/interceptors/db-auto.d.ts.map +1 -0
  143. package/dist/interceptors/db-auto.js +217 -0
  144. package/dist/interceptors/db-auto.js.map +1 -0
  145. package/dist/interceptors/db.d.ts +23 -0
  146. package/dist/interceptors/db.d.ts.map +1 -0
  147. package/dist/interceptors/db.js +137 -0
  148. package/dist/interceptors/db.js.map +1 -0
  149. package/dist/interceptors/http.d.ts +28 -0
  150. package/dist/interceptors/http.d.ts.map +1 -0
  151. package/dist/interceptors/http.js +356 -0
  152. package/dist/interceptors/http.js.map +1 -0
  153. package/dist/interceptors/side-effects.d.ts +7 -0
  154. package/dist/interceptors/side-effects.d.ts.map +1 -0
  155. package/dist/interceptors/side-effects.js +72 -0
  156. package/dist/interceptors/side-effects.js.map +1 -0
  157. package/dist/interceptors/telemetry-push.d.ts +142 -0
  158. package/dist/interceptors/telemetry-push.d.ts.map +1 -0
  159. package/dist/interceptors/telemetry-push.js +463 -0
  160. package/dist/interceptors/telemetry-push.js.map +1 -0
  161. package/dist/interceptors/tool.d.ts +2 -0
  162. package/dist/interceptors/tool.d.ts.map +1 -0
  163. package/dist/interceptors/tool.js +274 -0
  164. package/dist/interceptors/tool.js.map +1 -0
  165. package/dist/interceptors/workflow-ai.d.ts +5 -0
  166. package/dist/interceptors/workflow-ai.d.ts.map +1 -0
  167. package/dist/interceptors/workflow-ai.js +382 -0
  168. package/dist/interceptors/workflow-ai.js.map +1 -0
  169. package/dist/internals/conditional-recorder.d.ts +21 -0
  170. package/dist/internals/conditional-recorder.d.ts.map +1 -0
  171. package/dist/internals/conditional-recorder.js +54 -0
  172. package/dist/internals/conditional-recorder.js.map +1 -0
  173. package/dist/internals/mock-resolver.d.ts +146 -0
  174. package/dist/internals/mock-resolver.d.ts.map +1 -0
  175. package/dist/internals/mock-resolver.js +427 -0
  176. package/dist/internals/mock-resolver.js.map +1 -0
  177. package/dist/matchers/index.d.ts +96 -0
  178. package/dist/matchers/index.d.ts.map +1 -0
  179. package/dist/matchers/index.js +668 -0
  180. package/dist/matchers/index.js.map +1 -0
  181. package/dist/observability.d.ts +82 -0
  182. package/dist/observability.d.ts.map +1 -0
  183. package/dist/observability.js +471 -0
  184. package/dist/observability.js.map +1 -0
  185. package/dist/portal-executor.d.ts +30 -0
  186. package/dist/portal-executor.d.ts.map +1 -0
  187. package/dist/portal-executor.js +324 -0
  188. package/dist/portal-executor.js.map +1 -0
  189. package/dist/portal-server.d.ts +3 -0
  190. package/dist/portal-server.d.ts.map +1 -0
  191. package/dist/portal-server.js +279 -0
  192. package/dist/portal-server.js.map +1 -0
  193. package/dist/proxy/llm-capture.d.ts +14 -0
  194. package/dist/proxy/llm-capture.d.ts.map +1 -0
  195. package/dist/proxy/llm-capture.js +264 -0
  196. package/dist/proxy/llm-capture.js.map +1 -0
  197. package/dist/reporter.d.ts +3 -0
  198. package/dist/reporter.d.ts.map +1 -0
  199. package/dist/reporter.js +72 -0
  200. package/dist/reporter.js.map +1 -0
  201. package/dist/runWorkflowSubprocess.d.ts +14 -0
  202. package/dist/runWorkflowSubprocess.d.ts.map +1 -0
  203. package/dist/runWorkflowSubprocess.js +66 -0
  204. package/dist/runWorkflowSubprocess.js.map +1 -0
  205. package/dist/runner.d.ts +16 -0
  206. package/dist/runner.d.ts.map +1 -0
  207. package/dist/runner.js +138 -0
  208. package/dist/runner.js.map +1 -0
  209. package/dist/socket-connector.d.ts +22 -0
  210. package/dist/socket-connector.d.ts.map +1 -0
  211. package/dist/socket-connector.js +104 -0
  212. package/dist/socket-connector.js.map +1 -0
  213. package/dist/telemetry-batcher.d.ts +56 -0
  214. package/dist/telemetry-batcher.d.ts.map +1 -0
  215. package/dist/telemetry-batcher.js +143 -0
  216. package/dist/telemetry-batcher.js.map +1 -0
  217. package/dist/test-setup.d.ts +12 -0
  218. package/dist/test-setup.d.ts.map +1 -0
  219. package/dist/test-setup.js +13 -0
  220. package/dist/test-setup.js.map +1 -0
  221. package/dist/tool-registry.d.ts +31 -0
  222. package/dist/tool-registry.d.ts.map +1 -0
  223. package/dist/tool-registry.js +73 -0
  224. package/dist/tool-registry.js.map +1 -0
  225. package/dist/tool-runner-worker.d.ts +2 -0
  226. package/dist/tool-runner-worker.d.ts.map +1 -0
  227. package/dist/tool-runner-worker.js +215 -0
  228. package/dist/tool-runner-worker.js.map +1 -0
  229. package/dist/trace-adapter/context.d.ts +72 -0
  230. package/dist/trace-adapter/context.d.ts.map +1 -0
  231. package/dist/trace-adapter/context.js +80 -0
  232. package/dist/trace-adapter/context.js.map +1 -0
  233. package/dist/tracing.d.ts +2 -0
  234. package/dist/tracing.d.ts.map +1 -0
  235. package/dist/tracing.js +59 -0
  236. package/dist/tracing.js.map +1 -0
  237. package/dist/trigger-executor.d.ts +12 -0
  238. package/dist/trigger-executor.d.ts.map +1 -0
  239. package/dist/trigger-executor.js +130 -0
  240. package/dist/trigger-executor.js.map +1 -0
  241. package/dist/types/portal.d.ts +76 -0
  242. package/dist/types/portal.d.ts.map +1 -0
  243. package/dist/types/portal.js +2 -0
  244. package/dist/types/portal.js.map +1 -0
  245. package/dist/utils/debug.d.ts +3 -0
  246. package/dist/utils/debug.d.ts.map +1 -0
  247. package/dist/utils/debug.js +8 -0
  248. package/dist/utils/debug.js.map +1 -0
  249. package/dist/utils/license-error.d.ts +23 -0
  250. package/dist/utils/license-error.d.ts.map +1 -0
  251. package/dist/utils/license-error.js +42 -0
  252. package/dist/utils/license-error.js.map +1 -0
  253. package/dist/utils/redact.d.ts +7 -0
  254. package/dist/utils/redact.d.ts.map +1 -0
  255. package/dist/utils/redact.js +26 -0
  256. package/dist/utils/redact.js.map +1 -0
  257. package/dist/workflow-runner-worker.d.ts +2 -0
  258. package/dist/workflow-runner-worker.d.ts.map +1 -0
  259. package/dist/workflow-runner-worker.js +329 -0
  260. package/dist/workflow-runner-worker.js.map +1 -0
  261. package/dist/workflow-runner.d.ts +14 -0
  262. package/dist/workflow-runner.d.ts.map +1 -0
  263. package/dist/workflow-runner.js +34 -0
  264. package/dist/workflow-runner.js.map +1 -0
  265. package/docs/agent-coding-instructions.md +138 -0
  266. package/docs/agent-integration-guide.md +564 -0
  267. package/docs/agents.md +140 -0
  268. package/docs/dashboard.md +394 -0
  269. package/docs/deno.md +69 -0
  270. package/docs/instrumentation.md +424 -0
  271. package/docs/langfuse-trace-structure.md +145 -0
  272. package/docs/matchers.md +173 -0
  273. package/docs/observability_contract.md +192 -0
  274. package/docs/observability_mode.md +195 -0
  275. package/docs/quickstart.md +621 -0
  276. package/docs/security-compliance.md +566 -0
  277. package/docs/test-writing-guidelines.md +444 -0
  278. package/docs/tools.md +165 -0
  279. package/docs/workflow-modes.md +253 -0
  280. package/package.json +76 -0
  281. package/src/browser-ui.ts +281 -0
  282. package/src/capture/event.ts +30 -0
  283. package/src/capture/index.ts +3 -0
  284. package/src/capture/recorder.ts +62 -0
  285. package/src/capture/replay.ts +55 -0
  286. package/src/ci/api-client.ts +136 -0
  287. package/src/ci/benchmark.ts +257 -0
  288. package/src/ci/ed-runner.ts +351 -0
  289. package/src/ci/executor.ts +671 -0
  290. package/src/ci/git-info.ts +127 -0
  291. package/src/ci/index.ts +5 -0
  292. package/src/ci/measurement.ts +25 -0
  293. package/src/ci/replay.ts +127 -0
  294. package/src/ci/reporters/default.ts +50 -0
  295. package/src/ci/reporters/index.ts +21 -0
  296. package/src/ci/reporters/json.ts +18 -0
  297. package/src/ci/reporters/junit.ts +61 -0
  298. package/src/ci/runner.ts +208 -0
  299. package/src/ci/test-discovery.ts +16 -0
  300. package/src/ci/test-loader.ts +187 -0
  301. package/src/ci/test-registry.ts +62 -0
  302. package/src/ci/trace-schema.ts +96 -0
  303. package/src/ci/trace-writer.ts +107 -0
  304. package/src/ci/types.ts +115 -0
  305. package/src/ci/upload-client.ts +300 -0
  306. package/src/cli.ts +811 -0
  307. package/src/core/agent-state.ts +162 -0
  308. package/src/core/judge-utils.ts +232 -0
  309. package/src/core/registry.ts +92 -0
  310. package/src/dashboard-server.ts +2047 -0
  311. package/src/execution/tool-runner.ts +352 -0
  312. package/src/html/dashboard.html +2218 -0
  313. package/src/http.ts +13 -0
  314. package/src/index.ts +138 -0
  315. package/src/interceptors/ai-interceptor.ts +798 -0
  316. package/src/interceptors/db-auto.ts +243 -0
  317. package/src/interceptors/db.ts +156 -0
  318. package/src/interceptors/http.ts +393 -0
  319. package/src/interceptors/side-effects.ts +83 -0
  320. package/src/interceptors/telemetry-push.ts +537 -0
  321. package/src/interceptors/tool.ts +287 -0
  322. package/src/interceptors/workflow-ai.ts +419 -0
  323. package/src/internals/conditional-recorder.ts +63 -0
  324. package/src/internals/mock-resolver.ts +492 -0
  325. package/src/matchers/index.ts +824 -0
  326. package/src/observability.ts +501 -0
  327. package/src/portal-executor.ts +355 -0
  328. package/src/portal-server.ts +304 -0
  329. package/src/proxy/llm-capture.ts +301 -0
  330. package/src/reporter.ts +81 -0
  331. package/src/runWorkflowSubprocess.ts +74 -0
  332. package/src/runner.ts +178 -0
  333. package/src/socket-connector.ts +117 -0
  334. package/src/telemetry-batcher.ts +191 -0
  335. package/src/test-setup.ts +16 -0
  336. package/src/tool-registry.ts +94 -0
  337. package/src/tool-runner-worker.ts +244 -0
  338. package/src/trace-adapter/context.ts +156 -0
  339. package/src/tracing.ts +62 -0
  340. package/src/trigger-executor.ts +171 -0
  341. package/src/types/agent.d.ts +63 -0
  342. package/src/types/expect.d.ts +81 -0
  343. package/src/types/modules.d.ts +2 -0
  344. package/src/types/portal.ts +69 -0
  345. package/src/utils/debug.ts +8 -0
  346. package/src/utils/license-error.ts +43 -0
  347. package/src/utils/redact.ts +25 -0
  348. package/src/workflow-runner-worker.ts +386 -0
  349. package/src/workflow-runner.ts +58 -0
@@ -0,0 +1,444 @@
1
+ # ElasticDash SDK Test Writing Guidelines
2
+
3
+ This guide covers both testing approaches available in `elasticdash-sdk`:
4
+
5
+ 1. **`defineTest` + benchmarks** — VCR-style CI testing with recorded fixtures (recommended for CI/CD)
6
+ 2. **`aiTest` + matchers** — live workflow testing with the existing runner
7
+
8
+ ## Prerequisites
9
+
10
+ - Node.js >= 20
11
+ - `elasticdash-sdk` installed as a dev dependency
12
+ - For `defineTest`: a TypeScript toolchain (e.g. `tsx`) if writing `.ts` test files
13
+ - Optional: `ELASTICDASH_API_KEY` for uploading results to the dashboard
14
+
15
+ ---
16
+
17
+ ## Part 1: CI/CD Testing with `defineTest` (Record & Replay)
18
+
19
+ ### Overview
20
+
21
+ ElasticDash CI testing follows a VCR-style record-and-replay pattern:
22
+
23
+ 1. **Record** — run your workflow locally with `ELASTICDASH_CAPTURE_TRACE=1` to capture a trace fixture
24
+ 2. **Write** — create an `ed_tests.ts` file with `defineTest()` calls that reference the fixture
25
+ 3. **Run** — execute `npx ed ed-test` to replay the fixture, measure performance, and compare against benchmarks
26
+ 4. **Upload** — results are automatically sent to the ElasticDash backend for historical analysis
27
+
28
+ ### Quick Start
29
+
30
+ ```bash
31
+ # 1. Record a trace
32
+ ELASTICDASH_CAPTURE_TRACE=1 node your-workflow.js
33
+
34
+ # 2. Write a test (see below)
35
+
36
+ # 3. Run tests locally
37
+ npx ed ed-test --no-upload
38
+
39
+ # 4. Run in CI with upload
40
+ ELASTICDASH_API_KEY=your-key ELASTICDASH_API_URL=https://server.elasticdash.com npx ed ed-test
41
+ ```
42
+
43
+ ### Recording Traces
44
+
45
+ Set the `ELASTICDASH_CAPTURE_TRACE=1` environment variable before running any workflow that uses the SDK's `wrapTool`/`wrapAI` interceptors or `runWorkflow()`.
46
+
47
+ ```bash
48
+ ELASTICDASH_CAPTURE_TRACE=1 tsx your-workflow.ts
49
+ ```
50
+
51
+ This writes a JSON trace file to `.ed_traces/` in the current directory:
52
+
53
+ ```
54
+ .ed_traces/2026-04-19T14-23-07_a7f3.json
55
+ ```
56
+
57
+ The trace contains every tool call and AI call the workflow made, including inputs, outputs, timing, and token usage. Sensitive fields (`authorization`, `api_key`, `password`, `secret`, `token`) are automatically scrubbed to `[REDACTED]`.
58
+
59
+ Add `.ed_traces/` to your `.gitignore` if you don't want fixtures committed, or commit them if you want deterministic CI replay.
60
+
61
+ #### Trace file structure
62
+
63
+ A trace file captures the full workflow execution. Here's what one looks like (simplified from a real trace):
64
+
65
+ ```json
66
+ {
67
+ "trace_id": "2026-04-20T00-57-44_800f",
68
+ "created_at": "2026-04-20T00:57:44.856Z",
69
+ "sdk_version": "unknown",
70
+ "workflow": {
71
+ "name": "chatStreamHandler",
72
+ "input": null,
73
+ "output": null
74
+ },
75
+ "steps": [
76
+ {
77
+ "step_id": "ai_call_0",
78
+ "type": "ai_call",
79
+ "name": "gpt-4o",
80
+ "input": {
81
+ "messages": [
82
+ { "role": "system", "content": "You are an expert that refines user queries..." },
83
+ { "role": "user", "content": "What's the attack of Groudon?" }
84
+ ],
85
+ "temperature": 0.5,
86
+ "max_tokens": 4096
87
+ },
88
+ "output": "Refined Query: \"What is the attack stat of Groudon?\"\nLanguage: \"en\"\nConcepts: [\"Groudon\", \"attack stat\"]\n...",
89
+ "started_at": "2026-04-20T00:57:26.425Z",
90
+ "ended_at": "2026-04-20T00:57:28.626Z",
91
+ "duration_ms": 2201,
92
+ "tokens": { "input": 1726, "output": 57, "total": 1783 }
93
+ },
94
+ {
95
+ "step_id": "tool_call_0",
96
+ "type": "tool_call",
97
+ "name": "queryRefinement",
98
+ "input": {
99
+ "userInput": "What's the attack of Groudon?",
100
+ "userToken": ""
101
+ },
102
+ "output": {
103
+ "refinedQuery": "What is the attack stat of Groudon?",
104
+ "entities": ["groudon details", "pokemon stats"],
105
+ "intentType": "FETCH"
106
+ },
107
+ "started_at": "2026-04-20T00:57:26.424Z",
108
+ "ended_at": "2026-04-20T00:57:28.627Z",
109
+ "duration_ms": 2203,
110
+ "tokens": null
111
+ }
112
+ ]
113
+ }
114
+ ```
115
+
116
+ Key fields to note:
117
+ - **`step_id`** — Used in `defineTest` to target a specific step (e.g. `"ai_call_0"`, `"tool_call_0"`)
118
+ - **`type`** — Either `"ai_call"` or `"tool_call"`, must match the `target.type` in your test
119
+ - **`input`** — The recorded input/prompt. This is what gets overridden when you use the custom `input` field in `defineTest`
120
+ - **`output`** — The recorded response, used for benchmark comparisons (`output_contains`, `llm_judge`, etc.)
121
+ - **`duration_ms`** / **`tokens`** — The measurements compared against `max_duration_ms` and `max_tokens_total` benchmarks
122
+
123
+ ### Writing ed_tests
124
+
125
+ Create a file named `ed_tests.ts` (or `ed_tests.js`) anywhere in your project. Multiple test files in different directories are all discovered automatically.
126
+
127
+ ```ts
128
+ import { defineTest } from "elasticdash-sdk";
129
+ import { runCheckoutWorkflow } from "./workflows";
130
+
131
+ defineTest({
132
+ name: "checkout_happy_path_latency",
133
+ trace: "../.ed_traces/2026-04-19T14-23-07_a7f3.json",
134
+ target: { type: "tool_call", step_id: "tool_call_2" },
135
+ benchmarks: { max_duration_ms: 2000 },
136
+ run: async () => {
137
+ await runCheckoutWorkflow({ userId: "premium_123" });
138
+ },
139
+ });
140
+
141
+ defineTest({
142
+ name: "checkout_token_budget",
143
+ trace: "../.ed_traces/2026-04-19T14-23-07_a7f3.json",
144
+ target: { type: "ai_call", step_id: "ai_call_0" },
145
+ benchmarks: { max_tokens_total: 5000 },
146
+ run: async () => {
147
+ await runCheckoutWorkflow({ userId: "premium_123" });
148
+ },
149
+ });
150
+ ```
151
+
152
+ #### Field reference
153
+
154
+ | Field | Type | Required | Description |
155
+ |-------|------|----------|-------------|
156
+ | `name` | string | yes | Unique test ID, used for historical comparison |
157
+ | `trace` | string | yes | Path to trace file, relative to the test file's directory |
158
+ | `target` | object | yes | `{ type: "tool_call" \| "ai_call", step_id: "tool_call_0" }` |
159
+ | `benchmarks` | object | yes | At least one of `max_duration_ms` or `max_tokens_total` |
160
+ | `input` | unknown or function | no | Custom input that overrides the trace's recorded input. Can be a static value or an async function for dynamic resolution |
161
+ | `run` | function | yes* | Async function that invokes the workflow. Receives the resolved input (custom or from trace) as its argument. *Required for execution |
162
+ | `timeout_ms` | number | no | Per-test timeout (default: 60000ms) |
163
+
164
+ The `step_id` values (`tool_call_0`, `ai_call_1`, etc.) are found in the trace file's `steps` array. Open the trace JSON to find the step you want to target.
165
+
166
+ #### Custom input
167
+
168
+ By default, the test's input is read from the trace fixture. You can override it with a static value or an async function to source the prompt from anywhere:
169
+
170
+ ```ts
171
+ // Static override — input is passed to run()
172
+ defineTest({
173
+ name: "checkout_custom_prompt",
174
+ trace: "../.ed_traces/trace.json",
175
+ target: { type: "ai_call", step_id: "ai_call_0" },
176
+ benchmarks: { max_tokens_total: 5000 },
177
+ input: { prompt: "Custom prompt text" },
178
+ run: async (input) => { await runCheckoutWorkflow(input); },
179
+ });
180
+
181
+ // Dynamic resolution (database, API, environment, etc.)
182
+ defineTest({
183
+ name: "checkout_dynamic_prompt",
184
+ trace: "../.ed_traces/trace.json",
185
+ target: { type: "ai_call", step_id: "ai_call_0" },
186
+ benchmarks: { max_tokens_total: 5000 },
187
+ input: async () => {
188
+ const row = await db.query("SELECT prompt FROM prompts WHERE id = 42");
189
+ return { prompt: row.prompt };
190
+ },
191
+ run: async (input) => { await runCheckoutWorkflow(input); },
192
+ });
193
+ ```
194
+
195
+ The custom input only affects the reported input in test results and uploads — replay matching still uses the trace fixture's recorded data.
196
+
197
+ ### Running Tests Locally
198
+
199
+ ```bash
200
+ # Run all tests
201
+ npx ed ed-test
202
+
203
+ # Skip upload (local iteration)
204
+ npx ed ed-test --no-upload
205
+
206
+ # Filter by test name pattern
207
+ npx ed ed-test --filter "checkout_*"
208
+
209
+ # Stop on first failure
210
+ npx ed ed-test --fail-fast
211
+
212
+ # JSON output (for scripting)
213
+ npx ed ed-test --reporter json
214
+
215
+ # JUnit XML (for CI systems)
216
+ npx ed ed-test --reporter junit
217
+ ```
218
+
219
+ #### Exit codes
220
+
221
+ | Code | Meaning |
222
+ |------|---------|
223
+ | 0 | All tests passed |
224
+ | 1 | One or more tests failed |
225
+ | 3 | Configuration error (no tests found, bad files, missing fixtures) |
226
+ | 4 | Upload failed, but tests completed successfully |
227
+
228
+ ### CI Integration
229
+
230
+ #### GitHub Actions
231
+
232
+ ```yaml
233
+ - name: Run ElasticDash tests
234
+ env:
235
+ ELASTICDASH_API_KEY: ${{ secrets.ELASTICDASH_API_KEY }}
236
+ ELASTICDASH_API_URL: https://server.elasticdash.com
237
+ run: npx ed ed-test
238
+ ```
239
+
240
+ #### GitLab CI
241
+
242
+ ```yaml
243
+ ed-test:
244
+ script:
245
+ - npx ed ed-test
246
+ variables:
247
+ ELASTICDASH_API_KEY: $ELASTICDASH_API_KEY
248
+ ELASTICDASH_API_URL: https://server.elasticdash.com
249
+ ```
250
+
251
+ #### CircleCI
252
+
253
+ ```yaml
254
+ - run:
255
+ name: Run ElasticDash tests
256
+ command: npx ed ed-test
257
+ environment:
258
+ ELASTICDASH_API_KEY: ${ELASTICDASH_API_KEY}
259
+ ELASTICDASH_API_URL: https://server.elasticdash.com
260
+ ```
261
+
262
+ Git metadata (commit SHA, branch, PR number) is auto-detected from CI environment variables for GitHub Actions, GitLab CI, CircleCI, and Buildkite. For local runs, `git rev-parse` is used as a fallback.
263
+
264
+ ### Environment Variables
265
+
266
+ | Variable | Description | Default |
267
+ |----------|-------------|---------|
268
+ | `ELASTICDASH_API_KEY` | Project API key for authentication | — |
269
+ | `ELASTICDASH_API_URL` | Backend server URL | — |
270
+ | `ELASTICDASH_CAPTURE_TRACE` | Set to `1` to record a trace fixture | — |
271
+ | `ELASTICDASH_ACCEPT_RERUNS` | Set to `false`, `0`, or `no` to reject rerun/trigger requests from the server. Useful when the SDK is installed across multiple projects and only one should process reruns. | Enabled (accepts reruns) |
272
+ | `ELASTICDASH_PORTAL_PORT` | Port for the portal server | `4574` |
273
+ | `ELASTICDASH_ALLOWED_ORIGINS` | Comma-separated allowed CORS origins for the portal | — |
274
+
275
+ ### How Replay Works
276
+
277
+ During test execution, every `wrapTool` and `wrapAI` call is intercepted. Instead of calling the real service, the SDK looks up the matching step in the trace fixture and returns the recorded output.
278
+
279
+ **Match key:** Each call is matched by `<type>::<name>::<hash(input)>` where the input is canonicalized (sorted keys, normalized whitespace, rounded floats). Calls are matched in order of occurrence.
280
+
281
+ **Replay miss:** If the workflow makes a call that doesn't match any step in the fixture, the test fails immediately with a clear diagnostic:
282
+
283
+ ```
284
+ FAIL checkout_happy_path_latency
285
+ → replay miss: tool_call::stripe.customers.retrieve
286
+ ```
287
+
288
+ This means the fixture is stale or the workflow has changed. Re-record the trace with `ELASTICDASH_CAPTURE_TRACE=1`.
289
+
290
+ ### Benchmarks
291
+
292
+ Phase 3 supports absolute threshold benchmarks:
293
+
294
+ - `max_duration_ms` — the step's recorded duration must not exceed this value
295
+ - `max_tokens_total` — the step's total token usage must not exceed this value (ai_call only)
296
+
297
+ Measurements come from the fixture's recorded values, not live timing. This makes tests deterministic — they always produce the same result for the same fixture.
298
+
299
+ ### Upload & Dashboard
300
+
301
+ When `ELASTICDASH_API_KEY` and `ELASTICDASH_API_URL` are set, results are uploaded to `POST /api/v1/test-runs/create` after all tests complete.
302
+
303
+ If the upload fails after retries, the payload is saved to `.ed_traces/failed_uploads/<run_id>.json` for manual inspection or retry. Upload failures don't change the test exit code unless all tests passed (exit code 4).
304
+
305
+ ### Troubleshooting
306
+
307
+ **Replay miss errors:**
308
+ The fixture doesn't contain a step matching what the workflow called. Common causes:
309
+ - Workflow code changed since the trace was recorded
310
+ - Non-deterministic inputs (timestamps, UUIDs) in tool call arguments
311
+
312
+ Fix: re-record the trace with `ELASTICDASH_CAPTURE_TRACE=1`.
313
+
314
+ **"test has no run function":**
315
+ The `run` field is required for test execution. Add an async function that invokes your workflow.
316
+
317
+ **Git detection shows "unknown":**
318
+ No `.git` directory or `git` binary found. In CI, ensure the repo is checked out. Locally, run from within a git repository.
319
+
320
+ **Upload auth errors:**
321
+ Check that `ELASTICDASH_API_KEY` is set and valid. The API key must be scoped to the correct project.
322
+
323
+ ---
324
+
325
+ ## Part 2: Live Workflow Testing with `aiTest` (Existing Approach)
326
+
327
+ The `aiTest` + matchers approach runs workflows live and asserts on the trace using custom matchers. This is useful for development-time testing but not for deterministic CI replay.
328
+
329
+ ### Test anatomy
330
+
331
+ - Import test setup once per file: `import 'elasticdash-sdk/dist/test-setup.js'`
332
+ - Each test receives `ctx` with `ctx.trace` to inspect recorded LLM/tool/custom steps.
333
+ - Matchers live on `expect` (already registered by `test-setup`).
334
+ - Files end with `.ai.test.ts` and use the global `aiTest(name, fn)`.
335
+
336
+ ### Hooks
337
+
338
+ - `beforeAll(fn)` / `afterAll(fn)`: run once per file before/after all tests.
339
+ - `beforeEach(fn)` / `afterEach(fn)`: run before/after every test in the file. `afterEach` still runs when the test fails.
340
+
341
+ ### Useful matchers (quick reference)
342
+
343
+ - `toHaveLLMStep(config)`: Assert LLM calls match model/provider/prompt/output filters.
344
+ - `toCallTool(name)`: Assert a tool call occurred.
345
+ - `toHaveCustomStep(config)`: Assert custom (RAG/code/fixed/custom) steps.
346
+ - `toHavePromptWhere(config)`: Filter prompts by substring, then require/include/exclude content (with optional nth/index positional checks).
347
+ - `toMatchSemanticOutput(expected, options?)`: LLM-judged semantic match over combined LLM outputs.
348
+ - `toEvaluateOutputMetric(config)`: LLM-scored numeric metric (0-1) on a specific LLM step's prompt or result, with threshold comparisons.
349
+
350
+ ### Patterns and examples
351
+
352
+ #### Validate the order of steps in a workflow
353
+
354
+ ```ts
355
+ aiTest('prompts occur in order', async (ctx) => {
356
+ await runWorkflow()
357
+
358
+ expect(ctx.trace).toHavePromptWhere({
359
+ filterContains: 'Goal Completion Validator',
360
+ nth: 1,
361
+ })
362
+
363
+ expect(ctx.trace).toHavePromptWhere({
364
+ filterContains: "User's Ultimate Goal:",
365
+ nth: 2,
366
+ })
367
+ })
368
+ ```
369
+
370
+ #### Validate fetched data from RAG/APIs
371
+
372
+ ```ts
373
+ aiTest('RAG includes required source', async (ctx) => {
374
+ await runWorkflow()
375
+
376
+ expect(ctx.trace).toHaveCustomStep({
377
+ kind: 'rag',
378
+ contains: 'pokemon_stats',
379
+ resultContains: 'base_stat',
380
+ })
381
+ })
382
+ ```
383
+
384
+ #### Check LLM output
385
+
386
+ ```ts
387
+ aiTest('planner output returns attack stat', async (ctx) => {
388
+ await runWorkflow()
389
+
390
+ expect(ctx.trace).toHaveLLMStep({
391
+ outputContains: 'attack stat of Pikachu',
392
+ })
393
+ })
394
+ ```
395
+
396
+ #### LLM-as-judge metric scoring
397
+
398
+ ```ts
399
+ aiTest('plan is actionable', async (ctx) => {
400
+ await runWorkflow()
401
+
402
+ await expect(ctx.trace).toEvaluateOutputMetric({
403
+ evaluationPrompt:
404
+ 'Score 0-1: is this execution plan concrete and directly executable? '
405
+ + '1.0 = concrete SQL with specific tables/columns; 0.0 = vague/placeholder.',
406
+ target: 'result',
407
+ nth: 2,
408
+ condition: { atLeast: 0.7 },
409
+ provider: 'claude',
410
+ model: 'claude-3-opus-20240229',
411
+ })
412
+ })
413
+ ```
414
+
415
+ ### Tips
416
+
417
+ - Always `await` async matchers (`toMatchSemanticOutput`, `toEvaluateOutputMetric`).
418
+ - Use `nth`/`index` in `toHavePromptWhere` to avoid false positives when multiple prompts contain similar text.
419
+ - Keep prompts/results concise in tests; log the trace when debugging: `console.log(ctx.trace.getLLMSteps())`.
420
+
421
+ ### Running aiTest tests
422
+
423
+ ```bash
424
+ # Run all *.ai.test.ts files
425
+ elasticdash test
426
+
427
+ # Run in a subdir
428
+ elasticdash test examples/
429
+
430
+ # Single file
431
+ elasticdash run path/to/file.ai.test.ts
432
+ ```
433
+
434
+ ### Minimal scaffold
435
+
436
+ ```ts
437
+ import 'elasticdash-sdk/dist/test-setup.js'
438
+ import { expect } from 'expect'
439
+
440
+ aiTest('example', async (ctx) => {
441
+ await runWorkflow()
442
+ expect(ctx.trace).toHaveLLMStep({ provider: 'openai' })
443
+ })
444
+ ```
package/docs/tools.md ADDED
@@ -0,0 +1,165 @@
1
+ # Tool Recording and Replay
2
+
3
+ ElasticDash automatically records and traces tool calls during workflow execution, providing replay and debugging capabilities.
4
+
5
+ For HTTP response streaming capture (SSE/NDJSON fetch flows), see `README.md` and `docs/quickstart.md#capture-streaming-flows`. That behavior is handled by the HTTP interceptor and is separate from manual tool instrumentation in this document.
6
+
7
+ ## Manual Tool Recording
8
+
9
+ For tools outside the normal import flow, or if you need explicit success/error logging control, use a resilient `recordToolCall` pattern where tracing is isolated from the main service path:
10
+
11
+ ```ts
12
+ import { runSelectQuery } from 'path/to/tool/calls'
13
+
14
+ export const dataService = async (input: any) => {
15
+ const { query } = input as { query: string }
16
+
17
+ return await runSelectQuery(query)
18
+ .then(async (result: any) => {
19
+ try {
20
+ const { recordToolCall } = await import('elasticdash-sdk')
21
+ recordToolCall('dataService', input, result)
22
+ } catch {
23
+ // trace logging errors must not break the main service
24
+ }
25
+ return result
26
+ })
27
+ .catch(async (error: any) => {
28
+ try {
29
+ const { recordToolCall } = await import('elasticdash-sdk')
30
+ recordToolCall('dataService', input, error)
31
+ } catch {
32
+ // trace logging errors must not break the main service
33
+ }
34
+ throw error
35
+ })
36
+ }
37
+ ```
38
+
39
+ Why this pattern is recommended for manual instrumentation:
40
+
41
+ - Build-time or mainstream runtime contexts may not have tracing APIs available.
42
+ - Dynamic import plus nested `try/catch` keeps `recordToolCall` best-effort.
43
+ - Service success/failure behavior is preserved even when trace logging fails.
44
+
45
+ If your runtime may execute outside ElasticDash worker context, dynamic import keeps behavior safe:
46
+
47
+ ```ts
48
+ try {
49
+ const { recordToolCall } = await import('elasticdash-sdk')
50
+ recordToolCall('dataService', input, result)
51
+ } catch {
52
+ // no-op outside elasticdash runtime
53
+ }
54
+ ```
55
+
56
+ **Note:** Manual recording is best-effort trace logging. Keep the same resilient pattern (dynamic import + nested `try/catch`) across all tools so trace logging never interrupts core service execution.
57
+
58
+ ## Calling Tools from Workflows
59
+
60
+ **Always call tool functions from `ed_tools.ts` (or `ed_tools.js`), not from their source code locations.**
61
+
62
+ In your workflows, import and use tools through the instrumented export:
63
+
64
+ ```ts
65
+ // ✅ Correct - calls the traced version from ed_tools.ts
66
+ import { dataService } from './ed_tools'
67
+
68
+ export const checkoutWorkflow = async (orderId: string) => {
69
+ const orderData = await dataService({ query: `SELECT * FROM orders WHERE id = ${orderId}` })
70
+ // ... rest of workflow
71
+ }
72
+ ```
73
+
74
+ Not directly from the source file:
75
+ ```ts
76
+ // ❌ Wrong - bypasses tracing instrumentation
77
+ import { runSelectQuery } from './services/dataService'
78
+
79
+ export const checkoutWorkflow = async (orderId: string) => {
80
+ const orderData = await runSelectQuery(`SELECT * FROM orders WHERE id = ${orderId}`)
81
+ // ... rest of workflow
82
+ }
83
+ ```
84
+
85
+ **Why this matters:**
86
+ - Tool calls through `ed_tools.ts` are automatically traced and recorded
87
+ - Direct imports bypass the `recordToolCall` instrumentation
88
+ - Dashboard trace replay requires tools to be called through `ed_tools.ts`
89
+ - LLM agents calling tools will record the call with the `name` from `ed_tools.ts`, so using the same import ensures name matching
90
+
91
+ ## Tool Function Compatibility (`ed_tools.ts/js`)
92
+
93
+ Exports in `ed_tools.ts/js` should be plain callable functions that take serializable input and return serializable output.
94
+
95
+ - Export directly callable functions
96
+ - Use JSON-serializable args/results (object, array, string, number, boolean, or `null`)
97
+ - Avoid exporting framework request/response handlers directly (for example Next.js `NextRequest`/`NextResponse` route handlers)
98
+
99
+ Compatible export example:
100
+
101
+ ```ts
102
+ export async function chargeCard(input: { amount: number; token: string }) {
103
+ return { success: true, transactionId: 'txn-123' }
104
+ }
105
+ ```
106
+
107
+ Not directly compatible as a tool export:
108
+
109
+ ```ts
110
+ // Next.js route handler style
111
+ export async function POST(req: NextRequest): Promise<NextResponse> {
112
+ return NextResponse.json({ ok: true })
113
+ }
114
+ ```
115
+
116
+ If your app uses framework handlers, keep `ed_tools.ts/js` as a plain callable boundary and invoke your framework-specific code behind that boundary.
117
+
118
+ ## Recording Without Passing `ctx.trace`
119
+
120
+ Use Node's `AsyncLocalStorage` to record steps without threading `ctx.trace` through every function:
121
+
122
+ ```ts
123
+ // In your test
124
+ import { setCurrentTrace } from 'elasticdash-sdk'
125
+
126
+ aiTest('flow test', async (ctx) => {
127
+ setCurrentTrace(ctx.trace) // bind the trace to the current async context
128
+ await runFlowWithoutTraceArg() // your existing code
129
+ expect(ctx.trace).toHaveCustomStep({ kind: 'rag', name: 'pokemon-search' })
130
+ })
131
+
132
+ // In your app/flow code (called during the test)
133
+ import { getCurrentTrace } from 'elasticdash-sdk'
134
+
135
+ function runFlowWithoutTraceArg() {
136
+ const trace = getCurrentTrace()
137
+ trace?.recordCustomStep({
138
+ kind: 'rag',
139
+ name: 'pokemon-search',
140
+ payload: { query: 'pikachu attack' },
141
+ result: { ids: [25] },
142
+ tags: ['source:db', 'sort:asc'],
143
+ })
144
+ }
145
+ ```
146
+
147
+ **Notes:**
148
+ - Works per async context; if you spawn detached work (child processes/independent workers), pass `trace` explicitly there.
149
+ - Still compatible with manual DI: you can continue passing `ctx.trace` explicitly if you prefer.
150
+
151
+ ## Optional LLM Capture Proxy (for Supabase Edge / Deno)
152
+
153
+ For environments where Node fetch interception doesn't work (like Supabase Edge Functions or Deno Deploy):
154
+
155
+ 1. Set `ELASTICDASH_LLM_PROXY=1` (optional: `ELASTICDASH_LLM_PROXY_PORT`, default `8787`)
156
+ 2. The runner starts a local proxy and generates a per-test `ELASTICDASH_TRACE_ID`
157
+ 3. Point your LLM client at the proxy via base URL envs:
158
+ ```bash
159
+ OPENAI_BASE_URL=http://localhost:8787/v1
160
+ ANTHROPIC_API_URL=http://localhost:8787
161
+ ```
162
+ 4. Forward the trace ID to your Edge/Deno code (e.g., add `x-trace-id: process.env.ELASTICDASH_TRACE_ID` header)
163
+ 5. The proxy records model/prompt/completion and folds captured steps back into `ctx.trace`
164
+
165
+ When `ELASTICDASH_LLM_PROXY` is unset, the existing Node fetch interceptor remains the default.