npm - elasticdash-sdk - Versions diffs - 0.2.0 - Mend

elasticdash-sdk 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (349) hide show

package/LICENSE +21 -0
package/README.md +775 -0
package/dist/browser-ui.d.ts +43 -0
package/dist/browser-ui.d.ts.map +1 -0
package/dist/browser-ui.js +246 -0
package/dist/browser-ui.js.map +1 -0
package/dist/capture/event.d.ts +33 -0
package/dist/capture/event.d.ts.map +1 -0
package/dist/capture/event.js +2 -0
package/dist/capture/event.js.map +1 -0
package/dist/capture/index.d.ts +4 -0
package/dist/capture/index.d.ts.map +1 -0
package/dist/capture/index.js +4 -0
package/dist/capture/index.js.map +1 -0
package/dist/capture/recorder.d.ts +24 -0
package/dist/capture/recorder.d.ts.map +1 -0
package/dist/capture/recorder.js +46 -0
package/dist/capture/recorder.js.map +1 -0
package/dist/capture/replay.d.ts +20 -0
package/dist/capture/replay.d.ts.map +1 -0
package/dist/capture/replay.js +47 -0
package/dist/capture/replay.js.map +1 -0
package/dist/ci/api-client.d.ts +38 -0
package/dist/ci/api-client.d.ts.map +1 -0
package/dist/ci/api-client.js +96 -0
package/dist/ci/api-client.js.map +1 -0
package/dist/ci/benchmark.d.ts +33 -0
package/dist/ci/benchmark.d.ts.map +1 -0
package/dist/ci/benchmark.js +213 -0
package/dist/ci/benchmark.js.map +1 -0
package/dist/ci/ed-runner.d.ts +48 -0
package/dist/ci/ed-runner.d.ts.map +1 -0
package/dist/ci/ed-runner.js +260 -0
package/dist/ci/ed-runner.js.map +1 -0
package/dist/ci/executor.d.ts +13 -0
package/dist/ci/executor.d.ts.map +1 -0
package/dist/ci/executor.js +542 -0
package/dist/ci/executor.js.map +1 -0
package/dist/ci/git-info.d.ts +17 -0
package/dist/ci/git-info.d.ts.map +1 -0
package/dist/ci/git-info.js +102 -0
package/dist/ci/git-info.js.map +1 -0
package/dist/ci/index.d.ts +6 -0
package/dist/ci/index.d.ts.map +1 -0
package/dist/ci/index.js +4 -0
package/dist/ci/index.js.map +1 -0
package/dist/ci/measurement.d.ts +9 -0
package/dist/ci/measurement.d.ts.map +1 -0
package/dist/ci/measurement.js +15 -0
package/dist/ci/measurement.js.map +1 -0
package/dist/ci/replay.d.ts +31 -0
package/dist/ci/replay.d.ts.map +1 -0
package/dist/ci/replay.js +96 -0
package/dist/ci/replay.js.map +1 -0
package/dist/ci/reporters/default.d.ts +8 -0
package/dist/ci/reporters/default.d.ts.map +1 -0
package/dist/ci/reporters/default.js +46 -0
package/dist/ci/reporters/default.js.map +1 -0
package/dist/ci/reporters/index.d.ts +8 -0
package/dist/ci/reporters/index.d.ts.map +1 -0
package/dist/ci/reporters/index.js +14 -0
package/dist/ci/reporters/index.js.map +1 -0
package/dist/ci/reporters/json.d.ts +8 -0
package/dist/ci/reporters/json.d.ts.map +1 -0
package/dist/ci/reporters/json.js +14 -0
package/dist/ci/reporters/json.js.map +1 -0
package/dist/ci/reporters/junit.d.ts +8 -0
package/dist/ci/reporters/junit.d.ts.map +1 -0
package/dist/ci/reporters/junit.js +48 -0
package/dist/ci/reporters/junit.js.map +1 -0
package/dist/ci/runner.d.ts +3 -0
package/dist/ci/runner.d.ts.map +1 -0
package/dist/ci/runner.js +187 -0
package/dist/ci/runner.js.map +1 -0
package/dist/ci/test-discovery.d.ts +5 -0
package/dist/ci/test-discovery.d.ts.map +1 -0
package/dist/ci/test-discovery.js +11 -0
package/dist/ci/test-discovery.js.map +1 -0
package/dist/ci/test-loader.d.ts +19 -0
package/dist/ci/test-loader.d.ts.map +1 -0
package/dist/ci/test-loader.js +149 -0
package/dist/ci/test-loader.js.map +1 -0
package/dist/ci/test-registry.d.ts +42 -0
package/dist/ci/test-registry.d.ts.map +1 -0
package/dist/ci/test-registry.js +18 -0
package/dist/ci/test-registry.js.map +1 -0
package/dist/ci/trace-schema.d.ts +30 -0
package/dist/ci/trace-schema.d.ts.map +1 -0
package/dist/ci/trace-schema.js +66 -0
package/dist/ci/trace-schema.js.map +1 -0
package/dist/ci/trace-writer.d.ts +16 -0
package/dist/ci/trace-writer.d.ts.map +1 -0
package/dist/ci/trace-writer.js +108 -0
package/dist/ci/trace-writer.js.map +1 -0
package/dist/ci/types.d.ts +108 -0
package/dist/ci/types.d.ts.map +1 -0
package/dist/ci/types.js +3 -0
package/dist/ci/types.js.map +1 -0
package/dist/ci/upload-client.d.ts +74 -0
package/dist/ci/upload-client.d.ts.map +1 -0
package/dist/ci/upload-client.js +195 -0
package/dist/ci/upload-client.js.map +1 -0
package/dist/cli.d.ts +3 -0
package/dist/cli.d.ts.map +1 -0
package/dist/cli.js +716 -0
package/dist/cli.js.map +1 -0
package/dist/core/agent-state.d.ts +47 -0
package/dist/core/agent-state.d.ts.map +1 -0
package/dist/core/agent-state.js +137 -0
package/dist/core/agent-state.js.map +1 -0
package/dist/core/judge-utils.d.ts +22 -0
package/dist/core/judge-utils.d.ts.map +1 -0
package/dist/core/judge-utils.js +211 -0
package/dist/core/judge-utils.js.map +1 -0
package/dist/core/registry.d.ts +28 -0
package/dist/core/registry.d.ts.map +1 -0
package/dist/core/registry.js +52 -0
package/dist/core/registry.js.map +1 -0
package/dist/dashboard-server.d.ts +65 -0
package/dist/dashboard-server.d.ts.map +1 -0
package/dist/dashboard-server.js +3940 -0
package/dist/dashboard-server.js.map +1 -0
package/dist/execution/tool-runner.d.ts +26 -0
package/dist/execution/tool-runner.d.ts.map +1 -0
package/dist/execution/tool-runner.js +316 -0
package/dist/execution/tool-runner.js.map +1 -0
package/dist/html/dashboard.html +2218 -0
package/dist/http.d.ts +14 -0
package/dist/http.d.ts.map +1 -0
package/dist/http.js +13 -0
package/dist/http.js.map +1 -0
package/dist/index.cjs +8102 -0
package/dist/index.d.ts +61 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +67 -0
package/dist/index.js.map +1 -0
package/dist/interceptors/ai-interceptor.d.ts +26 -0
package/dist/interceptors/ai-interceptor.d.ts.map +1 -0
package/dist/interceptors/ai-interceptor.js +756 -0
package/dist/interceptors/ai-interceptor.js.map +1 -0
package/dist/interceptors/db-auto.d.ts +8 -0
package/dist/interceptors/db-auto.d.ts.map +1 -0
package/dist/interceptors/db-auto.js +217 -0
package/dist/interceptors/db-auto.js.map +1 -0
package/dist/interceptors/db.d.ts +23 -0
package/dist/interceptors/db.d.ts.map +1 -0
package/dist/interceptors/db.js +137 -0
package/dist/interceptors/db.js.map +1 -0
package/dist/interceptors/http.d.ts +28 -0
package/dist/interceptors/http.d.ts.map +1 -0
package/dist/interceptors/http.js +356 -0
package/dist/interceptors/http.js.map +1 -0
package/dist/interceptors/side-effects.d.ts +7 -0
package/dist/interceptors/side-effects.d.ts.map +1 -0
package/dist/interceptors/side-effects.js +72 -0
package/dist/interceptors/side-effects.js.map +1 -0
package/dist/interceptors/telemetry-push.d.ts +142 -0
package/dist/interceptors/telemetry-push.d.ts.map +1 -0
package/dist/interceptors/telemetry-push.js +463 -0
package/dist/interceptors/telemetry-push.js.map +1 -0
package/dist/interceptors/tool.d.ts +2 -0
package/dist/interceptors/tool.d.ts.map +1 -0
package/dist/interceptors/tool.js +274 -0
package/dist/interceptors/tool.js.map +1 -0
package/dist/interceptors/workflow-ai.d.ts +5 -0
package/dist/interceptors/workflow-ai.d.ts.map +1 -0
package/dist/interceptors/workflow-ai.js +382 -0
package/dist/interceptors/workflow-ai.js.map +1 -0
package/dist/internals/conditional-recorder.d.ts +21 -0
package/dist/internals/conditional-recorder.d.ts.map +1 -0
package/dist/internals/conditional-recorder.js +54 -0
package/dist/internals/conditional-recorder.js.map +1 -0
package/dist/internals/mock-resolver.d.ts +146 -0
package/dist/internals/mock-resolver.d.ts.map +1 -0
package/dist/internals/mock-resolver.js +427 -0
package/dist/internals/mock-resolver.js.map +1 -0
package/dist/matchers/index.d.ts +96 -0
package/dist/matchers/index.d.ts.map +1 -0
package/dist/matchers/index.js +668 -0
package/dist/matchers/index.js.map +1 -0
package/dist/observability.d.ts +82 -0
package/dist/observability.d.ts.map +1 -0
package/dist/observability.js +471 -0
package/dist/observability.js.map +1 -0
package/dist/portal-executor.d.ts +30 -0
package/dist/portal-executor.d.ts.map +1 -0
package/dist/portal-executor.js +324 -0
package/dist/portal-executor.js.map +1 -0
package/dist/portal-server.d.ts +3 -0
package/dist/portal-server.d.ts.map +1 -0
package/dist/portal-server.js +279 -0
package/dist/portal-server.js.map +1 -0
package/dist/proxy/llm-capture.d.ts +14 -0
package/dist/proxy/llm-capture.d.ts.map +1 -0
package/dist/proxy/llm-capture.js +264 -0
package/dist/proxy/llm-capture.js.map +1 -0
package/dist/reporter.d.ts +3 -0
package/dist/reporter.d.ts.map +1 -0
package/dist/reporter.js +72 -0
package/dist/reporter.js.map +1 -0
package/dist/runWorkflowSubprocess.d.ts +14 -0
package/dist/runWorkflowSubprocess.d.ts.map +1 -0
package/dist/runWorkflowSubprocess.js +66 -0
package/dist/runWorkflowSubprocess.js.map +1 -0
package/dist/runner.d.ts +16 -0
package/dist/runner.d.ts.map +1 -0
package/dist/runner.js +138 -0
package/dist/runner.js.map +1 -0
package/dist/socket-connector.d.ts +22 -0
package/dist/socket-connector.d.ts.map +1 -0
package/dist/socket-connector.js +104 -0
package/dist/socket-connector.js.map +1 -0
package/dist/telemetry-batcher.d.ts +56 -0
package/dist/telemetry-batcher.d.ts.map +1 -0
package/dist/telemetry-batcher.js +143 -0
package/dist/telemetry-batcher.js.map +1 -0
package/dist/test-setup.d.ts +12 -0
package/dist/test-setup.d.ts.map +1 -0
package/dist/test-setup.js +13 -0
package/dist/test-setup.js.map +1 -0
package/dist/tool-registry.d.ts +31 -0
package/dist/tool-registry.d.ts.map +1 -0
package/dist/tool-registry.js +73 -0
package/dist/tool-registry.js.map +1 -0
package/dist/tool-runner-worker.d.ts +2 -0
package/dist/tool-runner-worker.d.ts.map +1 -0
package/dist/tool-runner-worker.js +215 -0
package/dist/tool-runner-worker.js.map +1 -0
package/dist/trace-adapter/context.d.ts +72 -0
package/dist/trace-adapter/context.d.ts.map +1 -0
package/dist/trace-adapter/context.js +80 -0
package/dist/trace-adapter/context.js.map +1 -0
package/dist/tracing.d.ts +2 -0
package/dist/tracing.d.ts.map +1 -0
package/dist/tracing.js +59 -0
package/dist/tracing.js.map +1 -0
package/dist/trigger-executor.d.ts +12 -0
package/dist/trigger-executor.d.ts.map +1 -0
package/dist/trigger-executor.js +130 -0
package/dist/trigger-executor.js.map +1 -0
package/dist/types/portal.d.ts +76 -0
package/dist/types/portal.d.ts.map +1 -0
package/dist/types/portal.js +2 -0
package/dist/types/portal.js.map +1 -0
package/dist/utils/debug.d.ts +3 -0
package/dist/utils/debug.d.ts.map +1 -0
package/dist/utils/debug.js +8 -0
package/dist/utils/debug.js.map +1 -0
package/dist/utils/license-error.d.ts +23 -0
package/dist/utils/license-error.d.ts.map +1 -0
package/dist/utils/license-error.js +42 -0
package/dist/utils/license-error.js.map +1 -0
package/dist/utils/redact.d.ts +7 -0
package/dist/utils/redact.d.ts.map +1 -0
package/dist/utils/redact.js +26 -0
package/dist/utils/redact.js.map +1 -0
package/dist/workflow-runner-worker.d.ts +2 -0
package/dist/workflow-runner-worker.d.ts.map +1 -0
package/dist/workflow-runner-worker.js +329 -0
package/dist/workflow-runner-worker.js.map +1 -0
package/dist/workflow-runner.d.ts +14 -0
package/dist/workflow-runner.d.ts.map +1 -0
package/dist/workflow-runner.js +34 -0
package/dist/workflow-runner.js.map +1 -0
package/docs/agent-coding-instructions.md +138 -0
package/docs/agent-integration-guide.md +564 -0
package/docs/agents.md +140 -0
package/docs/dashboard.md +394 -0
package/docs/deno.md +69 -0
package/docs/instrumentation.md +424 -0
package/docs/langfuse-trace-structure.md +145 -0
package/docs/matchers.md +173 -0
package/docs/observability_contract.md +192 -0
package/docs/observability_mode.md +195 -0
package/docs/quickstart.md +621 -0
package/docs/security-compliance.md +566 -0
package/docs/test-writing-guidelines.md +444 -0
package/docs/tools.md +165 -0
package/docs/workflow-modes.md +253 -0
package/package.json +76 -0
package/src/browser-ui.ts +281 -0
package/src/capture/event.ts +30 -0
package/src/capture/index.ts +3 -0
package/src/capture/recorder.ts +62 -0
package/src/capture/replay.ts +55 -0
package/src/ci/api-client.ts +136 -0
package/src/ci/benchmark.ts +257 -0
package/src/ci/ed-runner.ts +351 -0
package/src/ci/executor.ts +671 -0
package/src/ci/git-info.ts +127 -0
package/src/ci/index.ts +5 -0
package/src/ci/measurement.ts +25 -0
package/src/ci/replay.ts +127 -0
package/src/ci/reporters/default.ts +50 -0
package/src/ci/reporters/index.ts +21 -0
package/src/ci/reporters/json.ts +18 -0
package/src/ci/reporters/junit.ts +61 -0
package/src/ci/runner.ts +208 -0
package/src/ci/test-discovery.ts +16 -0
package/src/ci/test-loader.ts +187 -0
package/src/ci/test-registry.ts +62 -0
package/src/ci/trace-schema.ts +96 -0
package/src/ci/trace-writer.ts +107 -0
package/src/ci/types.ts +115 -0
package/src/ci/upload-client.ts +300 -0
package/src/cli.ts +811 -0
package/src/core/agent-state.ts +162 -0
package/src/core/judge-utils.ts +232 -0
package/src/core/registry.ts +92 -0
package/src/dashboard-server.ts +2047 -0
package/src/execution/tool-runner.ts +352 -0
package/src/html/dashboard.html +2218 -0
package/src/http.ts +13 -0
package/src/index.ts +138 -0
package/src/interceptors/ai-interceptor.ts +798 -0
package/src/interceptors/db-auto.ts +243 -0
package/src/interceptors/db.ts +156 -0
package/src/interceptors/http.ts +393 -0
package/src/interceptors/side-effects.ts +83 -0
package/src/interceptors/telemetry-push.ts +537 -0
package/src/interceptors/tool.ts +287 -0
package/src/interceptors/workflow-ai.ts +419 -0
package/src/internals/conditional-recorder.ts +63 -0
package/src/internals/mock-resolver.ts +492 -0
package/src/matchers/index.ts +824 -0
package/src/observability.ts +501 -0
package/src/portal-executor.ts +355 -0
package/src/portal-server.ts +304 -0
package/src/proxy/llm-capture.ts +301 -0
package/src/reporter.ts +81 -0
package/src/runWorkflowSubprocess.ts +74 -0
package/src/runner.ts +178 -0
package/src/socket-connector.ts +117 -0
package/src/telemetry-batcher.ts +191 -0
package/src/test-setup.ts +16 -0
package/src/tool-registry.ts +94 -0
package/src/tool-runner-worker.ts +244 -0
package/src/trace-adapter/context.ts +156 -0
package/src/tracing.ts +62 -0
package/src/trigger-executor.ts +171 -0
package/src/types/agent.d.ts +63 -0
package/src/types/expect.d.ts +81 -0
package/src/types/modules.d.ts +2 -0
package/src/types/portal.ts +69 -0
package/src/utils/debug.ts +8 -0
package/src/utils/license-error.ts +43 -0
package/src/utils/redact.ts +25 -0
package/src/workflow-runner-worker.ts +386 -0
package/src/workflow-runner.ts +58 -0

package/docs/test-writing-guidelines.md ADDED Viewed

@@ -0,0 +1,444 @@
+# ElasticDash SDK Test Writing Guidelines
+This guide covers both testing approaches available in `elasticdash-sdk`:
+1. **`defineTest` + benchmarks** — VCR-style CI testing with recorded fixtures (recommended for CI/CD)
+2. **`aiTest` + matchers** — live workflow testing with the existing runner
+## Prerequisites
+- Node.js >= 20
+- `elasticdash-sdk` installed as a dev dependency
+- For `defineTest`: a TypeScript toolchain (e.g. `tsx`) if writing `.ts` test files
+- Optional: `ELASTICDASH_API_KEY` for uploading results to the dashboard
+---
+## Part 1: CI/CD Testing with `defineTest` (Record & Replay)
+### Overview
+ElasticDash CI testing follows a VCR-style record-and-replay pattern:
+1. **Record** — run your workflow locally with `ELASTICDASH_CAPTURE_TRACE=1` to capture a trace fixture
+2. **Write** — create an `ed_tests.ts` file with `defineTest()` calls that reference the fixture
+3. **Run** — execute `npx ed ed-test` to replay the fixture, measure performance, and compare against benchmarks
+4. **Upload** — results are automatically sent to the ElasticDash backend for historical analysis
+### Quick Start
+```bash
+# 1. Record a trace
+ELASTICDASH_CAPTURE_TRACE=1 node your-workflow.js
+# 2. Write a test (see below)
+# 3. Run tests locally
+npx ed ed-test --no-upload
+# 4. Run in CI with upload
+ELASTICDASH_API_KEY=your-key ELASTICDASH_API_URL=https://server.elasticdash.com npx ed ed-test
+```
+### Recording Traces
+Set the `ELASTICDASH_CAPTURE_TRACE=1` environment variable before running any workflow that uses the SDK's `wrapTool`/`wrapAI` interceptors or `runWorkflow()`.
+```bash
+ELASTICDASH_CAPTURE_TRACE=1 tsx your-workflow.ts
+```
+This writes a JSON trace file to `.ed_traces/` in the current directory:
+```
+.ed_traces/2026-04-19T14-23-07_a7f3.json
+```
+The trace contains every tool call and AI call the workflow made, including inputs, outputs, timing, and token usage. Sensitive fields (`authorization`, `api_key`, `password`, `secret`, `token`) are automatically scrubbed to `[REDACTED]`.
+Add `.ed_traces/` to your `.gitignore` if you don't want fixtures committed, or commit them if you want deterministic CI replay.
+#### Trace file structure
+A trace file captures the full workflow execution. Here's what one looks like (simplified from a real trace):
+```json
+{
+  "trace_id": "2026-04-20T00-57-44_800f",
+  "created_at": "2026-04-20T00:57:44.856Z",
+  "sdk_version": "unknown",
+  "workflow": {
+    "name": "chatStreamHandler",
+    "input": null,
+    "output": null
+  },
+  "steps": [
+    {
+      "step_id": "ai_call_0",
+      "type": "ai_call",
+      "name": "gpt-4o",
+      "input": {
+        "messages": [
+          { "role": "system", "content": "You are an expert that refines user queries..." },
+          { "role": "user", "content": "What's the attack of Groudon?" }
+        ],
+        "temperature": 0.5,
+        "max_tokens": 4096
+      },
+      "output": "Refined Query: \"What is the attack stat of Groudon?\"\nLanguage: \"en\"\nConcepts: [\"Groudon\", \"attack stat\"]\n...",
+      "started_at": "2026-04-20T00:57:26.425Z",
+      "ended_at": "2026-04-20T00:57:28.626Z",
+      "duration_ms": 2201,
+      "tokens": { "input": 1726, "output": 57, "total": 1783 }
+    },
+    {
+      "step_id": "tool_call_0",
+      "type": "tool_call",
+      "name": "queryRefinement",
+      "input": {
+        "userInput": "What's the attack of Groudon?",
+        "userToken": ""
+      },
+      "output": {
+        "refinedQuery": "What is the attack stat of Groudon?",
+        "entities": ["groudon details", "pokemon stats"],
+        "intentType": "FETCH"
+      },
+      "started_at": "2026-04-20T00:57:26.424Z",
+      "ended_at": "2026-04-20T00:57:28.627Z",
+      "duration_ms": 2203,
+      "tokens": null
+    }
+  ]
+}
+```
+Key fields to note:
+- **`step_id`** — Used in `defineTest` to target a specific step (e.g. `"ai_call_0"`, `"tool_call_0"`)
+- **`type`** — Either `"ai_call"` or `"tool_call"`, must match the `target.type` in your test
+- **`input`** — The recorded input/prompt. This is what gets overridden when you use the custom `input` field in `defineTest`
+- **`output`** — The recorded response, used for benchmark comparisons (`output_contains`, `llm_judge`, etc.)
+- **`duration_ms`** / **`tokens`** — The measurements compared against `max_duration_ms` and `max_tokens_total` benchmarks
+### Writing ed_tests
+Create a file named `ed_tests.ts` (or `ed_tests.js`) anywhere in your project. Multiple test files in different directories are all discovered automatically.
+```ts
+import { defineTest } from "elasticdash-sdk";
+import { runCheckoutWorkflow } from "./workflows";
+defineTest({
+  name: "checkout_happy_path_latency",
+  trace: "../.ed_traces/2026-04-19T14-23-07_a7f3.json",
+  target: { type: "tool_call", step_id: "tool_call_2" },
+  benchmarks: { max_duration_ms: 2000 },
+  run: async () => {
+    await runCheckoutWorkflow({ userId: "premium_123" });
+  },
+});
+defineTest({
+  name: "checkout_token_budget",
+  trace: "../.ed_traces/2026-04-19T14-23-07_a7f3.json",
+  target: { type: "ai_call", step_id: "ai_call_0" },
+  benchmarks: { max_tokens_total: 5000 },
+  run: async () => {
+    await runCheckoutWorkflow({ userId: "premium_123" });
+  },
+});
+```
+#### Field reference
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `name` | string | yes | Unique test ID, used for historical comparison |
+| `trace` | string | yes | Path to trace file, relative to the test file's directory |
+| `target` | object | yes | `{ type: "tool_call" \| "ai_call", step_id: "tool_call_0" }` |
+| `benchmarks` | object | yes | At least one of `max_duration_ms` or `max_tokens_total` |
+| `input` | unknown or function | no | Custom input that overrides the trace's recorded input. Can be a static value or an async function for dynamic resolution |
+| `run` | function | yes* | Async function that invokes the workflow. Receives the resolved input (custom or from trace) as its argument. *Required for execution |
+| `timeout_ms` | number | no | Per-test timeout (default: 60000ms) |
+The `step_id` values (`tool_call_0`, `ai_call_1`, etc.) are found in the trace file's `steps` array. Open the trace JSON to find the step you want to target.
+#### Custom input
+By default, the test's input is read from the trace fixture. You can override it with a static value or an async function to source the prompt from anywhere:
+```ts
+// Static override — input is passed to run()
+defineTest({
+  name: "checkout_custom_prompt",
+  trace: "../.ed_traces/trace.json",
+  target: { type: "ai_call", step_id: "ai_call_0" },
+  benchmarks: { max_tokens_total: 5000 },
+  input: { prompt: "Custom prompt text" },
+  run: async (input) => { await runCheckoutWorkflow(input); },
+});
+// Dynamic resolution (database, API, environment, etc.)
+defineTest({
+  name: "checkout_dynamic_prompt",
+  trace: "../.ed_traces/trace.json",
+  target: { type: "ai_call", step_id: "ai_call_0" },
+  benchmarks: { max_tokens_total: 5000 },
+  input: async () => {
+    const row = await db.query("SELECT prompt FROM prompts WHERE id = 42");
+    return { prompt: row.prompt };
+  },
+  run: async (input) => { await runCheckoutWorkflow(input); },
+});
+```
+The custom input only affects the reported input in test results and uploads — replay matching still uses the trace fixture's recorded data.
+### Running Tests Locally
+```bash
+# Run all tests
+npx ed ed-test
+# Skip upload (local iteration)
+npx ed ed-test --no-upload
+# Filter by test name pattern
+npx ed ed-test --filter "checkout_*"
+# Stop on first failure
+npx ed ed-test --fail-fast
+# JSON output (for scripting)
+npx ed ed-test --reporter json
+# JUnit XML (for CI systems)
+npx ed ed-test --reporter junit
+```
+#### Exit codes
+| Code | Meaning |
+|------|---------|
+| 0 | All tests passed |
+| 1 | One or more tests failed |
+| 3 | Configuration error (no tests found, bad files, missing fixtures) |
+| 4 | Upload failed, but tests completed successfully |
+### CI Integration
+#### GitHub Actions
+```yaml
+- name: Run ElasticDash tests
+  env:
+    ELASTICDASH_API_KEY: ${{ secrets.ELASTICDASH_API_KEY }}
+    ELASTICDASH_API_URL: https://server.elasticdash.com
+  run: npx ed ed-test
+```
+#### GitLab CI
+```yaml
+ed-test:
+  script:
+    - npx ed ed-test
+  variables:
+    ELASTICDASH_API_KEY: $ELASTICDASH_API_KEY
+    ELASTICDASH_API_URL: https://server.elasticdash.com
+```
+#### CircleCI
+```yaml
+- run:
+    name: Run ElasticDash tests
+    command: npx ed ed-test
+    environment:
+      ELASTICDASH_API_KEY: ${ELASTICDASH_API_KEY}
+      ELASTICDASH_API_URL: https://server.elasticdash.com
+```
+Git metadata (commit SHA, branch, PR number) is auto-detected from CI environment variables for GitHub Actions, GitLab CI, CircleCI, and Buildkite. For local runs, `git rev-parse` is used as a fallback.
+### Environment Variables
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `ELASTICDASH_API_KEY` | Project API key for authentication | — |
+| `ELASTICDASH_API_URL` | Backend server URL | — |
+| `ELASTICDASH_CAPTURE_TRACE` | Set to `1` to record a trace fixture | — |
+| `ELASTICDASH_ACCEPT_RERUNS` | Set to `false`, `0`, or `no` to reject rerun/trigger requests from the server. Useful when the SDK is installed across multiple projects and only one should process reruns. | Enabled (accepts reruns) |
+| `ELASTICDASH_PORTAL_PORT` | Port for the portal server | `4574` |
+| `ELASTICDASH_ALLOWED_ORIGINS` | Comma-separated allowed CORS origins for the portal | — |
+### How Replay Works
+During test execution, every `wrapTool` and `wrapAI` call is intercepted. Instead of calling the real service, the SDK looks up the matching step in the trace fixture and returns the recorded output.
+**Match key:** Each call is matched by `<type>::<name>::<hash(input)>` where the input is canonicalized (sorted keys, normalized whitespace, rounded floats). Calls are matched in order of occurrence.
+**Replay miss:** If the workflow makes a call that doesn't match any step in the fixture, the test fails immediately with a clear diagnostic:
+```
+FAIL  checkout_happy_path_latency
+      → replay miss: tool_call::stripe.customers.retrieve
+```
+This means the fixture is stale or the workflow has changed. Re-record the trace with `ELASTICDASH_CAPTURE_TRACE=1`.
+### Benchmarks
+Phase 3 supports absolute threshold benchmarks:
+- `max_duration_ms` — the step's recorded duration must not exceed this value
+- `max_tokens_total` — the step's total token usage must not exceed this value (ai_call only)
+Measurements come from the fixture's recorded values, not live timing. This makes tests deterministic — they always produce the same result for the same fixture.
+### Upload & Dashboard
+When `ELASTICDASH_API_KEY` and `ELASTICDASH_API_URL` are set, results are uploaded to `POST /api/v1/test-runs/create` after all tests complete.
+If the upload fails after retries, the payload is saved to `.ed_traces/failed_uploads/<run_id>.json` for manual inspection or retry. Upload failures don't change the test exit code unless all tests passed (exit code 4).
+### Troubleshooting
+**Replay miss errors:**
+The fixture doesn't contain a step matching what the workflow called. Common causes:
+- Workflow code changed since the trace was recorded
+- Non-deterministic inputs (timestamps, UUIDs) in tool call arguments
+Fix: re-record the trace with `ELASTICDASH_CAPTURE_TRACE=1`.
+**"test has no run function":**
+The `run` field is required for test execution. Add an async function that invokes your workflow.
+**Git detection shows "unknown":**
+No `.git` directory or `git` binary found. In CI, ensure the repo is checked out. Locally, run from within a git repository.
+**Upload auth errors:**
+Check that `ELASTICDASH_API_KEY` is set and valid. The API key must be scoped to the correct project.
+---
+## Part 2: Live Workflow Testing with `aiTest` (Existing Approach)
+The `aiTest` + matchers approach runs workflows live and asserts on the trace using custom matchers. This is useful for development-time testing but not for deterministic CI replay.
+### Test anatomy
+- Import test setup once per file: `import 'elasticdash-sdk/dist/test-setup.js'`
+- Each test receives `ctx` with `ctx.trace` to inspect recorded LLM/tool/custom steps.
+- Matchers live on `expect` (already registered by `test-setup`).
+- Files end with `.ai.test.ts` and use the global `aiTest(name, fn)`.
+### Hooks
+- `beforeAll(fn)` / `afterAll(fn)`: run once per file before/after all tests.
+- `beforeEach(fn)` / `afterEach(fn)`: run before/after every test in the file. `afterEach` still runs when the test fails.
+### Useful matchers (quick reference)
+- `toHaveLLMStep(config)`: Assert LLM calls match model/provider/prompt/output filters.
+- `toCallTool(name)`: Assert a tool call occurred.
+- `toHaveCustomStep(config)`: Assert custom (RAG/code/fixed/custom) steps.
+- `toHavePromptWhere(config)`: Filter prompts by substring, then require/include/exclude content (with optional nth/index positional checks).
+- `toMatchSemanticOutput(expected, options?)`: LLM-judged semantic match over combined LLM outputs.
+- `toEvaluateOutputMetric(config)`: LLM-scored numeric metric (0-1) on a specific LLM step's prompt or result, with threshold comparisons.
+### Patterns and examples
+#### Validate the order of steps in a workflow
+```ts
+aiTest('prompts occur in order', async (ctx) => {
+  await runWorkflow()
+  expect(ctx.trace).toHavePromptWhere({
+    filterContains: 'Goal Completion Validator',
+    nth: 1,
+  })
+  expect(ctx.trace).toHavePromptWhere({
+    filterContains: "User's Ultimate Goal:",
+    nth: 2,
+  })
+})
+```
+#### Validate fetched data from RAG/APIs
+```ts
+aiTest('RAG includes required source', async (ctx) => {
+  await runWorkflow()
+  expect(ctx.trace).toHaveCustomStep({
+    kind: 'rag',
+    contains: 'pokemon_stats',
+    resultContains: 'base_stat',
+  })
+})
+```
+#### Check LLM output
+```ts
+aiTest('planner output returns attack stat', async (ctx) => {
+  await runWorkflow()
+  expect(ctx.trace).toHaveLLMStep({
+    outputContains: 'attack stat of Pikachu',
+  })
+})
+```
+#### LLM-as-judge metric scoring
+```ts
+aiTest('plan is actionable', async (ctx) => {
+  await runWorkflow()
+  await expect(ctx.trace).toEvaluateOutputMetric({
+    evaluationPrompt:
+      'Score 0-1: is this execution plan concrete and directly executable? '
+      + '1.0 = concrete SQL with specific tables/columns; 0.0 = vague/placeholder.',
+    target: 'result',
+    nth: 2,
+    condition: { atLeast: 0.7 },
+    provider: 'claude',
+    model: 'claude-3-opus-20240229',
+  })
+})
+```
+### Tips
+- Always `await` async matchers (`toMatchSemanticOutput`, `toEvaluateOutputMetric`).
+- Use `nth`/`index` in `toHavePromptWhere` to avoid false positives when multiple prompts contain similar text.
+- Keep prompts/results concise in tests; log the trace when debugging: `console.log(ctx.trace.getLLMSteps())`.
+### Running aiTest tests
+```bash
+# Run all *.ai.test.ts files
+elasticdash test
+# Run in a subdir
+elasticdash test examples/
+# Single file
+elasticdash run path/to/file.ai.test.ts
+```
+### Minimal scaffold
+```ts
+import 'elasticdash-sdk/dist/test-setup.js'
+import { expect } from 'expect'
+aiTest('example', async (ctx) => {
+  await runWorkflow()
+  expect(ctx.trace).toHaveLLMStep({ provider: 'openai' })
+})
+```

package/docs/tools.md ADDED Viewed

@@ -0,0 +1,165 @@
+# Tool Recording and Replay
+ElasticDash automatically records and traces tool calls during workflow execution, providing replay and debugging capabilities.
+For HTTP response streaming capture (SSE/NDJSON fetch flows), see `README.md` and `docs/quickstart.md#capture-streaming-flows`. That behavior is handled by the HTTP interceptor and is separate from manual tool instrumentation in this document.
+## Manual Tool Recording
+For tools outside the normal import flow, or if you need explicit success/error logging control, use a resilient `recordToolCall` pattern where tracing is isolated from the main service path:
+```ts
+import { runSelectQuery } from 'path/to/tool/calls'
+export const dataService = async (input: any) => {
+  const { query } = input as { query: string }
+  return await runSelectQuery(query)
+    .then(async (result: any) => {
+      try {
+        const { recordToolCall } = await import('elasticdash-sdk')
+        recordToolCall('dataService', input, result)
+      } catch {
+        // trace logging errors must not break the main service
+      }
+      return result
+    })
+    .catch(async (error: any) => {
+      try {
+        const { recordToolCall } = await import('elasticdash-sdk')
+        recordToolCall('dataService', input, error)
+      } catch {
+        // trace logging errors must not break the main service
+      }
+      throw error
+    })
+}
+```
+Why this pattern is recommended for manual instrumentation:
+- Build-time or mainstream runtime contexts may not have tracing APIs available.
+- Dynamic import plus nested `try/catch` keeps `recordToolCall` best-effort.
+- Service success/failure behavior is preserved even when trace logging fails.
+If your runtime may execute outside ElasticDash worker context, dynamic import keeps behavior safe:
+```ts
+try {
+  const { recordToolCall } = await import('elasticdash-sdk')
+  recordToolCall('dataService', input, result)
+} catch {
+  // no-op outside elasticdash runtime
+}
+```
+**Note:** Manual recording is best-effort trace logging. Keep the same resilient pattern (dynamic import + nested `try/catch`) across all tools so trace logging never interrupts core service execution.
+## Calling Tools from Workflows
+**Always call tool functions from `ed_tools.ts` (or `ed_tools.js`), not from their source code locations.**
+In your workflows, import and use tools through the instrumented export:
+```ts
+// ✅ Correct - calls the traced version from ed_tools.ts
+import { dataService } from './ed_tools'
+export const checkoutWorkflow = async (orderId: string) => {
+  const orderData = await dataService({ query: `SELECT * FROM orders WHERE id = ${orderId}` })
+  // ... rest of workflow
+}
+```
+Not directly from the source file:
+```ts
+// ❌ Wrong - bypasses tracing instrumentation
+import { runSelectQuery } from './services/dataService'
+export const checkoutWorkflow = async (orderId: string) => {
+  const orderData = await runSelectQuery(`SELECT * FROM orders WHERE id = ${orderId}`)
+  // ... rest of workflow
+}
+```
+**Why this matters:**
+- Tool calls through `ed_tools.ts` are automatically traced and recorded
+- Direct imports bypass the `recordToolCall` instrumentation
+- Dashboard trace replay requires tools to be called through `ed_tools.ts`
+- LLM agents calling tools will record the call with the `name` from `ed_tools.ts`, so using the same import ensures name matching
+## Tool Function Compatibility (`ed_tools.ts/js`)
+Exports in `ed_tools.ts/js` should be plain callable functions that take serializable input and return serializable output.
+- Export directly callable functions
+- Use JSON-serializable args/results (object, array, string, number, boolean, or `null`)
+- Avoid exporting framework request/response handlers directly (for example Next.js `NextRequest`/`NextResponse` route handlers)
+Compatible export example:
+```ts
+export async function chargeCard(input: { amount: number; token: string }) {
+  return { success: true, transactionId: 'txn-123' }
+}
+```
+Not directly compatible as a tool export:
+```ts
+// Next.js route handler style
+export async function POST(req: NextRequest): Promise<NextResponse> {
+  return NextResponse.json({ ok: true })
+}
+```
+If your app uses framework handlers, keep `ed_tools.ts/js` as a plain callable boundary and invoke your framework-specific code behind that boundary.
+## Recording Without Passing `ctx.trace`
+Use Node's `AsyncLocalStorage` to record steps without threading `ctx.trace` through every function:
+```ts
+// In your test
+import { setCurrentTrace } from 'elasticdash-sdk'
+aiTest('flow test', async (ctx) => {
+  setCurrentTrace(ctx.trace)          // bind the trace to the current async context
+  await runFlowWithoutTraceArg()      // your existing code
+  expect(ctx.trace).toHaveCustomStep({ kind: 'rag', name: 'pokemon-search' })
+})
+// In your app/flow code (called during the test)
+import { getCurrentTrace } from 'elasticdash-sdk'
+function runFlowWithoutTraceArg() {
+  const trace = getCurrentTrace()
+  trace?.recordCustomStep({
+    kind: 'rag',
+    name: 'pokemon-search',
+    payload: { query: 'pikachu attack' },
+    result: { ids: [25] },
+    tags: ['source:db', 'sort:asc'],
+  })
+}
+```
+**Notes:**
+- Works per async context; if you spawn detached work (child processes/independent workers), pass `trace` explicitly there.
+- Still compatible with manual DI: you can continue passing `ctx.trace` explicitly if you prefer.
+## Optional LLM Capture Proxy (for Supabase Edge / Deno)
+For environments where Node fetch interception doesn't work (like Supabase Edge Functions or Deno Deploy):
+1. Set `ELASTICDASH_LLM_PROXY=1` (optional: `ELASTICDASH_LLM_PROXY_PORT`, default `8787`)
+2. The runner starts a local proxy and generates a per-test `ELASTICDASH_TRACE_ID`
+3. Point your LLM client at the proxy via base URL envs:
+   ```bash
+   OPENAI_BASE_URL=http://localhost:8787/v1
+   ANTHROPIC_API_URL=http://localhost:8787
+   ```
+4. Forward the trace ID to your Edge/Deno code (e.g., add `x-trace-id: process.env.ELASTICDASH_TRACE_ID` header)
+5. The proxy records model/prompt/completion and folds captured steps back into `ctx.trace`
+When `ELASTICDASH_LLM_PROXY` is unset, the existing Node fetch interceptor remains the default.