elasticdash-sdk 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (349) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +775 -0
  3. package/dist/browser-ui.d.ts +43 -0
  4. package/dist/browser-ui.d.ts.map +1 -0
  5. package/dist/browser-ui.js +246 -0
  6. package/dist/browser-ui.js.map +1 -0
  7. package/dist/capture/event.d.ts +33 -0
  8. package/dist/capture/event.d.ts.map +1 -0
  9. package/dist/capture/event.js +2 -0
  10. package/dist/capture/event.js.map +1 -0
  11. package/dist/capture/index.d.ts +4 -0
  12. package/dist/capture/index.d.ts.map +1 -0
  13. package/dist/capture/index.js +4 -0
  14. package/dist/capture/index.js.map +1 -0
  15. package/dist/capture/recorder.d.ts +24 -0
  16. package/dist/capture/recorder.d.ts.map +1 -0
  17. package/dist/capture/recorder.js +46 -0
  18. package/dist/capture/recorder.js.map +1 -0
  19. package/dist/capture/replay.d.ts +20 -0
  20. package/dist/capture/replay.d.ts.map +1 -0
  21. package/dist/capture/replay.js +47 -0
  22. package/dist/capture/replay.js.map +1 -0
  23. package/dist/ci/api-client.d.ts +38 -0
  24. package/dist/ci/api-client.d.ts.map +1 -0
  25. package/dist/ci/api-client.js +96 -0
  26. package/dist/ci/api-client.js.map +1 -0
  27. package/dist/ci/benchmark.d.ts +33 -0
  28. package/dist/ci/benchmark.d.ts.map +1 -0
  29. package/dist/ci/benchmark.js +213 -0
  30. package/dist/ci/benchmark.js.map +1 -0
  31. package/dist/ci/ed-runner.d.ts +48 -0
  32. package/dist/ci/ed-runner.d.ts.map +1 -0
  33. package/dist/ci/ed-runner.js +260 -0
  34. package/dist/ci/ed-runner.js.map +1 -0
  35. package/dist/ci/executor.d.ts +13 -0
  36. package/dist/ci/executor.d.ts.map +1 -0
  37. package/dist/ci/executor.js +542 -0
  38. package/dist/ci/executor.js.map +1 -0
  39. package/dist/ci/git-info.d.ts +17 -0
  40. package/dist/ci/git-info.d.ts.map +1 -0
  41. package/dist/ci/git-info.js +102 -0
  42. package/dist/ci/git-info.js.map +1 -0
  43. package/dist/ci/index.d.ts +6 -0
  44. package/dist/ci/index.d.ts.map +1 -0
  45. package/dist/ci/index.js +4 -0
  46. package/dist/ci/index.js.map +1 -0
  47. package/dist/ci/measurement.d.ts +9 -0
  48. package/dist/ci/measurement.d.ts.map +1 -0
  49. package/dist/ci/measurement.js +15 -0
  50. package/dist/ci/measurement.js.map +1 -0
  51. package/dist/ci/replay.d.ts +31 -0
  52. package/dist/ci/replay.d.ts.map +1 -0
  53. package/dist/ci/replay.js +96 -0
  54. package/dist/ci/replay.js.map +1 -0
  55. package/dist/ci/reporters/default.d.ts +8 -0
  56. package/dist/ci/reporters/default.d.ts.map +1 -0
  57. package/dist/ci/reporters/default.js +46 -0
  58. package/dist/ci/reporters/default.js.map +1 -0
  59. package/dist/ci/reporters/index.d.ts +8 -0
  60. package/dist/ci/reporters/index.d.ts.map +1 -0
  61. package/dist/ci/reporters/index.js +14 -0
  62. package/dist/ci/reporters/index.js.map +1 -0
  63. package/dist/ci/reporters/json.d.ts +8 -0
  64. package/dist/ci/reporters/json.d.ts.map +1 -0
  65. package/dist/ci/reporters/json.js +14 -0
  66. package/dist/ci/reporters/json.js.map +1 -0
  67. package/dist/ci/reporters/junit.d.ts +8 -0
  68. package/dist/ci/reporters/junit.d.ts.map +1 -0
  69. package/dist/ci/reporters/junit.js +48 -0
  70. package/dist/ci/reporters/junit.js.map +1 -0
  71. package/dist/ci/runner.d.ts +3 -0
  72. package/dist/ci/runner.d.ts.map +1 -0
  73. package/dist/ci/runner.js +187 -0
  74. package/dist/ci/runner.js.map +1 -0
  75. package/dist/ci/test-discovery.d.ts +5 -0
  76. package/dist/ci/test-discovery.d.ts.map +1 -0
  77. package/dist/ci/test-discovery.js +11 -0
  78. package/dist/ci/test-discovery.js.map +1 -0
  79. package/dist/ci/test-loader.d.ts +19 -0
  80. package/dist/ci/test-loader.d.ts.map +1 -0
  81. package/dist/ci/test-loader.js +149 -0
  82. package/dist/ci/test-loader.js.map +1 -0
  83. package/dist/ci/test-registry.d.ts +42 -0
  84. package/dist/ci/test-registry.d.ts.map +1 -0
  85. package/dist/ci/test-registry.js +18 -0
  86. package/dist/ci/test-registry.js.map +1 -0
  87. package/dist/ci/trace-schema.d.ts +30 -0
  88. package/dist/ci/trace-schema.d.ts.map +1 -0
  89. package/dist/ci/trace-schema.js +66 -0
  90. package/dist/ci/trace-schema.js.map +1 -0
  91. package/dist/ci/trace-writer.d.ts +16 -0
  92. package/dist/ci/trace-writer.d.ts.map +1 -0
  93. package/dist/ci/trace-writer.js +108 -0
  94. package/dist/ci/trace-writer.js.map +1 -0
  95. package/dist/ci/types.d.ts +108 -0
  96. package/dist/ci/types.d.ts.map +1 -0
  97. package/dist/ci/types.js +3 -0
  98. package/dist/ci/types.js.map +1 -0
  99. package/dist/ci/upload-client.d.ts +74 -0
  100. package/dist/ci/upload-client.d.ts.map +1 -0
  101. package/dist/ci/upload-client.js +195 -0
  102. package/dist/ci/upload-client.js.map +1 -0
  103. package/dist/cli.d.ts +3 -0
  104. package/dist/cli.d.ts.map +1 -0
  105. package/dist/cli.js +716 -0
  106. package/dist/cli.js.map +1 -0
  107. package/dist/core/agent-state.d.ts +47 -0
  108. package/dist/core/agent-state.d.ts.map +1 -0
  109. package/dist/core/agent-state.js +137 -0
  110. package/dist/core/agent-state.js.map +1 -0
  111. package/dist/core/judge-utils.d.ts +22 -0
  112. package/dist/core/judge-utils.d.ts.map +1 -0
  113. package/dist/core/judge-utils.js +211 -0
  114. package/dist/core/judge-utils.js.map +1 -0
  115. package/dist/core/registry.d.ts +28 -0
  116. package/dist/core/registry.d.ts.map +1 -0
  117. package/dist/core/registry.js +52 -0
  118. package/dist/core/registry.js.map +1 -0
  119. package/dist/dashboard-server.d.ts +65 -0
  120. package/dist/dashboard-server.d.ts.map +1 -0
  121. package/dist/dashboard-server.js +3940 -0
  122. package/dist/dashboard-server.js.map +1 -0
  123. package/dist/execution/tool-runner.d.ts +26 -0
  124. package/dist/execution/tool-runner.d.ts.map +1 -0
  125. package/dist/execution/tool-runner.js +316 -0
  126. package/dist/execution/tool-runner.js.map +1 -0
  127. package/dist/html/dashboard.html +2218 -0
  128. package/dist/http.d.ts +14 -0
  129. package/dist/http.d.ts.map +1 -0
  130. package/dist/http.js +13 -0
  131. package/dist/http.js.map +1 -0
  132. package/dist/index.cjs +8102 -0
  133. package/dist/index.d.ts +61 -0
  134. package/dist/index.d.ts.map +1 -0
  135. package/dist/index.js +67 -0
  136. package/dist/index.js.map +1 -0
  137. package/dist/interceptors/ai-interceptor.d.ts +26 -0
  138. package/dist/interceptors/ai-interceptor.d.ts.map +1 -0
  139. package/dist/interceptors/ai-interceptor.js +756 -0
  140. package/dist/interceptors/ai-interceptor.js.map +1 -0
  141. package/dist/interceptors/db-auto.d.ts +8 -0
  142. package/dist/interceptors/db-auto.d.ts.map +1 -0
  143. package/dist/interceptors/db-auto.js +217 -0
  144. package/dist/interceptors/db-auto.js.map +1 -0
  145. package/dist/interceptors/db.d.ts +23 -0
  146. package/dist/interceptors/db.d.ts.map +1 -0
  147. package/dist/interceptors/db.js +137 -0
  148. package/dist/interceptors/db.js.map +1 -0
  149. package/dist/interceptors/http.d.ts +28 -0
  150. package/dist/interceptors/http.d.ts.map +1 -0
  151. package/dist/interceptors/http.js +356 -0
  152. package/dist/interceptors/http.js.map +1 -0
  153. package/dist/interceptors/side-effects.d.ts +7 -0
  154. package/dist/interceptors/side-effects.d.ts.map +1 -0
  155. package/dist/interceptors/side-effects.js +72 -0
  156. package/dist/interceptors/side-effects.js.map +1 -0
  157. package/dist/interceptors/telemetry-push.d.ts +142 -0
  158. package/dist/interceptors/telemetry-push.d.ts.map +1 -0
  159. package/dist/interceptors/telemetry-push.js +463 -0
  160. package/dist/interceptors/telemetry-push.js.map +1 -0
  161. package/dist/interceptors/tool.d.ts +2 -0
  162. package/dist/interceptors/tool.d.ts.map +1 -0
  163. package/dist/interceptors/tool.js +274 -0
  164. package/dist/interceptors/tool.js.map +1 -0
  165. package/dist/interceptors/workflow-ai.d.ts +5 -0
  166. package/dist/interceptors/workflow-ai.d.ts.map +1 -0
  167. package/dist/interceptors/workflow-ai.js +382 -0
  168. package/dist/interceptors/workflow-ai.js.map +1 -0
  169. package/dist/internals/conditional-recorder.d.ts +21 -0
  170. package/dist/internals/conditional-recorder.d.ts.map +1 -0
  171. package/dist/internals/conditional-recorder.js +54 -0
  172. package/dist/internals/conditional-recorder.js.map +1 -0
  173. package/dist/internals/mock-resolver.d.ts +146 -0
  174. package/dist/internals/mock-resolver.d.ts.map +1 -0
  175. package/dist/internals/mock-resolver.js +427 -0
  176. package/dist/internals/mock-resolver.js.map +1 -0
  177. package/dist/matchers/index.d.ts +96 -0
  178. package/dist/matchers/index.d.ts.map +1 -0
  179. package/dist/matchers/index.js +668 -0
  180. package/dist/matchers/index.js.map +1 -0
  181. package/dist/observability.d.ts +82 -0
  182. package/dist/observability.d.ts.map +1 -0
  183. package/dist/observability.js +471 -0
  184. package/dist/observability.js.map +1 -0
  185. package/dist/portal-executor.d.ts +30 -0
  186. package/dist/portal-executor.d.ts.map +1 -0
  187. package/dist/portal-executor.js +324 -0
  188. package/dist/portal-executor.js.map +1 -0
  189. package/dist/portal-server.d.ts +3 -0
  190. package/dist/portal-server.d.ts.map +1 -0
  191. package/dist/portal-server.js +279 -0
  192. package/dist/portal-server.js.map +1 -0
  193. package/dist/proxy/llm-capture.d.ts +14 -0
  194. package/dist/proxy/llm-capture.d.ts.map +1 -0
  195. package/dist/proxy/llm-capture.js +264 -0
  196. package/dist/proxy/llm-capture.js.map +1 -0
  197. package/dist/reporter.d.ts +3 -0
  198. package/dist/reporter.d.ts.map +1 -0
  199. package/dist/reporter.js +72 -0
  200. package/dist/reporter.js.map +1 -0
  201. package/dist/runWorkflowSubprocess.d.ts +14 -0
  202. package/dist/runWorkflowSubprocess.d.ts.map +1 -0
  203. package/dist/runWorkflowSubprocess.js +66 -0
  204. package/dist/runWorkflowSubprocess.js.map +1 -0
  205. package/dist/runner.d.ts +16 -0
  206. package/dist/runner.d.ts.map +1 -0
  207. package/dist/runner.js +138 -0
  208. package/dist/runner.js.map +1 -0
  209. package/dist/socket-connector.d.ts +22 -0
  210. package/dist/socket-connector.d.ts.map +1 -0
  211. package/dist/socket-connector.js +104 -0
  212. package/dist/socket-connector.js.map +1 -0
  213. package/dist/telemetry-batcher.d.ts +56 -0
  214. package/dist/telemetry-batcher.d.ts.map +1 -0
  215. package/dist/telemetry-batcher.js +143 -0
  216. package/dist/telemetry-batcher.js.map +1 -0
  217. package/dist/test-setup.d.ts +12 -0
  218. package/dist/test-setup.d.ts.map +1 -0
  219. package/dist/test-setup.js +13 -0
  220. package/dist/test-setup.js.map +1 -0
  221. package/dist/tool-registry.d.ts +31 -0
  222. package/dist/tool-registry.d.ts.map +1 -0
  223. package/dist/tool-registry.js +73 -0
  224. package/dist/tool-registry.js.map +1 -0
  225. package/dist/tool-runner-worker.d.ts +2 -0
  226. package/dist/tool-runner-worker.d.ts.map +1 -0
  227. package/dist/tool-runner-worker.js +215 -0
  228. package/dist/tool-runner-worker.js.map +1 -0
  229. package/dist/trace-adapter/context.d.ts +72 -0
  230. package/dist/trace-adapter/context.d.ts.map +1 -0
  231. package/dist/trace-adapter/context.js +80 -0
  232. package/dist/trace-adapter/context.js.map +1 -0
  233. package/dist/tracing.d.ts +2 -0
  234. package/dist/tracing.d.ts.map +1 -0
  235. package/dist/tracing.js +59 -0
  236. package/dist/tracing.js.map +1 -0
  237. package/dist/trigger-executor.d.ts +12 -0
  238. package/dist/trigger-executor.d.ts.map +1 -0
  239. package/dist/trigger-executor.js +130 -0
  240. package/dist/trigger-executor.js.map +1 -0
  241. package/dist/types/portal.d.ts +76 -0
  242. package/dist/types/portal.d.ts.map +1 -0
  243. package/dist/types/portal.js +2 -0
  244. package/dist/types/portal.js.map +1 -0
  245. package/dist/utils/debug.d.ts +3 -0
  246. package/dist/utils/debug.d.ts.map +1 -0
  247. package/dist/utils/debug.js +8 -0
  248. package/dist/utils/debug.js.map +1 -0
  249. package/dist/utils/license-error.d.ts +23 -0
  250. package/dist/utils/license-error.d.ts.map +1 -0
  251. package/dist/utils/license-error.js +42 -0
  252. package/dist/utils/license-error.js.map +1 -0
  253. package/dist/utils/redact.d.ts +7 -0
  254. package/dist/utils/redact.d.ts.map +1 -0
  255. package/dist/utils/redact.js +26 -0
  256. package/dist/utils/redact.js.map +1 -0
  257. package/dist/workflow-runner-worker.d.ts +2 -0
  258. package/dist/workflow-runner-worker.d.ts.map +1 -0
  259. package/dist/workflow-runner-worker.js +329 -0
  260. package/dist/workflow-runner-worker.js.map +1 -0
  261. package/dist/workflow-runner.d.ts +14 -0
  262. package/dist/workflow-runner.d.ts.map +1 -0
  263. package/dist/workflow-runner.js +34 -0
  264. package/dist/workflow-runner.js.map +1 -0
  265. package/docs/agent-coding-instructions.md +138 -0
  266. package/docs/agent-integration-guide.md +564 -0
  267. package/docs/agents.md +140 -0
  268. package/docs/dashboard.md +394 -0
  269. package/docs/deno.md +69 -0
  270. package/docs/instrumentation.md +424 -0
  271. package/docs/langfuse-trace-structure.md +145 -0
  272. package/docs/matchers.md +173 -0
  273. package/docs/observability_contract.md +192 -0
  274. package/docs/observability_mode.md +195 -0
  275. package/docs/quickstart.md +621 -0
  276. package/docs/security-compliance.md +566 -0
  277. package/docs/test-writing-guidelines.md +444 -0
  278. package/docs/tools.md +165 -0
  279. package/docs/workflow-modes.md +253 -0
  280. package/package.json +76 -0
  281. package/src/browser-ui.ts +281 -0
  282. package/src/capture/event.ts +30 -0
  283. package/src/capture/index.ts +3 -0
  284. package/src/capture/recorder.ts +62 -0
  285. package/src/capture/replay.ts +55 -0
  286. package/src/ci/api-client.ts +136 -0
  287. package/src/ci/benchmark.ts +257 -0
  288. package/src/ci/ed-runner.ts +351 -0
  289. package/src/ci/executor.ts +671 -0
  290. package/src/ci/git-info.ts +127 -0
  291. package/src/ci/index.ts +5 -0
  292. package/src/ci/measurement.ts +25 -0
  293. package/src/ci/replay.ts +127 -0
  294. package/src/ci/reporters/default.ts +50 -0
  295. package/src/ci/reporters/index.ts +21 -0
  296. package/src/ci/reporters/json.ts +18 -0
  297. package/src/ci/reporters/junit.ts +61 -0
  298. package/src/ci/runner.ts +208 -0
  299. package/src/ci/test-discovery.ts +16 -0
  300. package/src/ci/test-loader.ts +187 -0
  301. package/src/ci/test-registry.ts +62 -0
  302. package/src/ci/trace-schema.ts +96 -0
  303. package/src/ci/trace-writer.ts +107 -0
  304. package/src/ci/types.ts +115 -0
  305. package/src/ci/upload-client.ts +300 -0
  306. package/src/cli.ts +811 -0
  307. package/src/core/agent-state.ts +162 -0
  308. package/src/core/judge-utils.ts +232 -0
  309. package/src/core/registry.ts +92 -0
  310. package/src/dashboard-server.ts +2047 -0
  311. package/src/execution/tool-runner.ts +352 -0
  312. package/src/html/dashboard.html +2218 -0
  313. package/src/http.ts +13 -0
  314. package/src/index.ts +138 -0
  315. package/src/interceptors/ai-interceptor.ts +798 -0
  316. package/src/interceptors/db-auto.ts +243 -0
  317. package/src/interceptors/db.ts +156 -0
  318. package/src/interceptors/http.ts +393 -0
  319. package/src/interceptors/side-effects.ts +83 -0
  320. package/src/interceptors/telemetry-push.ts +537 -0
  321. package/src/interceptors/tool.ts +287 -0
  322. package/src/interceptors/workflow-ai.ts +419 -0
  323. package/src/internals/conditional-recorder.ts +63 -0
  324. package/src/internals/mock-resolver.ts +492 -0
  325. package/src/matchers/index.ts +824 -0
  326. package/src/observability.ts +501 -0
  327. package/src/portal-executor.ts +355 -0
  328. package/src/portal-server.ts +304 -0
  329. package/src/proxy/llm-capture.ts +301 -0
  330. package/src/reporter.ts +81 -0
  331. package/src/runWorkflowSubprocess.ts +74 -0
  332. package/src/runner.ts +178 -0
  333. package/src/socket-connector.ts +117 -0
  334. package/src/telemetry-batcher.ts +191 -0
  335. package/src/test-setup.ts +16 -0
  336. package/src/tool-registry.ts +94 -0
  337. package/src/tool-runner-worker.ts +244 -0
  338. package/src/trace-adapter/context.ts +156 -0
  339. package/src/tracing.ts +62 -0
  340. package/src/trigger-executor.ts +171 -0
  341. package/src/types/agent.d.ts +63 -0
  342. package/src/types/expect.d.ts +81 -0
  343. package/src/types/modules.d.ts +2 -0
  344. package/src/types/portal.ts +69 -0
  345. package/src/utils/debug.ts +8 -0
  346. package/src/utils/license-error.ts +43 -0
  347. package/src/utils/redact.ts +25 -0
  348. package/src/workflow-runner-worker.ts +386 -0
  349. package/src/workflow-runner.ts +58 -0
@@ -0,0 +1,351 @@
1
+ import { randomUUID } from 'node:crypto'
2
+ import { loadTests } from './test-loader.js'
3
+ import { createReplayContext, installReplay, uninstallReplay, ReplayMissError } from './replay.js'
4
+ import { collectMeasurement } from './measurement.js'
5
+ import { SDK_VERSION } from './trace-schema.js'
6
+ import { compareBenchmarks } from './benchmark.js'
7
+ import { fetchEvaluatorConfig } from './api-client.js'
8
+ import type { EvaluatorConfig } from './api-client.js'
9
+ import type { TestMeasurement } from './measurement.js'
10
+ import type { BenchmarkResult } from './benchmark.js'
11
+ import type { ValidatedTest } from './test-loader.js'
12
+
13
+ // SDK_VERSION imported from trace-schema.ts (CJS-safe)
14
+
15
+ // ─── Types ──────────────────────────────────────────────────
16
+
17
+ export interface EdTestRunOptions {
18
+ cwd?: string
19
+ filter?: string
20
+ failFast?: boolean
21
+ noUpload?: boolean
22
+ reporter?: 'default' | 'json' | 'junit'
23
+ /** Number of times to run each test. Passes if any run succeeds. Defaults to 1. */
24
+ runs?: number
25
+ }
26
+
27
+ export interface EdSingleRunResult {
28
+ status: 'pass' | 'fail'
29
+ failureReason?: string
30
+ measurement?: TestMeasurement
31
+ benchmarkResult?: BenchmarkResult
32
+ output?: unknown
33
+ durationMs: number
34
+ startedAt: string
35
+ finishedAt: string
36
+ }
37
+
38
+ export interface EdTestResult {
39
+ testId: string
40
+ testName: string
41
+ status: 'pass' | 'fail'
42
+ failureReason?: string
43
+ measurement?: TestMeasurement
44
+ benchmarkResult?: BenchmarkResult
45
+ traceRef?: string
46
+ target?: { type: string; step_id: string }
47
+ input?: unknown
48
+ output?: unknown
49
+ durationMs: number
50
+ /** All individual run results when --runs > 1 */
51
+ singleRuns?: EdSingleRunResult[]
52
+ }
53
+
54
+ export interface EdTestRunResult {
55
+ runId: string
56
+ startedAt: string
57
+ finishedAt: string
58
+ results: EdTestResult[]
59
+ sdkVersion: string
60
+ }
61
+
62
+ // ─── Runner ─────────────────────────────────────────────────
63
+
64
+ export async function runEdTests(options?: EdTestRunOptions): Promise<EdTestRunResult> {
65
+ const cwd = options?.cwd ?? process.cwd()
66
+ const runId = randomUUID()
67
+ const startedAt = new Date().toISOString()
68
+ const results: EdTestResult[] = []
69
+
70
+ const { tests, errors } = await loadTests({ cwd })
71
+
72
+ // Report validation errors as failed tests
73
+ for (const err of errors) {
74
+ results.push({
75
+ testId: err.testName ?? 'unknown',
76
+ testName: err.testName ?? 'unknown',
77
+ status: 'fail',
78
+ failureReason: `validation error: ${err.message}`,
79
+ durationMs: 0,
80
+ singleRuns: [],
81
+ })
82
+ }
83
+
84
+ // Filter tests if pattern provided
85
+ let testsToRun: ValidatedTest[] = tests
86
+ if (options?.filter) {
87
+ const pattern = options.filter
88
+ testsToRun = tests.filter(t => matchGlob(t.name, pattern))
89
+ }
90
+
91
+ const maxRuns = Math.max(1, options?.runs ?? 1)
92
+
93
+ // Fetch evaluator config from backend if any test uses llm_judge without
94
+ // explicit provider/model. Cached for the entire run to avoid repeated calls.
95
+ let evaluatorConfig: EvaluatorConfig | null = null
96
+ const needsEvaluatorConfig = testsToRun.some(
97
+ t => t.benchmarks.llm_judge && (!t.benchmarks.llm_judge.judge_provider || !t.benchmarks.llm_judge.judge_model)
98
+ )
99
+ if (needsEvaluatorConfig) {
100
+ const serverUrl = process.env.ELASTICDASH_API_URL ?? process.env.ELASTICDASH_SERVER ?? ''
101
+ const apiKey = process.env.ELASTICDASH_API_KEY ?? ''
102
+ if (serverUrl && apiKey) {
103
+ try {
104
+ evaluatorConfig = await fetchEvaluatorConfig(serverUrl, apiKey)
105
+ console.log(`[ed-test] Evaluator config: provider=${evaluatorConfig.provider}, model=${evaluatorConfig.model}, hasKey=${!!evaluatorConfig.apiKey}`)
106
+ } catch (err) {
107
+ console.warn(`[ed-test] Could not fetch evaluator config: ${err instanceof Error ? err.message : String(err)}`)
108
+ }
109
+ }
110
+ }
111
+
112
+ for (const test of testsToRun) {
113
+ const allRuns: EdSingleRunResult[] = []
114
+ let bestResult: EdTestResult | null = null
115
+
116
+ for (let attempt = 1; attempt <= maxRuns; attempt++) {
117
+ const runStartedAt = new Date().toISOString()
118
+ const result = await runSingleTest(test, evaluatorConfig)
119
+ const runFinishedAt = new Date().toISOString()
120
+
121
+ if (attempt > 1) {
122
+ console.log(` [ed-test] ${test.name}: run ${attempt}/${maxRuns} — ${result.status}`)
123
+ }
124
+
125
+ // Collect every run for upload
126
+ allRuns.push({
127
+ status: result.status,
128
+ failureReason: result.failureReason,
129
+ measurement: result.measurement,
130
+ benchmarkResult: result.benchmarkResult,
131
+ output: result.output,
132
+ durationMs: result.durationMs,
133
+ startedAt: runStartedAt,
134
+ finishedAt: runFinishedAt,
135
+ })
136
+
137
+ // Keep the first passing result, or the last failure (for aggregate status)
138
+ if (!bestResult || result.status === 'pass') {
139
+ bestResult = result
140
+ }
141
+
142
+ }
143
+
144
+ // Aggregate: fail if ANY run failed
145
+ const anyFailed = allRuns.some(r => r.status === 'fail')
146
+ const failedRun = allRuns.find(r => r.status === 'fail')
147
+
148
+ results.push({
149
+ ...bestResult!,
150
+ status: anyFailed ? 'fail' : 'pass',
151
+ failureReason: anyFailed ? (failedRun?.failureReason || bestResult!.failureReason) : undefined,
152
+ singleRuns: allRuns,
153
+ })
154
+
155
+ if (options?.failFast && anyFailed) {
156
+ break
157
+ }
158
+ }
159
+
160
+ const finishedAt = new Date().toISOString()
161
+
162
+ return {
163
+ runId,
164
+ startedAt,
165
+ finishedAt,
166
+ results,
167
+ sdkVersion: SDK_VERSION,
168
+ }
169
+ }
170
+
171
+ // ─── Single test execution ──────────────────────────────────
172
+
173
+ async function resolveCustomInput(input: unknown | (() => Promise<unknown> | unknown)): Promise<unknown> {
174
+ return typeof input === 'function' ? await (input as () => Promise<unknown> | unknown)() : input
175
+ }
176
+
177
+ async function runSingleTest(test: ValidatedTest, evaluatorConfig?: EvaluatorConfig | null): Promise<EdTestResult> {
178
+ const startMs = Date.now()
179
+ const targetStep = test.traceData.steps.find(s => s.step_id === test.target.step_id)
180
+
181
+ const resolvedInput = test.input !== undefined
182
+ ? await resolveCustomInput(test.input)
183
+ : targetStep?.input
184
+
185
+ const base: Partial<EdTestResult> = {
186
+ testId: test.name,
187
+ testName: test.name,
188
+ traceRef: test.trace,
189
+ target: { type: test.target.type, step_id: test.target.step_id },
190
+ input: resolvedInput,
191
+ output: targetStep?.output,
192
+ }
193
+
194
+ // Check run function exists
195
+ if (!test.run || typeof test.run !== 'function') {
196
+ return {
197
+ ...base,
198
+ testId: test.name,
199
+ testName: test.name,
200
+ status: 'fail',
201
+ failureReason: 'test has no run function',
202
+ durationMs: Date.now() - startMs,
203
+ }
204
+ }
205
+
206
+ const replayCtx = createReplayContext(test.traceData, test.target.step_id)
207
+ installReplay(replayCtx)
208
+
209
+ try {
210
+ const timeoutMs = test.timeout_ms ?? 60000
211
+
212
+ await Promise.race([
213
+ test.run(resolvedInput),
214
+ new Promise<never>((_, reject) =>
215
+ setTimeout(() => reject(new TimeoutError(timeoutMs)), timeoutMs),
216
+ ),
217
+ ])
218
+
219
+ // Collect measurement from the target step.
220
+ // If replay captured it (in-process wrapTool/wrapAI), use that.
221
+ // Otherwise fall back to extracting the measurement directly from the
222
+ // trace data. This handles HTTP-mode workflows where wrapTool/wrapAI
223
+ // calls happen on a remote server and the replay ALS is not accessed.
224
+ // The measurement values are identical either way — both come from the
225
+ // recorded trace, not from live execution.
226
+ let measurement = collectMeasurement(replayCtx)
227
+ if (!measurement) {
228
+ measurement = extractMeasurementFromTrace(test) ?? null
229
+ if (measurement) {
230
+ console.log(` [ed-test] ${test.name}: extracted measurement from trace (HTTP-mode fallback)`)
231
+ }
232
+ }
233
+ if (!measurement) {
234
+ return {
235
+ ...base,
236
+ testId: test.name,
237
+ testName: test.name,
238
+ status: 'fail',
239
+ failureReason: `target step "${test.target.step_id}" was not replayed during execution`,
240
+ durationMs: Date.now() - startMs,
241
+ }
242
+ }
243
+
244
+ // Compare against benchmarks (async to support llm_judge)
245
+ const benchmarkResult = await compareBenchmarks(measurement, test.benchmarks, targetStep?.output, evaluatorConfig)
246
+
247
+ return {
248
+ ...base,
249
+ testId: test.name,
250
+ testName: test.name,
251
+ status: benchmarkResult.passed ? 'pass' : 'fail',
252
+ failureReason: benchmarkResult.failure_reason,
253
+ measurement,
254
+ benchmarkResult,
255
+ durationMs: Date.now() - startMs,
256
+ }
257
+ } catch (err) {
258
+ // For HTTP-mode workflows, run() may fail (e.g. server not running) but the
259
+ // measurement can still be extracted from the trace. The benchmarks compare
260
+ // against recorded data, not live performance, so this is valid.
261
+ const traceMeasurement = extractMeasurementFromTrace(test)
262
+ if (traceMeasurement) {
263
+ console.log(` [ed-test] ${test.name}: run() failed (${err instanceof Error ? err.message : String(err)}), using trace measurement fallback`)
264
+ const benchmarkResult = await compareBenchmarks(traceMeasurement, test.benchmarks, targetStep?.output, evaluatorConfig)
265
+ return {
266
+ ...base,
267
+ testId: test.name,
268
+ testName: test.name,
269
+ status: benchmarkResult.passed ? 'pass' : 'fail',
270
+ failureReason: benchmarkResult.failure_reason,
271
+ measurement: traceMeasurement,
272
+ benchmarkResult,
273
+ durationMs: Date.now() - startMs,
274
+ }
275
+ }
276
+
277
+ if (err instanceof ReplayMissError) {
278
+ return {
279
+ ...base,
280
+ testId: test.name,
281
+ testName: test.name,
282
+ status: 'fail',
283
+ failureReason: `replay miss: ${err.callType}::${err.callName}`,
284
+ durationMs: Date.now() - startMs,
285
+ }
286
+ }
287
+ if (err instanceof TimeoutError) {
288
+ return {
289
+ ...base,
290
+ testId: test.name,
291
+ testName: test.name,
292
+ status: 'fail',
293
+ failureReason: `test timed out after ${err.timeoutMs}ms`,
294
+ durationMs: Date.now() - startMs,
295
+ }
296
+ }
297
+ return {
298
+ ...base,
299
+ testId: test.name,
300
+ testName: test.name,
301
+ status: 'fail',
302
+ failureReason: `execution error: ${err instanceof Error ? err.message : String(err)}`,
303
+ durationMs: Date.now() - startMs,
304
+ }
305
+ } finally {
306
+ uninstallReplay()
307
+ }
308
+ }
309
+
310
+ // ─── Trace-direct measurement extraction ────────────────────
311
+
312
+ /**
313
+ * Extracts the target step's measurement directly from the trace data.
314
+ * Used as a fallback when the replay mechanism did not capture the step
315
+ * (e.g. HTTP-mode workflows where wrapTool/wrapAI run on a remote server).
316
+ *
317
+ * Returns the same TestMeasurement shape that collectMeasurement produces.
318
+ */
319
+ function extractMeasurementFromTrace(test: ValidatedTest): TestMeasurement | undefined {
320
+ const step = test.traceData.steps.find(s => s.step_id === test.target.step_id)
321
+ if (!step) return undefined
322
+
323
+ const result: TestMeasurement = {
324
+ duration_ms: step.duration_ms,
325
+ }
326
+
327
+ if (step.tokens) {
328
+ result.tokens_input = step.tokens.input
329
+ result.tokens_output = step.tokens.output
330
+ result.tokens_total = step.tokens.total
331
+ }
332
+
333
+ return result
334
+ }
335
+
336
+ // ─── Helpers ────────────────────────────────────────────────
337
+
338
+ class TimeoutError extends Error {
339
+ constructor(public timeoutMs: number) {
340
+ super(`Test timed out after ${timeoutMs}ms`)
341
+ this.name = 'TimeoutError'
342
+ }
343
+ }
344
+
345
+ function matchGlob(name: string, pattern: string): boolean {
346
+ // Simple glob: convert * to .* and ? to .
347
+ const regex = new RegExp(
348
+ '^' + pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&').replace(/\*/g, '.*').replace(/\?/g, '.') + '$',
349
+ )
350
+ return regex.test(name)
351
+ }