elasticdash-sdk 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (349) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +775 -0
  3. package/dist/browser-ui.d.ts +43 -0
  4. package/dist/browser-ui.d.ts.map +1 -0
  5. package/dist/browser-ui.js +246 -0
  6. package/dist/browser-ui.js.map +1 -0
  7. package/dist/capture/event.d.ts +33 -0
  8. package/dist/capture/event.d.ts.map +1 -0
  9. package/dist/capture/event.js +2 -0
  10. package/dist/capture/event.js.map +1 -0
  11. package/dist/capture/index.d.ts +4 -0
  12. package/dist/capture/index.d.ts.map +1 -0
  13. package/dist/capture/index.js +4 -0
  14. package/dist/capture/index.js.map +1 -0
  15. package/dist/capture/recorder.d.ts +24 -0
  16. package/dist/capture/recorder.d.ts.map +1 -0
  17. package/dist/capture/recorder.js +46 -0
  18. package/dist/capture/recorder.js.map +1 -0
  19. package/dist/capture/replay.d.ts +20 -0
  20. package/dist/capture/replay.d.ts.map +1 -0
  21. package/dist/capture/replay.js +47 -0
  22. package/dist/capture/replay.js.map +1 -0
  23. package/dist/ci/api-client.d.ts +38 -0
  24. package/dist/ci/api-client.d.ts.map +1 -0
  25. package/dist/ci/api-client.js +96 -0
  26. package/dist/ci/api-client.js.map +1 -0
  27. package/dist/ci/benchmark.d.ts +33 -0
  28. package/dist/ci/benchmark.d.ts.map +1 -0
  29. package/dist/ci/benchmark.js +213 -0
  30. package/dist/ci/benchmark.js.map +1 -0
  31. package/dist/ci/ed-runner.d.ts +48 -0
  32. package/dist/ci/ed-runner.d.ts.map +1 -0
  33. package/dist/ci/ed-runner.js +260 -0
  34. package/dist/ci/ed-runner.js.map +1 -0
  35. package/dist/ci/executor.d.ts +13 -0
  36. package/dist/ci/executor.d.ts.map +1 -0
  37. package/dist/ci/executor.js +542 -0
  38. package/dist/ci/executor.js.map +1 -0
  39. package/dist/ci/git-info.d.ts +17 -0
  40. package/dist/ci/git-info.d.ts.map +1 -0
  41. package/dist/ci/git-info.js +102 -0
  42. package/dist/ci/git-info.js.map +1 -0
  43. package/dist/ci/index.d.ts +6 -0
  44. package/dist/ci/index.d.ts.map +1 -0
  45. package/dist/ci/index.js +4 -0
  46. package/dist/ci/index.js.map +1 -0
  47. package/dist/ci/measurement.d.ts +9 -0
  48. package/dist/ci/measurement.d.ts.map +1 -0
  49. package/dist/ci/measurement.js +15 -0
  50. package/dist/ci/measurement.js.map +1 -0
  51. package/dist/ci/replay.d.ts +31 -0
  52. package/dist/ci/replay.d.ts.map +1 -0
  53. package/dist/ci/replay.js +96 -0
  54. package/dist/ci/replay.js.map +1 -0
  55. package/dist/ci/reporters/default.d.ts +8 -0
  56. package/dist/ci/reporters/default.d.ts.map +1 -0
  57. package/dist/ci/reporters/default.js +46 -0
  58. package/dist/ci/reporters/default.js.map +1 -0
  59. package/dist/ci/reporters/index.d.ts +8 -0
  60. package/dist/ci/reporters/index.d.ts.map +1 -0
  61. package/dist/ci/reporters/index.js +14 -0
  62. package/dist/ci/reporters/index.js.map +1 -0
  63. package/dist/ci/reporters/json.d.ts +8 -0
  64. package/dist/ci/reporters/json.d.ts.map +1 -0
  65. package/dist/ci/reporters/json.js +14 -0
  66. package/dist/ci/reporters/json.js.map +1 -0
  67. package/dist/ci/reporters/junit.d.ts +8 -0
  68. package/dist/ci/reporters/junit.d.ts.map +1 -0
  69. package/dist/ci/reporters/junit.js +48 -0
  70. package/dist/ci/reporters/junit.js.map +1 -0
  71. package/dist/ci/runner.d.ts +3 -0
  72. package/dist/ci/runner.d.ts.map +1 -0
  73. package/dist/ci/runner.js +187 -0
  74. package/dist/ci/runner.js.map +1 -0
  75. package/dist/ci/test-discovery.d.ts +5 -0
  76. package/dist/ci/test-discovery.d.ts.map +1 -0
  77. package/dist/ci/test-discovery.js +11 -0
  78. package/dist/ci/test-discovery.js.map +1 -0
  79. package/dist/ci/test-loader.d.ts +19 -0
  80. package/dist/ci/test-loader.d.ts.map +1 -0
  81. package/dist/ci/test-loader.js +149 -0
  82. package/dist/ci/test-loader.js.map +1 -0
  83. package/dist/ci/test-registry.d.ts +42 -0
  84. package/dist/ci/test-registry.d.ts.map +1 -0
  85. package/dist/ci/test-registry.js +18 -0
  86. package/dist/ci/test-registry.js.map +1 -0
  87. package/dist/ci/trace-schema.d.ts +30 -0
  88. package/dist/ci/trace-schema.d.ts.map +1 -0
  89. package/dist/ci/trace-schema.js +66 -0
  90. package/dist/ci/trace-schema.js.map +1 -0
  91. package/dist/ci/trace-writer.d.ts +16 -0
  92. package/dist/ci/trace-writer.d.ts.map +1 -0
  93. package/dist/ci/trace-writer.js +108 -0
  94. package/dist/ci/trace-writer.js.map +1 -0
  95. package/dist/ci/types.d.ts +108 -0
  96. package/dist/ci/types.d.ts.map +1 -0
  97. package/dist/ci/types.js +3 -0
  98. package/dist/ci/types.js.map +1 -0
  99. package/dist/ci/upload-client.d.ts +74 -0
  100. package/dist/ci/upload-client.d.ts.map +1 -0
  101. package/dist/ci/upload-client.js +195 -0
  102. package/dist/ci/upload-client.js.map +1 -0
  103. package/dist/cli.d.ts +3 -0
  104. package/dist/cli.d.ts.map +1 -0
  105. package/dist/cli.js +716 -0
  106. package/dist/cli.js.map +1 -0
  107. package/dist/core/agent-state.d.ts +47 -0
  108. package/dist/core/agent-state.d.ts.map +1 -0
  109. package/dist/core/agent-state.js +137 -0
  110. package/dist/core/agent-state.js.map +1 -0
  111. package/dist/core/judge-utils.d.ts +22 -0
  112. package/dist/core/judge-utils.d.ts.map +1 -0
  113. package/dist/core/judge-utils.js +211 -0
  114. package/dist/core/judge-utils.js.map +1 -0
  115. package/dist/core/registry.d.ts +28 -0
  116. package/dist/core/registry.d.ts.map +1 -0
  117. package/dist/core/registry.js +52 -0
  118. package/dist/core/registry.js.map +1 -0
  119. package/dist/dashboard-server.d.ts +65 -0
  120. package/dist/dashboard-server.d.ts.map +1 -0
  121. package/dist/dashboard-server.js +3940 -0
  122. package/dist/dashboard-server.js.map +1 -0
  123. package/dist/execution/tool-runner.d.ts +26 -0
  124. package/dist/execution/tool-runner.d.ts.map +1 -0
  125. package/dist/execution/tool-runner.js +316 -0
  126. package/dist/execution/tool-runner.js.map +1 -0
  127. package/dist/html/dashboard.html +2218 -0
  128. package/dist/http.d.ts +14 -0
  129. package/dist/http.d.ts.map +1 -0
  130. package/dist/http.js +13 -0
  131. package/dist/http.js.map +1 -0
  132. package/dist/index.cjs +8102 -0
  133. package/dist/index.d.ts +61 -0
  134. package/dist/index.d.ts.map +1 -0
  135. package/dist/index.js +67 -0
  136. package/dist/index.js.map +1 -0
  137. package/dist/interceptors/ai-interceptor.d.ts +26 -0
  138. package/dist/interceptors/ai-interceptor.d.ts.map +1 -0
  139. package/dist/interceptors/ai-interceptor.js +756 -0
  140. package/dist/interceptors/ai-interceptor.js.map +1 -0
  141. package/dist/interceptors/db-auto.d.ts +8 -0
  142. package/dist/interceptors/db-auto.d.ts.map +1 -0
  143. package/dist/interceptors/db-auto.js +217 -0
  144. package/dist/interceptors/db-auto.js.map +1 -0
  145. package/dist/interceptors/db.d.ts +23 -0
  146. package/dist/interceptors/db.d.ts.map +1 -0
  147. package/dist/interceptors/db.js +137 -0
  148. package/dist/interceptors/db.js.map +1 -0
  149. package/dist/interceptors/http.d.ts +28 -0
  150. package/dist/interceptors/http.d.ts.map +1 -0
  151. package/dist/interceptors/http.js +356 -0
  152. package/dist/interceptors/http.js.map +1 -0
  153. package/dist/interceptors/side-effects.d.ts +7 -0
  154. package/dist/interceptors/side-effects.d.ts.map +1 -0
  155. package/dist/interceptors/side-effects.js +72 -0
  156. package/dist/interceptors/side-effects.js.map +1 -0
  157. package/dist/interceptors/telemetry-push.d.ts +142 -0
  158. package/dist/interceptors/telemetry-push.d.ts.map +1 -0
  159. package/dist/interceptors/telemetry-push.js +463 -0
  160. package/dist/interceptors/telemetry-push.js.map +1 -0
  161. package/dist/interceptors/tool.d.ts +2 -0
  162. package/dist/interceptors/tool.d.ts.map +1 -0
  163. package/dist/interceptors/tool.js +274 -0
  164. package/dist/interceptors/tool.js.map +1 -0
  165. package/dist/interceptors/workflow-ai.d.ts +5 -0
  166. package/dist/interceptors/workflow-ai.d.ts.map +1 -0
  167. package/dist/interceptors/workflow-ai.js +382 -0
  168. package/dist/interceptors/workflow-ai.js.map +1 -0
  169. package/dist/internals/conditional-recorder.d.ts +21 -0
  170. package/dist/internals/conditional-recorder.d.ts.map +1 -0
  171. package/dist/internals/conditional-recorder.js +54 -0
  172. package/dist/internals/conditional-recorder.js.map +1 -0
  173. package/dist/internals/mock-resolver.d.ts +146 -0
  174. package/dist/internals/mock-resolver.d.ts.map +1 -0
  175. package/dist/internals/mock-resolver.js +427 -0
  176. package/dist/internals/mock-resolver.js.map +1 -0
  177. package/dist/matchers/index.d.ts +96 -0
  178. package/dist/matchers/index.d.ts.map +1 -0
  179. package/dist/matchers/index.js +668 -0
  180. package/dist/matchers/index.js.map +1 -0
  181. package/dist/observability.d.ts +82 -0
  182. package/dist/observability.d.ts.map +1 -0
  183. package/dist/observability.js +471 -0
  184. package/dist/observability.js.map +1 -0
  185. package/dist/portal-executor.d.ts +30 -0
  186. package/dist/portal-executor.d.ts.map +1 -0
  187. package/dist/portal-executor.js +324 -0
  188. package/dist/portal-executor.js.map +1 -0
  189. package/dist/portal-server.d.ts +3 -0
  190. package/dist/portal-server.d.ts.map +1 -0
  191. package/dist/portal-server.js +279 -0
  192. package/dist/portal-server.js.map +1 -0
  193. package/dist/proxy/llm-capture.d.ts +14 -0
  194. package/dist/proxy/llm-capture.d.ts.map +1 -0
  195. package/dist/proxy/llm-capture.js +264 -0
  196. package/dist/proxy/llm-capture.js.map +1 -0
  197. package/dist/reporter.d.ts +3 -0
  198. package/dist/reporter.d.ts.map +1 -0
  199. package/dist/reporter.js +72 -0
  200. package/dist/reporter.js.map +1 -0
  201. package/dist/runWorkflowSubprocess.d.ts +14 -0
  202. package/dist/runWorkflowSubprocess.d.ts.map +1 -0
  203. package/dist/runWorkflowSubprocess.js +66 -0
  204. package/dist/runWorkflowSubprocess.js.map +1 -0
  205. package/dist/runner.d.ts +16 -0
  206. package/dist/runner.d.ts.map +1 -0
  207. package/dist/runner.js +138 -0
  208. package/dist/runner.js.map +1 -0
  209. package/dist/socket-connector.d.ts +22 -0
  210. package/dist/socket-connector.d.ts.map +1 -0
  211. package/dist/socket-connector.js +104 -0
  212. package/dist/socket-connector.js.map +1 -0
  213. package/dist/telemetry-batcher.d.ts +56 -0
  214. package/dist/telemetry-batcher.d.ts.map +1 -0
  215. package/dist/telemetry-batcher.js +143 -0
  216. package/dist/telemetry-batcher.js.map +1 -0
  217. package/dist/test-setup.d.ts +12 -0
  218. package/dist/test-setup.d.ts.map +1 -0
  219. package/dist/test-setup.js +13 -0
  220. package/dist/test-setup.js.map +1 -0
  221. package/dist/tool-registry.d.ts +31 -0
  222. package/dist/tool-registry.d.ts.map +1 -0
  223. package/dist/tool-registry.js +73 -0
  224. package/dist/tool-registry.js.map +1 -0
  225. package/dist/tool-runner-worker.d.ts +2 -0
  226. package/dist/tool-runner-worker.d.ts.map +1 -0
  227. package/dist/tool-runner-worker.js +215 -0
  228. package/dist/tool-runner-worker.js.map +1 -0
  229. package/dist/trace-adapter/context.d.ts +72 -0
  230. package/dist/trace-adapter/context.d.ts.map +1 -0
  231. package/dist/trace-adapter/context.js +80 -0
  232. package/dist/trace-adapter/context.js.map +1 -0
  233. package/dist/tracing.d.ts +2 -0
  234. package/dist/tracing.d.ts.map +1 -0
  235. package/dist/tracing.js +59 -0
  236. package/dist/tracing.js.map +1 -0
  237. package/dist/trigger-executor.d.ts +12 -0
  238. package/dist/trigger-executor.d.ts.map +1 -0
  239. package/dist/trigger-executor.js +130 -0
  240. package/dist/trigger-executor.js.map +1 -0
  241. package/dist/types/portal.d.ts +76 -0
  242. package/dist/types/portal.d.ts.map +1 -0
  243. package/dist/types/portal.js +2 -0
  244. package/dist/types/portal.js.map +1 -0
  245. package/dist/utils/debug.d.ts +3 -0
  246. package/dist/utils/debug.d.ts.map +1 -0
  247. package/dist/utils/debug.js +8 -0
  248. package/dist/utils/debug.js.map +1 -0
  249. package/dist/utils/license-error.d.ts +23 -0
  250. package/dist/utils/license-error.d.ts.map +1 -0
  251. package/dist/utils/license-error.js +42 -0
  252. package/dist/utils/license-error.js.map +1 -0
  253. package/dist/utils/redact.d.ts +7 -0
  254. package/dist/utils/redact.d.ts.map +1 -0
  255. package/dist/utils/redact.js +26 -0
  256. package/dist/utils/redact.js.map +1 -0
  257. package/dist/workflow-runner-worker.d.ts +2 -0
  258. package/dist/workflow-runner-worker.d.ts.map +1 -0
  259. package/dist/workflow-runner-worker.js +329 -0
  260. package/dist/workflow-runner-worker.js.map +1 -0
  261. package/dist/workflow-runner.d.ts +14 -0
  262. package/dist/workflow-runner.d.ts.map +1 -0
  263. package/dist/workflow-runner.js +34 -0
  264. package/dist/workflow-runner.js.map +1 -0
  265. package/docs/agent-coding-instructions.md +138 -0
  266. package/docs/agent-integration-guide.md +564 -0
  267. package/docs/agents.md +140 -0
  268. package/docs/dashboard.md +394 -0
  269. package/docs/deno.md +69 -0
  270. package/docs/instrumentation.md +424 -0
  271. package/docs/langfuse-trace-structure.md +145 -0
  272. package/docs/matchers.md +173 -0
  273. package/docs/observability_contract.md +192 -0
  274. package/docs/observability_mode.md +195 -0
  275. package/docs/quickstart.md +621 -0
  276. package/docs/security-compliance.md +566 -0
  277. package/docs/test-writing-guidelines.md +444 -0
  278. package/docs/tools.md +165 -0
  279. package/docs/workflow-modes.md +253 -0
  280. package/package.json +76 -0
  281. package/src/browser-ui.ts +281 -0
  282. package/src/capture/event.ts +30 -0
  283. package/src/capture/index.ts +3 -0
  284. package/src/capture/recorder.ts +62 -0
  285. package/src/capture/replay.ts +55 -0
  286. package/src/ci/api-client.ts +136 -0
  287. package/src/ci/benchmark.ts +257 -0
  288. package/src/ci/ed-runner.ts +351 -0
  289. package/src/ci/executor.ts +671 -0
  290. package/src/ci/git-info.ts +127 -0
  291. package/src/ci/index.ts +5 -0
  292. package/src/ci/measurement.ts +25 -0
  293. package/src/ci/replay.ts +127 -0
  294. package/src/ci/reporters/default.ts +50 -0
  295. package/src/ci/reporters/index.ts +21 -0
  296. package/src/ci/reporters/json.ts +18 -0
  297. package/src/ci/reporters/junit.ts +61 -0
  298. package/src/ci/runner.ts +208 -0
  299. package/src/ci/test-discovery.ts +16 -0
  300. package/src/ci/test-loader.ts +187 -0
  301. package/src/ci/test-registry.ts +62 -0
  302. package/src/ci/trace-schema.ts +96 -0
  303. package/src/ci/trace-writer.ts +107 -0
  304. package/src/ci/types.ts +115 -0
  305. package/src/ci/upload-client.ts +300 -0
  306. package/src/cli.ts +811 -0
  307. package/src/core/agent-state.ts +162 -0
  308. package/src/core/judge-utils.ts +232 -0
  309. package/src/core/registry.ts +92 -0
  310. package/src/dashboard-server.ts +2047 -0
  311. package/src/execution/tool-runner.ts +352 -0
  312. package/src/html/dashboard.html +2218 -0
  313. package/src/http.ts +13 -0
  314. package/src/index.ts +138 -0
  315. package/src/interceptors/ai-interceptor.ts +798 -0
  316. package/src/interceptors/db-auto.ts +243 -0
  317. package/src/interceptors/db.ts +156 -0
  318. package/src/interceptors/http.ts +393 -0
  319. package/src/interceptors/side-effects.ts +83 -0
  320. package/src/interceptors/telemetry-push.ts +537 -0
  321. package/src/interceptors/tool.ts +287 -0
  322. package/src/interceptors/workflow-ai.ts +419 -0
  323. package/src/internals/conditional-recorder.ts +63 -0
  324. package/src/internals/mock-resolver.ts +492 -0
  325. package/src/matchers/index.ts +824 -0
  326. package/src/observability.ts +501 -0
  327. package/src/portal-executor.ts +355 -0
  328. package/src/portal-server.ts +304 -0
  329. package/src/proxy/llm-capture.ts +301 -0
  330. package/src/reporter.ts +81 -0
  331. package/src/runWorkflowSubprocess.ts +74 -0
  332. package/src/runner.ts +178 -0
  333. package/src/socket-connector.ts +117 -0
  334. package/src/telemetry-batcher.ts +191 -0
  335. package/src/test-setup.ts +16 -0
  336. package/src/tool-registry.ts +94 -0
  337. package/src/tool-runner-worker.ts +244 -0
  338. package/src/trace-adapter/context.ts +156 -0
  339. package/src/tracing.ts +62 -0
  340. package/src/trigger-executor.ts +171 -0
  341. package/src/types/agent.d.ts +63 -0
  342. package/src/types/expect.d.ts +81 -0
  343. package/src/types/modules.d.ts +2 -0
  344. package/src/types/portal.ts +69 -0
  345. package/src/utils/debug.ts +8 -0
  346. package/src/utils/license-error.ts +43 -0
  347. package/src/utils/redact.ts +25 -0
  348. package/src/workflow-runner-worker.ts +386 -0
  349. package/src/workflow-runner.ts +58 -0
@@ -0,0 +1,671 @@
1
+ import { executePortalTask, checkToolAvailability, checkAIAvailability } from '../portal-executor.js'
2
+ import { scanTools } from '../execution/tool-runner.js'
3
+ import { callProviderLLM } from '../matchers/index.js'
4
+ import { prepareOutputForJudge } from '../core/judge-utils.js'
5
+ import type { ToolInfo } from '../execution/tool-runner.js'
6
+ import type {
7
+ APITestGroupTest,
8
+ APIExpectation,
9
+ CISingleRunResult,
10
+ CIExpectationResult,
11
+ } from './types.js'
12
+
13
+ // ─── Test Executor ───────────────────────────────────────────
14
+ // Executes a single TestGroupTest: runs it N times, evaluates expectations,
15
+ // determines pass/fail based on pass_threshold.
16
+
17
+ interface ExecutionResult {
18
+ passed: boolean
19
+ singleRuns: CISingleRunResult[]
20
+ expectationResults: CIExpectationResult[]
21
+ durationMs: number
22
+ }
23
+
24
+ /**
25
+ * Execute a test (single-step or full-flow) according to its configuration.
26
+ */
27
+ export async function executeTest(
28
+ test: APITestGroupTest,
29
+ cwd: string,
30
+ ): Promise<ExecutionResult> {
31
+ const tools = scanTools(cwd)
32
+ const runCount = test.run_count || 1
33
+ const timeoutMs = test.timeout_ms || 30000
34
+ const startTime = Date.now()
35
+
36
+ const singleRuns: CISingleRunResult[] = []
37
+
38
+ for (let i = 0; i < runCount; i++) {
39
+ const runStart = Date.now()
40
+ let result: CISingleRunResult
41
+
42
+ try {
43
+ const runPromise = executeSingleRun(test, cwd, tools, i)
44
+ const timeoutPromise = new Promise<never>((_, reject) =>
45
+ setTimeout(() => reject(new Error(`Test timed out after ${timeoutMs}ms`)), timeoutMs)
46
+ )
47
+ result = await Promise.race([runPromise, timeoutPromise])
48
+ } catch (err) {
49
+ result = {
50
+ runIndex: i + 1,
51
+ passed: false,
52
+ durationMs: Date.now() - runStart,
53
+ inputTokens: 0,
54
+ outputTokens: 0,
55
+ totalTokens: 0,
56
+ output: null,
57
+ trace: null,
58
+ error: err instanceof Error ? err.message : String(err),
59
+ }
60
+ }
61
+
62
+ singleRuns.push(result)
63
+ }
64
+
65
+ // Evaluate expectations against the single runs
66
+ const expectationResults = await evaluateExpectations(test.expectations, singleRuns)
67
+
68
+ // Determine overall pass/fail
69
+ const passed = determinePassFail(test.pass_threshold, singleRuns, expectationResults)
70
+
71
+ return {
72
+ passed,
73
+ singleRuns,
74
+ expectationResults,
75
+ durationMs: Date.now() - startTime,
76
+ }
77
+ }
78
+
79
+ // ─── Single Run Execution ────────────────────────────────────
80
+
81
+ async function executeSingleRun(
82
+ test: APITestGroupTest,
83
+ cwd: string,
84
+ tools: ToolInfo[],
85
+ runIndex: number,
86
+ ): Promise<CISingleRunResult> {
87
+ const start = Date.now()
88
+
89
+ if (test.test_type === 'single-step') {
90
+ return executeSingleStep(test, cwd, tools, runIndex, start)
91
+ }
92
+
93
+ return executeFullFlow(test, cwd, tools, runIndex, start)
94
+ }
95
+
96
+ async function executeSingleStep(
97
+ test: APITestGroupTest,
98
+ cwd: string,
99
+ tools: ToolInfo[],
100
+ runIndex: number,
101
+ start: number,
102
+ ): Promise<CISingleRunResult> {
103
+ const stepType = test.target_step_type // 'ai' or 'tool'
104
+ const stepName = test.target_step_name
105
+
106
+ if (!stepType || !stepName) {
107
+ return {
108
+ runIndex: runIndex + 1,
109
+ passed: false,
110
+ durationMs: Date.now() - start,
111
+ inputTokens: 0, outputTokens: 0, totalTokens: 0,
112
+ output: null, trace: null,
113
+ error: 'Single-step test requires target_step_type and target_step_name.',
114
+ }
115
+ }
116
+
117
+ // Pre-validate availability
118
+ const availability = stepType === 'ai'
119
+ ? checkAIAvailability(undefined, stepName)
120
+ : checkToolAvailability(stepName, cwd, tools)
121
+
122
+ if (!availability.available) {
123
+ return {
124
+ runIndex: runIndex + 1,
125
+ passed: false,
126
+ durationMs: Date.now() - start,
127
+ inputTokens: 0, outputTokens: 0, totalTokens: 0,
128
+ output: null, trace: null,
129
+ error: `Step unavailable: ${availability.reason}`,
130
+ }
131
+ }
132
+
133
+ // Execute via portal executor (reuses existing infrastructure)
134
+ const result = await executePortalTask(
135
+ {
136
+ taskId: `ci-${test.id}-run-${runIndex}`,
137
+ type: stepType === 'ai' ? 'ai' : 'tool',
138
+ name: stepName,
139
+ input: test.mock_input,
140
+ frozenEvents: test.frozen_events as any[],
141
+ },
142
+ cwd,
143
+ tools,
144
+ )
145
+
146
+ return {
147
+ runIndex: runIndex + 1,
148
+ passed: result.ok,
149
+ durationMs: result.durationMs,
150
+ inputTokens: result.usage?.inputTokens ?? 0,
151
+ outputTokens: result.usage?.outputTokens ?? 0,
152
+ totalTokens: result.usage?.totalTokens ?? 0,
153
+ output: result.output,
154
+ trace: null,
155
+ error: result.error,
156
+ }
157
+ }
158
+
159
+ async function executeFullFlow(
160
+ test: APITestGroupTest,
161
+ cwd: string,
162
+ tools: ToolInfo[],
163
+ runIndex: number,
164
+ start: number,
165
+ ): Promise<CISingleRunResult> {
166
+ // Full-flow tests require running the entire workflow.
167
+ // We leverage the existing workflow runner infrastructure.
168
+ // The workflow is loaded from ed_workflows.ts and executed with workflow_input.
169
+
170
+ try {
171
+ const { runWorkflow } = await import('../workflow-runner.js')
172
+ const { resolveRuntimeModule } = await import('../execution/tool-runner.js')
173
+ const { pathToFileURL } = await import('node:url')
174
+
175
+ const workflowModulePath = resolveRuntimeModule(cwd, 'ed_workflows')
176
+ if (!workflowModulePath) {
177
+ return {
178
+ runIndex: runIndex + 1,
179
+ passed: false,
180
+ durationMs: Date.now() - start,
181
+ inputTokens: 0, outputTokens: 0, totalTokens: 0,
182
+ output: null, trace: null,
183
+ error: 'Cannot find ed_workflows.ts/js in workspace root.',
184
+ }
185
+ }
186
+
187
+ // Import the workflow module dynamically
188
+ const workflowModule = await import(pathToFileURL(workflowModulePath).href)
189
+
190
+ // Find a suitable workflow function to execute
191
+ // Convention: use the first exported async function, or match by test group's workflow name
192
+ const workflowFns = Object.entries(workflowModule).filter(
193
+ ([, val]) => typeof val === 'function'
194
+ ) as [string, (...args: unknown[]) => Promise<unknown>][]
195
+
196
+ if (workflowFns.length === 0) {
197
+ return {
198
+ runIndex: runIndex + 1,
199
+ passed: false,
200
+ durationMs: Date.now() - start,
201
+ inputTokens: 0, outputTokens: 0, totalTokens: 0,
202
+ output: null, trace: null,
203
+ error: 'No workflow functions found in ed_workflows.',
204
+ }
205
+ }
206
+
207
+ // Prefer matching by name if the test has a target_step_name
208
+ const targetFn = test.target_step_name
209
+ ? workflowFns.find(([name]) => name === test.target_step_name)?.[1]
210
+ : workflowFns[0][1]
211
+
212
+ const fn = targetFn ?? workflowFns[0][1]
213
+
214
+ const frozenEvents = Array.isArray(test.frozen_events) ? test.frozen_events : []
215
+
216
+ const { result, trace } = await runWorkflow(
217
+ () => {
218
+ const input = test.workflow_input
219
+ return fn(input) as Promise<unknown>
220
+ },
221
+ {
222
+ replayMode: frozenEvents.length > 0,
223
+ history: frozenEvents as any[],
224
+ interceptHttp: true,
225
+ interceptSideEffects: true,
226
+ },
227
+ )
228
+
229
+ // Extract usage from trace events
230
+ let inputTokens = 0, outputTokens = 0, totalTokens = 0
231
+ if (trace?.events) {
232
+ for (const evt of trace.events) {
233
+ if (evt.type === 'ai' && evt.usage) {
234
+ inputTokens += (evt.usage as any).inputTokens ?? (evt.usage as any).input ?? 0
235
+ outputTokens += (evt.usage as any).outputTokens ?? (evt.usage as any).output ?? 0
236
+ totalTokens += (evt.usage as any).totalTokens ?? (evt.usage as any).total ?? 0
237
+ }
238
+ }
239
+ }
240
+
241
+ return {
242
+ runIndex: runIndex + 1,
243
+ passed: true,
244
+ durationMs: Date.now() - start,
245
+ inputTokens, outputTokens, totalTokens,
246
+ output: result,
247
+ trace: trace ?? null,
248
+ }
249
+ } catch (err) {
250
+ return {
251
+ runIndex: runIndex + 1,
252
+ passed: false,
253
+ durationMs: Date.now() - start,
254
+ inputTokens: 0, outputTokens: 0, totalTokens: 0,
255
+ output: null, trace: null,
256
+ error: err instanceof Error ? err.message : String(err),
257
+ }
258
+ }
259
+ }
260
+
261
+ // ─── Expectation Evaluation ──────────────────────────────────
262
+
263
+ async function evaluateExpectations(
264
+ expectations: APIExpectation[],
265
+ singleRuns: CISingleRunResult[],
266
+ ): Promise<CIExpectationResult[]> {
267
+ const results: CIExpectationResult[] = []
268
+
269
+ for (const exp of expectations) {
270
+ const result = await evaluateExpectation(exp, singleRuns)
271
+ results.push(result)
272
+ }
273
+
274
+ return results
275
+ }
276
+
277
+ async function evaluateExpectation(
278
+ exp: APIExpectation,
279
+ singleRuns: CISingleRunResult[],
280
+ ): Promise<CIExpectationResult> {
281
+ switch (exp.type) {
282
+ case 'token-budget':
283
+ return evaluateTokenBudget(exp, singleRuns)
284
+ case 'latency-budget':
285
+ return evaluateLatencyBudget(exp, singleRuns)
286
+ case 'output-contains':
287
+ return evaluateOutputContains(exp, singleRuns)
288
+ case 'output-schema':
289
+ return evaluateOutputSchema(exp, singleRuns)
290
+ case 'tool-called':
291
+ return evaluateToolCalled(exp, singleRuns)
292
+ case 'determinism':
293
+ return evaluateDeterminism(exp, singleRuns)
294
+ case 'llm-judge':
295
+ return evaluateLLMJudge(exp, singleRuns)
296
+ default:
297
+ return {
298
+ expectationId: exp.id,
299
+ type: exp.type,
300
+ passed: false,
301
+ detail: `Unknown expectation type: ${exp.type}`,
302
+ }
303
+ }
304
+ }
305
+
306
+ function evaluateTokenBudget(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
307
+ const perRun: Record<number, { passed: boolean; detail?: string }> = {}
308
+ let allPassed = true
309
+
310
+ for (const run of runs) {
311
+ let passed = true
312
+ const details: string[] = []
313
+
314
+ if (exp.max_tokens_per_run != null && run.totalTokens > exp.max_tokens_per_run) {
315
+ passed = false
316
+ details.push(`total ${run.totalTokens} > max ${exp.max_tokens_per_run}`)
317
+ }
318
+ perRun[run.runIndex] = { passed, detail: details.join('; ') || undefined }
319
+ if (!passed) allPassed = false
320
+ }
321
+
322
+ if (exp.max_total_tokens != null) {
323
+ const total = runs.reduce((sum, r) => sum + r.totalTokens, 0)
324
+ if (total > exp.max_total_tokens) {
325
+ allPassed = false
326
+ }
327
+ }
328
+
329
+ return {
330
+ expectationId: exp.id,
331
+ type: 'token-budget',
332
+ passed: allPassed,
333
+ detail: allPassed ? undefined : 'Token budget exceeded.',
334
+ perRun,
335
+ }
336
+ }
337
+
338
+ function evaluateLatencyBudget(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
339
+ const perRun: Record<number, { passed: boolean; detail?: string }> = {}
340
+ let allPassed = true
341
+
342
+ for (const run of runs) {
343
+ let passed = true
344
+ if (exp.max_duration_ms != null && run.durationMs > exp.max_duration_ms) {
345
+ passed = false
346
+ }
347
+ perRun[run.runIndex] = { passed, detail: passed ? undefined : `${run.durationMs}ms > ${exp.max_duration_ms}ms` }
348
+ if (!passed) allPassed = false
349
+ }
350
+
351
+ if (exp.max_total_duration_ms != null) {
352
+ const total = runs.reduce((sum, r) => sum + r.durationMs, 0)
353
+ if (total > exp.max_total_duration_ms) {
354
+ allPassed = false
355
+ }
356
+ }
357
+
358
+ return {
359
+ expectationId: exp.id,
360
+ type: 'latency-budget',
361
+ passed: allPassed,
362
+ detail: allPassed ? undefined : 'Latency budget exceeded.',
363
+ perRun,
364
+ }
365
+ }
366
+
367
+ function evaluateOutputContains(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
368
+ const perRun: Record<number, { passed: boolean; detail?: string }> = {}
369
+ let allPassed = true
370
+
371
+ for (const run of runs) {
372
+ const outputStr = typeof run.output === 'string'
373
+ ? run.output
374
+ : JSON.stringify(run.output ?? '')
375
+
376
+ const haystack = exp.case_insensitive ? outputStr.toLowerCase() : outputStr
377
+ let passed = true
378
+
379
+ if (exp.contains_text) {
380
+ const needle = exp.case_insensitive ? exp.contains_text.toLowerCase() : exp.contains_text
381
+ if (!haystack.includes(needle)) passed = false
382
+ }
383
+ if (exp.not_contains_text) {
384
+ const needle = exp.case_insensitive ? exp.not_contains_text.toLowerCase() : exp.not_contains_text
385
+ if (haystack.includes(needle)) passed = false
386
+ }
387
+
388
+ perRun[run.runIndex] = { passed, detail: passed ? undefined : 'Output text check failed.' }
389
+ if (!passed) allPassed = false
390
+ }
391
+
392
+ return {
393
+ expectationId: exp.id,
394
+ type: 'output-contains',
395
+ passed: allPassed,
396
+ detail: allPassed ? undefined : 'Output contains check failed.',
397
+ perRun,
398
+ }
399
+ }
400
+
401
+ function evaluateOutputSchema(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
402
+ // Basic JSON schema validation: check that output is valid JSON matching the schema's type/required fields
403
+ const perRun: Record<number, { passed: boolean; detail?: string }> = {}
404
+ let allPassed = true
405
+
406
+ const schema = exp.json_schema as Record<string, unknown> | null
407
+
408
+ for (const run of runs) {
409
+ if (!schema) {
410
+ perRun[run.runIndex] = { passed: true }
411
+ continue
412
+ }
413
+
414
+ let output = run.output
415
+ if (typeof output === 'string') {
416
+ try { output = JSON.parse(output) } catch {
417
+ perRun[run.runIndex] = { passed: false, detail: 'Output is not valid JSON.' }
418
+ allPassed = false
419
+ continue
420
+ }
421
+ }
422
+
423
+ // Check type
424
+ const schemaType = schema.type as string | undefined
425
+ let passed = true
426
+
427
+ if (schemaType === 'object' && (typeof output !== 'object' || output === null || Array.isArray(output))) {
428
+ passed = false
429
+ } else if (schemaType === 'array' && !Array.isArray(output)) {
430
+ passed = false
431
+ } else if (schemaType === 'string' && typeof output !== 'string') {
432
+ passed = false
433
+ }
434
+
435
+ // Check required fields
436
+ if (passed && schemaType === 'object' && Array.isArray(schema.required)) {
437
+ for (const key of schema.required as string[]) {
438
+ if (!(key in (output as Record<string, unknown>))) {
439
+ passed = false
440
+ break
441
+ }
442
+ }
443
+ }
444
+
445
+ perRun[run.runIndex] = { passed, detail: passed ? undefined : 'Output does not match schema.' }
446
+ if (!passed) allPassed = false
447
+ }
448
+
449
+ return {
450
+ expectationId: exp.id,
451
+ type: 'output-schema',
452
+ passed: allPassed,
453
+ detail: allPassed ? undefined : 'Output schema check failed.',
454
+ perRun,
455
+ }
456
+ }
457
+
458
+ function evaluateToolCalled(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
459
+ const perRun: Record<number, { passed: boolean; detail?: string }> = {}
460
+ let allPassed = true
461
+
462
+ for (const run of runs) {
463
+ // Extract tool calls from trace
464
+ const trace = run.trace as { events?: Array<{ type: string; name: string }> } | null
465
+ const toolNames = trace?.events
466
+ ?.filter((e) => e.type === 'tool')
467
+ .map((e) => e.name) ?? []
468
+
469
+ let passed = true
470
+
471
+ // Check required tools
472
+ if (exp.required_tools?.length) {
473
+ for (const required of exp.required_tools) {
474
+ if (!toolNames.includes(required)) {
475
+ passed = false
476
+ break
477
+ }
478
+ }
479
+ }
480
+
481
+ // Check forbidden tools
482
+ if (exp.forbidden_tools?.length) {
483
+ for (const forbidden of exp.forbidden_tools) {
484
+ if (toolNames.includes(forbidden)) {
485
+ passed = false
486
+ break
487
+ }
488
+ }
489
+ }
490
+
491
+ perRun[run.runIndex] = { passed, detail: passed ? undefined : `Tools called: [${toolNames.join(', ')}]` }
492
+ if (!passed) allPassed = false
493
+ }
494
+
495
+ return {
496
+ expectationId: exp.id,
497
+ type: 'tool-called',
498
+ passed: allPassed,
499
+ detail: allPassed ? undefined : 'Tool call check failed.',
500
+ perRun,
501
+ }
502
+ }
503
+
504
+ function evaluateDeterminism(exp: APIExpectation, runs: CISingleRunResult[]): CIExpectationResult {
505
+ if (runs.length < 2) {
506
+ return {
507
+ expectationId: exp.id,
508
+ type: 'determinism',
509
+ passed: true,
510
+ detail: 'Only one run — determinism check skipped.',
511
+ }
512
+ }
513
+
514
+ // Compare outputs pairwise
515
+ const outputs = runs.map((r) =>
516
+ typeof r.output === 'string' ? r.output : JSON.stringify(r.output ?? '')
517
+ )
518
+
519
+ const reference = outputs[0]
520
+ let allSame = true
521
+ const perRun: Record<number, { passed: boolean; detail?: string }> = {}
522
+
523
+ for (let i = 0; i < runs.length; i++) {
524
+ const same = outputs[i] === reference
525
+ perRun[runs[i].runIndex] = { passed: same, detail: same ? undefined : 'Output differs from run 1.' }
526
+ if (!same) allSame = false
527
+ }
528
+
529
+ // If similarity_threshold is set, do a basic character-level similarity check
530
+ if (!allSame && exp.similarity_threshold != null) {
531
+ const threshold = exp.similarity_threshold
532
+ let allAboveThreshold = true
533
+ for (let i = 1; i < outputs.length; i++) {
534
+ const similarity = computeStringSimilarity(reference, outputs[i])
535
+ if (similarity < threshold) {
536
+ allAboveThreshold = false
537
+ perRun[runs[i].runIndex] = {
538
+ passed: false,
539
+ detail: `Similarity ${(similarity * 100).toFixed(1)}% < ${(threshold * 100).toFixed(1)}%`,
540
+ }
541
+ } else {
542
+ perRun[runs[i].runIndex] = { passed: true }
543
+ }
544
+ }
545
+ return {
546
+ expectationId: exp.id,
547
+ type: 'determinism',
548
+ passed: allAboveThreshold,
549
+ detail: allAboveThreshold ? undefined : 'Outputs are not sufficiently similar.',
550
+ perRun,
551
+ }
552
+ }
553
+
554
+ return {
555
+ expectationId: exp.id,
556
+ type: 'determinism',
557
+ passed: allSame,
558
+ detail: allSame ? undefined : 'Outputs are not identical across runs.',
559
+ perRun,
560
+ }
561
+ }
562
+
563
+ function computeStringSimilarity(a: string, b: string): number {
564
+ if (a === b) return 1
565
+ if (!a.length || !b.length) return 0
566
+ // Simple character overlap ratio
567
+ const longer = a.length >= b.length ? a : b
568
+ const shorter = a.length < b.length ? a : b
569
+ let matches = 0
570
+ for (let i = 0; i < shorter.length; i++) {
571
+ if (shorter[i] === longer[i]) matches++
572
+ }
573
+ return matches / longer.length
574
+ }
575
+
576
+ async function evaluateLLMJudge(exp: APIExpectation, runs: CISingleRunResult[]): Promise<CIExpectationResult> {
577
+ if (!exp.judge_prompt) {
578
+ return {
579
+ expectationId: exp.id,
580
+ type: 'llm-judge',
581
+ passed: false,
582
+ detail: 'LLM judge expectation requires judge_prompt.',
583
+ }
584
+ }
585
+
586
+ const perRun: Record<number, { passed: boolean; detail?: string }> = {}
587
+ let allPassed = true
588
+ const threshold = exp.judge_score_threshold ?? 7
589
+
590
+ for (const run of runs) {
591
+ const outputStr = typeof run.output === 'string'
592
+ ? run.output
593
+ : JSON.stringify(run.output ?? '')
594
+
595
+ const preparedOutput = prepareOutputForJudge(outputStr, exp.judge_prompt)
596
+ const evalPrompt = `${exp.judge_prompt}
597
+
598
+ <output>
599
+ ${preparedOutput}
600
+ </output>
601
+
602
+ Based on the evaluation criteria above, score this output on a scale of 0-10. Respond with only the number.`
603
+
604
+ try {
605
+ const provider = (exp.judge_provider as 'openai' | 'claude' | 'gemini' | 'grok' | 'kimi') || 'openai'
606
+ const result = await callProviderLLM(
607
+ evalPrompt,
608
+ { provider, model: exp.judge_model ?? undefined },
609
+ 'You are an expert test judge. Return only a number between 0 and 10.',
610
+ 4096,
611
+ 0,
612
+ )
613
+
614
+ const score = parseFloat(result.content.match(/-?\d+(?:\.\d+)?/)?.[0] ?? '')
615
+ if (isNaN(score)) {
616
+ perRun[run.runIndex] = { passed: false, detail: `Could not parse score from: "${result.content}"` }
617
+ allPassed = false
618
+ } else {
619
+ const passed = score >= threshold
620
+ perRun[run.runIndex] = { passed, detail: `Score: ${score}/${threshold}` }
621
+ if (!passed) allPassed = false
622
+ }
623
+ } catch (err) {
624
+ perRun[run.runIndex] = {
625
+ passed: false,
626
+ detail: `LLM judge error: ${err instanceof Error ? err.message : String(err)}`,
627
+ }
628
+ allPassed = false
629
+ }
630
+ }
631
+
632
+ return {
633
+ expectationId: exp.id,
634
+ type: 'llm-judge',
635
+ passed: allPassed,
636
+ detail: allPassed ? undefined : 'LLM judge check failed.',
637
+ perRun,
638
+ }
639
+ }
640
+
641
+ // ─── Pass/Fail Threshold ─────────────────────────────────────
642
+
643
+ function determinePassFail(
644
+ passThreshold: string,
645
+ singleRuns: CISingleRunResult[],
646
+ expectationResults: CIExpectationResult[],
647
+ ): boolean {
648
+ // Check single run pass/fail
649
+ const runsPassed = singleRuns.filter((r) => r.passed).length
650
+ const totalRuns = singleRuns.length
651
+
652
+ let runsPass: boolean
653
+ switch (passThreshold) {
654
+ case 'all':
655
+ runsPass = runsPassed === totalRuns
656
+ break
657
+ case 'majority':
658
+ runsPass = runsPassed > totalRuns / 2
659
+ break
660
+ case 'any':
661
+ runsPass = runsPassed > 0
662
+ break
663
+ default:
664
+ runsPass = runsPassed === totalRuns
665
+ }
666
+
667
+ // All expectations must pass regardless of threshold
668
+ const expectationsPass = expectationResults.every((e) => e.passed)
669
+
670
+ return runsPass && expectationsPass
671
+ }