elasticdash-sdk 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (349) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +775 -0
  3. package/dist/browser-ui.d.ts +43 -0
  4. package/dist/browser-ui.d.ts.map +1 -0
  5. package/dist/browser-ui.js +246 -0
  6. package/dist/browser-ui.js.map +1 -0
  7. package/dist/capture/event.d.ts +33 -0
  8. package/dist/capture/event.d.ts.map +1 -0
  9. package/dist/capture/event.js +2 -0
  10. package/dist/capture/event.js.map +1 -0
  11. package/dist/capture/index.d.ts +4 -0
  12. package/dist/capture/index.d.ts.map +1 -0
  13. package/dist/capture/index.js +4 -0
  14. package/dist/capture/index.js.map +1 -0
  15. package/dist/capture/recorder.d.ts +24 -0
  16. package/dist/capture/recorder.d.ts.map +1 -0
  17. package/dist/capture/recorder.js +46 -0
  18. package/dist/capture/recorder.js.map +1 -0
  19. package/dist/capture/replay.d.ts +20 -0
  20. package/dist/capture/replay.d.ts.map +1 -0
  21. package/dist/capture/replay.js +47 -0
  22. package/dist/capture/replay.js.map +1 -0
  23. package/dist/ci/api-client.d.ts +38 -0
  24. package/dist/ci/api-client.d.ts.map +1 -0
  25. package/dist/ci/api-client.js +96 -0
  26. package/dist/ci/api-client.js.map +1 -0
  27. package/dist/ci/benchmark.d.ts +33 -0
  28. package/dist/ci/benchmark.d.ts.map +1 -0
  29. package/dist/ci/benchmark.js +213 -0
  30. package/dist/ci/benchmark.js.map +1 -0
  31. package/dist/ci/ed-runner.d.ts +48 -0
  32. package/dist/ci/ed-runner.d.ts.map +1 -0
  33. package/dist/ci/ed-runner.js +260 -0
  34. package/dist/ci/ed-runner.js.map +1 -0
  35. package/dist/ci/executor.d.ts +13 -0
  36. package/dist/ci/executor.d.ts.map +1 -0
  37. package/dist/ci/executor.js +542 -0
  38. package/dist/ci/executor.js.map +1 -0
  39. package/dist/ci/git-info.d.ts +17 -0
  40. package/dist/ci/git-info.d.ts.map +1 -0
  41. package/dist/ci/git-info.js +102 -0
  42. package/dist/ci/git-info.js.map +1 -0
  43. package/dist/ci/index.d.ts +6 -0
  44. package/dist/ci/index.d.ts.map +1 -0
  45. package/dist/ci/index.js +4 -0
  46. package/dist/ci/index.js.map +1 -0
  47. package/dist/ci/measurement.d.ts +9 -0
  48. package/dist/ci/measurement.d.ts.map +1 -0
  49. package/dist/ci/measurement.js +15 -0
  50. package/dist/ci/measurement.js.map +1 -0
  51. package/dist/ci/replay.d.ts +31 -0
  52. package/dist/ci/replay.d.ts.map +1 -0
  53. package/dist/ci/replay.js +96 -0
  54. package/dist/ci/replay.js.map +1 -0
  55. package/dist/ci/reporters/default.d.ts +8 -0
  56. package/dist/ci/reporters/default.d.ts.map +1 -0
  57. package/dist/ci/reporters/default.js +46 -0
  58. package/dist/ci/reporters/default.js.map +1 -0
  59. package/dist/ci/reporters/index.d.ts +8 -0
  60. package/dist/ci/reporters/index.d.ts.map +1 -0
  61. package/dist/ci/reporters/index.js +14 -0
  62. package/dist/ci/reporters/index.js.map +1 -0
  63. package/dist/ci/reporters/json.d.ts +8 -0
  64. package/dist/ci/reporters/json.d.ts.map +1 -0
  65. package/dist/ci/reporters/json.js +14 -0
  66. package/dist/ci/reporters/json.js.map +1 -0
  67. package/dist/ci/reporters/junit.d.ts +8 -0
  68. package/dist/ci/reporters/junit.d.ts.map +1 -0
  69. package/dist/ci/reporters/junit.js +48 -0
  70. package/dist/ci/reporters/junit.js.map +1 -0
  71. package/dist/ci/runner.d.ts +3 -0
  72. package/dist/ci/runner.d.ts.map +1 -0
  73. package/dist/ci/runner.js +187 -0
  74. package/dist/ci/runner.js.map +1 -0
  75. package/dist/ci/test-discovery.d.ts +5 -0
  76. package/dist/ci/test-discovery.d.ts.map +1 -0
  77. package/dist/ci/test-discovery.js +11 -0
  78. package/dist/ci/test-discovery.js.map +1 -0
  79. package/dist/ci/test-loader.d.ts +19 -0
  80. package/dist/ci/test-loader.d.ts.map +1 -0
  81. package/dist/ci/test-loader.js +149 -0
  82. package/dist/ci/test-loader.js.map +1 -0
  83. package/dist/ci/test-registry.d.ts +42 -0
  84. package/dist/ci/test-registry.d.ts.map +1 -0
  85. package/dist/ci/test-registry.js +18 -0
  86. package/dist/ci/test-registry.js.map +1 -0
  87. package/dist/ci/trace-schema.d.ts +30 -0
  88. package/dist/ci/trace-schema.d.ts.map +1 -0
  89. package/dist/ci/trace-schema.js +66 -0
  90. package/dist/ci/trace-schema.js.map +1 -0
  91. package/dist/ci/trace-writer.d.ts +16 -0
  92. package/dist/ci/trace-writer.d.ts.map +1 -0
  93. package/dist/ci/trace-writer.js +108 -0
  94. package/dist/ci/trace-writer.js.map +1 -0
  95. package/dist/ci/types.d.ts +108 -0
  96. package/dist/ci/types.d.ts.map +1 -0
  97. package/dist/ci/types.js +3 -0
  98. package/dist/ci/types.js.map +1 -0
  99. package/dist/ci/upload-client.d.ts +74 -0
  100. package/dist/ci/upload-client.d.ts.map +1 -0
  101. package/dist/ci/upload-client.js +195 -0
  102. package/dist/ci/upload-client.js.map +1 -0
  103. package/dist/cli.d.ts +3 -0
  104. package/dist/cli.d.ts.map +1 -0
  105. package/dist/cli.js +716 -0
  106. package/dist/cli.js.map +1 -0
  107. package/dist/core/agent-state.d.ts +47 -0
  108. package/dist/core/agent-state.d.ts.map +1 -0
  109. package/dist/core/agent-state.js +137 -0
  110. package/dist/core/agent-state.js.map +1 -0
  111. package/dist/core/judge-utils.d.ts +22 -0
  112. package/dist/core/judge-utils.d.ts.map +1 -0
  113. package/dist/core/judge-utils.js +211 -0
  114. package/dist/core/judge-utils.js.map +1 -0
  115. package/dist/core/registry.d.ts +28 -0
  116. package/dist/core/registry.d.ts.map +1 -0
  117. package/dist/core/registry.js +52 -0
  118. package/dist/core/registry.js.map +1 -0
  119. package/dist/dashboard-server.d.ts +65 -0
  120. package/dist/dashboard-server.d.ts.map +1 -0
  121. package/dist/dashboard-server.js +3940 -0
  122. package/dist/dashboard-server.js.map +1 -0
  123. package/dist/execution/tool-runner.d.ts +26 -0
  124. package/dist/execution/tool-runner.d.ts.map +1 -0
  125. package/dist/execution/tool-runner.js +316 -0
  126. package/dist/execution/tool-runner.js.map +1 -0
  127. package/dist/html/dashboard.html +2218 -0
  128. package/dist/http.d.ts +14 -0
  129. package/dist/http.d.ts.map +1 -0
  130. package/dist/http.js +13 -0
  131. package/dist/http.js.map +1 -0
  132. package/dist/index.cjs +8102 -0
  133. package/dist/index.d.ts +61 -0
  134. package/dist/index.d.ts.map +1 -0
  135. package/dist/index.js +67 -0
  136. package/dist/index.js.map +1 -0
  137. package/dist/interceptors/ai-interceptor.d.ts +26 -0
  138. package/dist/interceptors/ai-interceptor.d.ts.map +1 -0
  139. package/dist/interceptors/ai-interceptor.js +756 -0
  140. package/dist/interceptors/ai-interceptor.js.map +1 -0
  141. package/dist/interceptors/db-auto.d.ts +8 -0
  142. package/dist/interceptors/db-auto.d.ts.map +1 -0
  143. package/dist/interceptors/db-auto.js +217 -0
  144. package/dist/interceptors/db-auto.js.map +1 -0
  145. package/dist/interceptors/db.d.ts +23 -0
  146. package/dist/interceptors/db.d.ts.map +1 -0
  147. package/dist/interceptors/db.js +137 -0
  148. package/dist/interceptors/db.js.map +1 -0
  149. package/dist/interceptors/http.d.ts +28 -0
  150. package/dist/interceptors/http.d.ts.map +1 -0
  151. package/dist/interceptors/http.js +356 -0
  152. package/dist/interceptors/http.js.map +1 -0
  153. package/dist/interceptors/side-effects.d.ts +7 -0
  154. package/dist/interceptors/side-effects.d.ts.map +1 -0
  155. package/dist/interceptors/side-effects.js +72 -0
  156. package/dist/interceptors/side-effects.js.map +1 -0
  157. package/dist/interceptors/telemetry-push.d.ts +142 -0
  158. package/dist/interceptors/telemetry-push.d.ts.map +1 -0
  159. package/dist/interceptors/telemetry-push.js +463 -0
  160. package/dist/interceptors/telemetry-push.js.map +1 -0
  161. package/dist/interceptors/tool.d.ts +2 -0
  162. package/dist/interceptors/tool.d.ts.map +1 -0
  163. package/dist/interceptors/tool.js +274 -0
  164. package/dist/interceptors/tool.js.map +1 -0
  165. package/dist/interceptors/workflow-ai.d.ts +5 -0
  166. package/dist/interceptors/workflow-ai.d.ts.map +1 -0
  167. package/dist/interceptors/workflow-ai.js +382 -0
  168. package/dist/interceptors/workflow-ai.js.map +1 -0
  169. package/dist/internals/conditional-recorder.d.ts +21 -0
  170. package/dist/internals/conditional-recorder.d.ts.map +1 -0
  171. package/dist/internals/conditional-recorder.js +54 -0
  172. package/dist/internals/conditional-recorder.js.map +1 -0
  173. package/dist/internals/mock-resolver.d.ts +146 -0
  174. package/dist/internals/mock-resolver.d.ts.map +1 -0
  175. package/dist/internals/mock-resolver.js +427 -0
  176. package/dist/internals/mock-resolver.js.map +1 -0
  177. package/dist/matchers/index.d.ts +96 -0
  178. package/dist/matchers/index.d.ts.map +1 -0
  179. package/dist/matchers/index.js +668 -0
  180. package/dist/matchers/index.js.map +1 -0
  181. package/dist/observability.d.ts +82 -0
  182. package/dist/observability.d.ts.map +1 -0
  183. package/dist/observability.js +471 -0
  184. package/dist/observability.js.map +1 -0
  185. package/dist/portal-executor.d.ts +30 -0
  186. package/dist/portal-executor.d.ts.map +1 -0
  187. package/dist/portal-executor.js +324 -0
  188. package/dist/portal-executor.js.map +1 -0
  189. package/dist/portal-server.d.ts +3 -0
  190. package/dist/portal-server.d.ts.map +1 -0
  191. package/dist/portal-server.js +279 -0
  192. package/dist/portal-server.js.map +1 -0
  193. package/dist/proxy/llm-capture.d.ts +14 -0
  194. package/dist/proxy/llm-capture.d.ts.map +1 -0
  195. package/dist/proxy/llm-capture.js +264 -0
  196. package/dist/proxy/llm-capture.js.map +1 -0
  197. package/dist/reporter.d.ts +3 -0
  198. package/dist/reporter.d.ts.map +1 -0
  199. package/dist/reporter.js +72 -0
  200. package/dist/reporter.js.map +1 -0
  201. package/dist/runWorkflowSubprocess.d.ts +14 -0
  202. package/dist/runWorkflowSubprocess.d.ts.map +1 -0
  203. package/dist/runWorkflowSubprocess.js +66 -0
  204. package/dist/runWorkflowSubprocess.js.map +1 -0
  205. package/dist/runner.d.ts +16 -0
  206. package/dist/runner.d.ts.map +1 -0
  207. package/dist/runner.js +138 -0
  208. package/dist/runner.js.map +1 -0
  209. package/dist/socket-connector.d.ts +22 -0
  210. package/dist/socket-connector.d.ts.map +1 -0
  211. package/dist/socket-connector.js +104 -0
  212. package/dist/socket-connector.js.map +1 -0
  213. package/dist/telemetry-batcher.d.ts +56 -0
  214. package/dist/telemetry-batcher.d.ts.map +1 -0
  215. package/dist/telemetry-batcher.js +143 -0
  216. package/dist/telemetry-batcher.js.map +1 -0
  217. package/dist/test-setup.d.ts +12 -0
  218. package/dist/test-setup.d.ts.map +1 -0
  219. package/dist/test-setup.js +13 -0
  220. package/dist/test-setup.js.map +1 -0
  221. package/dist/tool-registry.d.ts +31 -0
  222. package/dist/tool-registry.d.ts.map +1 -0
  223. package/dist/tool-registry.js +73 -0
  224. package/dist/tool-registry.js.map +1 -0
  225. package/dist/tool-runner-worker.d.ts +2 -0
  226. package/dist/tool-runner-worker.d.ts.map +1 -0
  227. package/dist/tool-runner-worker.js +215 -0
  228. package/dist/tool-runner-worker.js.map +1 -0
  229. package/dist/trace-adapter/context.d.ts +72 -0
  230. package/dist/trace-adapter/context.d.ts.map +1 -0
  231. package/dist/trace-adapter/context.js +80 -0
  232. package/dist/trace-adapter/context.js.map +1 -0
  233. package/dist/tracing.d.ts +2 -0
  234. package/dist/tracing.d.ts.map +1 -0
  235. package/dist/tracing.js +59 -0
  236. package/dist/tracing.js.map +1 -0
  237. package/dist/trigger-executor.d.ts +12 -0
  238. package/dist/trigger-executor.d.ts.map +1 -0
  239. package/dist/trigger-executor.js +130 -0
  240. package/dist/trigger-executor.js.map +1 -0
  241. package/dist/types/portal.d.ts +76 -0
  242. package/dist/types/portal.d.ts.map +1 -0
  243. package/dist/types/portal.js +2 -0
  244. package/dist/types/portal.js.map +1 -0
  245. package/dist/utils/debug.d.ts +3 -0
  246. package/dist/utils/debug.d.ts.map +1 -0
  247. package/dist/utils/debug.js +8 -0
  248. package/dist/utils/debug.js.map +1 -0
  249. package/dist/utils/license-error.d.ts +23 -0
  250. package/dist/utils/license-error.d.ts.map +1 -0
  251. package/dist/utils/license-error.js +42 -0
  252. package/dist/utils/license-error.js.map +1 -0
  253. package/dist/utils/redact.d.ts +7 -0
  254. package/dist/utils/redact.d.ts.map +1 -0
  255. package/dist/utils/redact.js +26 -0
  256. package/dist/utils/redact.js.map +1 -0
  257. package/dist/workflow-runner-worker.d.ts +2 -0
  258. package/dist/workflow-runner-worker.d.ts.map +1 -0
  259. package/dist/workflow-runner-worker.js +329 -0
  260. package/dist/workflow-runner-worker.js.map +1 -0
  261. package/dist/workflow-runner.d.ts +14 -0
  262. package/dist/workflow-runner.d.ts.map +1 -0
  263. package/dist/workflow-runner.js +34 -0
  264. package/dist/workflow-runner.js.map +1 -0
  265. package/docs/agent-coding-instructions.md +138 -0
  266. package/docs/agent-integration-guide.md +564 -0
  267. package/docs/agents.md +140 -0
  268. package/docs/dashboard.md +394 -0
  269. package/docs/deno.md +69 -0
  270. package/docs/instrumentation.md +424 -0
  271. package/docs/langfuse-trace-structure.md +145 -0
  272. package/docs/matchers.md +173 -0
  273. package/docs/observability_contract.md +192 -0
  274. package/docs/observability_mode.md +195 -0
  275. package/docs/quickstart.md +621 -0
  276. package/docs/security-compliance.md +566 -0
  277. package/docs/test-writing-guidelines.md +444 -0
  278. package/docs/tools.md +165 -0
  279. package/docs/workflow-modes.md +253 -0
  280. package/package.json +76 -0
  281. package/src/browser-ui.ts +281 -0
  282. package/src/capture/event.ts +30 -0
  283. package/src/capture/index.ts +3 -0
  284. package/src/capture/recorder.ts +62 -0
  285. package/src/capture/replay.ts +55 -0
  286. package/src/ci/api-client.ts +136 -0
  287. package/src/ci/benchmark.ts +257 -0
  288. package/src/ci/ed-runner.ts +351 -0
  289. package/src/ci/executor.ts +671 -0
  290. package/src/ci/git-info.ts +127 -0
  291. package/src/ci/index.ts +5 -0
  292. package/src/ci/measurement.ts +25 -0
  293. package/src/ci/replay.ts +127 -0
  294. package/src/ci/reporters/default.ts +50 -0
  295. package/src/ci/reporters/index.ts +21 -0
  296. package/src/ci/reporters/json.ts +18 -0
  297. package/src/ci/reporters/junit.ts +61 -0
  298. package/src/ci/runner.ts +208 -0
  299. package/src/ci/test-discovery.ts +16 -0
  300. package/src/ci/test-loader.ts +187 -0
  301. package/src/ci/test-registry.ts +62 -0
  302. package/src/ci/trace-schema.ts +96 -0
  303. package/src/ci/trace-writer.ts +107 -0
  304. package/src/ci/types.ts +115 -0
  305. package/src/ci/upload-client.ts +300 -0
  306. package/src/cli.ts +811 -0
  307. package/src/core/agent-state.ts +162 -0
  308. package/src/core/judge-utils.ts +232 -0
  309. package/src/core/registry.ts +92 -0
  310. package/src/dashboard-server.ts +2047 -0
  311. package/src/execution/tool-runner.ts +352 -0
  312. package/src/html/dashboard.html +2218 -0
  313. package/src/http.ts +13 -0
  314. package/src/index.ts +138 -0
  315. package/src/interceptors/ai-interceptor.ts +798 -0
  316. package/src/interceptors/db-auto.ts +243 -0
  317. package/src/interceptors/db.ts +156 -0
  318. package/src/interceptors/http.ts +393 -0
  319. package/src/interceptors/side-effects.ts +83 -0
  320. package/src/interceptors/telemetry-push.ts +537 -0
  321. package/src/interceptors/tool.ts +287 -0
  322. package/src/interceptors/workflow-ai.ts +419 -0
  323. package/src/internals/conditional-recorder.ts +63 -0
  324. package/src/internals/mock-resolver.ts +492 -0
  325. package/src/matchers/index.ts +824 -0
  326. package/src/observability.ts +501 -0
  327. package/src/portal-executor.ts +355 -0
  328. package/src/portal-server.ts +304 -0
  329. package/src/proxy/llm-capture.ts +301 -0
  330. package/src/reporter.ts +81 -0
  331. package/src/runWorkflowSubprocess.ts +74 -0
  332. package/src/runner.ts +178 -0
  333. package/src/socket-connector.ts +117 -0
  334. package/src/telemetry-batcher.ts +191 -0
  335. package/src/test-setup.ts +16 -0
  336. package/src/tool-registry.ts +94 -0
  337. package/src/tool-runner-worker.ts +244 -0
  338. package/src/trace-adapter/context.ts +156 -0
  339. package/src/tracing.ts +62 -0
  340. package/src/trigger-executor.ts +171 -0
  341. package/src/types/agent.d.ts +63 -0
  342. package/src/types/expect.d.ts +81 -0
  343. package/src/types/modules.d.ts +2 -0
  344. package/src/types/portal.ts +69 -0
  345. package/src/utils/debug.ts +8 -0
  346. package/src/utils/license-error.ts +43 -0
  347. package/src/utils/redact.ts +25 -0
  348. package/src/workflow-runner-worker.ts +386 -0
  349. package/src/workflow-runner.ts +58 -0
@@ -0,0 +1,668 @@
1
+ import { expect } from 'expect';
2
+ import { prepareOutputForJudge } from '../core/judge-utils.js';
3
+ /**
4
+ * Type guard: returns true only if `value` looks like a TraceHandle.
5
+ * Used to produce a clear error message when a non-trace value (e.g. a plain
6
+ * string) is passed to a trace-aware matcher.
7
+ */
8
+ function isTraceHandle(value) {
9
+ return (value !== null &&
10
+ typeof value === 'object' &&
11
+ typeof value.getLLMSteps === 'function' &&
12
+ typeof value.getToolCalls === 'function');
13
+ }
14
+ const defaultModels = {
15
+ openai: 'gpt-4.1',
16
+ claude: 'claude-3-opus-20240229',
17
+ gemini: 'gemini-1.5-pro',
18
+ grok: 'grok-beta',
19
+ kimi: 'kimi-k2-turbo-preview',
20
+ };
21
+ export async function callProviderLLM(prompt, options = {}, systemPrompt = 'You are an expert test judge.', maxTokens = 32, temperature = 0) {
22
+ const provider = options.provider ?? 'openai';
23
+ const sdk = options.sdk;
24
+ const resolvedModel = options.model ?? defaultModels[provider];
25
+ const t0 = Date.now();
26
+ switch (provider) {
27
+ case 'openai': {
28
+ if (sdk && sdk.chat?.completions?.create) {
29
+ const resp = await sdk.chat.completions.create({
30
+ model: resolvedModel,
31
+ messages: [
32
+ { role: 'system', content: systemPrompt },
33
+ { role: 'user', content: prompt },
34
+ ],
35
+ max_tokens: maxTokens,
36
+ // temperature,
37
+ });
38
+ const u = resp?.usage;
39
+ return {
40
+ content: resp?.choices?.[0]?.message?.content?.trim() ?? '',
41
+ durationMs: Date.now() - t0,
42
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
43
+ };
44
+ }
45
+ const apiKey = options.apiKey ?? process.env.OPENAI_API_KEY;
46
+ if (!apiKey)
47
+ throw new Error('Provide apiKey or set OPENAI_API_KEY for OpenAI-compatible endpoint.');
48
+ const baseURL = (options.baseURL ?? 'https://api.openai.com/v1').replace(/\/$/, '');
49
+ const response = await fetch(`${baseURL}/chat/completions`, {
50
+ method: 'POST',
51
+ headers: {
52
+ Authorization: `Bearer ${apiKey}`,
53
+ 'Content-Type': 'application/json',
54
+ },
55
+ body: JSON.stringify({
56
+ model: resolvedModel,
57
+ messages: [
58
+ { role: 'system', content: systemPrompt },
59
+ { role: 'user', content: prompt },
60
+ ],
61
+ max_tokens: maxTokens,
62
+ // temperature,
63
+ }),
64
+ });
65
+ if (!response.ok) {
66
+ throw new Error(`OpenAI API error: ${response.status} ${response.statusText}`);
67
+ }
68
+ const data = await response.json();
69
+ const u = data?.usage;
70
+ return {
71
+ content: data.choices?.[0]?.message?.content?.trim() ?? '',
72
+ durationMs: Date.now() - t0,
73
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
74
+ };
75
+ }
76
+ case 'claude': {
77
+ if (sdk && sdk.messages?.create) {
78
+ const resp = await sdk.messages.create({
79
+ model: resolvedModel,
80
+ max_tokens: maxTokens,
81
+ // temperature,
82
+ messages: [{ role: 'user', content: `${systemPrompt}\n\n${prompt}` }],
83
+ });
84
+ const u = resp?.usage;
85
+ return {
86
+ content: resp?.content?.[0]?.text?.trim() ?? '',
87
+ durationMs: Date.now() - t0,
88
+ usage: u ? { inputTokens: u.input_tokens ?? 0, outputTokens: u.output_tokens ?? 0, totalTokens: (u.input_tokens ?? 0) + (u.output_tokens ?? 0) } : undefined,
89
+ };
90
+ }
91
+ const apiKey = process.env.ANTHROPIC_API_KEY;
92
+ if (!apiKey)
93
+ throw new Error('ANTHROPIC_API_KEY is not set in environment.');
94
+ const response = await fetch('https://api.anthropic.com/v1/messages', {
95
+ method: 'POST',
96
+ headers: {
97
+ 'x-api-key': apiKey,
98
+ 'anthropic-version': '2023-06-01',
99
+ 'content-type': 'application/json',
100
+ },
101
+ body: JSON.stringify({
102
+ model: resolvedModel,
103
+ max_tokens: maxTokens,
104
+ // temperature,
105
+ messages: [{ role: 'user', content: `${systemPrompt}\n\n${prompt}` }],
106
+ }),
107
+ });
108
+ if (!response.ok) {
109
+ const errBody = await response.text().catch(() => '');
110
+ throw new Error(`Claude API error: ${response.status} ${response.statusText} (model=${resolvedModel}): ${errBody.substring(0, 200)}`);
111
+ }
112
+ const data = await response.json();
113
+ const u = data?.usage;
114
+ return {
115
+ content: data?.content?.[0]?.text?.trim() ?? '',
116
+ durationMs: Date.now() - t0,
117
+ usage: u ? { inputTokens: u.input_tokens ?? 0, outputTokens: u.output_tokens ?? 0, totalTokens: (u.input_tokens ?? 0) + (u.output_tokens ?? 0) } : undefined,
118
+ };
119
+ }
120
+ case 'gemini': {
121
+ if (sdk && sdk.models?.generateContent) {
122
+ const resp = await sdk.models.generateContent({
123
+ model: resolvedModel,
124
+ contents: [{ role: 'user', parts: [{ text: `${systemPrompt}\n\n${prompt}` }] }],
125
+ generationConfig: {
126
+ // temperature,
127
+ maxOutputTokens: maxTokens
128
+ },
129
+ });
130
+ const u = resp?.response?.usageMetadata;
131
+ return {
132
+ content: resp?.response?.candidates?.[0]?.content?.parts?.[0]?.text?.trim() ?? '',
133
+ durationMs: Date.now() - t0,
134
+ usage: u ? { inputTokens: u.promptTokenCount ?? 0, outputTokens: u.candidatesTokenCount ?? 0, totalTokens: u.totalTokenCount ?? 0 } : undefined,
135
+ };
136
+ }
137
+ const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY;
138
+ if (!apiKey)
139
+ throw new Error('GEMINI_API_KEY (or GOOGLE_API_KEY) is not set in environment.');
140
+ const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${resolvedModel}:generateContent?key=${apiKey}`, {
141
+ method: 'POST',
142
+ headers: { 'Content-Type': 'application/json' },
143
+ body: JSON.stringify({
144
+ contents: [{ role: 'user', parts: [{ text: `${systemPrompt}\n\n${prompt}` }] }],
145
+ generationConfig: {
146
+ // temperature,
147
+ maxOutputTokens: maxTokens
148
+ },
149
+ }),
150
+ });
151
+ if (!response.ok) {
152
+ throw new Error(`Gemini API error: ${response.status} ${response.statusText}`);
153
+ }
154
+ const data = await response.json();
155
+ const u = data?.usageMetadata;
156
+ return {
157
+ content: data?.candidates?.[0]?.content?.parts?.[0]?.text?.trim() ?? '',
158
+ durationMs: Date.now() - t0,
159
+ usage: u ? { inputTokens: u.promptTokenCount ?? 0, outputTokens: u.candidatesTokenCount ?? 0, totalTokens: u.totalTokenCount ?? 0 } : undefined,
160
+ };
161
+ }
162
+ case 'grok': {
163
+ if (sdk && sdk.chat?.completions?.create) {
164
+ const resp = await sdk.chat.completions.create({
165
+ model: resolvedModel,
166
+ messages: [
167
+ { role: 'system', content: systemPrompt },
168
+ { role: 'user', content: prompt },
169
+ ],
170
+ max_tokens: maxTokens,
171
+ // temperature,
172
+ });
173
+ const u = resp?.usage;
174
+ return {
175
+ content: resp?.choices?.[0]?.message?.content?.trim() ?? '',
176
+ durationMs: Date.now() - t0,
177
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
178
+ };
179
+ }
180
+ const apiKey = process.env.GROK_API_KEY;
181
+ if (!apiKey)
182
+ throw new Error('GROK_API_KEY is not set in environment.');
183
+ const response = await fetch('https://api.x.ai/v1/chat/completions', {
184
+ method: 'POST',
185
+ headers: {
186
+ Authorization: `Bearer ${apiKey}`,
187
+ 'Content-Type': 'application/json',
188
+ },
189
+ body: JSON.stringify({
190
+ model: resolvedModel,
191
+ messages: [
192
+ { role: 'system', content: systemPrompt },
193
+ { role: 'user', content: prompt },
194
+ ],
195
+ max_tokens: maxTokens,
196
+ // temperature,
197
+ }),
198
+ });
199
+ if (!response.ok) {
200
+ throw new Error(`Grok API error: ${response.status} ${response.statusText}`);
201
+ }
202
+ const data = await response.json();
203
+ const u = data?.usage;
204
+ return {
205
+ content: data.choices?.[0]?.message?.content?.trim() ?? '',
206
+ durationMs: Date.now() - t0,
207
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
208
+ };
209
+ }
210
+ case 'kimi': {
211
+ const apiKey = process.env.KIMI_API_KEY;
212
+ if (!apiKey)
213
+ throw new Error('KIMI_API_KEY is not set in environment.');
214
+ const response = await fetch('https://api.moonshot.ai/v1/chat/completions', {
215
+ method: 'POST',
216
+ headers: {
217
+ Authorization: `Bearer ${apiKey}`,
218
+ 'Content-Type': 'application/json',
219
+ },
220
+ body: JSON.stringify({
221
+ model: resolvedModel,
222
+ messages: [
223
+ { role: 'system', content: systemPrompt },
224
+ { role: 'user', content: prompt },
225
+ ],
226
+ max_tokens: maxTokens,
227
+ }),
228
+ });
229
+ const data = await response.json();
230
+ console.log(` [kimi] response: ${JSON.stringify(data).slice(0, 500)}`);
231
+ if (!response.ok) {
232
+ throw new Error(`Kimi API error: ${response.status} ${response.statusText} — ${JSON.stringify(data)}`);
233
+ }
234
+ const u = data?.usage;
235
+ return {
236
+ content: data.choices?.[0]?.message?.content?.trim() ?? '',
237
+ durationMs: Date.now() - t0,
238
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
239
+ };
240
+ }
241
+ default:
242
+ throw new Error(`Unsupported provider: ${provider}`);
243
+ }
244
+ }
245
+ // Helper: Call an LLM (configurable provider/model/sdk) to judge semantic match
246
+ async function llmJudgeSemanticMatch(traceOutput, expected, options = {}) {
247
+ const prompt = `
248
+ You are an expert test judge. Given the following AI trace output and an expected semantic result, answer "YES" if the trace output semantically matches the expectation, otherwise answer "NO".
249
+
250
+ Trace Output:
251
+ ${traceOutput}
252
+
253
+ Expected:
254
+ ${expected}
255
+
256
+ Answer only "YES" or "NO".
257
+ `.trim();
258
+ const content = (await callProviderLLM(prompt, options, 'You are an expert test judge.', 8, 0)).content.trim().toUpperCase();
259
+ return content.startsWith('YES');
260
+ }
261
+ function parseFirstNumber(text) {
262
+ const match = text.match(/-?\d+(?:\.\d+)?/);
263
+ if (!match)
264
+ return null;
265
+ const num = Number.parseFloat(match[0]);
266
+ return Number.isFinite(num) ? num : null;
267
+ }
268
+ function resolveCondition(config) {
269
+ const entries = Object.entries(config || {}).filter(([, v]) => typeof v === 'number' && Number.isFinite(v));
270
+ if (entries.length === 0)
271
+ return { kind: 'atLeast', value: 0.7 };
272
+ if (entries.length > 1) {
273
+ throw new Error('Provide only one metric condition (greaterThan, lessThan, atLeast, atMost, equals).');
274
+ }
275
+ return { kind: entries[0][0], value: entries[0][1] };
276
+ }
277
+ function checkCondition(score, condition) {
278
+ switch (condition.kind) {
279
+ case 'greaterThan':
280
+ return score > condition.value;
281
+ case 'lessThan':
282
+ return score < condition.value;
283
+ case 'atLeast':
284
+ return score >= condition.value;
285
+ case 'atMost':
286
+ return score <= condition.value;
287
+ case 'equals':
288
+ return score === condition.value;
289
+ default:
290
+ return false;
291
+ }
292
+ }
293
+ /**
294
+ * Register all AI-specific custom matchers onto the `expect` instance.
295
+ * Call this once on runner startup.
296
+ */
297
+ export function registerMatchers() {
298
+ expect.extend({
299
+ toHaveLLMStep(trace, config = {}) {
300
+ if (!isTraceHandle(trace)) {
301
+ return {
302
+ pass: false,
303
+ message: () => `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toHaveLLMStep(...)`,
304
+ };
305
+ }
306
+ const steps = trace.getLLMSteps();
307
+ const matching = steps.filter((step) => {
308
+ if (config.model && step.model !== config.model)
309
+ return false;
310
+ if (config.provider && step.provider !== config.provider)
311
+ return false;
312
+ if (config.contains) {
313
+ const haystack = [step.completion, step.prompt, step.contains]
314
+ .filter(Boolean)
315
+ .join(' ')
316
+ .toLowerCase();
317
+ if (!haystack.includes(config.contains.toLowerCase()))
318
+ return false;
319
+ }
320
+ if (config.promptContains) {
321
+ const promptHaystack = (step.prompt ?? '').toLowerCase();
322
+ if (!promptHaystack.includes(config.promptContains.toLowerCase()))
323
+ return false;
324
+ }
325
+ if (config.outputContains) {
326
+ const outputHaystack = (step.completion ?? '').toLowerCase();
327
+ if (!outputHaystack.includes(config.outputContains.toLowerCase()))
328
+ return false;
329
+ }
330
+ return true;
331
+ });
332
+ const count = matching.length;
333
+ let pass;
334
+ if (config.times !== undefined) {
335
+ pass = count === config.times;
336
+ }
337
+ else if (config.minTimes !== undefined || config.maxTimes !== undefined) {
338
+ const min = config.minTimes ?? 0;
339
+ const max = config.maxTimes ?? Infinity;
340
+ pass = count >= min && count <= max;
341
+ }
342
+ else {
343
+ pass = count > 0;
344
+ }
345
+ return {
346
+ pass,
347
+ message: () => {
348
+ if (pass) {
349
+ return `Expected trace NOT to have LLM step matching ${JSON.stringify(config)}`;
350
+ }
351
+ const stepSummary = steps.length === 0
352
+ ? 'no LLM steps were recorded'
353
+ : `${count} matching step(s) found; recorded steps: ${JSON.stringify(steps)}`;
354
+ return `Expected trace to have LLM step matching ${JSON.stringify(config)}, but ${stepSummary}`;
355
+ },
356
+ };
357
+ },
358
+ toCallTool(trace, toolName) {
359
+ if (!isTraceHandle(trace)) {
360
+ return {
361
+ pass: false,
362
+ message: () => `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toCallTool(...)`,
363
+ };
364
+ }
365
+ const calls = trace.getToolCalls();
366
+ const pass = calls.some((c) => c.name === toolName);
367
+ return {
368
+ pass,
369
+ message: () => {
370
+ if (pass) {
371
+ return `Expected trace NOT to call tool "${toolName}"`;
372
+ }
373
+ const names = calls.map((c) => c.name);
374
+ const recorded = names.length === 0 ? 'no tool calls were recorded' : `recorded: [${names.join(', ')}]`;
375
+ return `Expected tool "${toolName}" to be called, but ${recorded}`;
376
+ },
377
+ };
378
+ },
379
+ async toMatchSemanticOutput(trace, expected, options) {
380
+ if (!isTraceHandle(trace)) {
381
+ return {
382
+ pass: false,
383
+ message: () => `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toMatchSemanticOutput(...)`,
384
+ };
385
+ }
386
+ const steps = trace.getLLMSteps();
387
+ const fullOutput = steps
388
+ .map((s) => [s.completion, s.contains].filter(Boolean).join(' '))
389
+ .join(' ')
390
+ .trim();
391
+ try {
392
+ const pass = await llmJudgeSemanticMatch(fullOutput, expected, options);
393
+ return {
394
+ pass,
395
+ message: () => {
396
+ if (pass) {
397
+ return `Expected trace output NOT to semantically match "${expected}" (LLM judged YES)`;
398
+ }
399
+ return `Expected trace output to semantically match "${expected}", but LLM judged NO. Trace output: "${fullOutput || '(empty)'}"`;
400
+ },
401
+ };
402
+ }
403
+ catch (err) {
404
+ return {
405
+ pass: false,
406
+ message: () => `LLM semantic match failed: ${err.message}`,
407
+ };
408
+ }
409
+ },
410
+ async toEvaluateOutputMetric(trace, config) {
411
+ if (!isTraceHandle(trace)) {
412
+ return {
413
+ pass: false,
414
+ message: () => `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.
415
+ Use: expect(ctx.trace).toEvaluateOutputMetric(...)`,
416
+ };
417
+ }
418
+ if (!config || !config.evaluationPrompt) {
419
+ return {
420
+ pass: false,
421
+ message: () => 'toEvaluateOutputMetric requires evaluationPrompt',
422
+ };
423
+ }
424
+ const steps = trace.getLLMSteps();
425
+ if (steps.length === 0) {
426
+ return {
427
+ pass: false,
428
+ message: () => 'No LLM steps recorded; cannot evaluate output metric.',
429
+ };
430
+ }
431
+ const targetIdx = config.index ?? (config.nth !== undefined ? config.nth - 1 : steps.length - 1);
432
+ if (targetIdx < 0 || targetIdx >= steps.length) {
433
+ return {
434
+ pass: false,
435
+ message: () => `LLM steps length ${steps.length}, but index/nth points to ${targetIdx}.`,
436
+ };
437
+ }
438
+ const targetStep = steps[targetIdx];
439
+ const targetField = config.target ?? 'result';
440
+ const targetText = targetField === 'prompt' ? targetStep.prompt ?? '' : targetStep.completion ?? '';
441
+ if (!targetText) {
442
+ return {
443
+ pass: false,
444
+ message: () => `Selected LLM step has empty ${targetField}; cannot evaluate.`,
445
+ };
446
+ }
447
+ const condition = (() => {
448
+ try {
449
+ return resolveCondition(config.condition);
450
+ }
451
+ catch (err) {
452
+ return err;
453
+ }
454
+ })();
455
+ if (condition instanceof Error) {
456
+ return {
457
+ pass: false,
458
+ message: () => condition.message,
459
+ };
460
+ }
461
+ const preparedText = prepareOutputForJudge(targetText, config.evaluationPrompt);
462
+ const evalPrompt = `
463
+ Evaluation prompt (from user):
464
+ ${config.evaluationPrompt}
465
+
466
+ Score the following text strictly between 0 and 1 (inclusive). Respond with only the number.
467
+
468
+ <output>
469
+ ${preparedText}
470
+ </output>
471
+ `.trim();
472
+ try {
473
+ const raw = (await callProviderLLM(evalPrompt, { provider: config.provider, model: config.model, sdk: config.sdk, apiKey: config.apiKey, baseURL: config.baseURL }, 'You are an evaluation assistant. Return only a number between 0 and 1.', 16, 0)).content;
474
+ const score = parseFirstNumber(raw);
475
+ if (score === null) {
476
+ return {
477
+ pass: false,
478
+ message: () => `Could not parse numeric metric from model response: "${raw}"`,
479
+ };
480
+ }
481
+ if (score < 0 || score > 1) {
482
+ return {
483
+ pass: false,
484
+ message: () => `Metric ${score} is out of allowed range 0.0–1.0 (raw: "${raw}")`,
485
+ };
486
+ }
487
+ const pass = checkCondition(score, condition);
488
+ return {
489
+ pass,
490
+ message: () => {
491
+ if (pass) {
492
+ return `Expected metric NOT to satisfy ${condition.kind} ${condition.value} (score ${score})`;
493
+ }
494
+ return `Metric check failed: score ${score} did not satisfy ${condition.kind} ${condition.value}. Raw response: "${raw}"`;
495
+ },
496
+ };
497
+ }
498
+ catch (err) {
499
+ return {
500
+ pass: false,
501
+ message: () => `LLM evaluation failed: ${err.message}`,
502
+ };
503
+ }
504
+ },
505
+ toHaveCustomStep(trace, config = {}) {
506
+ if (!isTraceHandle(trace) || typeof trace.getCustomSteps !== 'function') {
507
+ return {
508
+ pass: false,
509
+ message: () => `Expected a TraceHandle (ctx.trace with getCustomSteps) but received ${typeof trace}.\nUse: expect(ctx.trace).toHaveCustomStep(...)`,
510
+ };
511
+ }
512
+ const steps = trace.getCustomSteps();
513
+ const matchString = (val) => {
514
+ if (val === undefined || val === null)
515
+ return '';
516
+ if (typeof val === 'string')
517
+ return val;
518
+ try {
519
+ return JSON.stringify(val);
520
+ }
521
+ catch {
522
+ return String(val);
523
+ }
524
+ };
525
+ const matching = steps.filter((step) => {
526
+ if (config.kind && step.kind !== config.kind)
527
+ return false;
528
+ if (config.name && step.name !== config.name)
529
+ return false;
530
+ if (config.tag && !(step.tags || []).includes(config.tag))
531
+ return false;
532
+ const payloadStr = matchString(step.payload).toLowerCase();
533
+ const resultStr = matchString(step.result).toLowerCase();
534
+ const metaStr = matchString(step.metadata).toLowerCase();
535
+ const combined = [payloadStr, resultStr, metaStr].filter(Boolean).join(' ');
536
+ if (config.contains && !combined.includes(config.contains.toLowerCase()))
537
+ return false;
538
+ if (config.payloadContains && !payloadStr.includes(config.payloadContains.toLowerCase()))
539
+ return false;
540
+ if (config.resultContains && !resultStr.includes(config.resultContains.toLowerCase()))
541
+ return false;
542
+ if (config.metadataContains && !metaStr.includes(config.metadataContains.toLowerCase()))
543
+ return false;
544
+ return true;
545
+ });
546
+ const count = matching.length;
547
+ let pass;
548
+ if (config.times !== undefined) {
549
+ pass = count === config.times;
550
+ }
551
+ else if (config.minTimes !== undefined || config.maxTimes !== undefined) {
552
+ const min = config.minTimes ?? 0;
553
+ const max = config.maxTimes ?? Infinity;
554
+ pass = count >= min && count <= max;
555
+ }
556
+ else {
557
+ pass = count > 0;
558
+ }
559
+ return {
560
+ pass,
561
+ message: () => {
562
+ if (pass) {
563
+ return `Expected trace NOT to have custom step matching ${JSON.stringify(config)}`;
564
+ }
565
+ const stepSummary = steps.length === 0
566
+ ? 'no custom steps were recorded'
567
+ : `${count} matching step(s) found; recorded custom steps: ${JSON.stringify(steps)}`;
568
+ return `Expected trace to have custom step matching ${JSON.stringify(config)}, but ${stepSummary}`;
569
+ },
570
+ };
571
+ },
572
+ toHavePromptWhere(trace, config) {
573
+ if (!isTraceHandle(trace)) {
574
+ return {
575
+ pass: false,
576
+ message: () => `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toHavePromptWhere(...)`,
577
+ };
578
+ }
579
+ if (!config || !config.filterContains) {
580
+ return {
581
+ pass: false,
582
+ message: () => 'toHavePromptWhere requires filterContains',
583
+ };
584
+ }
585
+ const filterNeedle = config.filterContains.toLowerCase();
586
+ const requireNeedle = config.requireContains?.toLowerCase();
587
+ const forbidNeedle = config.requireNotContains?.toLowerCase();
588
+ const prompts = trace.getLLMSteps().map((s) => s.prompt ?? '');
589
+ const filtered = prompts.filter((p) => p.toLowerCase().includes(filterNeedle));
590
+ // Optional positional check (index or nth)
591
+ const targetIdx = config.index ?? (config.nth !== undefined ? config.nth - 1 : undefined);
592
+ let checked = [];
593
+ let count = 0;
594
+ let pass = true;
595
+ if (targetIdx !== undefined) {
596
+ if (targetIdx < 0 || targetIdx >= filtered.length) {
597
+ return {
598
+ pass: false,
599
+ message: () => `Filtered prompts length ${filtered.length}, but index/nth points to ${targetIdx}. Config: ${JSON.stringify(config)}`,
600
+ };
601
+ }
602
+ const p = filtered[targetIdx];
603
+ const lower = p.toLowerCase();
604
+ const okRequire = requireNeedle ? lower.includes(requireNeedle) : true;
605
+ const okForbid = forbidNeedle ? !lower.includes(forbidNeedle) : true;
606
+ pass = okRequire && okForbid;
607
+ checked = okRequire && okForbid ? [p] : [];
608
+ count = checked.length;
609
+ }
610
+ else {
611
+ checked = filtered.filter((p) => {
612
+ const lower = p.toLowerCase();
613
+ if (requireNeedle && !lower.includes(requireNeedle))
614
+ return false;
615
+ if (forbidNeedle && lower.includes(forbidNeedle))
616
+ return false;
617
+ return true;
618
+ });
619
+ count = checked.length;
620
+ if (config.times !== undefined) {
621
+ pass = count === config.times;
622
+ }
623
+ else {
624
+ const min = config.minTimes ?? 0;
625
+ const max = config.maxTimes ?? Infinity;
626
+ pass = count >= min && count <= max;
627
+ }
628
+ // Also ensure that if requireContains is set, no filtered prompt violates it
629
+ if (requireNeedle) {
630
+ const violating = filtered.filter((p) => !p.toLowerCase().includes(requireNeedle));
631
+ if (violating.length > 0)
632
+ pass = false;
633
+ }
634
+ if (forbidNeedle) {
635
+ const violating = filtered.filter((p) => p.toLowerCase().includes(forbidNeedle));
636
+ if (violating.length > 0)
637
+ pass = false;
638
+ }
639
+ }
640
+ return {
641
+ pass,
642
+ message: () => {
643
+ if (pass) {
644
+ return `Expected prompts NOT to satisfy filter/require combo: ${JSON.stringify(config)}`;
645
+ }
646
+ const base = [`Expected prompts filtered by "${config.filterContains}" to satisfy requirements`];
647
+ if (config.requireContains)
648
+ base.push(`requireContains: "${config.requireContains}"`);
649
+ if (config.requireNotContains)
650
+ base.push(`requireNotContains: "${config.requireNotContains}"`);
651
+ if (targetIdx !== undefined) {
652
+ base.push(`checked index: ${targetIdx}`, `filtered count: ${filtered.length}`);
653
+ }
654
+ else {
655
+ base.push(`filtered count: ${filtered.length}, passing count: ${checked.length}`);
656
+ base.push(config.times !== undefined
657
+ ? `expected exactly ${config.times}`
658
+ : `expected between ${config.minTimes ?? 0} and ${config.maxTimes ?? Infinity}`);
659
+ }
660
+ return base.filter(Boolean).join('; ');
661
+ },
662
+ };
663
+ },
664
+ });
665
+ }
666
+ // Export our patched expect so users can import it and get the correct type and runtime matchers
667
+ export { expect };
668
+ //# sourceMappingURL=index.js.map