elasticdash-sdk 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (349) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +775 -0
  3. package/dist/browser-ui.d.ts +43 -0
  4. package/dist/browser-ui.d.ts.map +1 -0
  5. package/dist/browser-ui.js +246 -0
  6. package/dist/browser-ui.js.map +1 -0
  7. package/dist/capture/event.d.ts +33 -0
  8. package/dist/capture/event.d.ts.map +1 -0
  9. package/dist/capture/event.js +2 -0
  10. package/dist/capture/event.js.map +1 -0
  11. package/dist/capture/index.d.ts +4 -0
  12. package/dist/capture/index.d.ts.map +1 -0
  13. package/dist/capture/index.js +4 -0
  14. package/dist/capture/index.js.map +1 -0
  15. package/dist/capture/recorder.d.ts +24 -0
  16. package/dist/capture/recorder.d.ts.map +1 -0
  17. package/dist/capture/recorder.js +46 -0
  18. package/dist/capture/recorder.js.map +1 -0
  19. package/dist/capture/replay.d.ts +20 -0
  20. package/dist/capture/replay.d.ts.map +1 -0
  21. package/dist/capture/replay.js +47 -0
  22. package/dist/capture/replay.js.map +1 -0
  23. package/dist/ci/api-client.d.ts +38 -0
  24. package/dist/ci/api-client.d.ts.map +1 -0
  25. package/dist/ci/api-client.js +96 -0
  26. package/dist/ci/api-client.js.map +1 -0
  27. package/dist/ci/benchmark.d.ts +33 -0
  28. package/dist/ci/benchmark.d.ts.map +1 -0
  29. package/dist/ci/benchmark.js +213 -0
  30. package/dist/ci/benchmark.js.map +1 -0
  31. package/dist/ci/ed-runner.d.ts +48 -0
  32. package/dist/ci/ed-runner.d.ts.map +1 -0
  33. package/dist/ci/ed-runner.js +260 -0
  34. package/dist/ci/ed-runner.js.map +1 -0
  35. package/dist/ci/executor.d.ts +13 -0
  36. package/dist/ci/executor.d.ts.map +1 -0
  37. package/dist/ci/executor.js +542 -0
  38. package/dist/ci/executor.js.map +1 -0
  39. package/dist/ci/git-info.d.ts +17 -0
  40. package/dist/ci/git-info.d.ts.map +1 -0
  41. package/dist/ci/git-info.js +102 -0
  42. package/dist/ci/git-info.js.map +1 -0
  43. package/dist/ci/index.d.ts +6 -0
  44. package/dist/ci/index.d.ts.map +1 -0
  45. package/dist/ci/index.js +4 -0
  46. package/dist/ci/index.js.map +1 -0
  47. package/dist/ci/measurement.d.ts +9 -0
  48. package/dist/ci/measurement.d.ts.map +1 -0
  49. package/dist/ci/measurement.js +15 -0
  50. package/dist/ci/measurement.js.map +1 -0
  51. package/dist/ci/replay.d.ts +31 -0
  52. package/dist/ci/replay.d.ts.map +1 -0
  53. package/dist/ci/replay.js +96 -0
  54. package/dist/ci/replay.js.map +1 -0
  55. package/dist/ci/reporters/default.d.ts +8 -0
  56. package/dist/ci/reporters/default.d.ts.map +1 -0
  57. package/dist/ci/reporters/default.js +46 -0
  58. package/dist/ci/reporters/default.js.map +1 -0
  59. package/dist/ci/reporters/index.d.ts +8 -0
  60. package/dist/ci/reporters/index.d.ts.map +1 -0
  61. package/dist/ci/reporters/index.js +14 -0
  62. package/dist/ci/reporters/index.js.map +1 -0
  63. package/dist/ci/reporters/json.d.ts +8 -0
  64. package/dist/ci/reporters/json.d.ts.map +1 -0
  65. package/dist/ci/reporters/json.js +14 -0
  66. package/dist/ci/reporters/json.js.map +1 -0
  67. package/dist/ci/reporters/junit.d.ts +8 -0
  68. package/dist/ci/reporters/junit.d.ts.map +1 -0
  69. package/dist/ci/reporters/junit.js +48 -0
  70. package/dist/ci/reporters/junit.js.map +1 -0
  71. package/dist/ci/runner.d.ts +3 -0
  72. package/dist/ci/runner.d.ts.map +1 -0
  73. package/dist/ci/runner.js +187 -0
  74. package/dist/ci/runner.js.map +1 -0
  75. package/dist/ci/test-discovery.d.ts +5 -0
  76. package/dist/ci/test-discovery.d.ts.map +1 -0
  77. package/dist/ci/test-discovery.js +11 -0
  78. package/dist/ci/test-discovery.js.map +1 -0
  79. package/dist/ci/test-loader.d.ts +19 -0
  80. package/dist/ci/test-loader.d.ts.map +1 -0
  81. package/dist/ci/test-loader.js +149 -0
  82. package/dist/ci/test-loader.js.map +1 -0
  83. package/dist/ci/test-registry.d.ts +42 -0
  84. package/dist/ci/test-registry.d.ts.map +1 -0
  85. package/dist/ci/test-registry.js +18 -0
  86. package/dist/ci/test-registry.js.map +1 -0
  87. package/dist/ci/trace-schema.d.ts +30 -0
  88. package/dist/ci/trace-schema.d.ts.map +1 -0
  89. package/dist/ci/trace-schema.js +66 -0
  90. package/dist/ci/trace-schema.js.map +1 -0
  91. package/dist/ci/trace-writer.d.ts +16 -0
  92. package/dist/ci/trace-writer.d.ts.map +1 -0
  93. package/dist/ci/trace-writer.js +108 -0
  94. package/dist/ci/trace-writer.js.map +1 -0
  95. package/dist/ci/types.d.ts +108 -0
  96. package/dist/ci/types.d.ts.map +1 -0
  97. package/dist/ci/types.js +3 -0
  98. package/dist/ci/types.js.map +1 -0
  99. package/dist/ci/upload-client.d.ts +74 -0
  100. package/dist/ci/upload-client.d.ts.map +1 -0
  101. package/dist/ci/upload-client.js +195 -0
  102. package/dist/ci/upload-client.js.map +1 -0
  103. package/dist/cli.d.ts +3 -0
  104. package/dist/cli.d.ts.map +1 -0
  105. package/dist/cli.js +716 -0
  106. package/dist/cli.js.map +1 -0
  107. package/dist/core/agent-state.d.ts +47 -0
  108. package/dist/core/agent-state.d.ts.map +1 -0
  109. package/dist/core/agent-state.js +137 -0
  110. package/dist/core/agent-state.js.map +1 -0
  111. package/dist/core/judge-utils.d.ts +22 -0
  112. package/dist/core/judge-utils.d.ts.map +1 -0
  113. package/dist/core/judge-utils.js +211 -0
  114. package/dist/core/judge-utils.js.map +1 -0
  115. package/dist/core/registry.d.ts +28 -0
  116. package/dist/core/registry.d.ts.map +1 -0
  117. package/dist/core/registry.js +52 -0
  118. package/dist/core/registry.js.map +1 -0
  119. package/dist/dashboard-server.d.ts +65 -0
  120. package/dist/dashboard-server.d.ts.map +1 -0
  121. package/dist/dashboard-server.js +3940 -0
  122. package/dist/dashboard-server.js.map +1 -0
  123. package/dist/execution/tool-runner.d.ts +26 -0
  124. package/dist/execution/tool-runner.d.ts.map +1 -0
  125. package/dist/execution/tool-runner.js +316 -0
  126. package/dist/execution/tool-runner.js.map +1 -0
  127. package/dist/html/dashboard.html +2218 -0
  128. package/dist/http.d.ts +14 -0
  129. package/dist/http.d.ts.map +1 -0
  130. package/dist/http.js +13 -0
  131. package/dist/http.js.map +1 -0
  132. package/dist/index.cjs +8102 -0
  133. package/dist/index.d.ts +61 -0
  134. package/dist/index.d.ts.map +1 -0
  135. package/dist/index.js +67 -0
  136. package/dist/index.js.map +1 -0
  137. package/dist/interceptors/ai-interceptor.d.ts +26 -0
  138. package/dist/interceptors/ai-interceptor.d.ts.map +1 -0
  139. package/dist/interceptors/ai-interceptor.js +756 -0
  140. package/dist/interceptors/ai-interceptor.js.map +1 -0
  141. package/dist/interceptors/db-auto.d.ts +8 -0
  142. package/dist/interceptors/db-auto.d.ts.map +1 -0
  143. package/dist/interceptors/db-auto.js +217 -0
  144. package/dist/interceptors/db-auto.js.map +1 -0
  145. package/dist/interceptors/db.d.ts +23 -0
  146. package/dist/interceptors/db.d.ts.map +1 -0
  147. package/dist/interceptors/db.js +137 -0
  148. package/dist/interceptors/db.js.map +1 -0
  149. package/dist/interceptors/http.d.ts +28 -0
  150. package/dist/interceptors/http.d.ts.map +1 -0
  151. package/dist/interceptors/http.js +356 -0
  152. package/dist/interceptors/http.js.map +1 -0
  153. package/dist/interceptors/side-effects.d.ts +7 -0
  154. package/dist/interceptors/side-effects.d.ts.map +1 -0
  155. package/dist/interceptors/side-effects.js +72 -0
  156. package/dist/interceptors/side-effects.js.map +1 -0
  157. package/dist/interceptors/telemetry-push.d.ts +142 -0
  158. package/dist/interceptors/telemetry-push.d.ts.map +1 -0
  159. package/dist/interceptors/telemetry-push.js +463 -0
  160. package/dist/interceptors/telemetry-push.js.map +1 -0
  161. package/dist/interceptors/tool.d.ts +2 -0
  162. package/dist/interceptors/tool.d.ts.map +1 -0
  163. package/dist/interceptors/tool.js +274 -0
  164. package/dist/interceptors/tool.js.map +1 -0
  165. package/dist/interceptors/workflow-ai.d.ts +5 -0
  166. package/dist/interceptors/workflow-ai.d.ts.map +1 -0
  167. package/dist/interceptors/workflow-ai.js +382 -0
  168. package/dist/interceptors/workflow-ai.js.map +1 -0
  169. package/dist/internals/conditional-recorder.d.ts +21 -0
  170. package/dist/internals/conditional-recorder.d.ts.map +1 -0
  171. package/dist/internals/conditional-recorder.js +54 -0
  172. package/dist/internals/conditional-recorder.js.map +1 -0
  173. package/dist/internals/mock-resolver.d.ts +146 -0
  174. package/dist/internals/mock-resolver.d.ts.map +1 -0
  175. package/dist/internals/mock-resolver.js +427 -0
  176. package/dist/internals/mock-resolver.js.map +1 -0
  177. package/dist/matchers/index.d.ts +96 -0
  178. package/dist/matchers/index.d.ts.map +1 -0
  179. package/dist/matchers/index.js +668 -0
  180. package/dist/matchers/index.js.map +1 -0
  181. package/dist/observability.d.ts +82 -0
  182. package/dist/observability.d.ts.map +1 -0
  183. package/dist/observability.js +471 -0
  184. package/dist/observability.js.map +1 -0
  185. package/dist/portal-executor.d.ts +30 -0
  186. package/dist/portal-executor.d.ts.map +1 -0
  187. package/dist/portal-executor.js +324 -0
  188. package/dist/portal-executor.js.map +1 -0
  189. package/dist/portal-server.d.ts +3 -0
  190. package/dist/portal-server.d.ts.map +1 -0
  191. package/dist/portal-server.js +279 -0
  192. package/dist/portal-server.js.map +1 -0
  193. package/dist/proxy/llm-capture.d.ts +14 -0
  194. package/dist/proxy/llm-capture.d.ts.map +1 -0
  195. package/dist/proxy/llm-capture.js +264 -0
  196. package/dist/proxy/llm-capture.js.map +1 -0
  197. package/dist/reporter.d.ts +3 -0
  198. package/dist/reporter.d.ts.map +1 -0
  199. package/dist/reporter.js +72 -0
  200. package/dist/reporter.js.map +1 -0
  201. package/dist/runWorkflowSubprocess.d.ts +14 -0
  202. package/dist/runWorkflowSubprocess.d.ts.map +1 -0
  203. package/dist/runWorkflowSubprocess.js +66 -0
  204. package/dist/runWorkflowSubprocess.js.map +1 -0
  205. package/dist/runner.d.ts +16 -0
  206. package/dist/runner.d.ts.map +1 -0
  207. package/dist/runner.js +138 -0
  208. package/dist/runner.js.map +1 -0
  209. package/dist/socket-connector.d.ts +22 -0
  210. package/dist/socket-connector.d.ts.map +1 -0
  211. package/dist/socket-connector.js +104 -0
  212. package/dist/socket-connector.js.map +1 -0
  213. package/dist/telemetry-batcher.d.ts +56 -0
  214. package/dist/telemetry-batcher.d.ts.map +1 -0
  215. package/dist/telemetry-batcher.js +143 -0
  216. package/dist/telemetry-batcher.js.map +1 -0
  217. package/dist/test-setup.d.ts +12 -0
  218. package/dist/test-setup.d.ts.map +1 -0
  219. package/dist/test-setup.js +13 -0
  220. package/dist/test-setup.js.map +1 -0
  221. package/dist/tool-registry.d.ts +31 -0
  222. package/dist/tool-registry.d.ts.map +1 -0
  223. package/dist/tool-registry.js +73 -0
  224. package/dist/tool-registry.js.map +1 -0
  225. package/dist/tool-runner-worker.d.ts +2 -0
  226. package/dist/tool-runner-worker.d.ts.map +1 -0
  227. package/dist/tool-runner-worker.js +215 -0
  228. package/dist/tool-runner-worker.js.map +1 -0
  229. package/dist/trace-adapter/context.d.ts +72 -0
  230. package/dist/trace-adapter/context.d.ts.map +1 -0
  231. package/dist/trace-adapter/context.js +80 -0
  232. package/dist/trace-adapter/context.js.map +1 -0
  233. package/dist/tracing.d.ts +2 -0
  234. package/dist/tracing.d.ts.map +1 -0
  235. package/dist/tracing.js +59 -0
  236. package/dist/tracing.js.map +1 -0
  237. package/dist/trigger-executor.d.ts +12 -0
  238. package/dist/trigger-executor.d.ts.map +1 -0
  239. package/dist/trigger-executor.js +130 -0
  240. package/dist/trigger-executor.js.map +1 -0
  241. package/dist/types/portal.d.ts +76 -0
  242. package/dist/types/portal.d.ts.map +1 -0
  243. package/dist/types/portal.js +2 -0
  244. package/dist/types/portal.js.map +1 -0
  245. package/dist/utils/debug.d.ts +3 -0
  246. package/dist/utils/debug.d.ts.map +1 -0
  247. package/dist/utils/debug.js +8 -0
  248. package/dist/utils/debug.js.map +1 -0
  249. package/dist/utils/license-error.d.ts +23 -0
  250. package/dist/utils/license-error.d.ts.map +1 -0
  251. package/dist/utils/license-error.js +42 -0
  252. package/dist/utils/license-error.js.map +1 -0
  253. package/dist/utils/redact.d.ts +7 -0
  254. package/dist/utils/redact.d.ts.map +1 -0
  255. package/dist/utils/redact.js +26 -0
  256. package/dist/utils/redact.js.map +1 -0
  257. package/dist/workflow-runner-worker.d.ts +2 -0
  258. package/dist/workflow-runner-worker.d.ts.map +1 -0
  259. package/dist/workflow-runner-worker.js +329 -0
  260. package/dist/workflow-runner-worker.js.map +1 -0
  261. package/dist/workflow-runner.d.ts +14 -0
  262. package/dist/workflow-runner.d.ts.map +1 -0
  263. package/dist/workflow-runner.js +34 -0
  264. package/dist/workflow-runner.js.map +1 -0
  265. package/docs/agent-coding-instructions.md +138 -0
  266. package/docs/agent-integration-guide.md +564 -0
  267. package/docs/agents.md +140 -0
  268. package/docs/dashboard.md +394 -0
  269. package/docs/deno.md +69 -0
  270. package/docs/instrumentation.md +424 -0
  271. package/docs/langfuse-trace-structure.md +145 -0
  272. package/docs/matchers.md +173 -0
  273. package/docs/observability_contract.md +192 -0
  274. package/docs/observability_mode.md +195 -0
  275. package/docs/quickstart.md +621 -0
  276. package/docs/security-compliance.md +566 -0
  277. package/docs/test-writing-guidelines.md +444 -0
  278. package/docs/tools.md +165 -0
  279. package/docs/workflow-modes.md +253 -0
  280. package/package.json +76 -0
  281. package/src/browser-ui.ts +281 -0
  282. package/src/capture/event.ts +30 -0
  283. package/src/capture/index.ts +3 -0
  284. package/src/capture/recorder.ts +62 -0
  285. package/src/capture/replay.ts +55 -0
  286. package/src/ci/api-client.ts +136 -0
  287. package/src/ci/benchmark.ts +257 -0
  288. package/src/ci/ed-runner.ts +351 -0
  289. package/src/ci/executor.ts +671 -0
  290. package/src/ci/git-info.ts +127 -0
  291. package/src/ci/index.ts +5 -0
  292. package/src/ci/measurement.ts +25 -0
  293. package/src/ci/replay.ts +127 -0
  294. package/src/ci/reporters/default.ts +50 -0
  295. package/src/ci/reporters/index.ts +21 -0
  296. package/src/ci/reporters/json.ts +18 -0
  297. package/src/ci/reporters/junit.ts +61 -0
  298. package/src/ci/runner.ts +208 -0
  299. package/src/ci/test-discovery.ts +16 -0
  300. package/src/ci/test-loader.ts +187 -0
  301. package/src/ci/test-registry.ts +62 -0
  302. package/src/ci/trace-schema.ts +96 -0
  303. package/src/ci/trace-writer.ts +107 -0
  304. package/src/ci/types.ts +115 -0
  305. package/src/ci/upload-client.ts +300 -0
  306. package/src/cli.ts +811 -0
  307. package/src/core/agent-state.ts +162 -0
  308. package/src/core/judge-utils.ts +232 -0
  309. package/src/core/registry.ts +92 -0
  310. package/src/dashboard-server.ts +2047 -0
  311. package/src/execution/tool-runner.ts +352 -0
  312. package/src/html/dashboard.html +2218 -0
  313. package/src/http.ts +13 -0
  314. package/src/index.ts +138 -0
  315. package/src/interceptors/ai-interceptor.ts +798 -0
  316. package/src/interceptors/db-auto.ts +243 -0
  317. package/src/interceptors/db.ts +156 -0
  318. package/src/interceptors/http.ts +393 -0
  319. package/src/interceptors/side-effects.ts +83 -0
  320. package/src/interceptors/telemetry-push.ts +537 -0
  321. package/src/interceptors/tool.ts +287 -0
  322. package/src/interceptors/workflow-ai.ts +419 -0
  323. package/src/internals/conditional-recorder.ts +63 -0
  324. package/src/internals/mock-resolver.ts +492 -0
  325. package/src/matchers/index.ts +824 -0
  326. package/src/observability.ts +501 -0
  327. package/src/portal-executor.ts +355 -0
  328. package/src/portal-server.ts +304 -0
  329. package/src/proxy/llm-capture.ts +301 -0
  330. package/src/reporter.ts +81 -0
  331. package/src/runWorkflowSubprocess.ts +74 -0
  332. package/src/runner.ts +178 -0
  333. package/src/socket-connector.ts +117 -0
  334. package/src/telemetry-batcher.ts +191 -0
  335. package/src/test-setup.ts +16 -0
  336. package/src/tool-registry.ts +94 -0
  337. package/src/tool-runner-worker.ts +244 -0
  338. package/src/trace-adapter/context.ts +156 -0
  339. package/src/tracing.ts +62 -0
  340. package/src/trigger-executor.ts +171 -0
  341. package/src/types/agent.d.ts +63 -0
  342. package/src/types/expect.d.ts +81 -0
  343. package/src/types/modules.d.ts +2 -0
  344. package/src/types/portal.ts +69 -0
  345. package/src/utils/debug.ts +8 -0
  346. package/src/utils/license-error.ts +43 -0
  347. package/src/utils/redact.ts +25 -0
  348. package/src/workflow-runner-worker.ts +386 -0
  349. package/src/workflow-runner.ts +58 -0
@@ -0,0 +1,824 @@
1
+ import { expect } from 'expect'
2
+ import type { TraceHandle, LLMStep, CustomStep, CustomStepKind } from '../trace-adapter/context.js'
3
+ import { prepareOutputForJudge } from '../core/judge-utils.js'
4
+
5
+ interface LLMStepConfig {
6
+ model?: string
7
+ contains?: string // searches prompt + completion
8
+ promptContains?: string // searches only in step.prompt
9
+ outputContains?: string // searches only in step.completion
10
+ provider?: string // 'openai' | 'claude' | 'gemini' | 'grok'
11
+ times?: number // match count must equal exactly this value
12
+ minTimes?: number // match count must be >= this value
13
+ maxTimes?: number // match count must be <= this value
14
+ }
15
+
16
+ interface CustomStepConfig {
17
+ kind?: CustomStepKind
18
+ name?: string
19
+ tag?: string
20
+ contains?: string // searches payload/result/metadata stringified
21
+ resultContains?: string // searches result only
22
+ payloadContains?: string // searches payload only
23
+ metadataContains?: string // searches metadata only
24
+ times?: number
25
+ minTimes?: number
26
+ maxTimes?: number
27
+ }
28
+
29
+ interface PromptWhereConfig {
30
+ filterContains: string // first filter: prompts that contain this substring
31
+ requireContains?: string // then assert: filtered prompts must also contain this
32
+ requireNotContains?: string // and must NOT contain this
33
+ times?: number // exact count of filtered prompts
34
+ minTimes?: number // min count of filtered prompts
35
+ maxTimes?: number // max count of filtered prompts
36
+ index?: number // optional 0-based index into filtered prompts to check specifically
37
+ nth?: number // optional 1-based alias for index
38
+ }
39
+
40
+ type SupportedProvider = 'openai' | 'claude' | 'gemini' | 'grok' | 'kimi'
41
+
42
+ interface SemanticMatchOptions {
43
+ provider?: SupportedProvider
44
+ model?: string
45
+ sdk?: unknown // optional user-supplied SDK instance
46
+ apiKey?: string // optional API key override (useful for OpenAI-compatible endpoints)
47
+ baseURL?: string // optional base URL override for OpenAI-compatible APIs
48
+ }
49
+
50
+ type EvaluationTarget = 'prompt' | 'result'
51
+
52
+ interface EvaluationCondition {
53
+ greaterThan?: number
54
+ lessThan?: number
55
+ atLeast?: number
56
+ atMost?: number
57
+ equals?: number
58
+ }
59
+
60
+ interface EvaluateOutputMetricConfig {
61
+ evaluationPrompt: string
62
+ target?: EvaluationTarget // 'prompt' or 'result'; default 'result'
63
+ index?: number // 0-based index into LLM steps
64
+ nth?: number // 1-based alias for index
65
+ condition?: EvaluationCondition // optional; default atLeast 0.7
66
+ provider?: SupportedProvider
67
+ model?: string
68
+ sdk?: unknown // optional SDK instance
69
+ apiKey?: string // optional API key override (useful for OpenAI-compatible endpoints)
70
+ baseURL?: string // optional base URL override for OpenAI-compatible APIs
71
+ }
72
+
73
+ /**
74
+ * Type guard: returns true only if `value` looks like a TraceHandle.
75
+ * Used to produce a clear error message when a non-trace value (e.g. a plain
76
+ * string) is passed to a trace-aware matcher.
77
+ */
78
+ function isTraceHandle(value: unknown): value is TraceHandle {
79
+ return (
80
+ value !== null &&
81
+ typeof value === 'object' &&
82
+ typeof (value as TraceHandle).getLLMSteps === 'function' &&
83
+ typeof (value as TraceHandle).getToolCalls === 'function'
84
+ )
85
+ }
86
+
87
+ const defaultModels: Record<SupportedProvider, string> = {
88
+ openai: 'gpt-4.1',
89
+ claude: 'claude-3-opus-20240229',
90
+ gemini: 'gemini-1.5-pro',
91
+ grok: 'grok-beta',
92
+ kimi: 'kimi-k2-turbo-preview',
93
+ }
94
+
95
+ // Helper: call an LLM provider (or SDK) and return the text content
96
+ export interface LLMCallResult {
97
+ content: string
98
+ durationMs: number
99
+ usage?: { inputTokens: number; outputTokens: number; totalTokens: number }
100
+ }
101
+
102
+ export async function callProviderLLM(
103
+ prompt: string,
104
+ options: SemanticMatchOptions = {},
105
+ systemPrompt = 'You are an expert test judge.',
106
+ maxTokens = 32,
107
+ temperature = 0
108
+ ): Promise<LLMCallResult> {
109
+ const provider: SupportedProvider = options.provider ?? 'openai'
110
+ const sdk = options.sdk as any | undefined
111
+ const resolvedModel = options.model ?? defaultModels[provider]
112
+ const t0 = Date.now()
113
+
114
+ switch (provider) {
115
+ case 'openai': {
116
+ if (sdk && sdk.chat?.completions?.create) {
117
+ const resp = await sdk.chat.completions.create({
118
+ model: resolvedModel,
119
+ messages: [
120
+ { role: 'system', content: systemPrompt },
121
+ { role: 'user', content: prompt },
122
+ ],
123
+ max_tokens: maxTokens,
124
+ // temperature,
125
+ })
126
+ const u = resp?.usage
127
+ return {
128
+ content: resp?.choices?.[0]?.message?.content?.trim() ?? '',
129
+ durationMs: Date.now() - t0,
130
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
131
+ }
132
+ }
133
+
134
+ const apiKey = options.apiKey ?? process.env.OPENAI_API_KEY
135
+ if (!apiKey) throw new Error('Provide apiKey or set OPENAI_API_KEY for OpenAI-compatible endpoint.')
136
+
137
+ const baseURL = (options.baseURL ?? 'https://api.openai.com/v1').replace(/\/$/, '')
138
+ const response = await fetch(`${baseURL}/chat/completions`, {
139
+ method: 'POST',
140
+ headers: {
141
+ Authorization: `Bearer ${apiKey}`,
142
+ 'Content-Type': 'application/json',
143
+ },
144
+ body: JSON.stringify({
145
+ model: resolvedModel,
146
+ messages: [
147
+ { role: 'system', content: systemPrompt },
148
+ { role: 'user', content: prompt },
149
+ ],
150
+ max_tokens: maxTokens,
151
+ // temperature,
152
+ }),
153
+ })
154
+
155
+ if (!response.ok) {
156
+ throw new Error(`OpenAI API error: ${response.status} ${response.statusText}`)
157
+ }
158
+ const data: any = await response.json()
159
+ const u = data?.usage
160
+ return {
161
+ content: data.choices?.[0]?.message?.content?.trim() ?? '',
162
+ durationMs: Date.now() - t0,
163
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
164
+ }
165
+ }
166
+
167
+ case 'claude': {
168
+ if (sdk && sdk.messages?.create) {
169
+ const resp = await sdk.messages.create({
170
+ model: resolvedModel,
171
+ max_tokens: maxTokens,
172
+ // temperature,
173
+ messages: [{ role: 'user', content: `${systemPrompt}\n\n${prompt}` }],
174
+ })
175
+ const u = resp?.usage
176
+ return {
177
+ content: resp?.content?.[0]?.text?.trim() ?? '',
178
+ durationMs: Date.now() - t0,
179
+ usage: u ? { inputTokens: u.input_tokens ?? 0, outputTokens: u.output_tokens ?? 0, totalTokens: (u.input_tokens ?? 0) + (u.output_tokens ?? 0) } : undefined,
180
+ }
181
+ }
182
+
183
+ const apiKey = process.env.ANTHROPIC_API_KEY
184
+ if (!apiKey) throw new Error('ANTHROPIC_API_KEY is not set in environment.')
185
+
186
+ const response = await fetch('https://api.anthropic.com/v1/messages', {
187
+ method: 'POST',
188
+ headers: {
189
+ 'x-api-key': apiKey,
190
+ 'anthropic-version': '2023-06-01',
191
+ 'content-type': 'application/json',
192
+ },
193
+ body: JSON.stringify({
194
+ model: resolvedModel,
195
+ max_tokens: maxTokens,
196
+ // temperature,
197
+ messages: [{ role: 'user', content: `${systemPrompt}\n\n${prompt}` }],
198
+ }),
199
+ })
200
+
201
+ if (!response.ok) {
202
+ const errBody = await response.text().catch(() => '')
203
+ throw new Error(`Claude API error: ${response.status} ${response.statusText} (model=${resolvedModel}): ${errBody.substring(0, 200)}`)
204
+ }
205
+ const data: any = await response.json()
206
+ const u = data?.usage
207
+ return {
208
+ content: data?.content?.[0]?.text?.trim() ?? '',
209
+ durationMs: Date.now() - t0,
210
+ usage: u ? { inputTokens: u.input_tokens ?? 0, outputTokens: u.output_tokens ?? 0, totalTokens: (u.input_tokens ?? 0) + (u.output_tokens ?? 0) } : undefined,
211
+ }
212
+ }
213
+
214
+ case 'gemini': {
215
+ if (sdk && sdk.models?.generateContent) {
216
+ const resp = await sdk.models.generateContent({
217
+ model: resolvedModel,
218
+ contents: [{ role: 'user', parts: [{ text: `${systemPrompt}\n\n${prompt}` }] }],
219
+ generationConfig: {
220
+ // temperature,
221
+ maxOutputTokens: maxTokens
222
+ },
223
+ })
224
+ const u = resp?.response?.usageMetadata
225
+ return {
226
+ content: resp?.response?.candidates?.[0]?.content?.parts?.[0]?.text?.trim() ?? '',
227
+ durationMs: Date.now() - t0,
228
+ usage: u ? { inputTokens: u.promptTokenCount ?? 0, outputTokens: u.candidatesTokenCount ?? 0, totalTokens: u.totalTokenCount ?? 0 } : undefined,
229
+ }
230
+ }
231
+
232
+ const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY
233
+ if (!apiKey) throw new Error('GEMINI_API_KEY (or GOOGLE_API_KEY) is not set in environment.')
234
+
235
+ const response = await fetch(
236
+ `https://generativelanguage.googleapis.com/v1beta/models/${resolvedModel}:generateContent?key=${apiKey}`,
237
+ {
238
+ method: 'POST',
239
+ headers: { 'Content-Type': 'application/json' },
240
+ body: JSON.stringify({
241
+ contents: [{ role: 'user', parts: [{ text: `${systemPrompt}\n\n${prompt}` }] }],
242
+ generationConfig: {
243
+ // temperature,
244
+ maxOutputTokens: maxTokens
245
+ },
246
+ }),
247
+ }
248
+ )
249
+
250
+ if (!response.ok) {
251
+ throw new Error(`Gemini API error: ${response.status} ${response.statusText}`)
252
+ }
253
+ const data: any = await response.json()
254
+ const u = data?.usageMetadata
255
+ return {
256
+ content: data?.candidates?.[0]?.content?.parts?.[0]?.text?.trim() ?? '',
257
+ durationMs: Date.now() - t0,
258
+ usage: u ? { inputTokens: u.promptTokenCount ?? 0, outputTokens: u.candidatesTokenCount ?? 0, totalTokens: u.totalTokenCount ?? 0 } : undefined,
259
+ }
260
+ }
261
+
262
+ case 'grok': {
263
+ if (sdk && sdk.chat?.completions?.create) {
264
+ const resp = await sdk.chat.completions.create({
265
+ model: resolvedModel,
266
+ messages: [
267
+ { role: 'system', content: systemPrompt },
268
+ { role: 'user', content: prompt },
269
+ ],
270
+ max_tokens: maxTokens,
271
+ // temperature,
272
+ })
273
+ const u = resp?.usage
274
+ return {
275
+ content: resp?.choices?.[0]?.message?.content?.trim() ?? '',
276
+ durationMs: Date.now() - t0,
277
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
278
+ }
279
+ }
280
+
281
+ const apiKey = process.env.GROK_API_KEY
282
+ if (!apiKey) throw new Error('GROK_API_KEY is not set in environment.')
283
+
284
+ const response = await fetch('https://api.x.ai/v1/chat/completions', {
285
+ method: 'POST',
286
+ headers: {
287
+ Authorization: `Bearer ${apiKey}`,
288
+ 'Content-Type': 'application/json',
289
+ },
290
+ body: JSON.stringify({
291
+ model: resolvedModel,
292
+ messages: [
293
+ { role: 'system', content: systemPrompt },
294
+ { role: 'user', content: prompt },
295
+ ],
296
+ max_tokens: maxTokens,
297
+ // temperature,
298
+ }),
299
+ })
300
+
301
+ if (!response.ok) {
302
+ throw new Error(`Grok API error: ${response.status} ${response.statusText}`)
303
+ }
304
+ const data: any = await response.json()
305
+ const u = data?.usage
306
+ return {
307
+ content: data.choices?.[0]?.message?.content?.trim() ?? '',
308
+ durationMs: Date.now() - t0,
309
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
310
+ }
311
+ }
312
+
313
+ case 'kimi': {
314
+ const apiKey = process.env.KIMI_API_KEY
315
+ if (!apiKey) throw new Error('KIMI_API_KEY is not set in environment.')
316
+
317
+ const response = await fetch('https://api.moonshot.ai/v1/chat/completions', {
318
+ method: 'POST',
319
+ headers: {
320
+ Authorization: `Bearer ${apiKey}`,
321
+ 'Content-Type': 'application/json',
322
+ },
323
+ body: JSON.stringify({
324
+ model: resolvedModel,
325
+ messages: [
326
+ { role: 'system', content: systemPrompt },
327
+ { role: 'user', content: prompt },
328
+ ],
329
+ max_tokens: maxTokens,
330
+ }),
331
+ })
332
+
333
+ const data: any = await response.json()
334
+ console.log(` [kimi] response: ${JSON.stringify(data).slice(0, 500)}`)
335
+ if (!response.ok) {
336
+ throw new Error(`Kimi API error: ${response.status} ${response.statusText} — ${JSON.stringify(data)}`)
337
+ }
338
+ const u = data?.usage
339
+ return {
340
+ content: data.choices?.[0]?.message?.content?.trim() ?? '',
341
+ durationMs: Date.now() - t0,
342
+ usage: u ? { inputTokens: u.prompt_tokens ?? 0, outputTokens: u.completion_tokens ?? 0, totalTokens: u.total_tokens ?? 0 } : undefined,
343
+ }
344
+ }
345
+
346
+ default:
347
+ throw new Error(`Unsupported provider: ${provider}`)
348
+ }
349
+ }
350
+
351
+ // Helper: Call an LLM (configurable provider/model/sdk) to judge semantic match
352
+ async function llmJudgeSemanticMatch(
353
+ traceOutput: string,
354
+ expected: string,
355
+ options: SemanticMatchOptions = {}
356
+ ): Promise<boolean> {
357
+ const prompt = `
358
+ You are an expert test judge. Given the following AI trace output and an expected semantic result, answer "YES" if the trace output semantically matches the expectation, otherwise answer "NO".
359
+
360
+ Trace Output:
361
+ ${traceOutput}
362
+
363
+ Expected:
364
+ ${expected}
365
+
366
+ Answer only "YES" or "NO".
367
+ `.trim()
368
+
369
+ const content = (await callProviderLLM(prompt, options, 'You are an expert test judge.', 8, 0)).content.trim().toUpperCase()
370
+ return content.startsWith('YES')
371
+ }
372
+
373
+ function parseFirstNumber(text: string): number | null {
374
+ const match = text.match(/-?\d+(?:\.\d+)?/)
375
+ if (!match) return null
376
+ const num = Number.parseFloat(match[0])
377
+ return Number.isFinite(num) ? num : null
378
+ }
379
+
380
+ function resolveCondition(config?: EvaluationCondition): { kind: keyof EvaluationCondition; value: number } {
381
+ const entries = Object.entries(config || {}).filter(([, v]) => typeof v === 'number' && Number.isFinite(v)) as Array<
382
+ [keyof EvaluationCondition, number]
383
+ >
384
+ if (entries.length === 0) return { kind: 'atLeast', value: 0.7 }
385
+ if (entries.length > 1) {
386
+ throw new Error('Provide only one metric condition (greaterThan, lessThan, atLeast, atMost, equals).')
387
+ }
388
+ return { kind: entries[0][0], value: entries[0][1] }
389
+ }
390
+
391
+ function checkCondition(score: number, condition: { kind: keyof EvaluationCondition; value: number }): boolean {
392
+ switch (condition.kind) {
393
+ case 'greaterThan':
394
+ return score > condition.value
395
+ case 'lessThan':
396
+ return score < condition.value
397
+ case 'atLeast':
398
+ return score >= condition.value
399
+ case 'atMost':
400
+ return score <= condition.value
401
+ case 'equals':
402
+ return score === condition.value
403
+ default:
404
+ return false
405
+ }
406
+ }
407
+
408
+ // Augment the `expect` package so TypeScript knows about custom matchers
409
+ declare module 'expect' {
410
+ interface Matchers<R> {
411
+ toHaveLLMStep(config?: LLMStepConfig): R
412
+ toCallTool(toolName: string): R
413
+ toMatchSemanticOutput(expected: string, options?: SemanticMatchOptions): R
414
+ toHaveCustomStep(config?: CustomStepConfig): R
415
+ /**
416
+ * Filter prompts that contain `filterContains`, then assert additional requirements.
417
+ * Example: prompts containing "A" must also contain "B".
418
+ */
419
+ toHavePromptWhere(config: PromptWhereConfig): R
420
+ /**
421
+ * Evaluate a specific LLM step's prompt or result via an LLM and assert a numeric metric condition (0.0–1.0).
422
+ */
423
+ toEvaluateOutputMetric(config: EvaluateOutputMetricConfig): Promise<R>
424
+ }
425
+ }
426
+
427
+ /**
428
+ * Register all AI-specific custom matchers onto the `expect` instance.
429
+ * Call this once on runner startup.
430
+ */
431
+ export function registerMatchers(): void {
432
+ expect.extend({
433
+ toHaveLLMStep(trace: TraceHandle, config: LLMStepConfig = {}) {
434
+ if (!isTraceHandle(trace)) {
435
+ return {
436
+ pass: false,
437
+ message: () =>
438
+ `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toHaveLLMStep(...)`,
439
+ }
440
+ }
441
+ const steps = trace.getLLMSteps()
442
+
443
+ const matching = steps.filter((step: LLMStep) => {
444
+ if (config.model && step.model !== config.model) return false
445
+ if (config.provider && step.provider !== config.provider) return false
446
+ if (config.contains) {
447
+ const haystack = [step.completion, step.prompt, step.contains]
448
+ .filter(Boolean)
449
+ .join(' ')
450
+ .toLowerCase()
451
+ if (!haystack.includes(config.contains.toLowerCase())) return false
452
+ }
453
+ if (config.promptContains) {
454
+ const promptHaystack = (step.prompt ?? '').toLowerCase()
455
+ if (!promptHaystack.includes(config.promptContains.toLowerCase())) return false
456
+ }
457
+ if (config.outputContains) {
458
+ const outputHaystack = (step.completion ?? '').toLowerCase()
459
+ if (!outputHaystack.includes(config.outputContains.toLowerCase())) return false
460
+ }
461
+ return true
462
+ })
463
+
464
+ const count = matching.length
465
+ let pass: boolean
466
+ if (config.times !== undefined) {
467
+ pass = count === config.times
468
+ } else if (config.minTimes !== undefined || config.maxTimes !== undefined) {
469
+ const min = config.minTimes ?? 0
470
+ const max = config.maxTimes ?? Infinity
471
+ pass = count >= min && count <= max
472
+ } else {
473
+ pass = count > 0
474
+ }
475
+
476
+ return {
477
+ pass,
478
+ message: () => {
479
+ if (pass) {
480
+ return `Expected trace NOT to have LLM step matching ${JSON.stringify(config)}`
481
+ }
482
+ const stepSummary =
483
+ steps.length === 0
484
+ ? 'no LLM steps were recorded'
485
+ : `${count} matching step(s) found; recorded steps: ${JSON.stringify(steps)}`
486
+ return `Expected trace to have LLM step matching ${JSON.stringify(config)}, but ${stepSummary}`
487
+ },
488
+ }
489
+ },
490
+
491
+ toCallTool(trace: TraceHandle, toolName: string) {
492
+ if (!isTraceHandle(trace)) {
493
+ return {
494
+ pass: false,
495
+ message: () =>
496
+ `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toCallTool(...)`,
497
+ }
498
+ }
499
+ const calls = trace.getToolCalls()
500
+ const pass = calls.some((c) => c.name === toolName)
501
+
502
+ return {
503
+ pass,
504
+ message: () => {
505
+ if (pass) {
506
+ return `Expected trace NOT to call tool "${toolName}"`
507
+ }
508
+ const names = calls.map((c) => c.name)
509
+ const recorded = names.length === 0 ? 'no tool calls were recorded' : `recorded: [${names.join(', ')}]`
510
+ return `Expected tool "${toolName}" to be called, but ${recorded}`
511
+ },
512
+ }
513
+ },
514
+
515
+ async toMatchSemanticOutput(trace: TraceHandle, expected: string, options?: SemanticMatchOptions) {
516
+ if (!isTraceHandle(trace)) {
517
+ return {
518
+ pass: false,
519
+ message: () =>
520
+ `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toMatchSemanticOutput(...)`,
521
+ }
522
+ }
523
+ const steps = trace.getLLMSteps()
524
+ const fullOutput = steps
525
+ .map((s: LLMStep) => [s.completion, s.contains].filter(Boolean).join(' '))
526
+ .join(' ')
527
+ .trim()
528
+
529
+ try {
530
+ const pass = await llmJudgeSemanticMatch(fullOutput, expected, options)
531
+ return {
532
+ pass,
533
+ message: () => {
534
+ if (pass) {
535
+ return `Expected trace output NOT to semantically match "${expected}" (LLM judged YES)`
536
+ }
537
+ return `Expected trace output to semantically match "${expected}", but LLM judged NO. Trace output: "${fullOutput || '(empty)'}"`
538
+ },
539
+ }
540
+ } catch (err) {
541
+ return {
542
+ pass: false,
543
+ message: () =>
544
+ `LLM semantic match failed: ${(err as Error).message}`,
545
+ }
546
+ }
547
+ },
548
+
549
+ async toEvaluateOutputMetric(trace: TraceHandle, config: EvaluateOutputMetricConfig) {
550
+ if (!isTraceHandle(trace)) {
551
+ return {
552
+ pass: false,
553
+ message: () =>
554
+ `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.
555
+ Use: expect(ctx.trace).toEvaluateOutputMetric(...)`,
556
+ }
557
+ }
558
+ if (!config || !config.evaluationPrompt) {
559
+ return {
560
+ pass: false,
561
+ message: () => 'toEvaluateOutputMetric requires evaluationPrompt',
562
+ }
563
+ }
564
+
565
+ const steps = trace.getLLMSteps()
566
+ if (steps.length === 0) {
567
+ return {
568
+ pass: false,
569
+ message: () => 'No LLM steps recorded; cannot evaluate output metric.',
570
+ }
571
+ }
572
+
573
+ const targetIdx = config.index ?? (config.nth !== undefined ? config.nth - 1 : steps.length - 1)
574
+ if (targetIdx < 0 || targetIdx >= steps.length) {
575
+ return {
576
+ pass: false,
577
+ message: () => `LLM steps length ${steps.length}, but index/nth points to ${targetIdx}.`,
578
+ }
579
+ }
580
+
581
+ const targetStep = steps[targetIdx]
582
+ const targetField: EvaluationTarget = config.target ?? 'result'
583
+ const targetText = targetField === 'prompt' ? targetStep.prompt ?? '' : targetStep.completion ?? ''
584
+ if (!targetText) {
585
+ return {
586
+ pass: false,
587
+ message: () => `Selected LLM step has empty ${targetField}; cannot evaluate.`,
588
+ }
589
+ }
590
+
591
+ const condition = (() => {
592
+ try {
593
+ return resolveCondition(config.condition)
594
+ } catch (err) {
595
+ return err as Error
596
+ }
597
+ })()
598
+ if (condition instanceof Error) {
599
+ return {
600
+ pass: false,
601
+ message: () => condition.message,
602
+ }
603
+ }
604
+
605
+ const preparedText = prepareOutputForJudge(targetText, config.evaluationPrompt)
606
+ const evalPrompt = `
607
+ Evaluation prompt (from user):
608
+ ${config.evaluationPrompt}
609
+
610
+ Score the following text strictly between 0 and 1 (inclusive). Respond with only the number.
611
+
612
+ <output>
613
+ ${preparedText}
614
+ </output>
615
+ `.trim()
616
+
617
+ try {
618
+ const raw = (await callProviderLLM(
619
+ evalPrompt,
620
+ { provider: config.provider, model: config.model, sdk: config.sdk, apiKey: config.apiKey, baseURL: config.baseURL },
621
+ 'You are an evaluation assistant. Return only a number between 0 and 1.',
622
+ 16,
623
+ 0
624
+ )).content
625
+ const score = parseFirstNumber(raw)
626
+ if (score === null) {
627
+ return {
628
+ pass: false,
629
+ message: () => `Could not parse numeric metric from model response: "${raw}"`,
630
+ }
631
+ }
632
+ if (score < 0 || score > 1) {
633
+ return {
634
+ pass: false,
635
+ message: () => `Metric ${score} is out of allowed range 0.0–1.0 (raw: "${raw}")`,
636
+ }
637
+ }
638
+
639
+ const pass = checkCondition(score, condition)
640
+ return {
641
+ pass,
642
+ message: () => {
643
+ if (pass) {
644
+ return `Expected metric NOT to satisfy ${condition.kind} ${condition.value} (score ${score})`
645
+ }
646
+ return `Metric check failed: score ${score} did not satisfy ${condition.kind} ${condition.value}. Raw response: "${raw}"`
647
+ },
648
+ }
649
+ } catch (err) {
650
+ return {
651
+ pass: false,
652
+ message: () => `LLM evaluation failed: ${(err as Error).message}`,
653
+ }
654
+ }
655
+ },
656
+
657
+ toHaveCustomStep(trace: TraceHandle, config: CustomStepConfig = {}) {
658
+ if (!isTraceHandle(trace) || typeof (trace as any).getCustomSteps !== 'function') {
659
+ return {
660
+ pass: false,
661
+ message: () =>
662
+ `Expected a TraceHandle (ctx.trace with getCustomSteps) but received ${typeof trace}.\nUse: expect(ctx.trace).toHaveCustomStep(...)`,
663
+ }
664
+ }
665
+
666
+ const steps = (trace as any).getCustomSteps() as CustomStep[]
667
+
668
+ const matchString = (val: unknown): string => {
669
+ if (val === undefined || val === null) return ''
670
+ if (typeof val === 'string') return val
671
+ try {
672
+ return JSON.stringify(val)
673
+ } catch {
674
+ return String(val)
675
+ }
676
+ }
677
+
678
+ const matching = steps.filter((step) => {
679
+ if (config.kind && step.kind !== config.kind) return false
680
+ if (config.name && step.name !== config.name) return false
681
+ if (config.tag && !(step.tags || []).includes(config.tag)) return false
682
+
683
+ const payloadStr = matchString(step.payload).toLowerCase()
684
+ const resultStr = matchString(step.result).toLowerCase()
685
+ const metaStr = matchString(step.metadata).toLowerCase()
686
+ const combined = [payloadStr, resultStr, metaStr].filter(Boolean).join(' ')
687
+
688
+ if (config.contains && !combined.includes(config.contains.toLowerCase())) return false
689
+ if (config.payloadContains && !payloadStr.includes(config.payloadContains.toLowerCase())) return false
690
+ if (config.resultContains && !resultStr.includes(config.resultContains.toLowerCase())) return false
691
+ if (config.metadataContains && !metaStr.includes(config.metadataContains.toLowerCase())) return false
692
+
693
+ return true
694
+ })
695
+
696
+ const count = matching.length
697
+ let pass: boolean
698
+ if (config.times !== undefined) {
699
+ pass = count === config.times
700
+ } else if (config.minTimes !== undefined || config.maxTimes !== undefined) {
701
+ const min = config.minTimes ?? 0
702
+ const max = config.maxTimes ?? Infinity
703
+ pass = count >= min && count <= max
704
+ } else {
705
+ pass = count > 0
706
+ }
707
+
708
+ return {
709
+ pass,
710
+ message: () => {
711
+ if (pass) {
712
+ return `Expected trace NOT to have custom step matching ${JSON.stringify(config)}`
713
+ }
714
+ const stepSummary =
715
+ steps.length === 0
716
+ ? 'no custom steps were recorded'
717
+ : `${count} matching step(s) found; recorded custom steps: ${JSON.stringify(steps)}`
718
+ return `Expected trace to have custom step matching ${JSON.stringify(config)}, but ${stepSummary}`
719
+ },
720
+ }
721
+ },
722
+
723
+ toHavePromptWhere(trace: TraceHandle, config: PromptWhereConfig) {
724
+ if (!isTraceHandle(trace)) {
725
+ return {
726
+ pass: false,
727
+ message: () =>
728
+ `Expected a TraceHandle (ctx.trace) but received ${typeof trace}.\nUse: expect(ctx.trace).toHavePromptWhere(...)`,
729
+ }
730
+ }
731
+ if (!config || !config.filterContains) {
732
+ return {
733
+ pass: false,
734
+ message: () => 'toHavePromptWhere requires filterContains',
735
+ }
736
+ }
737
+
738
+ const filterNeedle = config.filterContains.toLowerCase()
739
+ const requireNeedle = config.requireContains?.toLowerCase()
740
+ const forbidNeedle = config.requireNotContains?.toLowerCase()
741
+
742
+ const prompts = trace.getLLMSteps().map((s) => s.prompt ?? '')
743
+
744
+ const filtered = prompts.filter((p) => p.toLowerCase().includes(filterNeedle))
745
+
746
+ // Optional positional check (index or nth)
747
+ const targetIdx = config.index ?? (config.nth !== undefined ? config.nth - 1 : undefined)
748
+
749
+ let checked: string[] = []
750
+ let count = 0
751
+ let pass = true
752
+
753
+ if (targetIdx !== undefined) {
754
+ if (targetIdx < 0 || targetIdx >= filtered.length) {
755
+ return {
756
+ pass: false,
757
+ message: () =>
758
+ `Filtered prompts length ${filtered.length}, but index/nth points to ${targetIdx}. Config: ${JSON.stringify(config)}`,
759
+ }
760
+ }
761
+ const p = filtered[targetIdx]
762
+ const lower = p.toLowerCase()
763
+ const okRequire = requireNeedle ? lower.includes(requireNeedle) : true
764
+ const okForbid = forbidNeedle ? !lower.includes(forbidNeedle) : true
765
+ pass = okRequire && okForbid
766
+ checked = okRequire && okForbid ? [p] : []
767
+ count = checked.length
768
+ } else {
769
+ checked = filtered.filter((p) => {
770
+ const lower = p.toLowerCase()
771
+ if (requireNeedle && !lower.includes(requireNeedle)) return false
772
+ if (forbidNeedle && lower.includes(forbidNeedle)) return false
773
+ return true
774
+ })
775
+
776
+ count = checked.length
777
+
778
+ if (config.times !== undefined) {
779
+ pass = count === config.times
780
+ } else {
781
+ const min = config.minTimes ?? 0
782
+ const max = config.maxTimes ?? Infinity
783
+ pass = count >= min && count <= max
784
+ }
785
+
786
+ // Also ensure that if requireContains is set, no filtered prompt violates it
787
+ if (requireNeedle) {
788
+ const violating = filtered.filter((p) => !p.toLowerCase().includes(requireNeedle))
789
+ if (violating.length > 0) pass = false
790
+ }
791
+ if (forbidNeedle) {
792
+ const violating = filtered.filter((p) => p.toLowerCase().includes(forbidNeedle))
793
+ if (violating.length > 0) pass = false
794
+ }
795
+ }
796
+
797
+ return {
798
+ pass,
799
+ message: () => {
800
+ if (pass) {
801
+ return `Expected prompts NOT to satisfy filter/require combo: ${JSON.stringify(config)}`
802
+ }
803
+ const base = [`Expected prompts filtered by "${config.filterContains}" to satisfy requirements`]
804
+ if (config.requireContains) base.push(`requireContains: "${config.requireContains}"`)
805
+ if (config.requireNotContains) base.push(`requireNotContains: "${config.requireNotContains}"`)
806
+ if (targetIdx !== undefined) {
807
+ base.push(`checked index: ${targetIdx}`, `filtered count: ${filtered.length}`)
808
+ } else {
809
+ base.push(`filtered count: ${filtered.length}, passing count: ${checked.length}`)
810
+ base.push(
811
+ config.times !== undefined
812
+ ? `expected exactly ${config.times}`
813
+ : `expected between ${config.minTimes ?? 0} and ${config.maxTimes ?? Infinity}`,
814
+ )
815
+ }
816
+ return base.filter(Boolean).join('; ')
817
+ },
818
+ }
819
+ },
820
+ })
821
+ }
822
+
823
+ // Export our patched expect so users can import it and get the correct type and runtime matchers
824
+ export { expect }