@rudderjs/ai 1.17.3 → 1.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/README.md +19 -1274
  2. package/dist/budget-orm/index.d.ts +1 -95
  3. package/dist/budget-orm/index.d.ts.map +1 -1
  4. package/dist/budget-orm/index.js +4 -176
  5. package/dist/budget-orm/index.js.map +1 -1
  6. package/dist/chat-mentions.d.ts +1 -58
  7. package/dist/chat-mentions.d.ts.map +1 -1
  8. package/dist/chat-mentions.js +4 -80
  9. package/dist/chat-mentions.js.map +1 -1
  10. package/dist/commands/ai-eval.d.ts +1 -92
  11. package/dist/commands/ai-eval.d.ts.map +1 -1
  12. package/dist/commands/ai-eval.js +4 -377
  13. package/dist/commands/ai-eval.js.map +1 -1
  14. package/dist/commands/make-agent.d.ts +1 -2
  15. package/dist/commands/make-agent.d.ts.map +1 -1
  16. package/dist/commands/make-agent.js +4 -22
  17. package/dist/commands/make-agent.js.map +1 -1
  18. package/dist/computer-use/index.d.ts +1 -52
  19. package/dist/computer-use/index.d.ts.map +1 -1
  20. package/dist/computer-use/index.js +4 -50
  21. package/dist/computer-use/index.js.map +1 -1
  22. package/dist/conversation-orm/index.d.ts +1 -108
  23. package/dist/conversation-orm/index.d.ts.map +1 -1
  24. package/dist/conversation-orm/index.js +4 -214
  25. package/dist/conversation-orm/index.js.map +1 -1
  26. package/dist/doctor.d.ts +1 -1
  27. package/dist/doctor.d.ts.map +1 -1
  28. package/dist/doctor.js +4 -65
  29. package/dist/doctor.js.map +1 -1
  30. package/dist/eval/index.d.ts +1 -270
  31. package/dist/eval/index.d.ts.map +1 -1
  32. package/dist/eval/index.js +4 -509
  33. package/dist/eval/index.js.map +1 -1
  34. package/dist/gateway/index.d.ts +1 -10
  35. package/dist/gateway/index.d.ts.map +1 -1
  36. package/dist/gateway/index.js +4 -10
  37. package/dist/gateway/index.js.map +1 -1
  38. package/dist/index.d.ts +1 -66
  39. package/dist/index.d.ts.map +1 -1
  40. package/dist/index.js +4 -78
  41. package/dist/index.js.map +1 -1
  42. package/dist/mcp/index.d.ts +1 -15
  43. package/dist/mcp/index.d.ts.map +1 -1
  44. package/dist/mcp/index.js +4 -14
  45. package/dist/mcp/index.js.map +1 -1
  46. package/dist/memory-embedding/index.d.ts +1 -120
  47. package/dist/memory-embedding/index.d.ts.map +1 -1
  48. package/dist/memory-embedding/index.js +4 -228
  49. package/dist/memory-embedding/index.js.map +1 -1
  50. package/dist/memory-orm/index.d.ts +1 -117
  51. package/dist/memory-orm/index.d.ts.map +1 -1
  52. package/dist/memory-orm/index.js +4 -186
  53. package/dist/memory-orm/index.js.map +1 -1
  54. package/dist/node/index.d.ts +1 -2
  55. package/dist/node/index.d.ts.map +1 -1
  56. package/dist/node/index.js +4 -2
  57. package/dist/node/index.js.map +1 -1
  58. package/dist/observers.d.ts +1 -129
  59. package/dist/observers.d.ts.map +1 -1
  60. package/dist/observers.js +4 -39
  61. package/dist/observers.js.map +1 -1
  62. package/dist/react/index.d.ts +1 -15
  63. package/dist/react/index.d.ts.map +1 -1
  64. package/dist/react/index.js +4 -15
  65. package/dist/react/index.js.map +1 -1
  66. package/dist/server/index.d.ts +1 -1
  67. package/dist/server/index.d.ts.map +1 -1
  68. package/dist/server/index.js +4 -1
  69. package/dist/server/index.js.map +1 -1
  70. package/package.json +9 -13
  71. package/boost/guidelines.md +0 -260
  72. package/boost/skills/ai-agents/SKILL.md +0 -240
  73. package/boost/skills/ai-tools/SKILL.md +0 -260
  74. package/dist/agent-run-store.d.ts +0 -161
  75. package/dist/agent-run-store.d.ts.map +0 -1
  76. package/dist/agent-run-store.js +0 -98
  77. package/dist/agent-run-store.js.map +0 -1
  78. package/dist/agent-sse.d.ts +0 -153
  79. package/dist/agent-sse.d.ts.map +0 -1
  80. package/dist/agent-sse.js +0 -282
  81. package/dist/agent-sse.js.map +0 -1
  82. package/dist/agent.d.ts +0 -508
  83. package/dist/agent.d.ts.map +0 -1
  84. package/dist/agent.js +0 -1538
  85. package/dist/agent.js.map +0 -1
  86. package/dist/attachment.d.ts +0 -31
  87. package/dist/attachment.d.ts.map +0 -1
  88. package/dist/attachment.js +0 -89
  89. package/dist/attachment.js.map +0 -1
  90. package/dist/audio.d.ts +0 -45
  91. package/dist/audio.d.ts.map +0 -1
  92. package/dist/audio.js +0 -93
  93. package/dist/audio.js.map +0 -1
  94. package/dist/base64.d.ts +0 -7
  95. package/dist/base64.d.ts.map +0 -1
  96. package/dist/base64.js +0 -39
  97. package/dist/base64.js.map +0 -1
  98. package/dist/budget/pricing.d.ts +0 -124
  99. package/dist/budget/pricing.d.ts.map +0 -1
  100. package/dist/budget/pricing.js +0 -175
  101. package/dist/budget/pricing.js.map +0 -1
  102. package/dist/budget/storage.d.ts +0 -104
  103. package/dist/budget/storage.d.ts.map +0 -1
  104. package/dist/budget/storage.js +0 -0
  105. package/dist/budget/storage.js.map +0 -1
  106. package/dist/budget/with-budget.d.ts +0 -119
  107. package/dist/budget/with-budget.d.ts.map +0 -1
  108. package/dist/budget/with-budget.js +0 -175
  109. package/dist/budget/with-budget.js.map +0 -1
  110. package/dist/cached-embedding.d.ts +0 -14
  111. package/dist/cached-embedding.d.ts.map +0 -1
  112. package/dist/cached-embedding.js +0 -44
  113. package/dist/cached-embedding.js.map +0 -1
  114. package/dist/computer-use/actions.d.ts +0 -214
  115. package/dist/computer-use/actions.d.ts.map +0 -1
  116. package/dist/computer-use/actions.js +0 -48
  117. package/dist/computer-use/actions.js.map +0 -1
  118. package/dist/computer-use/errors.d.ts +0 -57
  119. package/dist/computer-use/errors.d.ts.map +0 -1
  120. package/dist/computer-use/errors.js +0 -76
  121. package/dist/computer-use/errors.js.map +0 -1
  122. package/dist/computer-use/playwright.d.ts +0 -76
  123. package/dist/computer-use/playwright.d.ts.map +0 -1
  124. package/dist/computer-use/playwright.js +0 -270
  125. package/dist/computer-use/playwright.js.map +0 -1
  126. package/dist/computer-use/tool.d.ts +0 -154
  127. package/dist/computer-use/tool.d.ts.map +0 -1
  128. package/dist/computer-use/tool.js +0 -210
  129. package/dist/computer-use/tool.js.map +0 -1
  130. package/dist/continuation-validation.d.ts +0 -85
  131. package/dist/continuation-validation.d.ts.map +0 -1
  132. package/dist/continuation-validation.js +0 -166
  133. package/dist/continuation-validation.js.map +0 -1
  134. package/dist/conversation-persistence.d.ts +0 -46
  135. package/dist/conversation-persistence.d.ts.map +0 -1
  136. package/dist/conversation-persistence.js +0 -176
  137. package/dist/conversation-persistence.js.map +0 -1
  138. package/dist/conversation.d.ts +0 -11
  139. package/dist/conversation.d.ts.map +0 -1
  140. package/dist/conversation.js +0 -55
  141. package/dist/conversation.js.map +0 -1
  142. package/dist/eval/fixtures.d.ts +0 -65
  143. package/dist/eval/fixtures.d.ts.map +0 -1
  144. package/dist/eval/fixtures.js +0 -110
  145. package/dist/eval/fixtures.js.map +0 -1
  146. package/dist/eval/html-reporter.d.ts +0 -25
  147. package/dist/eval/html-reporter.d.ts.map +0 -1
  148. package/dist/eval/html-reporter.js +0 -209
  149. package/dist/eval/html-reporter.js.map +0 -1
  150. package/dist/eval/json-reporter.d.ts +0 -43
  151. package/dist/eval/json-reporter.d.ts.map +0 -1
  152. package/dist/eval/json-reporter.js +0 -40
  153. package/dist/eval/json-reporter.js.map +0 -1
  154. package/dist/facade.d.ts +0 -96
  155. package/dist/facade.d.ts.map +0 -1
  156. package/dist/facade.js +0 -146
  157. package/dist/facade.js.map +0 -1
  158. package/dist/fake.d.ts +0 -201
  159. package/dist/fake.d.ts.map +0 -1
  160. package/dist/fake.js +0 -428
  161. package/dist/fake.js.map +0 -1
  162. package/dist/file-search.d.ts +0 -168
  163. package/dist/file-search.d.ts.map +0 -1
  164. package/dist/file-search.js +0 -158
  165. package/dist/file-search.js.map +0 -1
  166. package/dist/files.d.ts +0 -27
  167. package/dist/files.d.ts.map +0 -1
  168. package/dist/files.js +0 -44
  169. package/dist/files.js.map +0 -1
  170. package/dist/gateway/http-gateway-adapter.d.ts +0 -94
  171. package/dist/gateway/http-gateway-adapter.d.ts.map +0 -1
  172. package/dist/gateway/http-gateway-adapter.js +0 -106
  173. package/dist/gateway/http-gateway-adapter.js.map +0 -1
  174. package/dist/gateway/sse.d.ts +0 -28
  175. package/dist/gateway/sse.d.ts.map +0 -1
  176. package/dist/gateway/sse.js +0 -78
  177. package/dist/gateway/sse.js.map +0 -1
  178. package/dist/handoff.d.ts +0 -95
  179. package/dist/handoff.d.ts.map +0 -1
  180. package/dist/handoff.js +0 -78
  181. package/dist/handoff.js.map +0 -1
  182. package/dist/handoffs-driver.d.ts +0 -58
  183. package/dist/handoffs-driver.d.ts.map +0 -1
  184. package/dist/handoffs-driver.js +0 -103
  185. package/dist/handoffs-driver.js.map +0 -1
  186. package/dist/image.d.ts +0 -40
  187. package/dist/image.d.ts.map +0 -1
  188. package/dist/image.js +0 -109
  189. package/dist/image.js.map +0 -1
  190. package/dist/mcp/client-tools.d.ts +0 -39
  191. package/dist/mcp/client-tools.d.ts.map +0 -1
  192. package/dist/mcp/client-tools.js +0 -147
  193. package/dist/mcp/client-tools.js.map +0 -1
  194. package/dist/mcp/server-from-agent.d.ts +0 -24
  195. package/dist/mcp/server-from-agent.d.ts.map +0 -1
  196. package/dist/mcp/server-from-agent.js +0 -113
  197. package/dist/mcp/server-from-agent.js.map +0 -1
  198. package/dist/mcp/types.d.ts +0 -64
  199. package/dist/mcp/types.d.ts.map +0 -1
  200. package/dist/mcp/types.js +0 -6
  201. package/dist/mcp/types.js.map +0 -1
  202. package/dist/memory-extract.d.ts +0 -60
  203. package/dist/memory-extract.d.ts.map +0 -1
  204. package/dist/memory-extract.js +0 -163
  205. package/dist/memory-extract.js.map +0 -1
  206. package/dist/memory-inject.d.ts +0 -39
  207. package/dist/memory-inject.d.ts.map +0 -1
  208. package/dist/memory-inject.js +0 -135
  209. package/dist/memory-inject.js.map +0 -1
  210. package/dist/memory.d.ts +0 -55
  211. package/dist/memory.d.ts.map +0 -1
  212. package/dist/memory.js +0 -132
  213. package/dist/memory.js.map +0 -1
  214. package/dist/middleware.d.ts +0 -18
  215. package/dist/middleware.d.ts.map +0 -1
  216. package/dist/middleware.js +0 -72
  217. package/dist/middleware.js.map +0 -1
  218. package/dist/node/attachment.d.ts +0 -6
  219. package/dist/node/attachment.d.ts.map +0 -1
  220. package/dist/node/attachment.js +0 -35
  221. package/dist/node/attachment.js.map +0 -1
  222. package/dist/node/transcription.d.ts +0 -4
  223. package/dist/node/transcription.d.ts.map +0 -1
  224. package/dist/node/transcription.js +0 -8
  225. package/dist/node/transcription.js.map +0 -1
  226. package/dist/output.d.ts +0 -22
  227. package/dist/output.d.ts.map +0 -1
  228. package/dist/output.js +0 -60
  229. package/dist/output.js.map +0 -1
  230. package/dist/provider-tools.d.ts +0 -87
  231. package/dist/provider-tools.d.ts.map +0 -1
  232. package/dist/provider-tools.js +0 -189
  233. package/dist/provider-tools.js.map +0 -1
  234. package/dist/providers/anthropic.d.ts +0 -24
  235. package/dist/providers/anthropic.d.ts.map +0 -1
  236. package/dist/providers/anthropic.js +0 -405
  237. package/dist/providers/anthropic.js.map +0 -1
  238. package/dist/providers/azure.d.ts +0 -13
  239. package/dist/providers/azure.d.ts.map +0 -1
  240. package/dist/providers/azure.js +0 -15
  241. package/dist/providers/azure.js.map +0 -1
  242. package/dist/providers/bedrock.d.ts +0 -75
  243. package/dist/providers/bedrock.d.ts.map +0 -1
  244. package/dist/providers/bedrock.js +0 -181
  245. package/dist/providers/bedrock.js.map +0 -1
  246. package/dist/providers/cohere.d.ts +0 -13
  247. package/dist/providers/cohere.d.ts.map +0 -1
  248. package/dist/providers/cohere.js +0 -87
  249. package/dist/providers/cohere.js.map +0 -1
  250. package/dist/providers/deepseek.d.ts +0 -12
  251. package/dist/providers/deepseek.d.ts.map +0 -1
  252. package/dist/providers/deepseek.js +0 -15
  253. package/dist/providers/deepseek.js.map +0 -1
  254. package/dist/providers/elevenlabs.d.ts +0 -98
  255. package/dist/providers/elevenlabs.d.ts.map +0 -1
  256. package/dist/providers/elevenlabs.js +0 -229
  257. package/dist/providers/elevenlabs.js.map +0 -1
  258. package/dist/providers/google-cache-registry.d.ts +0 -132
  259. package/dist/providers/google-cache-registry.d.ts.map +0 -1
  260. package/dist/providers/google-cache-registry.js +0 -209
  261. package/dist/providers/google-cache-registry.js.map +0 -1
  262. package/dist/providers/google.d.ts +0 -38
  263. package/dist/providers/google.d.ts.map +0 -1
  264. package/dist/providers/google.js +0 -903
  265. package/dist/providers/google.js.map +0 -1
  266. package/dist/providers/groq.d.ts +0 -12
  267. package/dist/providers/groq.d.ts.map +0 -1
  268. package/dist/providers/groq.js +0 -15
  269. package/dist/providers/groq.js.map +0 -1
  270. package/dist/providers/jina.d.ts +0 -13
  271. package/dist/providers/jina.d.ts.map +0 -1
  272. package/dist/providers/jina.js +0 -90
  273. package/dist/providers/jina.js.map +0 -1
  274. package/dist/providers/mistral.d.ts +0 -13
  275. package/dist/providers/mistral.d.ts.map +0 -1
  276. package/dist/providers/mistral.js +0 -46
  277. package/dist/providers/mistral.js.map +0 -1
  278. package/dist/providers/ollama.d.ts +0 -11
  279. package/dist/providers/ollama.d.ts.map +0 -1
  280. package/dist/providers/ollama.js +0 -15
  281. package/dist/providers/ollama.js.map +0 -1
  282. package/dist/providers/openai.d.ts +0 -79
  283. package/dist/providers/openai.d.ts.map +0 -1
  284. package/dist/providers/openai.js +0 -792
  285. package/dist/providers/openai.js.map +0 -1
  286. package/dist/providers/openrouter.d.ts +0 -43
  287. package/dist/providers/openrouter.d.ts.map +0 -1
  288. package/dist/providers/openrouter.js +0 -21
  289. package/dist/providers/openrouter.js.map +0 -1
  290. package/dist/providers/voyage.d.ts +0 -91
  291. package/dist/providers/voyage.d.ts.map +0 -1
  292. package/dist/providers/voyage.js +0 -166
  293. package/dist/providers/voyage.js.map +0 -1
  294. package/dist/providers/xai.d.ts +0 -12
  295. package/dist/providers/xai.d.ts.map +0 -1
  296. package/dist/providers/xai.js +0 -15
  297. package/dist/providers/xai.js.map +0 -1
  298. package/dist/queue-job.d.ts +0 -100
  299. package/dist/queue-job.d.ts.map +0 -1
  300. package/dist/queue-job.js +0 -185
  301. package/dist/queue-job.js.map +0 -1
  302. package/dist/react/agent-run.d.ts +0 -111
  303. package/dist/react/agent-run.d.ts.map +0 -1
  304. package/dist/react/agent-run.js +0 -107
  305. package/dist/react/agent-run.js.map +0 -1
  306. package/dist/react/useAgentRun.d.ts +0 -68
  307. package/dist/react/useAgentRun.d.ts.map +0 -1
  308. package/dist/react/useAgentRun.js +0 -125
  309. package/dist/react/useAgentRun.js.map +0 -1
  310. package/dist/registry.d.ts +0 -45
  311. package/dist/registry.d.ts.map +0 -1
  312. package/dist/registry.js +0 -131
  313. package/dist/registry.js.map +0 -1
  314. package/dist/rerank.d.ts +0 -20
  315. package/dist/rerank.d.ts.map +0 -1
  316. package/dist/rerank.js +0 -40
  317. package/dist/rerank.js.map +0 -1
  318. package/dist/resume-approval.d.ts +0 -30
  319. package/dist/resume-approval.d.ts.map +0 -1
  320. package/dist/resume-approval.js +0 -147
  321. package/dist/resume-approval.js.map +0 -1
  322. package/dist/sanitize-conversation.d.ts +0 -43
  323. package/dist/sanitize-conversation.d.ts.map +0 -1
  324. package/dist/sanitize-conversation.js +0 -85
  325. package/dist/sanitize-conversation.js.map +0 -1
  326. package/dist/scoped-tool.d.ts +0 -98
  327. package/dist/scoped-tool.d.ts.map +0 -1
  328. package/dist/scoped-tool.js +0 -174
  329. package/dist/scoped-tool.js.map +0 -1
  330. package/dist/server/provider.d.ts +0 -22
  331. package/dist/server/provider.d.ts.map +0 -1
  332. package/dist/server/provider.js +0 -194
  333. package/dist/server/provider.js.map +0 -1
  334. package/dist/similarity-search.d.ts +0 -163
  335. package/dist/similarity-search.d.ts.map +0 -1
  336. package/dist/similarity-search.js +0 -147
  337. package/dist/similarity-search.js.map +0 -1
  338. package/dist/sub-agent-run-store.d.ts +0 -157
  339. package/dist/sub-agent-run-store.d.ts.map +0 -1
  340. package/dist/sub-agent-run-store.js +0 -87
  341. package/dist/sub-agent-run-store.js.map +0 -1
  342. package/dist/tool-execution.d.ts +0 -16
  343. package/dist/tool-execution.d.ts.map +0 -1
  344. package/dist/tool-execution.js +0 -498
  345. package/dist/tool-execution.js.map +0 -1
  346. package/dist/tool-helpers.d.ts +0 -77
  347. package/dist/tool-helpers.d.ts.map +0 -1
  348. package/dist/tool-helpers.js +0 -117
  349. package/dist/tool-helpers.js.map +0 -1
  350. package/dist/tool.d.ts +0 -216
  351. package/dist/tool.d.ts.map +0 -1
  352. package/dist/tool.js +0 -175
  353. package/dist/tool.js.map +0 -1
  354. package/dist/transcription.d.ts +0 -42
  355. package/dist/transcription.d.ts.map +0 -1
  356. package/dist/transcription.js +0 -77
  357. package/dist/transcription.js.map +0 -1
  358. package/dist/types.d.ts +0 -1020
  359. package/dist/types.d.ts.map +0 -1
  360. package/dist/types.js +0 -2
  361. package/dist/types.js.map +0 -1
  362. package/dist/util/hash.d.ts +0 -11
  363. package/dist/util/hash.d.ts.map +0 -1
  364. package/dist/util/hash.js +0 -23
  365. package/dist/util/hash.js.map +0 -1
  366. package/dist/vector-stores/index.d.ts +0 -96
  367. package/dist/vector-stores/index.d.ts.map +0 -1
  368. package/dist/vector-stores/index.js +0 -153
  369. package/dist/vector-stores/index.js.map +0 -1
  370. package/dist/vercel-protocol.d.ts +0 -18
  371. package/dist/vercel-protocol.d.ts.map +0 -1
  372. package/dist/vercel-protocol.js +0 -75
  373. package/dist/vercel-protocol.js.map +0 -1
  374. package/dist/zod-to-json-schema.d.ts +0 -16
  375. package/dist/zod-to-json-schema.d.ts.map +0 -1
  376. package/dist/zod-to-json-schema.js +0 -17
  377. package/dist/zod-to-json-schema.js.map +0 -1
@@ -1,271 +1,2 @@
1
- /**
2
- * `@rudderjs/ai/eval` — built-in eval framework for #A5 Phase 1.
3
- *
4
- * Define a suite of input cases + assertions, run them against any
5
- * `Agent`, get a console report with pass/fail + cost + tokens. Same
6
- * `Agent` instances as your app code — one source of truth.
7
- *
8
- * @example
9
- * ```ts
10
- * // evals/support-agent.eval.ts
11
- * import { evalSuite, llmJudge, exactMatch, regex } from '@rudderjs/ai/eval'
12
- * import { SupportAgent } from '../app/Agents/SupportAgent.js'
13
- *
14
- * export default evalSuite('SupportAgent', {
15
- * agent: () => new SupportAgent(),
16
- * cases: [
17
- * { name: 'password reset', input: 'How do I reset my password?',
18
- * assert: llmJudge('mentions a password reset link') },
19
- * { name: 'price', input: 'How much?', assert: exactMatch('$99/month') },
20
- * { name: 'support email', input: 'Contact?', assert: regex(/support@/) },
21
- * ],
22
- * })
23
- * ```
24
- *
25
- * Run programmatically via `runSuite(suite)` from this entry, or via
26
- * `pnpm rudder ai:eval` once Phase 2 lands.
27
- *
28
- * Built-in metrics: `exactMatch`, `regex`, `llmJudge`, `jsonShape`,
29
- * `semanticMatch`, `tokenCost`. Compose multiple via `compose(...)`.
30
- * User-defined metrics work today — any `(response, ctx) =>
31
- * MetricResult` qualifies.
32
- */
33
- import type { Agent } from '../agent.js';
34
- import type { AgentResponse } from '../types.js';
35
- import { z } from 'zod';
36
- export { reportJson } from './json-reporter.js';
37
- export type { SuiteJson, SuiteJsonCase } from './json-reporter.js';
38
- export { stepsFromResponse } from './fixtures.js';
39
- export type { EvalFixture } from './fixtures.js';
40
- export { reportHtml } from './html-reporter.js';
41
- export type { HtmlReportOptions } from './html-reporter.js';
42
- /**
43
- * Result of a single assertion. `pass` is the only required field;
44
- * `score` (0..1) and `reason` are surfaced in reports.
45
- */
46
- export interface MetricResult {
47
- pass: boolean;
48
- score?: number;
49
- reason?: string;
50
- }
51
- /**
52
- * Assertion signature. Sync or async; the runner awaits both.
53
- *
54
- * `ctx` carries the case context so user metrics can opt into the
55
- * input/case-name (e.g. for logging). The built-ins ignore it.
56
- */
57
- export type Metric = (response: AgentResponse, ctx: MetricContext) => MetricResult | Promise<MetricResult>;
58
- export interface MetricContext {
59
- /** The case's input string (the same passed to `agent.prompt`). */
60
- input: string;
61
- /** Optional case `name` if set on the spec. */
62
- caseName: string;
63
- }
64
- /** A single eval case. */
65
- export interface EvalCase {
66
- /** Stable identifier used in reports. Defaults to `case-<index>`. */
67
- name?: string;
68
- /** Input passed to `agent.prompt(input)`. */
69
- input: string;
70
- /** The assertion. Pass-fail + optional score/reason. */
71
- assert: Metric;
72
- /**
73
- * Per-case agent override. When set, replaces the suite-level
74
- * `agent` factory for this case (e.g. swap models for a stress
75
- * test).
76
- */
77
- agent?: () => Agent;
78
- /**
79
- * Per-case timeout in ms. Defaults to the suite-level timeout
80
- * (or no timeout if neither is set).
81
- */
82
- timeout?: number;
83
- /**
84
- * Skip this case. Pass `true` to silently skip, or a string for
85
- * a reason that surfaces in the report.
86
- */
87
- skip?: boolean | string;
88
- }
89
- export interface EvalSuiteSpec {
90
- /** Factory for the agent under test. Called once per case. */
91
- agent: () => Agent;
92
- /** The cases to run. */
93
- cases: EvalCase[];
94
- /**
95
- * Suite-wide timeout in ms applied to every case unless the case
96
- * overrides. Throws cause `pass: false` with the timeout message.
97
- */
98
- timeout?: number;
99
- /**
100
- * Optional ownership / context surfaced in the HTML report (#A5
101
- * Phase 5). Well-known keys (`owner`, `lastReviewed`, `ticket`)
102
- * get formatted headings; any extra string keys render as a
103
- * generic key/value row so teams can attach their own metadata.
104
- */
105
- metadata?: EvalMetadata;
106
- }
107
- export interface EvalMetadata {
108
- owner?: string;
109
- lastReviewed?: string;
110
- ticket?: string;
111
- [key: string]: string | undefined;
112
- }
113
- export interface EvalSuite {
114
- name: string;
115
- spec: EvalSuiteSpec;
116
- }
117
- /** Per-case run record collected by {@link runSuite}. */
118
- export interface CaseResult {
119
- name: string;
120
- /** Final result; `'skipped'` skips assertion + cost. */
121
- status: 'passed' | 'failed' | 'skipped';
122
- metric?: MetricResult;
123
- /** Skip reason (when `status === 'skipped'`). */
124
- reason?: string;
125
- /** Wall-clock ms for the agent call + assertion. */
126
- duration: number;
127
- /**
128
- * Token usage from the agent's `prompt()` (zero on skip / failure
129
- * before the call). Includes BOTH the agent under test AND any
130
- * judge-model calls the assertion made.
131
- */
132
- tokens: number;
133
- /** USD estimate (see {@link estimateCost}; zero on skip). */
134
- cost: number;
135
- /**
136
- * The case's input string, copied through from `EvalCase.input`
137
- * for reporters that want to render the prompt alongside the
138
- * response (#A5 Phase 5 HTML report). Always present — runners
139
- * always know the input.
140
- */
141
- input: string;
142
- /**
143
- * The agent's final assistant text. Absent when the case skipped
144
- * or the agent threw before producing a response. The HTML
145
- * reporter renders `<no response>` in that case.
146
- */
147
- responseText?: string;
148
- }
149
- /** Full report returned by {@link runSuite}. */
150
- export interface SuiteReport {
151
- suite: string;
152
- cases: CaseResult[];
153
- passed: number;
154
- failed: number;
155
- skipped: number;
156
- duration: number;
157
- cost: number;
158
- tokens: number;
159
- /** Suite-level metadata (#A5 Phase 5), copied through from the spec. */
160
- metadata?: EvalMetadata;
161
- }
162
- /**
163
- * Define an eval suite. Returns a frozen `EvalSuite` ready to pass
164
- * into {@link runSuite} or to default-export from an `evals/*.eval.ts`
165
- * file (Phase 2's CLI auto-discovers those).
166
- *
167
- * The shape is deliberately a function rather than a class — keeps the
168
- * file's default export trivially serializable (Phase 2 needs to load
169
- * suites via dynamic import) and avoids the "did you forget `new`?"
170
- * footgun.
171
- */
172
- export declare function evalSuite(name: string, spec: EvalSuiteSpec): EvalSuite;
173
- /** Exact string equality against `response.text`. */
174
- export declare function exactMatch(expected: string): Metric;
175
- /** Pattern match against `response.text`. */
176
- export declare function regex(pattern: RegExp): Metric;
177
- /**
178
- * LLM-as-judge: ask a small model whether the response satisfies a
179
- * natural-language criterion. Returns the judge's reasoning in
180
- * `reason` so failures are debuggable.
181
- *
182
- * Design: the judge runs as a one-shot anonymous agent (no recursion
183
- * concern — default `remembers()` is `false`). Output is shaped via
184
- * `Output.object({ schema })` for deterministic parsing. Failures
185
- * (network, parse, unhandled judge error) bubble as `pass: false`
186
- * with the error in `reason` — a broken judge is not a passing case.
187
- *
188
- * Pitfall: the judge model has the same biases as any LLM. Use it
189
- * for fuzzy "did the answer mention X?" assertions; for exact
190
- * structural checks prefer `jsonShape` (Phase 3) or `regex`.
191
- */
192
- export declare function llmJudge(criterion: string, opts?: {
193
- model?: string;
194
- }): Metric;
195
- /**
196
- * Strict structural assertion: parse `response.text` as JSON
197
- * (stripping ```json fences) and run it through a zod schema.
198
- *
199
- * Pairs naturally with `Output.object({ schema })` on the agent —
200
- * if the agent declares the same schema, this metric verifies the
201
- * output actually conforms. Failures surface the zod issue path
202
- * (e.g. `customer.email`) so debugging doesn't require a separate
203
- * console log.
204
- */
205
- export declare function jsonShape<T>(schema: z.ZodType<T>): Metric;
206
- /**
207
- * Embedding-based fuzzy match. Embeds both `reference` and
208
- * `response.text` via `AI.embed()`, computes cosine similarity,
209
- * passes when >= `threshold` (default `0.85` — tighter than
210
- * `EmbeddingUserMemory`'s 0.5 retrieval-rank floor since this is
211
- * an assertion, not a ranking).
212
- *
213
- * Uses ≤ 2 embedding calls per case; embed tokens roll into the
214
- * case's cost rollup via the same side-channel `llmJudge` uses.
215
- *
216
- * Pitfall: requires a provider that implements `createEmbedding()`
217
- * (openai / google / mistral / cohere / jina). Failures (no
218
- * provider, network, etc.) surface as `pass: false` with the
219
- * error in `reason` — a broken embed is not a passing case.
220
- */
221
- export declare function semanticMatch(reference: string, opts?: {
222
- threshold?: number;
223
- model?: string;
224
- }): Metric;
225
- /**
226
- * Token budget guard. Passes when `response.usage.totalTokens
227
- * <= threshold`. Pair with cost-conscious agents to detect prompt-
228
- * size regressions before they show up as a billing surprise.
229
- *
230
- * `response.usage` is the multi-step rollup, so it's meaningful
231
- * even when the agent runs tools across several provider calls.
232
- */
233
- export declare function tokenCost(threshold: number): Metric;
234
- /**
235
- * Compose multiple metrics into one assertion. Runs them in order
236
- * and short-circuits on the first failure — failure `reason` is
237
- * surfaced; success returns `{ pass: true, score: 1 }`.
238
- *
239
- * @example
240
- * { input: '…',
241
- * assert: compose(
242
- * jsonShape(SummarySchema),
243
- * tokenCost(800),
244
- * ),
245
- * }
246
- */
247
- export declare function compose(...metrics: Metric[]): Metric;
248
- /**
249
- * Run every case in the suite, in declaration order. Returns the
250
- * full report; never throws (assertion errors become `failed` cases,
251
- * not exceptions).
252
- *
253
- * Phase 1 runs serially. Parallel execution lands in a follow-up
254
- * once we understand the rate-limit shape of real-world judge
255
- * models — sequential is correct under any rate limit.
256
- */
257
- export declare function runSuite(suite: EvalSuite): Promise<SuiteReport>;
258
- export { estimateCost, ModelPricing } from '../budget/pricing.js';
259
- export type { ModelPriceEntry } from '../budget/pricing.js';
260
- /**
261
- * Default reporter — prints a colorless ANSI-aware table to a
262
- * caller-supplied `console`-like sink. Uses Unicode pass/fail glyphs
263
- * for visual scanning. JSON / HTML reporters land in Phase 2 / 5.
264
- *
265
- * Returns the report unchanged so chains compose: `await
266
- * reportConsole(await runSuite(suite))`.
267
- */
268
- export declare function reportConsole(report: SuiteReport, sink?: {
269
- log: (s: string) => void;
270
- }): SuiteReport;
1
+ export * from '@gemstack/ai-sdk/eval';
271
2
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AAGH,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,aAAa,CAAA;AACxC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAA;AAKhD,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAA;AAEvB,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAC/C,YAAY,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AAClE,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAA;AACjD,YAAY,EAAE,WAAW,EAAE,MAAM,eAAe,CAAA;AAChD,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAC/C,YAAY,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAA;AAI3D;;;GAGG;AACH,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAK,OAAO,CAAA;IAChB,KAAK,CAAC,EAAG,MAAM,CAAA;IACf,MAAM,CAAC,EAAE,MAAM,CAAA;CAChB;AAED;;;;;GAKG;AACH,MAAM,MAAM,MAAM,GAAG,CAAC,QAAQ,EAAE,aAAa,EAAE,GAAG,EAAE,aAAa,KAAK,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAAA;AAE1G,MAAM,WAAW,aAAa;IAC5B,mEAAmE;IACnE,KAAK,EAAK,MAAM,CAAA;IAChB,+CAA+C;IAC/C,QAAQ,EAAE,MAAM,CAAA;CACjB;AAED,0BAA0B;AAC1B,MAAM,WAAW,QAAQ;IACvB,qEAAqE;IACrE,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,6CAA6C;IAC7C,KAAK,EAAE,MAAM,CAAA;IACb,wDAAwD;IACxD,MAAM,EAAE,MAAM,CAAA;IACd;;;;OAIG;IACH,KAAK,CAAC,EAAE,MAAM,KAAK,CAAA;IACnB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB;;;OAGG;IACH,IAAI,CAAC,EAAE,OAAO,GAAG,MAAM,CAAA;CACxB;AAED,MAAM,WAAW,aAAa;IAC5B,8DAA8D;IAC9D,KAAK,EAAE,MAAM,KAAK,CAAA;IAClB,wBAAwB;IACxB,KAAK,EAAE,QAAQ,EAAE,CAAA;IACjB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB;;;;;OAKG;IACH,QAAQ,CAAC,EAAE,YAAY,CAAA;CACxB;AAED,MAAM,WAAW,YAAY;IAC3B,KAAK,CAAC,EAAS,MAAM,CAAA;IACrB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,MAAM,CAAC,EAAQ,MAAM,CAAA;IACrB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAAA;CAClC;AAED,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,MAAM,CAAA;IACZ,IAAI,EAAE,aAAa,CAAA;CACpB;AAED,yDAAyD;AACzD,MAAM,WAAW,UAAU;IACzB,IAAI,EAAK,MAAM,CAAA;IACf,wDAAwD;IACxD,MAAM,EAAG,QAAQ,GAAG,QAAQ,GAAG,SAAS,CAAA;IACxC,MAAM,CAAC,EAAE,YAAY,CAAA;IACrB,iDAAiD;IACjD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,oDAAoD;IACpD,QAAQ,EAAE,MAAM,CAAA;IAChB;;;;OAIG;IACH,MAAM,EAAG,MAAM,CAAA;IACf,6DAA6D;IAC7D,IAAI,EAAK,MAAM,CAAA;IACf;;;;;OAKG;IACH,KAAK,EAAI,MAAM,CAAA;IACf;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,gDAAgD;AAChD,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAK,MAAM,CAAA;IAChB,KAAK,EAAK,UAAU,EAAE,CAAA;IACtB,MAAM,EAAI,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,OAAO,EAAG,MAAM,CAAA;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,IAAI,EAAM,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,wEAAwE;IACxE,QAAQ,CAAC,EAAE,YAAY,CAAA;CACxB;AAID;;;;;;;;;GASG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,aAAa,GAAG,SAAS,CAStE;AAID,qDAAqD;AACrD,wBAAgB,UAAU,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAUnD;AAED,6CAA6C;AAC7C,wBAAgB,KAAK,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAS7C;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,QAAQ,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,GAAE;IAAE,KAAK,CAAC,EAAE,MAAM,CAAA;CAAO,GAAG,MAAM,CA6CjF;AAQD;;;;;;;;;GASG;AACH,wBAAgB,SAAS,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,CAuBzD;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,aAAa,CAC3B,SAAS,EAAE,MAAM,EACjB,IAAI,GAAE;IAAE,SAAS,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAA;CAAO,GAChD,MAAM,CA+BR;AAED;;;;;;;GAOG;AACH,wBAAgB,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAYnD;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,OAAO,CAAC,GAAG,OAAO,EAAE,MAAM,EAAE,GAAG,MAAM,CAQpD;AA4BD;;;;;;;;GAQG;AACH,wBAAsB,QAAQ,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC,CAgDrE;AA0FD,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AACjE,YAAY,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAA;AAI3D;;;;;;;GAOG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,WAAW,EAAE,IAAI,GAAE;IAAE,GAAG,EAAE,CAAC,CAAC,EAAE,MAAM,KAAK,IAAI,CAAA;CAAY,GAAG,WAAW,CAyB5G"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAGA,cAAc,uBAAuB,CAAA"}