ai-functions 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (400) hide show
  1. package/.turbo/turbo-build.log +5 -0
  2. package/.turbo/turbo-test.log +105 -0
  3. package/README.md +190 -86
  4. package/TODO.md +138 -0
  5. package/dist/ai-promise.d.ts +219 -0
  6. package/dist/ai-promise.d.ts.map +1 -0
  7. package/dist/ai-promise.js +610 -0
  8. package/dist/ai-promise.js.map +1 -0
  9. package/dist/ai.d.ts +285 -0
  10. package/dist/ai.d.ts.map +1 -0
  11. package/dist/ai.js +842 -0
  12. package/dist/ai.js.map +1 -0
  13. package/dist/batch/anthropic.d.ts +23 -0
  14. package/dist/batch/anthropic.d.ts.map +1 -0
  15. package/dist/batch/anthropic.js +257 -0
  16. package/dist/batch/anthropic.js.map +1 -0
  17. package/dist/batch/bedrock.d.ts +64 -0
  18. package/dist/batch/bedrock.d.ts.map +1 -0
  19. package/dist/batch/bedrock.js +586 -0
  20. package/dist/batch/bedrock.js.map +1 -0
  21. package/dist/batch/cloudflare.d.ts +37 -0
  22. package/dist/batch/cloudflare.d.ts.map +1 -0
  23. package/dist/batch/cloudflare.js +289 -0
  24. package/dist/batch/cloudflare.js.map +1 -0
  25. package/dist/batch/google.d.ts +41 -0
  26. package/dist/batch/google.d.ts.map +1 -0
  27. package/dist/batch/google.js +360 -0
  28. package/dist/batch/google.js.map +1 -0
  29. package/dist/batch/index.d.ts +31 -0
  30. package/dist/batch/index.d.ts.map +1 -0
  31. package/dist/batch/index.js +31 -0
  32. package/dist/batch/index.js.map +1 -0
  33. package/dist/batch/memory.d.ts +44 -0
  34. package/dist/batch/memory.d.ts.map +1 -0
  35. package/dist/batch/memory.js +188 -0
  36. package/dist/batch/memory.js.map +1 -0
  37. package/dist/batch/openai.d.ts +37 -0
  38. package/dist/batch/openai.d.ts.map +1 -0
  39. package/dist/batch/openai.js +403 -0
  40. package/dist/batch/openai.js.map +1 -0
  41. package/dist/batch-map.d.ts +125 -0
  42. package/dist/batch-map.d.ts.map +1 -0
  43. package/dist/batch-map.js +406 -0
  44. package/dist/batch-map.js.map +1 -0
  45. package/dist/batch-queue.d.ts +273 -0
  46. package/dist/batch-queue.d.ts.map +1 -0
  47. package/dist/batch-queue.js +271 -0
  48. package/dist/batch-queue.js.map +1 -0
  49. package/dist/context.d.ts +133 -0
  50. package/dist/context.d.ts.map +1 -0
  51. package/dist/context.js +267 -0
  52. package/dist/context.js.map +1 -0
  53. package/dist/embeddings.d.ts +123 -0
  54. package/dist/embeddings.d.ts.map +1 -0
  55. package/dist/embeddings.js +170 -0
  56. package/dist/embeddings.js.map +1 -0
  57. package/dist/eval/index.d.ts +8 -0
  58. package/dist/eval/index.d.ts.map +1 -0
  59. package/dist/eval/index.js +8 -0
  60. package/dist/eval/index.js.map +1 -0
  61. package/dist/eval/models.d.ts +66 -0
  62. package/dist/eval/models.d.ts.map +1 -0
  63. package/dist/eval/models.js +120 -0
  64. package/dist/eval/models.js.map +1 -0
  65. package/dist/eval/runner.d.ts +64 -0
  66. package/dist/eval/runner.d.ts.map +1 -0
  67. package/dist/eval/runner.js +148 -0
  68. package/dist/eval/runner.js.map +1 -0
  69. package/dist/generate.d.ts +168 -0
  70. package/dist/generate.d.ts.map +1 -0
  71. package/dist/generate.js +174 -0
  72. package/dist/generate.js.map +1 -0
  73. package/dist/index.d.ts +29 -4
  74. package/dist/index.d.ts.map +1 -1
  75. package/dist/index.js +53 -52
  76. package/dist/index.js.map +1 -1
  77. package/dist/primitives.d.ts +292 -0
  78. package/dist/primitives.d.ts.map +1 -0
  79. package/dist/primitives.js +471 -0
  80. package/dist/primitives.js.map +1 -0
  81. package/dist/providers/cloudflare.d.ts +9 -0
  82. package/dist/providers/cloudflare.d.ts.map +1 -0
  83. package/dist/providers/cloudflare.js +9 -0
  84. package/dist/providers/cloudflare.js.map +1 -0
  85. package/dist/providers/index.d.ts +9 -0
  86. package/dist/providers/index.d.ts.map +1 -0
  87. package/dist/providers/index.js +9 -0
  88. package/dist/providers/index.js.map +1 -0
  89. package/dist/schema.d.ts +54 -0
  90. package/dist/schema.d.ts.map +1 -0
  91. package/dist/schema.js +109 -0
  92. package/dist/schema.js.map +1 -0
  93. package/dist/template.d.ts +73 -0
  94. package/dist/template.d.ts.map +1 -0
  95. package/dist/template.js +129 -0
  96. package/dist/template.js.map +1 -0
  97. package/dist/types.d.ts +474 -106
  98. package/dist/types.d.ts.map +1 -1
  99. package/dist/types.js +4 -8
  100. package/dist/types.js.map +1 -1
  101. package/evalite.config.ts +19 -0
  102. package/evals/README.md +212 -0
  103. package/evals/classification.eval.ts +108 -0
  104. package/evals/marketing.eval.ts +370 -0
  105. package/evals/math.eval.ts +94 -0
  106. package/evals/run-evals.ts +166 -0
  107. package/evals/structured-output.eval.ts +143 -0
  108. package/evals/writing.eval.ts +117 -0
  109. package/examples/batch-blog-posts.ts +160 -0
  110. package/package.json +57 -57
  111. package/src/ai-promise.ts +784 -0
  112. package/src/ai.ts +1183 -0
  113. package/src/batch/anthropic.ts +375 -0
  114. package/src/batch/bedrock.ts +801 -0
  115. package/src/batch/cloudflare.ts +421 -0
  116. package/src/batch/google.ts +491 -0
  117. package/src/batch/index.ts +31 -0
  118. package/src/batch/memory.ts +253 -0
  119. package/src/batch/openai.ts +557 -0
  120. package/src/batch-map.ts +534 -0
  121. package/src/batch-queue.ts +493 -0
  122. package/src/context.ts +332 -0
  123. package/src/embeddings.ts +244 -0
  124. package/src/eval/index.ts +8 -0
  125. package/src/eval/models.ts +158 -0
  126. package/src/eval/runner.ts +217 -0
  127. package/src/generate.ts +245 -0
  128. package/src/index.ts +154 -0
  129. package/src/primitives.ts +612 -0
  130. package/src/providers/cloudflare.ts +15 -0
  131. package/src/providers/index.ts +14 -0
  132. package/src/schema.ts +147 -0
  133. package/src/template.ts +209 -0
  134. package/src/types.ts +540 -0
  135. package/test/README.md +105 -0
  136. package/test/ai-proxy.test.ts +192 -0
  137. package/test/async-iterators.test.ts +327 -0
  138. package/test/batch-background.test.ts +482 -0
  139. package/test/batch-blog-posts.test.ts +387 -0
  140. package/test/blog-generation.test.ts +510 -0
  141. package/test/browse-read.test.ts +611 -0
  142. package/test/core-functions.test.ts +694 -0
  143. package/test/decide.test.ts +393 -0
  144. package/test/define.test.ts +274 -0
  145. package/test/e2e-bedrock-manual.ts +163 -0
  146. package/test/e2e-bedrock.test.ts +191 -0
  147. package/test/e2e-flex-gateway.ts +157 -0
  148. package/test/e2e-flex-manual.ts +183 -0
  149. package/test/e2e-flex.test.ts +209 -0
  150. package/test/e2e-google-manual.ts +178 -0
  151. package/test/e2e-google.test.ts +216 -0
  152. package/test/embeddings.test.ts +284 -0
  153. package/test/evals/define-function.eval.test.ts +379 -0
  154. package/test/evals/primitives.eval.test.ts +384 -0
  155. package/test/function-types.test.ts +492 -0
  156. package/test/generate-core.test.ts +319 -0
  157. package/test/generate.test.ts +163 -0
  158. package/test/implicit-batch.test.ts +422 -0
  159. package/test/schema.test.ts +109 -0
  160. package/test/tagged-templates.test.ts +302 -0
  161. package/tsconfig.json +10 -0
  162. package/vitest.config.ts +42 -0
  163. package/LICENSE +0 -21
  164. package/bin/cli.js +0 -5
  165. package/dist/cli/index.d.ts +0 -10
  166. package/dist/cli/index.d.ts.map +0 -1
  167. package/dist/cli/index.js +0 -38
  168. package/dist/cli/index.js.map +0 -1
  169. package/dist/cli/index.test.d.ts +0 -2
  170. package/dist/cli/index.test.d.ts.map +0 -1
  171. package/dist/cli/index.test.js +0 -35
  172. package/dist/cli/index.test.js.map +0 -1
  173. package/dist/constants/models.d.ts +0 -10
  174. package/dist/constants/models.d.ts.map +0 -1
  175. package/dist/constants/models.js +0 -12
  176. package/dist/constants/models.js.map +0 -1
  177. package/dist/converters/index.d.ts +0 -3
  178. package/dist/converters/index.d.ts.map +0 -1
  179. package/dist/converters/index.js +0 -3
  180. package/dist/converters/index.js.map +0 -1
  181. package/dist/converters/model.d.ts +0 -4
  182. package/dist/converters/model.d.ts.map +0 -1
  183. package/dist/converters/model.js +0 -19
  184. package/dist/converters/model.js.map +0 -1
  185. package/dist/converters/schema.d.ts +0 -4
  186. package/dist/converters/schema.d.ts.map +0 -1
  187. package/dist/converters/schema.js +0 -25
  188. package/dist/converters/schema.js.map +0 -1
  189. package/dist/core/responses.d.ts +0 -5
  190. package/dist/core/responses.d.ts.map +0 -1
  191. package/dist/core/responses.js +0 -16
  192. package/dist/core/responses.js.map +0 -1
  193. package/dist/core/responses.test.d.ts +0 -2
  194. package/dist/core/responses.test.d.ts.map +0 -1
  195. package/dist/core/responses.test.js +0 -31
  196. package/dist/core/responses.test.js.map +0 -1
  197. package/dist/errors.d.ts +0 -6
  198. package/dist/errors.d.ts.map +0 -1
  199. package/dist/errors.js +0 -9
  200. package/dist/errors.js.map +0 -1
  201. package/dist/examples/streaming.test.d.ts +0 -2
  202. package/dist/examples/streaming.test.d.ts.map +0 -1
  203. package/dist/examples/streaming.test.js +0 -176
  204. package/dist/examples/streaming.test.js.map +0 -1
  205. package/dist/factory/__tests__/index.test.d.ts +0 -2
  206. package/dist/factory/__tests__/index.test.d.ts.map +0 -1
  207. package/dist/factory/__tests__/index.test.js +0 -430
  208. package/dist/factory/__tests__/index.test.js.map +0 -1
  209. package/dist/factory/__tests__/list.test.d.ts +0 -2
  210. package/dist/factory/__tests__/list.test.d.ts.map +0 -1
  211. package/dist/factory/__tests__/list.test.js +0 -92
  212. package/dist/factory/__tests__/list.test.js.map +0 -1
  213. package/dist/factory/index.d.ts +0 -20
  214. package/dist/factory/index.d.ts.map +0 -1
  215. package/dist/factory/index.js +0 -287
  216. package/dist/factory/index.js.map +0 -1
  217. package/dist/factory/index.test.d.ts +0 -2
  218. package/dist/factory/index.test.d.ts.map +0 -1
  219. package/dist/factory/index.test.js +0 -287
  220. package/dist/factory/index.test.js.map +0 -1
  221. package/dist/factory/list.d.ts +0 -3
  222. package/dist/factory/list.d.ts.map +0 -1
  223. package/dist/factory/list.js +0 -221
  224. package/dist/factory/list.js.map +0 -1
  225. package/dist/factory/list.test.d.ts +0 -2
  226. package/dist/factory/list.test.d.ts.map +0 -1
  227. package/dist/factory/list.test.js +0 -84
  228. package/dist/factory/list.test.js.map +0 -1
  229. package/dist/generate/index.d.ts +0 -5
  230. package/dist/generate/index.d.ts.map +0 -1
  231. package/dist/generate/index.js +0 -17
  232. package/dist/generate/index.js.map +0 -1
  233. package/dist/index.test.d.ts +0 -2
  234. package/dist/index.test.d.ts.map +0 -1
  235. package/dist/index.test.js +0 -59
  236. package/dist/index.test.js.map +0 -1
  237. package/dist/list/await.d.ts +0 -3
  238. package/dist/list/await.d.ts.map +0 -1
  239. package/dist/list/await.js +0 -28
  240. package/dist/list/await.js.map +0 -1
  241. package/dist/list/constants.d.ts +0 -4
  242. package/dist/list/constants.d.ts.map +0 -1
  243. package/dist/list/constants.js +0 -5
  244. package/dist/list/constants.js.map +0 -1
  245. package/dist/list/create-function.d.ts +0 -3
  246. package/dist/list/create-function.d.ts.map +0 -1
  247. package/dist/list/create-function.js +0 -11
  248. package/dist/list/create-function.js.map +0 -1
  249. package/dist/list/index.d.ts +0 -4
  250. package/dist/list/index.d.ts.map +0 -1
  251. package/dist/list/index.js +0 -5
  252. package/dist/list/index.js.map +0 -1
  253. package/dist/list/prompt.d.ts +0 -3
  254. package/dist/list/prompt.d.ts.map +0 -1
  255. package/dist/list/prompt.js +0 -6
  256. package/dist/list/prompt.js.map +0 -1
  257. package/dist/list/schemas.d.ts +0 -4
  258. package/dist/list/schemas.d.ts.map +0 -1
  259. package/dist/list/schemas.js +0 -8
  260. package/dist/list/schemas.js.map +0 -1
  261. package/dist/list/stream.d.ts +0 -3
  262. package/dist/list/stream.d.ts.map +0 -1
  263. package/dist/list/stream.js +0 -33
  264. package/dist/list/stream.js.map +0 -1
  265. package/dist/list/types.d.ts +0 -11
  266. package/dist/list/types.d.ts.map +0 -1
  267. package/dist/list/types.js +0 -2
  268. package/dist/list/types.js.map +0 -1
  269. package/dist/list/validation.d.ts +0 -3
  270. package/dist/list/validation.d.ts.map +0 -1
  271. package/dist/list/validation.js +0 -12
  272. package/dist/list/validation.js.map +0 -1
  273. package/dist/providers/config.d.ts +0 -4
  274. package/dist/providers/config.d.ts.map +0 -1
  275. package/dist/providers/config.js +0 -21
  276. package/dist/providers/config.js.map +0 -1
  277. package/dist/providers/config.test.d.ts +0 -2
  278. package/dist/providers/config.test.d.ts.map +0 -1
  279. package/dist/providers/config.test.js +0 -37
  280. package/dist/providers/config.test.js.map +0 -1
  281. package/dist/proxy/constants.d.ts +0 -4
  282. package/dist/proxy/constants.d.ts.map +0 -1
  283. package/dist/proxy/constants.js +0 -5
  284. package/dist/proxy/constants.js.map +0 -1
  285. package/dist/proxy/create-function.d.ts +0 -4
  286. package/dist/proxy/create-function.d.ts.map +0 -1
  287. package/dist/proxy/create-function.js +0 -24
  288. package/dist/proxy/create-function.js.map +0 -1
  289. package/dist/proxy/create-proxy.d.ts +0 -2
  290. package/dist/proxy/create-proxy.d.ts.map +0 -1
  291. package/dist/proxy/create-proxy.js +0 -11
  292. package/dist/proxy/create-proxy.js.map +0 -1
  293. package/dist/proxy/function-generator.d.ts +0 -9
  294. package/dist/proxy/function-generator.d.ts.map +0 -1
  295. package/dist/proxy/function-generator.js +0 -29
  296. package/dist/proxy/function-generator.js.map +0 -1
  297. package/dist/proxy/index.d.ts +0 -4
  298. package/dist/proxy/index.d.ts.map +0 -1
  299. package/dist/proxy/index.js +0 -4
  300. package/dist/proxy/index.js.map +0 -1
  301. package/dist/proxy/prompt.d.ts +0 -2
  302. package/dist/proxy/prompt.d.ts.map +0 -1
  303. package/dist/proxy/prompt.js +0 -6
  304. package/dist/proxy/prompt.js.map +0 -1
  305. package/dist/proxy/types.d.ts +0 -7
  306. package/dist/proxy/types.d.ts.map +0 -1
  307. package/dist/proxy/types.js +0 -2
  308. package/dist/proxy/types.js.map +0 -1
  309. package/dist/queue/manager.d.ts +0 -5
  310. package/dist/queue/manager.d.ts.map +0 -1
  311. package/dist/queue/manager.js +0 -37
  312. package/dist/queue/manager.js.map +0 -1
  313. package/dist/queue/manager.test.d.ts +0 -2
  314. package/dist/queue/manager.test.d.ts.map +0 -1
  315. package/dist/queue/manager.test.js +0 -52
  316. package/dist/queue/manager.test.js.map +0 -1
  317. package/dist/schema-converter.d.ts +0 -4
  318. package/dist/schema-converter.d.ts.map +0 -1
  319. package/dist/schema-converter.js +0 -30
  320. package/dist/schema-converter.js.map +0 -1
  321. package/dist/stream/index.d.ts +0 -7
  322. package/dist/stream/index.d.ts.map +0 -1
  323. package/dist/stream/index.js +0 -23
  324. package/dist/stream/index.js.map +0 -1
  325. package/dist/streaming/utils.d.ts +0 -4
  326. package/dist/streaming/utils.d.ts.map +0 -1
  327. package/dist/streaming/utils.js +0 -131
  328. package/dist/streaming/utils.js.map +0 -1
  329. package/dist/streaming/utils.test.d.ts +0 -2
  330. package/dist/streaming/utils.test.d.ts.map +0 -1
  331. package/dist/streaming/utils.test.js +0 -84
  332. package/dist/streaming/utils.test.js.map +0 -1
  333. package/dist/templates/result.d.ts +0 -7
  334. package/dist/templates/result.d.ts.map +0 -1
  335. package/dist/templates/result.js +0 -40
  336. package/dist/templates/result.js.map +0 -1
  337. package/dist/templates/result.test.d.ts +0 -2
  338. package/dist/templates/result.test.d.ts.map +0 -1
  339. package/dist/templates/result.test.js +0 -75
  340. package/dist/templates/result.test.js.map +0 -1
  341. package/dist/test/setup.d.ts +0 -2
  342. package/dist/test/setup.d.ts.map +0 -1
  343. package/dist/test/setup.js +0 -21
  344. package/dist/test/setup.js.map +0 -1
  345. package/dist/test-types.d.ts +0 -13
  346. package/dist/test-types.d.ts.map +0 -1
  347. package/dist/test-types.js +0 -55
  348. package/dist/test-types.js.map +0 -1
  349. package/dist/types/index.d.ts +0 -4
  350. package/dist/types/index.d.ts.map +0 -1
  351. package/dist/types/index.js +0 -4
  352. package/dist/types/index.js.map +0 -1
  353. package/dist/types/list.d.ts +0 -10
  354. package/dist/types/list.d.ts.map +0 -1
  355. package/dist/types/list.js +0 -2
  356. package/dist/types/list.js.map +0 -1
  357. package/dist/types/model.d.ts +0 -7
  358. package/dist/types/model.d.ts.map +0 -1
  359. package/dist/types/model.js +0 -2
  360. package/dist/types/model.js.map +0 -1
  361. package/dist/types/options.d.ts +0 -25
  362. package/dist/types/options.d.ts.map +0 -1
  363. package/dist/types/options.js +0 -2
  364. package/dist/types/options.js.map +0 -1
  365. package/dist/types/schema.d.ts +0 -5
  366. package/dist/types/schema.d.ts.map +0 -1
  367. package/dist/types/schema.js +0 -2
  368. package/dist/types/schema.js.map +0 -1
  369. package/dist/utils/__tests__/request-handler.test.d.ts +0 -2
  370. package/dist/utils/__tests__/request-handler.test.d.ts.map +0 -1
  371. package/dist/utils/__tests__/request-handler.test.js +0 -134
  372. package/dist/utils/__tests__/request-handler.test.js.map +0 -1
  373. package/dist/utils/__tests__/schema.test.d.ts +0 -2
  374. package/dist/utils/__tests__/schema.test.d.ts.map +0 -1
  375. package/dist/utils/__tests__/schema.test.js +0 -49
  376. package/dist/utils/__tests__/schema.test.js.map +0 -1
  377. package/dist/utils/__tests__/stream-progress.test.d.ts +0 -2
  378. package/dist/utils/__tests__/stream-progress.test.d.ts.map +0 -1
  379. package/dist/utils/__tests__/stream-progress.test.js +0 -85
  380. package/dist/utils/__tests__/stream-progress.test.js.map +0 -1
  381. package/dist/utils/index.d.ts +0 -2
  382. package/dist/utils/index.d.ts.map +0 -1
  383. package/dist/utils/index.js +0 -2
  384. package/dist/utils/index.js.map +0 -1
  385. package/dist/utils/request-handler.d.ts +0 -17
  386. package/dist/utils/request-handler.d.ts.map +0 -1
  387. package/dist/utils/request-handler.js +0 -105
  388. package/dist/utils/request-handler.js.map +0 -1
  389. package/dist/utils/schema.d.ts +0 -11
  390. package/dist/utils/schema.d.ts.map +0 -1
  391. package/dist/utils/schema.js +0 -51
  392. package/dist/utils/schema.js.map +0 -1
  393. package/dist/utils/stream-progress.d.ts +0 -17
  394. package/dist/utils/stream-progress.d.ts.map +0 -1
  395. package/dist/utils/stream-progress.js +0 -86
  396. package/dist/utils/stream-progress.js.map +0 -1
  397. package/dist/utils/validation.d.ts +0 -3
  398. package/dist/utils/validation.d.ts.map +0 -1
  399. package/dist/utils/validation.js +0 -30
  400. package/dist/utils/validation.js.map +0 -1
@@ -0,0 +1,19 @@
1
+ import { defineConfig } from 'evalite/config'
2
+
3
+ export default defineConfig({
4
+ // Single trial by default (override per-eval for LLM-as-judge)
5
+ trialCount: 1,
6
+
7
+ // Allow longer timeouts for API calls
8
+ testTimeout: 60_000,
9
+
10
+ // Run up to 5 evals in parallel (be nice to rate limits)
11
+ maxConcurrency: 5,
12
+
13
+ // Fail CI if average score drops below 70%
14
+ scoreThreshold: 70,
15
+
16
+ server: {
17
+ port: 3006,
18
+ },
19
+ })
@@ -0,0 +1,212 @@
1
+ # AI Functions Eval Suite
2
+
3
+ Evaluations for ai-functions using both vitest-based tests and a custom eval runner.
4
+
5
+ ## Quick Start
6
+
7
+ ### Vitest-Based Evals (Recommended)
8
+
9
+ Tests the core AI primitives (`code`, `ai`, `list`, `is`, `defineFunction`, etc.) with real AI calls:
10
+
11
+ ```bash
12
+ # Run all eval tests
13
+ pnpm test:evals
14
+
15
+ # Run primitives eval (code, ai, list, is, etc.)
16
+ pnpm test:evals:primitives
17
+
18
+ # Run defineFunction eval
19
+ pnpm test:evals:define
20
+
21
+ # Run with specific model
22
+ MODEL=sonnet pnpm test:evals
23
+
24
+ # Run with specific tiers
25
+ EVAL_TIERS=best,fast pnpm test:evals
26
+ ```
27
+
28
+ ### Custom Runner Evals
29
+
30
+ Math and classification evals with detailed scoring:
31
+
32
+ ```bash
33
+ # Run all evals (math + classification)
34
+ pnpm eval
35
+
36
+ # Run specific eval
37
+ pnpm eval:math
38
+ pnpm eval:class
39
+
40
+ # Run all tiers (best, fast, cheap)
41
+ pnpm eval:all
42
+ ```
43
+
44
+ ## Eval Suites
45
+
46
+ ### Vitest Evals (test/evals/)
47
+
48
+ | Test Suite | Functions Tested | Test Cases |
49
+ |------------|------------------|------------|
50
+ | `primitives.eval.test.ts` | `code()`, `ai()`, `list()`, `is()`, `summarize()`, `extract()`, `write()`, `lists()` | Code generation, text generation, classification, extraction |
51
+ | `define-function.eval.test.ts` | `defineFunction()`, `define.generative()`, `define.code()` | Generative functions, code functions, structured outputs |
52
+
53
+ ### Custom Runner Evals (evals/)
54
+
55
+ | Eval | Tests | Scoring |
56
+ |------|-------|---------|
57
+ | `Math` | Arithmetic, word problems | Correct answer + shows work |
58
+ | `Classification` | Sentiment, support tickets | Accuracy + calibration |
59
+ | `Marketing` | Marketing copy generation | LLM-as-judge ELO ranking |
60
+
61
+ ### Marketing Copy Eval (LLM-as-Judge)
62
+
63
+ ```bash
64
+ # Run marketing eval (fast tier only)
65
+ pnpm eval:marketing
66
+
67
+ # Run with all tiers
68
+ pnpm eval:marketing:all
69
+
70
+ # Use different judge model
71
+ pnpm eval:marketing -- --judge=opus
72
+ ```
73
+
74
+ Generates marketing copy (title, description, hero headline/subhead, CTAs) for different scenarios and uses pairwise LLM-as-judge comparisons to create ELO rankings.
75
+
76
+ ## Latest Results (December 2025)
77
+
78
+ **Overall: 94.0%** | Cost: $0.06 | Time: 95s | 10 Models
79
+
80
+ ### Performance Summary
81
+
82
+ | Model | Math | Class | Overall | Avg Latency | Notes |
83
+ |-------|------|-------|---------|-------------|-------|
84
+ | Claude Sonnet 4.5 | 100% | 100% | **100%** | ~380ms | Best overall |
85
+ | GPT-5 Mini | 100% | 91.7% | 95.9% | ~1850ms | Slower but accurate |
86
+ | Gemini 2.5 Flash | 100% | 91.7% | 95.9% | ~200ms | **Fastest** |
87
+ | DeepSeek Chat | 100% | 91.7% | 95.9% | ~210ms | Great value |
88
+ | Mistral Medium 3.1 | 96% | 100% | 98.0% | ~850ms | Strong classify |
89
+ | Grok 4.1 Fast | 100% | 91.7% | 95.9% | ~2300ms | 2M context |
90
+ | Grok 4 Fast | 92% | 100% | 96.0% | ~1800ms | Good balance |
91
+ | Qwen3 30B | 96% | 91.7% | 93.9% | ~8900ms | Slowest |
92
+ | Llama 3.3 70B | 90% | 91.7% | 90.9% | ~185ms | Fast open model |
93
+ | GPT-oss 20B | 72% | 83.3% | 77.7% | ~1200ms | Open source |
94
+
95
+ ### Performance/$ Analysis (Fast Tier)
96
+
97
+ | Model | Score | Est $/1M tokens | Score/$ | Recommendation |
98
+ |-------|-------|-----------------|---------|----------------|
99
+ | DeepSeek Chat | 95.9% | $0.28 | **342** | Best value |
100
+ | Gemini 2.5 Flash | 95.9% | $0.30 | 320 | Fast + cheap |
101
+ | Llama 3.3 70B | 90.9% | $0.40 | 227 | Good OSS option |
102
+ | Claude Sonnet 4.5 | 100% | $3.00 | 33 | Best quality |
103
+ | Mistral Medium 3.1 | 98.0% | $2.50 | 39 | Strong balance |
104
+ | GPT-5 Mini | 95.9% | $1.00 | 96 | OpenAI ecosystem |
105
+ | Grok 4.1 Fast | 95.9% | $2.00 | 48 | 2M context |
106
+
107
+ ### Math Eval (94.6%)
108
+
109
+ | Model | Score | Avg Latency |
110
+ |-------|-------|-------------|
111
+ | Claude Sonnet 4.5 | 100% | ~380ms |
112
+ | GPT-5 Mini | 100% | ~200ms |
113
+ | Gemini 2.5 Flash | 100% | ~170ms |
114
+ | DeepSeek Chat | 100% | ~220ms |
115
+ | Grok 4.1 Fast | 100% | ~2600ms |
116
+ | Mistral Medium 3.1 | 96% | ~1040ms |
117
+ | Qwen3 30B | 96% | ~13000ms |
118
+ | Grok 4 Fast | 92% | ~2000ms |
119
+ | Llama 3.3 70B | 90% | ~170ms |
120
+ | GPT-oss 20B | 72% | ~180ms |
121
+
122
+ ### Classification Eval (93.3%)
123
+
124
+ | Model | Score | Avg Latency |
125
+ |-------|-------|-------------|
126
+ | Claude Sonnet 4.5 | 100% | ~205ms |
127
+ | Mistral Medium 3.1 | 100% | ~700ms |
128
+ | Grok 4 Fast | 100% | ~1670ms |
129
+ | GPT-5 Mini | 91.7% | ~3500ms |
130
+ | Gemini 2.5 Flash | 91.7% | ~235ms |
131
+ | Llama 3.3 70B | 91.7% | ~230ms |
132
+ | DeepSeek Chat | 91.7% | ~230ms |
133
+ | Qwen3 30B | 91.7% | ~3970ms |
134
+ | Grok 4.1 Fast | 91.7% | ~2170ms |
135
+ | GPT-oss 20B | 83.3% | ~2840ms |
136
+
137
+ ### Marketing Copy Eval (ELO Rankings)
138
+
139
+ Uses LLM-as-judge (Claude Sonnet) for pairwise comparisons across 4 test scenarios.
140
+
141
+ | Rank | Model | ELO | W | L | D | Notes |
142
+ |------|-------|-----|---|---|---|-------|
143
+ | 1 | Claude Sonnet 4.5 | **1745** | 31 | 3 | 0 | Dominant winner |
144
+ | 2 | Grok 4.1 Fast | 1595 | 22 | 12 | 0 | Strong creative |
145
+ | 3 | GPT-5 Mini | 1593 | 26 | 8 | 0 | Consistent quality |
146
+ | 4 | Grok 4 Fast | 1558 | 17 | 17 | 0 | Good balance |
147
+ | 5 | Gemini 2.5 Flash | 1503 | 14 | 20 | 0 | Middle tier |
148
+ | 6 | Mistral Medium 3.1 | 1481 | 16 | 18 | 0 | Solid performer |
149
+ | 7 | GPT-oss 20B | 1471 | 19 | 15 | 0 | OSS option |
150
+ | 8 | DeepSeek Chat | 1449 | 10 | 16 | 0 | Value option |
151
+ | 9 | Qwen3 30B | 1371 | 6 | 20 | 0 | Below average |
152
+ | 10 | Llama 3.3 70B | 1231 | 1 | 33 | 0 | Struggled |
153
+
154
+ **Key Insights:**
155
+ - Claude Sonnet 4.5 won 31 of 34 comparisons (91%)
156
+ - Grok models performed unexpectedly well on creative tasks
157
+ - Llama 3.3 70B, despite being strong on classification, struggled with marketing copy
158
+
159
+ ## Models
160
+
161
+ Uses model IDs from `language-models` package, routed via `ai-providers`:
162
+
163
+ ### Model Tiers
164
+
165
+ | Tier | Description | Models |
166
+ |------|-------------|--------|
167
+ | `best` | Highest capability | opus, o3, gpt-5.1, gemini-pro, deepseek-v3.2, mistral-large-3, qwen3-coder, grok-4 |
168
+ | `fast` | Good balance | sonnet, gpt-5-mini, flash, llama-3.3-70b, mistral-medium-3.1, qwen3-30b, grok-4.1-fast |
169
+ | `cheap` | Cost-optimized | haiku, gpt-5-nano, ministral-14b |
170
+
171
+ ### Full Model List (December 2025)
172
+
173
+ - **Anthropic**: `opus`, `sonnet`, `haiku`
174
+ - **OpenAI**: `openai/gpt-5.1`, `openai/gpt-5-mini`, `openai/gpt-5-nano`, `openai/o3`
175
+ - **OpenAI OSS**: `openai/gpt-oss-120b`, `openai/gpt-oss-20b` (open source models)
176
+ - **Google**: `gemini-pro`, `flash`
177
+ - **Meta**: `meta-llama/llama-4-maverick`, `meta-llama/llama-3.3-70b-instruct`
178
+ - **DeepSeek**: `deepseek/deepseek-v3.2`, `deepseek/deepseek-v3.2-speciale`, `deepseek/deepseek-chat`
179
+ - **Mistral**: `mistralai/mistral-large-2512` (Mistral Large 3), `mistralai/mistral-medium-3.1`, `mistralai/ministral-14b-2512`
180
+ - **Qwen**: `qwen/qwen3-coder`, `qwen/qwen3-30b-a3b`, `qwen/qwen3-next-80b-a3b-instruct`
181
+ - **xAI**: `x-ai/grok-4`, `x-ai/grok-4.1-fast`, `x-ai/grok-4-fast`
182
+
183
+ ## Environment
184
+
185
+ ```bash
186
+ # Use AI Gateway (recommended)
187
+ AI_GATEWAY_URL=https://gateway.ai.cloudflare.com/v1/...
188
+ AI_GATEWAY_TOKEN=...
189
+
190
+ # Or direct API keys
191
+ ANTHROPIC_API_KEY=sk-ant-...
192
+ OPENAI_API_KEY=sk-...
193
+ ```
194
+
195
+ ## Adding Evals
196
+
197
+ ### Vitest-Based Evals
198
+
199
+ 1. Create a new test file in `test/evals/`
200
+ 2. Import functions and models:
201
+ ```typescript
202
+ import { code, ai, list } from '../../src/primitives.js'
203
+ import { EVAL_MODELS, type EvalModel } from '../../src/eval/models.js'
204
+ ```
205
+ 3. Use `describe.skipIf(!hasAPI)` to skip when no API access
206
+ 4. Loop over models with `for (const model of models)`
207
+
208
+ ### Custom Runner Evals
209
+
210
+ 1. Add test cases to `evals/run-evals.ts`
211
+ 2. Use `runEval()` with `task` function and `scorers` array
212
+ 3. Use `createModelVariants({ tiers: ['fast'] })` to filter models
@@ -0,0 +1,108 @@
1
+ /**
2
+ * Classification Eval
3
+ *
4
+ * Tests model ability to classify inputs correctly.
5
+ * Includes sentiment analysis, category classification, and boolean questions.
6
+ */
7
+
8
+ import { evalite } from 'evalite'
9
+ import { generateObject } from '../src/generate.js'
10
+ import { schema } from '../src/schema.js'
11
+ import { createModelVariants, type EvalModel } from '../src/eval/models.js'
12
+
13
+ // Classification test cases
14
+ const TEST_CASES = [
15
+ // Sentiment
16
+ { text: 'This product exceeded my expectations!', expected: 'positive', options: ['positive', 'negative', 'neutral'] },
17
+ { text: 'The delivery was late and packaging damaged.', expected: 'negative', options: ['positive', 'negative', 'neutral'] },
18
+ { text: 'The product arrived as described.', expected: 'neutral', options: ['positive', 'negative', 'neutral'] },
19
+
20
+ // Support ticket classification
21
+ { text: 'I need to reset my password', expected: 'account', options: ['account', 'billing', 'technical', 'shipping'] },
22
+ { text: 'When will my refund be processed?', expected: 'billing', options: ['account', 'billing', 'technical', 'shipping'] },
23
+ { text: 'The app crashes when uploading images', expected: 'technical', options: ['account', 'billing', 'technical', 'shipping'] },
24
+ { text: 'My package shows delivered but I never received it', expected: 'shipping', options: ['account', 'billing', 'technical', 'shipping'] },
25
+ ]
26
+
27
+ const modelVariants = createModelVariants({ tiers: ['fast', 'cheap'] })
28
+
29
+ evalite.each(modelVariants)('Classification', {
30
+ data: TEST_CASES.map(tc => ({ input: tc, expected: tc.expected })),
31
+
32
+ task: async (input, variant) => {
33
+ const model = variant as EvalModel
34
+ const startTime = Date.now()
35
+
36
+ const enumStr = input.options.join(' | ')
37
+
38
+ const { object, usage } = await generateObject({
39
+ model: model.id,
40
+ schema: schema({
41
+ category: enumStr,
42
+ confidence: 'Confidence 0-1 (number)',
43
+ }),
44
+ prompt: `Classify this text into one of: ${input.options.join(', ')}
45
+
46
+ Text: "${input.text}"`,
47
+ })
48
+
49
+ const latencyMs = Date.now() - startTime
50
+
51
+ return {
52
+ predicted: object.category,
53
+ confidence: object.confidence,
54
+ expected: input.expected,
55
+ text: input.text,
56
+ options: input.options,
57
+ modelId: model.id,
58
+ modelName: model.name,
59
+ latencyMs,
60
+ usage,
61
+ }
62
+ },
63
+
64
+ scorers: [
65
+ // Accuracy
66
+ {
67
+ name: 'Accuracy',
68
+ description: 'Whether classification is correct',
69
+ scorer: ({ output, expected }) => ({
70
+ score: output.predicted === expected ? 1 : 0,
71
+ }),
72
+ },
73
+
74
+ // Valid category
75
+ {
76
+ name: 'Valid Category',
77
+ description: 'Whether output is a valid option',
78
+ scorer: ({ output }) => ({
79
+ score: (output.options as string[]).includes(output.predicted as string) ? 1 : 0,
80
+ }),
81
+ },
82
+
83
+ // Calibration
84
+ {
85
+ name: 'Calibration',
86
+ description: 'Confidence matches accuracy',
87
+ scorer: ({ output, expected }) => {
88
+ const correct = output.predicted === expected
89
+ const conf = output.confidence as number
90
+
91
+ // High confidence when correct, low when wrong = well calibrated
92
+ if (correct && conf >= 0.7) return { score: 1 }
93
+ if (!correct && conf <= 0.5) return { score: 0.8 }
94
+ if (correct && conf < 0.5) return { score: 0.6 } // Underconfident
95
+ if (!correct && conf > 0.7) return { score: 0.2 } // Overconfident
96
+ return { score: 0.5 }
97
+ },
98
+ },
99
+ ],
100
+
101
+ columns: ({ output, expected }) => [
102
+ { label: 'Model', value: output.modelName },
103
+ { label: 'Expected', value: expected },
104
+ { label: 'Got', value: output.predicted },
105
+ { label: 'Correct', value: output.predicted === expected ? 'Yes' : 'No' },
106
+ { label: 'Confidence', value: `${((output.confidence as number) * 100).toFixed(0)}%` },
107
+ ],
108
+ })