ai-functions 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (400) hide show
  1. package/.turbo/turbo-build.log +5 -0
  2. package/.turbo/turbo-test.log +105 -0
  3. package/README.md +190 -86
  4. package/TODO.md +138 -0
  5. package/dist/ai-promise.d.ts +219 -0
  6. package/dist/ai-promise.d.ts.map +1 -0
  7. package/dist/ai-promise.js +610 -0
  8. package/dist/ai-promise.js.map +1 -0
  9. package/dist/ai.d.ts +285 -0
  10. package/dist/ai.d.ts.map +1 -0
  11. package/dist/ai.js +842 -0
  12. package/dist/ai.js.map +1 -0
  13. package/dist/batch/anthropic.d.ts +23 -0
  14. package/dist/batch/anthropic.d.ts.map +1 -0
  15. package/dist/batch/anthropic.js +257 -0
  16. package/dist/batch/anthropic.js.map +1 -0
  17. package/dist/batch/bedrock.d.ts +64 -0
  18. package/dist/batch/bedrock.d.ts.map +1 -0
  19. package/dist/batch/bedrock.js +586 -0
  20. package/dist/batch/bedrock.js.map +1 -0
  21. package/dist/batch/cloudflare.d.ts +37 -0
  22. package/dist/batch/cloudflare.d.ts.map +1 -0
  23. package/dist/batch/cloudflare.js +289 -0
  24. package/dist/batch/cloudflare.js.map +1 -0
  25. package/dist/batch/google.d.ts +41 -0
  26. package/dist/batch/google.d.ts.map +1 -0
  27. package/dist/batch/google.js +360 -0
  28. package/dist/batch/google.js.map +1 -0
  29. package/dist/batch/index.d.ts +31 -0
  30. package/dist/batch/index.d.ts.map +1 -0
  31. package/dist/batch/index.js +31 -0
  32. package/dist/batch/index.js.map +1 -0
  33. package/dist/batch/memory.d.ts +44 -0
  34. package/dist/batch/memory.d.ts.map +1 -0
  35. package/dist/batch/memory.js +188 -0
  36. package/dist/batch/memory.js.map +1 -0
  37. package/dist/batch/openai.d.ts +37 -0
  38. package/dist/batch/openai.d.ts.map +1 -0
  39. package/dist/batch/openai.js +403 -0
  40. package/dist/batch/openai.js.map +1 -0
  41. package/dist/batch-map.d.ts +125 -0
  42. package/dist/batch-map.d.ts.map +1 -0
  43. package/dist/batch-map.js +406 -0
  44. package/dist/batch-map.js.map +1 -0
  45. package/dist/batch-queue.d.ts +273 -0
  46. package/dist/batch-queue.d.ts.map +1 -0
  47. package/dist/batch-queue.js +271 -0
  48. package/dist/batch-queue.js.map +1 -0
  49. package/dist/context.d.ts +133 -0
  50. package/dist/context.d.ts.map +1 -0
  51. package/dist/context.js +267 -0
  52. package/dist/context.js.map +1 -0
  53. package/dist/embeddings.d.ts +123 -0
  54. package/dist/embeddings.d.ts.map +1 -0
  55. package/dist/embeddings.js +170 -0
  56. package/dist/embeddings.js.map +1 -0
  57. package/dist/eval/index.d.ts +8 -0
  58. package/dist/eval/index.d.ts.map +1 -0
  59. package/dist/eval/index.js +8 -0
  60. package/dist/eval/index.js.map +1 -0
  61. package/dist/eval/models.d.ts +66 -0
  62. package/dist/eval/models.d.ts.map +1 -0
  63. package/dist/eval/models.js +120 -0
  64. package/dist/eval/models.js.map +1 -0
  65. package/dist/eval/runner.d.ts +64 -0
  66. package/dist/eval/runner.d.ts.map +1 -0
  67. package/dist/eval/runner.js +148 -0
  68. package/dist/eval/runner.js.map +1 -0
  69. package/dist/generate.d.ts +168 -0
  70. package/dist/generate.d.ts.map +1 -0
  71. package/dist/generate.js +174 -0
  72. package/dist/generate.js.map +1 -0
  73. package/dist/index.d.ts +29 -4
  74. package/dist/index.d.ts.map +1 -1
  75. package/dist/index.js +53 -52
  76. package/dist/index.js.map +1 -1
  77. package/dist/primitives.d.ts +292 -0
  78. package/dist/primitives.d.ts.map +1 -0
  79. package/dist/primitives.js +471 -0
  80. package/dist/primitives.js.map +1 -0
  81. package/dist/providers/cloudflare.d.ts +9 -0
  82. package/dist/providers/cloudflare.d.ts.map +1 -0
  83. package/dist/providers/cloudflare.js +9 -0
  84. package/dist/providers/cloudflare.js.map +1 -0
  85. package/dist/providers/index.d.ts +9 -0
  86. package/dist/providers/index.d.ts.map +1 -0
  87. package/dist/providers/index.js +9 -0
  88. package/dist/providers/index.js.map +1 -0
  89. package/dist/schema.d.ts +54 -0
  90. package/dist/schema.d.ts.map +1 -0
  91. package/dist/schema.js +109 -0
  92. package/dist/schema.js.map +1 -0
  93. package/dist/template.d.ts +73 -0
  94. package/dist/template.d.ts.map +1 -0
  95. package/dist/template.js +129 -0
  96. package/dist/template.js.map +1 -0
  97. package/dist/types.d.ts +474 -106
  98. package/dist/types.d.ts.map +1 -1
  99. package/dist/types.js +4 -8
  100. package/dist/types.js.map +1 -1
  101. package/evalite.config.ts +19 -0
  102. package/evals/README.md +212 -0
  103. package/evals/classification.eval.ts +108 -0
  104. package/evals/marketing.eval.ts +370 -0
  105. package/evals/math.eval.ts +94 -0
  106. package/evals/run-evals.ts +166 -0
  107. package/evals/structured-output.eval.ts +143 -0
  108. package/evals/writing.eval.ts +117 -0
  109. package/examples/batch-blog-posts.ts +160 -0
  110. package/package.json +57 -57
  111. package/src/ai-promise.ts +784 -0
  112. package/src/ai.ts +1183 -0
  113. package/src/batch/anthropic.ts +375 -0
  114. package/src/batch/bedrock.ts +801 -0
  115. package/src/batch/cloudflare.ts +421 -0
  116. package/src/batch/google.ts +491 -0
  117. package/src/batch/index.ts +31 -0
  118. package/src/batch/memory.ts +253 -0
  119. package/src/batch/openai.ts +557 -0
  120. package/src/batch-map.ts +534 -0
  121. package/src/batch-queue.ts +493 -0
  122. package/src/context.ts +332 -0
  123. package/src/embeddings.ts +244 -0
  124. package/src/eval/index.ts +8 -0
  125. package/src/eval/models.ts +158 -0
  126. package/src/eval/runner.ts +217 -0
  127. package/src/generate.ts +245 -0
  128. package/src/index.ts +154 -0
  129. package/src/primitives.ts +612 -0
  130. package/src/providers/cloudflare.ts +15 -0
  131. package/src/providers/index.ts +14 -0
  132. package/src/schema.ts +147 -0
  133. package/src/template.ts +209 -0
  134. package/src/types.ts +540 -0
  135. package/test/README.md +105 -0
  136. package/test/ai-proxy.test.ts +192 -0
  137. package/test/async-iterators.test.ts +327 -0
  138. package/test/batch-background.test.ts +482 -0
  139. package/test/batch-blog-posts.test.ts +387 -0
  140. package/test/blog-generation.test.ts +510 -0
  141. package/test/browse-read.test.ts +611 -0
  142. package/test/core-functions.test.ts +694 -0
  143. package/test/decide.test.ts +393 -0
  144. package/test/define.test.ts +274 -0
  145. package/test/e2e-bedrock-manual.ts +163 -0
  146. package/test/e2e-bedrock.test.ts +191 -0
  147. package/test/e2e-flex-gateway.ts +157 -0
  148. package/test/e2e-flex-manual.ts +183 -0
  149. package/test/e2e-flex.test.ts +209 -0
  150. package/test/e2e-google-manual.ts +178 -0
  151. package/test/e2e-google.test.ts +216 -0
  152. package/test/embeddings.test.ts +284 -0
  153. package/test/evals/define-function.eval.test.ts +379 -0
  154. package/test/evals/primitives.eval.test.ts +384 -0
  155. package/test/function-types.test.ts +492 -0
  156. package/test/generate-core.test.ts +319 -0
  157. package/test/generate.test.ts +163 -0
  158. package/test/implicit-batch.test.ts +422 -0
  159. package/test/schema.test.ts +109 -0
  160. package/test/tagged-templates.test.ts +302 -0
  161. package/tsconfig.json +10 -0
  162. package/vitest.config.ts +42 -0
  163. package/LICENSE +0 -21
  164. package/bin/cli.js +0 -5
  165. package/dist/cli/index.d.ts +0 -10
  166. package/dist/cli/index.d.ts.map +0 -1
  167. package/dist/cli/index.js +0 -38
  168. package/dist/cli/index.js.map +0 -1
  169. package/dist/cli/index.test.d.ts +0 -2
  170. package/dist/cli/index.test.d.ts.map +0 -1
  171. package/dist/cli/index.test.js +0 -35
  172. package/dist/cli/index.test.js.map +0 -1
  173. package/dist/constants/models.d.ts +0 -10
  174. package/dist/constants/models.d.ts.map +0 -1
  175. package/dist/constants/models.js +0 -12
  176. package/dist/constants/models.js.map +0 -1
  177. package/dist/converters/index.d.ts +0 -3
  178. package/dist/converters/index.d.ts.map +0 -1
  179. package/dist/converters/index.js +0 -3
  180. package/dist/converters/index.js.map +0 -1
  181. package/dist/converters/model.d.ts +0 -4
  182. package/dist/converters/model.d.ts.map +0 -1
  183. package/dist/converters/model.js +0 -19
  184. package/dist/converters/model.js.map +0 -1
  185. package/dist/converters/schema.d.ts +0 -4
  186. package/dist/converters/schema.d.ts.map +0 -1
  187. package/dist/converters/schema.js +0 -25
  188. package/dist/converters/schema.js.map +0 -1
  189. package/dist/core/responses.d.ts +0 -5
  190. package/dist/core/responses.d.ts.map +0 -1
  191. package/dist/core/responses.js +0 -16
  192. package/dist/core/responses.js.map +0 -1
  193. package/dist/core/responses.test.d.ts +0 -2
  194. package/dist/core/responses.test.d.ts.map +0 -1
  195. package/dist/core/responses.test.js +0 -31
  196. package/dist/core/responses.test.js.map +0 -1
  197. package/dist/errors.d.ts +0 -6
  198. package/dist/errors.d.ts.map +0 -1
  199. package/dist/errors.js +0 -9
  200. package/dist/errors.js.map +0 -1
  201. package/dist/examples/streaming.test.d.ts +0 -2
  202. package/dist/examples/streaming.test.d.ts.map +0 -1
  203. package/dist/examples/streaming.test.js +0 -176
  204. package/dist/examples/streaming.test.js.map +0 -1
  205. package/dist/factory/__tests__/index.test.d.ts +0 -2
  206. package/dist/factory/__tests__/index.test.d.ts.map +0 -1
  207. package/dist/factory/__tests__/index.test.js +0 -430
  208. package/dist/factory/__tests__/index.test.js.map +0 -1
  209. package/dist/factory/__tests__/list.test.d.ts +0 -2
  210. package/dist/factory/__tests__/list.test.d.ts.map +0 -1
  211. package/dist/factory/__tests__/list.test.js +0 -92
  212. package/dist/factory/__tests__/list.test.js.map +0 -1
  213. package/dist/factory/index.d.ts +0 -20
  214. package/dist/factory/index.d.ts.map +0 -1
  215. package/dist/factory/index.js +0 -287
  216. package/dist/factory/index.js.map +0 -1
  217. package/dist/factory/index.test.d.ts +0 -2
  218. package/dist/factory/index.test.d.ts.map +0 -1
  219. package/dist/factory/index.test.js +0 -287
  220. package/dist/factory/index.test.js.map +0 -1
  221. package/dist/factory/list.d.ts +0 -3
  222. package/dist/factory/list.d.ts.map +0 -1
  223. package/dist/factory/list.js +0 -221
  224. package/dist/factory/list.js.map +0 -1
  225. package/dist/factory/list.test.d.ts +0 -2
  226. package/dist/factory/list.test.d.ts.map +0 -1
  227. package/dist/factory/list.test.js +0 -84
  228. package/dist/factory/list.test.js.map +0 -1
  229. package/dist/generate/index.d.ts +0 -5
  230. package/dist/generate/index.d.ts.map +0 -1
  231. package/dist/generate/index.js +0 -17
  232. package/dist/generate/index.js.map +0 -1
  233. package/dist/index.test.d.ts +0 -2
  234. package/dist/index.test.d.ts.map +0 -1
  235. package/dist/index.test.js +0 -59
  236. package/dist/index.test.js.map +0 -1
  237. package/dist/list/await.d.ts +0 -3
  238. package/dist/list/await.d.ts.map +0 -1
  239. package/dist/list/await.js +0 -28
  240. package/dist/list/await.js.map +0 -1
  241. package/dist/list/constants.d.ts +0 -4
  242. package/dist/list/constants.d.ts.map +0 -1
  243. package/dist/list/constants.js +0 -5
  244. package/dist/list/constants.js.map +0 -1
  245. package/dist/list/create-function.d.ts +0 -3
  246. package/dist/list/create-function.d.ts.map +0 -1
  247. package/dist/list/create-function.js +0 -11
  248. package/dist/list/create-function.js.map +0 -1
  249. package/dist/list/index.d.ts +0 -4
  250. package/dist/list/index.d.ts.map +0 -1
  251. package/dist/list/index.js +0 -5
  252. package/dist/list/index.js.map +0 -1
  253. package/dist/list/prompt.d.ts +0 -3
  254. package/dist/list/prompt.d.ts.map +0 -1
  255. package/dist/list/prompt.js +0 -6
  256. package/dist/list/prompt.js.map +0 -1
  257. package/dist/list/schemas.d.ts +0 -4
  258. package/dist/list/schemas.d.ts.map +0 -1
  259. package/dist/list/schemas.js +0 -8
  260. package/dist/list/schemas.js.map +0 -1
  261. package/dist/list/stream.d.ts +0 -3
  262. package/dist/list/stream.d.ts.map +0 -1
  263. package/dist/list/stream.js +0 -33
  264. package/dist/list/stream.js.map +0 -1
  265. package/dist/list/types.d.ts +0 -11
  266. package/dist/list/types.d.ts.map +0 -1
  267. package/dist/list/types.js +0 -2
  268. package/dist/list/types.js.map +0 -1
  269. package/dist/list/validation.d.ts +0 -3
  270. package/dist/list/validation.d.ts.map +0 -1
  271. package/dist/list/validation.js +0 -12
  272. package/dist/list/validation.js.map +0 -1
  273. package/dist/providers/config.d.ts +0 -4
  274. package/dist/providers/config.d.ts.map +0 -1
  275. package/dist/providers/config.js +0 -21
  276. package/dist/providers/config.js.map +0 -1
  277. package/dist/providers/config.test.d.ts +0 -2
  278. package/dist/providers/config.test.d.ts.map +0 -1
  279. package/dist/providers/config.test.js +0 -37
  280. package/dist/providers/config.test.js.map +0 -1
  281. package/dist/proxy/constants.d.ts +0 -4
  282. package/dist/proxy/constants.d.ts.map +0 -1
  283. package/dist/proxy/constants.js +0 -5
  284. package/dist/proxy/constants.js.map +0 -1
  285. package/dist/proxy/create-function.d.ts +0 -4
  286. package/dist/proxy/create-function.d.ts.map +0 -1
  287. package/dist/proxy/create-function.js +0 -24
  288. package/dist/proxy/create-function.js.map +0 -1
  289. package/dist/proxy/create-proxy.d.ts +0 -2
  290. package/dist/proxy/create-proxy.d.ts.map +0 -1
  291. package/dist/proxy/create-proxy.js +0 -11
  292. package/dist/proxy/create-proxy.js.map +0 -1
  293. package/dist/proxy/function-generator.d.ts +0 -9
  294. package/dist/proxy/function-generator.d.ts.map +0 -1
  295. package/dist/proxy/function-generator.js +0 -29
  296. package/dist/proxy/function-generator.js.map +0 -1
  297. package/dist/proxy/index.d.ts +0 -4
  298. package/dist/proxy/index.d.ts.map +0 -1
  299. package/dist/proxy/index.js +0 -4
  300. package/dist/proxy/index.js.map +0 -1
  301. package/dist/proxy/prompt.d.ts +0 -2
  302. package/dist/proxy/prompt.d.ts.map +0 -1
  303. package/dist/proxy/prompt.js +0 -6
  304. package/dist/proxy/prompt.js.map +0 -1
  305. package/dist/proxy/types.d.ts +0 -7
  306. package/dist/proxy/types.d.ts.map +0 -1
  307. package/dist/proxy/types.js +0 -2
  308. package/dist/proxy/types.js.map +0 -1
  309. package/dist/queue/manager.d.ts +0 -5
  310. package/dist/queue/manager.d.ts.map +0 -1
  311. package/dist/queue/manager.js +0 -37
  312. package/dist/queue/manager.js.map +0 -1
  313. package/dist/queue/manager.test.d.ts +0 -2
  314. package/dist/queue/manager.test.d.ts.map +0 -1
  315. package/dist/queue/manager.test.js +0 -52
  316. package/dist/queue/manager.test.js.map +0 -1
  317. package/dist/schema-converter.d.ts +0 -4
  318. package/dist/schema-converter.d.ts.map +0 -1
  319. package/dist/schema-converter.js +0 -30
  320. package/dist/schema-converter.js.map +0 -1
  321. package/dist/stream/index.d.ts +0 -7
  322. package/dist/stream/index.d.ts.map +0 -1
  323. package/dist/stream/index.js +0 -23
  324. package/dist/stream/index.js.map +0 -1
  325. package/dist/streaming/utils.d.ts +0 -4
  326. package/dist/streaming/utils.d.ts.map +0 -1
  327. package/dist/streaming/utils.js +0 -131
  328. package/dist/streaming/utils.js.map +0 -1
  329. package/dist/streaming/utils.test.d.ts +0 -2
  330. package/dist/streaming/utils.test.d.ts.map +0 -1
  331. package/dist/streaming/utils.test.js +0 -84
  332. package/dist/streaming/utils.test.js.map +0 -1
  333. package/dist/templates/result.d.ts +0 -7
  334. package/dist/templates/result.d.ts.map +0 -1
  335. package/dist/templates/result.js +0 -40
  336. package/dist/templates/result.js.map +0 -1
  337. package/dist/templates/result.test.d.ts +0 -2
  338. package/dist/templates/result.test.d.ts.map +0 -1
  339. package/dist/templates/result.test.js +0 -75
  340. package/dist/templates/result.test.js.map +0 -1
  341. package/dist/test/setup.d.ts +0 -2
  342. package/dist/test/setup.d.ts.map +0 -1
  343. package/dist/test/setup.js +0 -21
  344. package/dist/test/setup.js.map +0 -1
  345. package/dist/test-types.d.ts +0 -13
  346. package/dist/test-types.d.ts.map +0 -1
  347. package/dist/test-types.js +0 -55
  348. package/dist/test-types.js.map +0 -1
  349. package/dist/types/index.d.ts +0 -4
  350. package/dist/types/index.d.ts.map +0 -1
  351. package/dist/types/index.js +0 -4
  352. package/dist/types/index.js.map +0 -1
  353. package/dist/types/list.d.ts +0 -10
  354. package/dist/types/list.d.ts.map +0 -1
  355. package/dist/types/list.js +0 -2
  356. package/dist/types/list.js.map +0 -1
  357. package/dist/types/model.d.ts +0 -7
  358. package/dist/types/model.d.ts.map +0 -1
  359. package/dist/types/model.js +0 -2
  360. package/dist/types/model.js.map +0 -1
  361. package/dist/types/options.d.ts +0 -25
  362. package/dist/types/options.d.ts.map +0 -1
  363. package/dist/types/options.js +0 -2
  364. package/dist/types/options.js.map +0 -1
  365. package/dist/types/schema.d.ts +0 -5
  366. package/dist/types/schema.d.ts.map +0 -1
  367. package/dist/types/schema.js +0 -2
  368. package/dist/types/schema.js.map +0 -1
  369. package/dist/utils/__tests__/request-handler.test.d.ts +0 -2
  370. package/dist/utils/__tests__/request-handler.test.d.ts.map +0 -1
  371. package/dist/utils/__tests__/request-handler.test.js +0 -134
  372. package/dist/utils/__tests__/request-handler.test.js.map +0 -1
  373. package/dist/utils/__tests__/schema.test.d.ts +0 -2
  374. package/dist/utils/__tests__/schema.test.d.ts.map +0 -1
  375. package/dist/utils/__tests__/schema.test.js +0 -49
  376. package/dist/utils/__tests__/schema.test.js.map +0 -1
  377. package/dist/utils/__tests__/stream-progress.test.d.ts +0 -2
  378. package/dist/utils/__tests__/stream-progress.test.d.ts.map +0 -1
  379. package/dist/utils/__tests__/stream-progress.test.js +0 -85
  380. package/dist/utils/__tests__/stream-progress.test.js.map +0 -1
  381. package/dist/utils/index.d.ts +0 -2
  382. package/dist/utils/index.d.ts.map +0 -1
  383. package/dist/utils/index.js +0 -2
  384. package/dist/utils/index.js.map +0 -1
  385. package/dist/utils/request-handler.d.ts +0 -17
  386. package/dist/utils/request-handler.d.ts.map +0 -1
  387. package/dist/utils/request-handler.js +0 -105
  388. package/dist/utils/request-handler.js.map +0 -1
  389. package/dist/utils/schema.d.ts +0 -11
  390. package/dist/utils/schema.d.ts.map +0 -1
  391. package/dist/utils/schema.js +0 -51
  392. package/dist/utils/schema.js.map +0 -1
  393. package/dist/utils/stream-progress.d.ts +0 -17
  394. package/dist/utils/stream-progress.d.ts.map +0 -1
  395. package/dist/utils/stream-progress.js +0 -86
  396. package/dist/utils/stream-progress.js.map +0 -1
  397. package/dist/utils/validation.d.ts +0 -3
  398. package/dist/utils/validation.d.ts.map +0 -1
  399. package/dist/utils/validation.js +0 -30
  400. package/dist/utils/validation.js.map +0 -1
@@ -0,0 +1,370 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * Marketing Copy Eval with LLM-as-Judge ELO Ranking
4
+ *
5
+ * Generates marketing copy (title, description, hero headline/subhead, CTAs)
6
+ * and uses pairwise comparison with an LLM judge to create ELO rankings.
7
+ *
8
+ * Usage:
9
+ * npx tsx evals/marketing.eval.ts
10
+ * npx tsx evals/marketing.eval.ts --judge=opus # Use specific judge model
11
+ * npx tsx evals/marketing.eval.ts --judge=haiku # Test cheap judge
12
+ * npx tsx evals/marketing.eval.ts --judge=flash # Test fast judge
13
+ * npx tsx evals/marketing.eval.ts --all # Run all tiers
14
+ * npx tsx evals/marketing.eval.ts --all --judge=haiku # All tiers + cheap judge
15
+ */
16
+
17
+ // Load .env from project root
18
+ import { config } from 'dotenv'
19
+ import { resolve } from 'path'
20
+ config({ path: resolve(import.meta.dirname, '../../../.env') })
21
+
22
+ import { generateObject } from '../src/generate.js'
23
+ import { schema } from '../src/schema.js'
24
+ import { EVAL_MODELS, type EvalModel, type ModelTier } from '../src/eval/models.js'
25
+
26
+ // Parse CLI args
27
+ const args = process.argv.slice(2)
28
+ const judgeArg = args.find(a => a.startsWith('--judge='))
29
+ const JUDGE_MODEL = judgeArg ? judgeArg.split('=')[1] : 'sonnet'
30
+ const runAll = args.includes('--all')
31
+
32
+ const tiers: ModelTier[] = runAll ? ['best', 'fast', 'cheap'] : ['fast']
33
+
34
+ // Marketing copy schema
35
+ const marketingCopySchema = schema({
36
+ title: 'Product/page title (5-10 words)',
37
+ description: 'Meta description for SEO (150-160 characters)',
38
+ hero: {
39
+ headline: 'Hero headline (5-8 words, compelling)',
40
+ subhead: 'Supporting subheadline (10-20 words)',
41
+ primaryCTA: 'Primary call-to-action button text (2-4 words)',
42
+ secondaryCTA: 'Secondary call-to-action link text (3-6 words)',
43
+ },
44
+ })
45
+
46
+ // Test cases - different product/service scenarios
47
+ const TEST_CASES = [
48
+ {
49
+ name: 'SaaS Analytics Platform',
50
+ prompt: `Create marketing copy for a B2B SaaS analytics platform called "InsightFlow" that helps companies understand their customer behavior with AI-powered insights. Target audience: Product managers and growth teams at mid-size tech companies.`,
51
+ },
52
+ {
53
+ name: 'E-commerce Fashion Brand',
54
+ prompt: `Create marketing copy for a sustainable fashion e-commerce brand called "EcoThread" that sells organic, ethically-made clothing. Target audience: Environmentally conscious millennials aged 25-35.`,
55
+ },
56
+ {
57
+ name: 'Developer Tool',
58
+ prompt: `Create marketing copy for a CLI tool called "DeployFast" that simplifies Kubernetes deployments with one-command deploys. Target audience: DevOps engineers and backend developers.`,
59
+ },
60
+ {
61
+ name: 'Mobile Fitness App',
62
+ prompt: `Create marketing copy for a fitness app called "FitPulse" that uses AI to create personalized workout plans and tracks progress with smart watch integration. Target audience: Busy professionals aged 30-45.`,
63
+ },
64
+ ]
65
+
66
+ interface MarketingCopy {
67
+ title: string
68
+ description: string
69
+ hero: {
70
+ headline: string
71
+ subhead: string
72
+ primaryCTA: string
73
+ secondaryCTA: string
74
+ }
75
+ }
76
+
77
+ interface GeneratedCopy {
78
+ model: EvalModel
79
+ testCase: typeof TEST_CASES[0]
80
+ copy: MarketingCopy
81
+ latencyMs: number
82
+ }
83
+
84
+ interface ELORating {
85
+ modelId: string
86
+ modelName: string
87
+ rating: number
88
+ wins: number
89
+ losses: number
90
+ draws: number
91
+ }
92
+
93
+ // ELO calculation
94
+ const K_FACTOR = 32
95
+ const INITIAL_ELO = 1500
96
+
97
+ function calculateEloChange(ratingA: number, ratingB: number, scoreA: number): { deltaA: number; deltaB: number } {
98
+ const expectedA = 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400))
99
+ const expectedB = 1 - expectedA
100
+
101
+ const deltaA = K_FACTOR * (scoreA - expectedA)
102
+ const deltaB = K_FACTOR * ((1 - scoreA) - expectedB)
103
+
104
+ return { deltaA, deltaB }
105
+ }
106
+
107
+ // LLM Judge for pairwise comparison
108
+ async function judgePair(
109
+ copyA: MarketingCopy,
110
+ copyB: MarketingCopy,
111
+ testCase: typeof TEST_CASES[0],
112
+ judgeModel: string
113
+ ): Promise<'A' | 'B' | 'TIE'> {
114
+ const prompt = `You are an expert marketing copywriter and brand strategist. Compare these two marketing copy options for: ${testCase.name}
115
+
116
+ Context: ${testCase.prompt}
117
+
118
+ === OPTION A ===
119
+ Title: ${copyA.title}
120
+ Description: ${copyA.description}
121
+ Hero Headline: ${copyA.hero.headline}
122
+ Hero Subhead: ${copyA.hero.subhead}
123
+ Primary CTA: ${copyA.hero.primaryCTA}
124
+ Secondary CTA: ${copyA.hero.secondaryCTA}
125
+
126
+ === OPTION B ===
127
+ Title: ${copyB.title}
128
+ Description: ${copyB.description}
129
+ Hero Headline: ${copyB.hero.headline}
130
+ Hero Subhead: ${copyB.hero.subhead}
131
+ Primary CTA: ${copyB.hero.primaryCTA}
132
+ Secondary CTA: ${copyB.hero.secondaryCTA}
133
+
134
+ Evaluate based on:
135
+ 1. Clarity and impact of messaging
136
+ 2. Target audience alignment
137
+ 3. Emotional appeal and persuasiveness
138
+ 4. CTA effectiveness
139
+ 5. Overall brand voice consistency
140
+
141
+ Which option is better? Answer A, B, or TIE if they're roughly equal.`
142
+
143
+ try {
144
+ const { object } = await generateObject({
145
+ model: judgeModel,
146
+ schema: schema({
147
+ reasoning: 'Brief explanation of your judgment (2-3 sentences)',
148
+ winner: 'A | B | TIE',
149
+ }),
150
+ prompt,
151
+ temperature: 0.3,
152
+ })
153
+
154
+ const result = object as { reasoning: string; winner: string }
155
+ const winner = result.winner.toUpperCase().trim()
156
+
157
+ if (winner === 'A' || winner === 'B' || winner === 'TIE') {
158
+ return winner
159
+ }
160
+ return 'TIE'
161
+ } catch (err) {
162
+ console.error(` Judge error: ${err}`)
163
+ return 'TIE'
164
+ }
165
+ }
166
+
167
+ // Generate marketing copy for a model
168
+ async function generateCopy(model: EvalModel, testCase: typeof TEST_CASES[0]): Promise<GeneratedCopy> {
169
+ const start = Date.now()
170
+
171
+ const { object } = await generateObject({
172
+ model: model.id,
173
+ schema: marketingCopySchema,
174
+ prompt: testCase.prompt,
175
+ temperature: 0.7,
176
+ })
177
+
178
+ return {
179
+ model,
180
+ testCase,
181
+ copy: object as MarketingCopy,
182
+ latencyMs: Date.now() - start,
183
+ }
184
+ }
185
+
186
+ // Run pairwise comparisons and calculate ELO
187
+ async function runEloTournament(
188
+ copies: GeneratedCopy[],
189
+ judgeModel: string
190
+ ): Promise<ELORating[]> {
191
+ // Initialize ELO ratings
192
+ const ratings: Map<string, ELORating> = new Map()
193
+
194
+ for (const copy of copies) {
195
+ if (!ratings.has(copy.model.id)) {
196
+ ratings.set(copy.model.id, {
197
+ modelId: copy.model.id,
198
+ modelName: copy.model.name,
199
+ rating: INITIAL_ELO,
200
+ wins: 0,
201
+ losses: 0,
202
+ draws: 0,
203
+ })
204
+ }
205
+ }
206
+
207
+ // Group copies by test case
208
+ const byTestCase = new Map<string, GeneratedCopy[]>()
209
+ for (const copy of copies) {
210
+ const key = copy.testCase.name
211
+ if (!byTestCase.has(key)) {
212
+ byTestCase.set(key, [])
213
+ }
214
+ byTestCase.get(key)!.push(copy)
215
+ }
216
+
217
+ console.log(`\n⚖️ Running pairwise comparisons with ${JUDGE_MODEL} as judge...\n`)
218
+
219
+ let comparisonCount = 0
220
+ const totalComparisons = Array.from(byTestCase.values()).reduce(
221
+ (sum, copies) => sum + (copies.length * (copies.length - 1)) / 2,
222
+ 0
223
+ )
224
+
225
+ // Run pairwise comparisons within each test case
226
+ for (const [testCaseName, testCaseCopies] of byTestCase) {
227
+ console.log(` 📝 ${testCaseName}:`)
228
+
229
+ for (let i = 0; i < testCaseCopies.length; i++) {
230
+ for (let j = i + 1; j < testCaseCopies.length; j++) {
231
+ const copyA = testCaseCopies[i]
232
+ const copyB = testCaseCopies[j]
233
+
234
+ comparisonCount++
235
+ process.stdout.write(` ${comparisonCount}/${totalComparisons} ${copyA.model.name} vs ${copyB.model.name}... `)
236
+
237
+ const winner = await judgePair(copyA.copy, copyB.copy, copyA.testCase, judgeModel)
238
+
239
+ const ratingA = ratings.get(copyA.model.id)!
240
+ const ratingB = ratings.get(copyB.model.id)!
241
+
242
+ let scoreA: number
243
+ if (winner === 'A') {
244
+ scoreA = 1
245
+ ratingA.wins++
246
+ ratingB.losses++
247
+ console.log(`${copyA.model.name} wins`)
248
+ } else if (winner === 'B') {
249
+ scoreA = 0
250
+ ratingA.losses++
251
+ ratingB.wins++
252
+ console.log(`${copyB.model.name} wins`)
253
+ } else {
254
+ scoreA = 0.5
255
+ ratingA.draws++
256
+ ratingB.draws++
257
+ console.log(`TIE`)
258
+ }
259
+
260
+ const { deltaA, deltaB } = calculateEloChange(ratingA.rating, ratingB.rating, scoreA)
261
+ ratingA.rating += deltaA
262
+ ratingB.rating += deltaB
263
+ }
264
+ }
265
+ }
266
+
267
+ // Sort by ELO rating
268
+ return Array.from(ratings.values()).sort((a, b) => b.rating - a.rating)
269
+ }
270
+
271
+ // Main
272
+ async function main() {
273
+ console.log('╔════════════════════════════════════════════════════════════════╗')
274
+ console.log('║ Marketing Copy Eval (LLM-as-Judge) ║')
275
+ console.log('╚════════════════════════════════════════════════════════════════╝')
276
+ console.log('')
277
+ console.log(`Judge Model: ${JUDGE_MODEL}`)
278
+ console.log(`Tiers: ${tiers.join(', ')}`)
279
+
280
+ // Get models to test
281
+ const models = EVAL_MODELS.filter(m => tiers.includes(m.tier))
282
+ console.log(`Models: ${models.map(m => m.name).join(', ')}`)
283
+ console.log(`Test Cases: ${TEST_CASES.length}`)
284
+ console.log('')
285
+
286
+ // Generate copy from each model for each test case
287
+ console.log('🎨 Generating marketing copy...\n')
288
+
289
+ const allCopies: GeneratedCopy[] = []
290
+ const startTime = Date.now()
291
+
292
+ for (const testCase of TEST_CASES) {
293
+ console.log(` 📦 ${testCase.name}:`)
294
+
295
+ const jobs = models.map(async model => {
296
+ try {
297
+ const copy = await generateCopy(model, testCase)
298
+ console.log(` ✓ ${model.name} (${copy.latencyMs}ms)`)
299
+ return copy
300
+ } catch (err) {
301
+ console.log(` ✗ ${model.name}: ${err}`)
302
+ return null
303
+ }
304
+ })
305
+
306
+ const results = await Promise.all(jobs)
307
+ allCopies.push(...results.filter((r): r is GeneratedCopy => r !== null))
308
+ }
309
+
310
+ const generateTime = Date.now() - startTime
311
+ console.log(`\n Generated ${allCopies.length} copies in ${(generateTime / 1000).toFixed(1)}s`)
312
+
313
+ // Run ELO tournament
314
+ const tournamentStart = Date.now()
315
+ const eloRatings = await runEloTournament(allCopies, JUDGE_MODEL)
316
+ const tournamentTime = Date.now() - tournamentStart
317
+
318
+ // Display results
319
+ console.log('')
320
+ console.log('╔════════════════════════════════════════════════════════════════╗')
321
+ console.log('║ ELO Rankings ║')
322
+ console.log('╚════════════════════════════════════════════════════════════════╝')
323
+ console.log('')
324
+ console.log(' Rank | Model | ELO | W | L | D |')
325
+ console.log(' -----|------------------------|--------|-----|-----|-----|')
326
+
327
+ eloRatings.forEach((rating, idx) => {
328
+ const rank = `${idx + 1}`.padStart(4)
329
+ const name = rating.modelName.padEnd(22)
330
+ const elo = Math.round(rating.rating).toString().padStart(6)
331
+ const wins = rating.wins.toString().padStart(3)
332
+ const losses = rating.losses.toString().padStart(3)
333
+ const draws = rating.draws.toString().padStart(3)
334
+ console.log(` ${rank} | ${name} | ${elo} | ${wins} | ${losses} | ${draws} |`)
335
+ })
336
+
337
+ console.log('')
338
+ console.log(` Judge: ${JUDGE_MODEL}`)
339
+ console.log(` Generation Time: ${(generateTime / 1000).toFixed(1)}s`)
340
+ console.log(` Tournament Time: ${(tournamentTime / 1000).toFixed(1)}s`)
341
+ console.log(` Total Time: ${((generateTime + tournamentTime) / 1000).toFixed(1)}s`)
342
+
343
+ // Show sample outputs from top 3
344
+ console.log('')
345
+ console.log('╔════════════════════════════════════════════════════════════════╗')
346
+ console.log('║ Sample Outputs (Top 3) ║')
347
+ console.log('╚════════════════════════════════════════════════════════════════╝')
348
+
349
+ const top3Models = eloRatings.slice(0, 3).map(r => r.modelId)
350
+ const sampleTestCase = TEST_CASES[0]
351
+
352
+ for (const modelId of top3Models) {
353
+ const copy = allCopies.find(c => c.model.id === modelId && c.testCase.name === sampleTestCase.name)
354
+ if (copy) {
355
+ const rank = eloRatings.findIndex(r => r.modelId === modelId) + 1
356
+ console.log(`\n #${rank} ${copy.model.name} (${sampleTestCase.name}):`)
357
+ console.log(` ─────────────────────────────────────────`)
358
+ console.log(` Title: ${copy.copy.title}`)
359
+ console.log(` Description: ${copy.copy.description}`)
360
+ console.log(` Headline: ${copy.copy.hero.headline}`)
361
+ console.log(` Subhead: ${copy.copy.hero.subhead}`)
362
+ console.log(` Primary CTA: [${copy.copy.hero.primaryCTA}]`)
363
+ console.log(` Secondary CTA: ${copy.copy.hero.secondaryCTA}`)
364
+ }
365
+ }
366
+
367
+ console.log('')
368
+ }
369
+
370
+ main().catch(console.error)
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Math Eval
3
+ *
4
+ * Tests model mathematical reasoning from simple arithmetic
5
+ * to word problems.
6
+ */
7
+
8
+ import { evalite } from 'evalite'
9
+ import { generateObject } from '../src/generate.js'
10
+ import { schema } from '../src/schema.js'
11
+ import { createModelVariants, type EvalModel } from '../src/eval/models.js'
12
+
13
+ // Math test cases
14
+ const TEST_CASES = [
15
+ // Arithmetic
16
+ { problem: 'What is 15 + 27?', expected: 42, difficulty: 'easy' },
17
+ { problem: 'What is 144 / 12?', expected: 12, difficulty: 'easy' },
18
+ { problem: 'What is 7 * 8?', expected: 56, difficulty: 'easy' },
19
+
20
+ // Word problems
21
+ { problem: 'A store sells 45 apples at $2 each. What is the total revenue?', expected: 90, difficulty: 'medium' },
22
+ { problem: 'A train travels 240 miles in 4 hours. What is the average speed in mph?', expected: 60, difficulty: 'medium' },
23
+
24
+ // Multi-step
25
+ { problem: 'A company has 120 employees. 40% work in engineering, and 25% of engineers are senior. How many senior engineers?', expected: 12, difficulty: 'hard' },
26
+ ]
27
+
28
+ const modelVariants = createModelVariants({ tiers: ['fast'] })
29
+
30
+ evalite.each(modelVariants)('Math', {
31
+ data: TEST_CASES.map(tc => ({ input: tc, expected: tc.expected })),
32
+
33
+ task: async (input, variant) => {
34
+ const model = variant as EvalModel
35
+ const startTime = Date.now()
36
+
37
+ const { object, usage } = await generateObject({
38
+ model: model.id,
39
+ schema: schema({
40
+ answer: 'The numeric answer (number)',
41
+ reasoning: 'Step by step reasoning',
42
+ }),
43
+ prompt: `Solve this math problem:\n\n${input.problem}`,
44
+ })
45
+
46
+ const latencyMs = Date.now() - startTime
47
+
48
+ return {
49
+ answer: object.answer,
50
+ reasoning: object.reasoning,
51
+ expected: input.expected,
52
+ problem: input.problem,
53
+ difficulty: input.difficulty,
54
+ modelId: model.id,
55
+ modelName: model.name,
56
+ latencyMs,
57
+ usage,
58
+ }
59
+ },
60
+
61
+ scorers: [
62
+ // Exact answer
63
+ {
64
+ name: 'Correct Answer',
65
+ description: 'Whether the numeric answer is correct',
66
+ scorer: ({ output, expected }) => {
67
+ const answer = output.answer as number
68
+ const exp = expected as number
69
+ // Allow small floating point tolerance
70
+ return { score: Math.abs(answer - exp) < 0.01 ? 1 : 0 }
71
+ },
72
+ },
73
+
74
+ // Shows reasoning
75
+ {
76
+ name: 'Shows Work',
77
+ description: 'Whether model explains reasoning',
78
+ scorer: ({ output }) => {
79
+ const reasoning = output.reasoning as string
80
+ if (!reasoning || reasoning.length < 20) return { score: 0.2 }
81
+ if (reasoning.length > 50) return { score: 1 }
82
+ return { score: 0.6 }
83
+ },
84
+ },
85
+ ],
86
+
87
+ columns: ({ output, expected }) => [
88
+ { label: 'Model', value: output.modelName },
89
+ { label: 'Difficulty', value: output.difficulty },
90
+ { label: 'Expected', value: expected },
91
+ { label: 'Got', value: output.answer },
92
+ { label: 'Correct', value: Math.abs((output.answer as number) - (expected as number)) < 0.01 ? 'Yes' : 'No' },
93
+ ],
94
+ })
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * Run AI Functions Eval Suite
4
+ *
5
+ * Usage:
6
+ * npx tsx evals/run-evals.ts [--fast] [--all]
7
+ *
8
+ * Options:
9
+ * --fast Only run fast-tier models (default)
10
+ * --all Run all models
11
+ * --math Run only math eval
12
+ * --class Run only classification eval
13
+ */
14
+
15
+ import { runEval, generateObject, generateText, schema } from '../src/eval/runner.js'
16
+ import type { EvalModel, ModelTier } from '../src/eval/models.js'
17
+
18
+ // Parse CLI args
19
+ const args = process.argv.slice(2)
20
+ const runAll = args.includes('--all')
21
+ const runMath = args.includes('--math')
22
+ const runClass = args.includes('--class')
23
+ const runSingle = runMath || runClass
24
+
25
+ const tiers: ModelTier[] = runAll ? ['best', 'fast', 'cheap'] : ['fast']
26
+
27
+ console.log('╔════════════════════════════════════════════════════════════════╗')
28
+ console.log('║ AI Functions Eval Suite ║')
29
+ console.log('╚════════════════════════════════════════════════════════════════╝')
30
+ console.log('')
31
+ console.log(`Tiers: ${tiers.join(', ')}`)
32
+
33
+ // Math eval
34
+ async function runMathEval() {
35
+ const cases = [
36
+ { name: 'Simple addition', input: { problem: 'What is 15 + 27?' }, expected: 42 },
37
+ { name: 'Division', input: { problem: 'What is 144 / 12?' }, expected: 12 },
38
+ { name: 'Multiplication', input: { problem: 'What is 7 * 8?' }, expected: 56 },
39
+ { name: 'Word problem', input: { problem: 'A store sells 45 apples at $2 each. What is the total revenue?' }, expected: 90 },
40
+ { name: 'Multi-step', input: { problem: 'A company has 120 employees. 40% work in engineering, and 25% of engineers are senior. How many senior engineers?' }, expected: 12 },
41
+ ]
42
+
43
+ return runEval({
44
+ name: 'Math',
45
+ cases,
46
+ tiers,
47
+ task: async (input, model) => {
48
+ const { object } = await generateObject({
49
+ model: model.id,
50
+ schema: schema({
51
+ answer: 'The numeric answer (number)',
52
+ reasoning: 'Step by step reasoning',
53
+ }),
54
+ prompt: `Solve this math problem:\n\n${input.problem}`,
55
+ })
56
+ return object
57
+ },
58
+ scorers: [
59
+ {
60
+ name: 'Correct Answer',
61
+ description: 'Whether the numeric answer is correct',
62
+ scorer: ({ output, expected }) => {
63
+ const answer = (output as { answer: number }).answer
64
+ const exp = expected as number
65
+ return Math.abs(answer - exp) < 0.01 ? 1 : 0
66
+ },
67
+ },
68
+ {
69
+ name: 'Shows Work',
70
+ description: 'Whether model explains reasoning',
71
+ scorer: ({ output }) => {
72
+ const reasoning = (output as { reasoning: string }).reasoning
73
+ if (!reasoning || reasoning.length < 20) return 0.2
74
+ if (reasoning.length > 50) return 1
75
+ return 0.6
76
+ },
77
+ },
78
+ ],
79
+ })
80
+ }
81
+
82
+ // Classification eval
83
+ async function runClassificationEval() {
84
+ const cases = [
85
+ { name: 'Positive sentiment', input: { text: 'This product exceeded my expectations!', options: ['positive', 'negative', 'neutral'] }, expected: 'positive' },
86
+ { name: 'Negative sentiment', input: { text: 'The delivery was late and packaging damaged.', options: ['positive', 'negative', 'neutral'] }, expected: 'negative' },
87
+ { name: 'Neutral sentiment', input: { text: 'The product arrived as described.', options: ['positive', 'negative', 'neutral'] }, expected: 'neutral' },
88
+ { name: 'Account ticket', input: { text: 'I need to reset my password', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'account' },
89
+ { name: 'Billing ticket', input: { text: 'When will my refund be processed?', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'billing' },
90
+ { name: 'Technical ticket', input: { text: 'The app crashes when uploading images', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'technical' },
91
+ ]
92
+
93
+ return runEval({
94
+ name: 'Classification',
95
+ cases,
96
+ tiers,
97
+ task: async (input, model) => {
98
+ const enumStr = input.options.join(' | ')
99
+ const { object } = await generateObject({
100
+ model: model.id,
101
+ schema: schema({
102
+ category: enumStr,
103
+ confidence: 'Confidence 0-1 (number)',
104
+ }),
105
+ prompt: `Classify this text into one of: ${input.options.join(', ')}\n\nText: "${input.text}"`,
106
+ })
107
+ return object
108
+ },
109
+ scorers: [
110
+ {
111
+ name: 'Accuracy',
112
+ description: 'Whether classification is correct',
113
+ scorer: ({ output, expected }) => {
114
+ const predicted = (output as { category: string }).category
115
+ return predicted === expected ? 1 : 0
116
+ },
117
+ },
118
+ {
119
+ name: 'Valid Category',
120
+ description: 'Whether output is a valid option',
121
+ scorer: ({ input, output }) => {
122
+ const predicted = (output as { category: string }).category
123
+ const options = (input as { options: string[] }).options
124
+ return options.includes(predicted) ? 1 : 0
125
+ },
126
+ },
127
+ ],
128
+ })
129
+ }
130
+
131
+ // Run evals
132
+ async function main() {
133
+ const results = []
134
+
135
+ if (!runSingle || runMath) {
136
+ results.push(await runMathEval())
137
+ }
138
+
139
+ if (!runSingle || runClass) {
140
+ results.push(await runClassificationEval())
141
+ }
142
+
143
+ // Overall summary
144
+ console.log('')
145
+ console.log('╔════════════════════════════════════════════════════════════════╗')
146
+ console.log('║ Summary ║')
147
+ console.log('╚════════════════════════════════════════════════════════════════╝')
148
+
149
+ let totalScore = 0
150
+ let totalCost = 0
151
+ let totalTime = 0
152
+
153
+ for (const result of results) {
154
+ console.log(`\n${result.name}: ${(result.avgScore * 100).toFixed(1)}%`)
155
+ totalScore += result.avgScore
156
+ totalCost += result.totalCost
157
+ totalTime += result.totalTime
158
+ }
159
+
160
+ console.log('')
161
+ console.log(`Overall: ${((totalScore / results.length) * 100).toFixed(1)}%`)
162
+ console.log(`Total Cost: $${totalCost.toFixed(4)}`)
163
+ console.log(`Total Time: ${(totalTime / 1000).toFixed(1)}s`)
164
+ }
165
+
166
+ main().catch(console.error)