ai-functions 0.2.19 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. package/.turbo/turbo-build.log +5 -0
  2. package/.turbo/turbo-test.log +105 -0
  3. package/README.md +232 -37
  4. package/TODO.md +138 -0
  5. package/dist/ai-promise.d.ts +219 -0
  6. package/dist/ai-promise.d.ts.map +1 -0
  7. package/dist/ai-promise.js +610 -0
  8. package/dist/ai-promise.js.map +1 -0
  9. package/dist/ai.d.ts +285 -0
  10. package/dist/ai.d.ts.map +1 -0
  11. package/dist/ai.js +842 -0
  12. package/dist/ai.js.map +1 -0
  13. package/dist/batch/anthropic.d.ts +23 -0
  14. package/dist/batch/anthropic.d.ts.map +1 -0
  15. package/dist/batch/anthropic.js +257 -0
  16. package/dist/batch/anthropic.js.map +1 -0
  17. package/dist/batch/bedrock.d.ts +64 -0
  18. package/dist/batch/bedrock.d.ts.map +1 -0
  19. package/dist/batch/bedrock.js +586 -0
  20. package/dist/batch/bedrock.js.map +1 -0
  21. package/dist/batch/cloudflare.d.ts +37 -0
  22. package/dist/batch/cloudflare.d.ts.map +1 -0
  23. package/dist/batch/cloudflare.js +289 -0
  24. package/dist/batch/cloudflare.js.map +1 -0
  25. package/dist/batch/google.d.ts +41 -0
  26. package/dist/batch/google.d.ts.map +1 -0
  27. package/dist/batch/google.js +360 -0
  28. package/dist/batch/google.js.map +1 -0
  29. package/dist/batch/index.d.ts +31 -0
  30. package/dist/batch/index.d.ts.map +1 -0
  31. package/dist/batch/index.js +31 -0
  32. package/dist/batch/index.js.map +1 -0
  33. package/dist/batch/memory.d.ts +44 -0
  34. package/dist/batch/memory.d.ts.map +1 -0
  35. package/dist/batch/memory.js +188 -0
  36. package/dist/batch/memory.js.map +1 -0
  37. package/dist/batch/openai.d.ts +37 -0
  38. package/dist/batch/openai.d.ts.map +1 -0
  39. package/dist/batch/openai.js +403 -0
  40. package/dist/batch/openai.js.map +1 -0
  41. package/dist/batch-map.d.ts +125 -0
  42. package/dist/batch-map.d.ts.map +1 -0
  43. package/dist/batch-map.js +406 -0
  44. package/dist/batch-map.js.map +1 -0
  45. package/dist/batch-queue.d.ts +273 -0
  46. package/dist/batch-queue.d.ts.map +1 -0
  47. package/dist/batch-queue.js +271 -0
  48. package/dist/batch-queue.js.map +1 -0
  49. package/dist/context.d.ts +133 -0
  50. package/dist/context.d.ts.map +1 -0
  51. package/dist/context.js +267 -0
  52. package/dist/context.js.map +1 -0
  53. package/dist/embeddings.d.ts +123 -0
  54. package/dist/embeddings.d.ts.map +1 -0
  55. package/dist/embeddings.js +170 -0
  56. package/dist/embeddings.js.map +1 -0
  57. package/dist/eval/index.d.ts +8 -0
  58. package/dist/eval/index.d.ts.map +1 -0
  59. package/dist/eval/index.js +8 -0
  60. package/dist/eval/index.js.map +1 -0
  61. package/dist/eval/models.d.ts +66 -0
  62. package/dist/eval/models.d.ts.map +1 -0
  63. package/dist/eval/models.js +120 -0
  64. package/dist/eval/models.js.map +1 -0
  65. package/dist/eval/runner.d.ts +64 -0
  66. package/dist/eval/runner.d.ts.map +1 -0
  67. package/dist/eval/runner.js +148 -0
  68. package/dist/eval/runner.js.map +1 -0
  69. package/dist/generate.d.ts +168 -0
  70. package/dist/generate.d.ts.map +1 -0
  71. package/dist/generate.js +174 -0
  72. package/dist/generate.js.map +1 -0
  73. package/dist/index.d.ts +30 -0
  74. package/dist/index.d.ts.map +1 -0
  75. package/dist/index.js +54 -0
  76. package/dist/index.js.map +1 -0
  77. package/dist/primitives.d.ts +292 -0
  78. package/dist/primitives.d.ts.map +1 -0
  79. package/dist/primitives.js +471 -0
  80. package/dist/primitives.js.map +1 -0
  81. package/dist/providers/cloudflare.d.ts +9 -0
  82. package/dist/providers/cloudflare.d.ts.map +1 -0
  83. package/dist/providers/cloudflare.js +9 -0
  84. package/dist/providers/cloudflare.js.map +1 -0
  85. package/dist/providers/index.d.ts +9 -0
  86. package/dist/providers/index.d.ts.map +1 -0
  87. package/dist/providers/index.js +9 -0
  88. package/dist/providers/index.js.map +1 -0
  89. package/dist/schema.d.ts +54 -0
  90. package/dist/schema.d.ts.map +1 -0
  91. package/dist/schema.js +109 -0
  92. package/dist/schema.js.map +1 -0
  93. package/dist/template.d.ts +73 -0
  94. package/dist/template.d.ts.map +1 -0
  95. package/dist/template.js +129 -0
  96. package/dist/template.js.map +1 -0
  97. package/dist/types.d.ts +481 -0
  98. package/dist/types.d.ts.map +1 -0
  99. package/dist/types.js +5 -0
  100. package/dist/types.js.map +1 -0
  101. package/evalite.config.ts +19 -0
  102. package/evals/README.md +212 -0
  103. package/evals/classification.eval.ts +108 -0
  104. package/evals/marketing.eval.ts +370 -0
  105. package/evals/math.eval.ts +94 -0
  106. package/evals/run-evals.ts +166 -0
  107. package/evals/structured-output.eval.ts +143 -0
  108. package/evals/writing.eval.ts +117 -0
  109. package/examples/batch-blog-posts.ts +160 -0
  110. package/package.json +59 -43
  111. package/src/ai-promise.ts +784 -0
  112. package/src/ai.ts +1183 -0
  113. package/src/batch/anthropic.ts +375 -0
  114. package/src/batch/bedrock.ts +801 -0
  115. package/src/batch/cloudflare.ts +421 -0
  116. package/src/batch/google.ts +491 -0
  117. package/src/batch/index.ts +31 -0
  118. package/src/batch/memory.ts +253 -0
  119. package/src/batch/openai.ts +557 -0
  120. package/src/batch-map.ts +534 -0
  121. package/src/batch-queue.ts +493 -0
  122. package/src/context.ts +332 -0
  123. package/src/embeddings.ts +244 -0
  124. package/src/eval/index.ts +8 -0
  125. package/src/eval/models.ts +158 -0
  126. package/src/eval/runner.ts +217 -0
  127. package/src/generate.ts +245 -0
  128. package/src/index.ts +154 -0
  129. package/src/primitives.ts +612 -0
  130. package/src/providers/cloudflare.ts +15 -0
  131. package/src/providers/index.ts +14 -0
  132. package/src/schema.ts +147 -0
  133. package/src/template.ts +209 -0
  134. package/src/types.ts +540 -0
  135. package/test/README.md +105 -0
  136. package/test/ai-proxy.test.ts +192 -0
  137. package/test/async-iterators.test.ts +327 -0
  138. package/test/batch-background.test.ts +482 -0
  139. package/test/batch-blog-posts.test.ts +387 -0
  140. package/test/blog-generation.test.ts +510 -0
  141. package/test/browse-read.test.ts +611 -0
  142. package/test/core-functions.test.ts +694 -0
  143. package/test/decide.test.ts +393 -0
  144. package/test/define.test.ts +274 -0
  145. package/test/e2e-bedrock-manual.ts +163 -0
  146. package/test/e2e-bedrock.test.ts +191 -0
  147. package/test/e2e-flex-gateway.ts +157 -0
  148. package/test/e2e-flex-manual.ts +183 -0
  149. package/test/e2e-flex.test.ts +209 -0
  150. package/test/e2e-google-manual.ts +178 -0
  151. package/test/e2e-google.test.ts +216 -0
  152. package/test/embeddings.test.ts +284 -0
  153. package/test/evals/define-function.eval.test.ts +379 -0
  154. package/test/evals/primitives.eval.test.ts +384 -0
  155. package/test/function-types.test.ts +492 -0
  156. package/test/generate-core.test.ts +319 -0
  157. package/test/generate.test.ts +163 -0
  158. package/test/implicit-batch.test.ts +422 -0
  159. package/test/schema.test.ts +109 -0
  160. package/test/tagged-templates.test.ts +302 -0
  161. package/tsconfig.json +8 -6
  162. package/vitest.config.ts +42 -0
  163. package/LICENSE +0 -21
  164. package/db/cache.ts +0 -6
  165. package/db/mongo.ts +0 -75
  166. package/dist/mjs/db/cache.d.ts +0 -1
  167. package/dist/mjs/db/cache.js +0 -5
  168. package/dist/mjs/db/mongo.d.ts +0 -31
  169. package/dist/mjs/db/mongo.js +0 -48
  170. package/dist/mjs/examples/data.d.ts +0 -1105
  171. package/dist/mjs/examples/data.js +0 -1105
  172. package/dist/mjs/functions/ai.d.ts +0 -20
  173. package/dist/mjs/functions/ai.js +0 -83
  174. package/dist/mjs/functions/ai.test.d.ts +0 -1
  175. package/dist/mjs/functions/ai.test.js +0 -29
  176. package/dist/mjs/functions/gpt.d.ts +0 -4
  177. package/dist/mjs/functions/gpt.js +0 -10
  178. package/dist/mjs/functions/list.d.ts +0 -7
  179. package/dist/mjs/functions/list.js +0 -72
  180. package/dist/mjs/index.d.ts +0 -3
  181. package/dist/mjs/index.js +0 -3
  182. package/dist/mjs/queue/kafka.d.ts +0 -0
  183. package/dist/mjs/queue/kafka.js +0 -1
  184. package/dist/mjs/queue/memory.d.ts +0 -0
  185. package/dist/mjs/queue/memory.js +0 -1
  186. package/dist/mjs/queue/mongo.d.ts +0 -30
  187. package/dist/mjs/queue/mongo.js +0 -42
  188. package/dist/mjs/streams/kafka.d.ts +0 -0
  189. package/dist/mjs/streams/kafka.js +0 -1
  190. package/dist/mjs/streams/memory.d.ts +0 -0
  191. package/dist/mjs/streams/memory.js +0 -1
  192. package/dist/mjs/streams/mongo.d.ts +0 -0
  193. package/dist/mjs/streams/mongo.js +0 -1
  194. package/dist/mjs/streams/types.d.ts +0 -0
  195. package/dist/mjs/streams/types.js +0 -1
  196. package/dist/mjs/types.d.ts +0 -11
  197. package/dist/mjs/types.js +0 -1
  198. package/dist/mjs/utils/completion.d.ts +0 -9
  199. package/dist/mjs/utils/completion.js +0 -20
  200. package/dist/mjs/utils/schema.d.ts +0 -10
  201. package/dist/mjs/utils/schema.js +0 -72
  202. package/dist/mjs/utils/schema.test.d.ts +0 -1
  203. package/dist/mjs/utils/schema.test.js +0 -60
  204. package/dist/mjs/utils/state.d.ts +0 -1
  205. package/dist/mjs/utils/state.js +0 -19
  206. package/examples/data.ts +0 -1105
  207. package/fixup +0 -11
  208. package/functions/ai.test.ts +0 -41
  209. package/functions/ai.ts +0 -115
  210. package/functions/gpt.ts +0 -12
  211. package/functions/list.ts +0 -84
  212. package/index.ts +0 -3
  213. package/queue/kafka.ts +0 -0
  214. package/queue/memory.ts +0 -0
  215. package/queue/mongo.ts +0 -88
  216. package/streams/kafka.ts +0 -0
  217. package/streams/memory.ts +0 -0
  218. package/streams/mongo.ts +0 -0
  219. package/streams/types.ts +0 -0
  220. package/tsconfig-backup.json +0 -105
  221. package/tsconfig-base.json +0 -26
  222. package/tsconfig-cjs.json +0 -8
  223. package/types.ts +0 -12
  224. package/utils/completion.ts +0 -28
  225. package/utils/schema.test.ts +0 -69
  226. package/utils/schema.ts +0 -74
  227. package/utils/state.ts +0 -23
@@ -0,0 +1,212 @@
1
+ # AI Functions Eval Suite
2
+
3
+ Evaluations for ai-functions using both vitest-based tests and a custom eval runner.
4
+
5
+ ## Quick Start
6
+
7
+ ### Vitest-Based Evals (Recommended)
8
+
9
+ Tests the core AI primitives (`code`, `ai`, `list`, `is`, `defineFunction`, etc.) with real AI calls:
10
+
11
+ ```bash
12
+ # Run all eval tests
13
+ pnpm test:evals
14
+
15
+ # Run primitives eval (code, ai, list, is, etc.)
16
+ pnpm test:evals:primitives
17
+
18
+ # Run defineFunction eval
19
+ pnpm test:evals:define
20
+
21
+ # Run with specific model
22
+ MODEL=sonnet pnpm test:evals
23
+
24
+ # Run with specific tiers
25
+ EVAL_TIERS=best,fast pnpm test:evals
26
+ ```
27
+
28
+ ### Custom Runner Evals
29
+
30
+ Math and classification evals with detailed scoring:
31
+
32
+ ```bash
33
+ # Run all evals (math + classification)
34
+ pnpm eval
35
+
36
+ # Run specific eval
37
+ pnpm eval:math
38
+ pnpm eval:class
39
+
40
+ # Run all tiers (best, fast, cheap)
41
+ pnpm eval:all
42
+ ```
43
+
44
+ ## Eval Suites
45
+
46
+ ### Vitest Evals (test/evals/)
47
+
48
+ | Test Suite | Functions Tested | Test Cases |
49
+ |------------|------------------|------------|
50
+ | `primitives.eval.test.ts` | `code()`, `ai()`, `list()`, `is()`, `summarize()`, `extract()`, `write()`, `lists()` | Code generation, text generation, classification, extraction |
51
+ | `define-function.eval.test.ts` | `defineFunction()`, `define.generative()`, `define.code()` | Generative functions, code functions, structured outputs |
52
+
53
+ ### Custom Runner Evals (evals/)
54
+
55
+ | Eval | Tests | Scoring |
56
+ |------|-------|---------|
57
+ | `Math` | Arithmetic, word problems | Correct answer + shows work |
58
+ | `Classification` | Sentiment, support tickets | Accuracy + calibration |
59
+ | `Marketing` | Marketing copy generation | LLM-as-judge ELO ranking |
60
+
61
+ ### Marketing Copy Eval (LLM-as-Judge)
62
+
63
+ ```bash
64
+ # Run marketing eval (fast tier only)
65
+ pnpm eval:marketing
66
+
67
+ # Run with all tiers
68
+ pnpm eval:marketing:all
69
+
70
+ # Use different judge model
71
+ pnpm eval:marketing -- --judge=opus
72
+ ```
73
+
74
+ Generates marketing copy (title, description, hero headline/subhead, CTAs) for different scenarios and uses pairwise LLM-as-judge comparisons to create ELO rankings.
75
+
76
+ ## Latest Results (December 2025)
77
+
78
+ **Overall: 94.0%** | Cost: $0.06 | Time: 95s | 10 Models
79
+
80
+ ### Performance Summary
81
+
82
+ | Model | Math | Class | Overall | Avg Latency | Notes |
83
+ |-------|------|-------|---------|-------------|-------|
84
+ | Claude Sonnet 4.5 | 100% | 100% | **100%** | ~380ms | Best overall |
85
+ | GPT-5 Mini | 100% | 91.7% | 95.9% | ~1850ms | Slower but accurate |
86
+ | Gemini 2.5 Flash | 100% | 91.7% | 95.9% | ~200ms | **Fastest** |
87
+ | DeepSeek Chat | 100% | 91.7% | 95.9% | ~210ms | Great value |
88
+ | Mistral Medium 3.1 | 96% | 100% | 98.0% | ~850ms | Strong classify |
89
+ | Grok 4.1 Fast | 100% | 91.7% | 95.9% | ~2300ms | 2M context |
90
+ | Grok 4 Fast | 92% | 100% | 96.0% | ~1800ms | Good balance |
91
+ | Qwen3 30B | 96% | 91.7% | 93.9% | ~8900ms | Slowest |
92
+ | Llama 3.3 70B | 90% | 91.7% | 90.9% | ~185ms | Fast open model |
93
+ | GPT-oss 20B | 72% | 83.3% | 77.7% | ~1200ms | Open source |
94
+
95
+ ### Performance/$ Analysis (Fast Tier)
96
+
97
+ | Model | Score | Est $/1M tokens | Score/$ | Recommendation |
98
+ |-------|-------|-----------------|---------|----------------|
99
+ | DeepSeek Chat | 95.9% | $0.28 | **342** | Best value |
100
+ | Gemini 2.5 Flash | 95.9% | $0.30 | 320 | Fast + cheap |
101
+ | Llama 3.3 70B | 90.9% | $0.40 | 227 | Good OSS option |
102
+ | Claude Sonnet 4.5 | 100% | $3.00 | 33 | Best quality |
103
+ | Mistral Medium 3.1 | 98.0% | $2.50 | 39 | Strong balance |
104
+ | GPT-5 Mini | 95.9% | $1.00 | 96 | OpenAI ecosystem |
105
+ | Grok 4.1 Fast | 95.9% | $2.00 | 48 | 2M context |
106
+
107
+ ### Math Eval (94.6%)
108
+
109
+ | Model | Score | Avg Latency |
110
+ |-------|-------|-------------|
111
+ | Claude Sonnet 4.5 | 100% | ~380ms |
112
+ | GPT-5 Mini | 100% | ~200ms |
113
+ | Gemini 2.5 Flash | 100% | ~170ms |
114
+ | DeepSeek Chat | 100% | ~220ms |
115
+ | Grok 4.1 Fast | 100% | ~2600ms |
116
+ | Mistral Medium 3.1 | 96% | ~1040ms |
117
+ | Qwen3 30B | 96% | ~13000ms |
118
+ | Grok 4 Fast | 92% | ~2000ms |
119
+ | Llama 3.3 70B | 90% | ~170ms |
120
+ | GPT-oss 20B | 72% | ~180ms |
121
+
122
+ ### Classification Eval (93.3%)
123
+
124
+ | Model | Score | Avg Latency |
125
+ |-------|-------|-------------|
126
+ | Claude Sonnet 4.5 | 100% | ~205ms |
127
+ | Mistral Medium 3.1 | 100% | ~700ms |
128
+ | Grok 4 Fast | 100% | ~1670ms |
129
+ | GPT-5 Mini | 91.7% | ~3500ms |
130
+ | Gemini 2.5 Flash | 91.7% | ~235ms |
131
+ | Llama 3.3 70B | 91.7% | ~230ms |
132
+ | DeepSeek Chat | 91.7% | ~230ms |
133
+ | Qwen3 30B | 91.7% | ~3970ms |
134
+ | Grok 4.1 Fast | 91.7% | ~2170ms |
135
+ | GPT-oss 20B | 83.3% | ~2840ms |
136
+
137
+ ### Marketing Copy Eval (ELO Rankings)
138
+
139
+ Uses LLM-as-judge (Claude Sonnet) for pairwise comparisons across 4 test scenarios.
140
+
141
+ | Rank | Model | ELO | W | L | D | Notes |
142
+ |------|-------|-----|---|---|---|-------|
143
+ | 1 | Claude Sonnet 4.5 | **1745** | 31 | 3 | 0 | Dominant winner |
144
+ | 2 | Grok 4.1 Fast | 1595 | 22 | 12 | 0 | Strong creative |
145
+ | 3 | GPT-5 Mini | 1593 | 26 | 8 | 0 | Consistent quality |
146
+ | 4 | Grok 4 Fast | 1558 | 17 | 17 | 0 | Good balance |
147
+ | 5 | Gemini 2.5 Flash | 1503 | 14 | 20 | 0 | Middle tier |
148
+ | 6 | Mistral Medium 3.1 | 1481 | 16 | 18 | 0 | Solid performer |
149
+ | 7 | GPT-oss 20B | 1471 | 19 | 15 | 0 | OSS option |
150
+ | 8 | DeepSeek Chat | 1449 | 10 | 16 | 0 | Value option |
151
+ | 9 | Qwen3 30B | 1371 | 6 | 20 | 0 | Below average |
152
+ | 10 | Llama 3.3 70B | 1231 | 1 | 33 | 0 | Struggled |
153
+
154
+ **Key Insights:**
155
+ - Claude Sonnet 4.5 won 31 of 34 comparisons (91%)
156
+ - Grok models performed unexpectedly well on creative tasks
157
+ - Llama 3.3 70B, despite being strong on classification, struggled with marketing copy
158
+
159
+ ## Models
160
+
161
+ Uses model IDs from `language-models` package, routed via `ai-providers`:
162
+
163
+ ### Model Tiers
164
+
165
+ | Tier | Description | Models |
166
+ |------|-------------|--------|
167
+ | `best` | Highest capability | opus, o3, gpt-5.1, gemini-pro, deepseek-v3.2, mistral-large-3, qwen3-coder, grok-4 |
168
+ | `fast` | Good balance | sonnet, gpt-5-mini, flash, llama-3.3-70b, mistral-medium-3.1, qwen3-30b, grok-4.1-fast |
169
+ | `cheap` | Cost-optimized | haiku, gpt-5-nano, ministral-14b |
170
+
171
+ ### Full Model List (December 2025)
172
+
173
+ - **Anthropic**: `opus`, `sonnet`, `haiku`
174
+ - **OpenAI**: `openai/gpt-5.1`, `openai/gpt-5-mini`, `openai/gpt-5-nano`, `openai/o3`
175
+ - **OpenAI OSS**: `openai/gpt-oss-120b`, `openai/gpt-oss-20b` (open source models)
176
+ - **Google**: `gemini-pro`, `flash`
177
+ - **Meta**: `meta-llama/llama-4-maverick`, `meta-llama/llama-3.3-70b-instruct`
178
+ - **DeepSeek**: `deepseek/deepseek-v3.2`, `deepseek/deepseek-v3.2-speciale`, `deepseek/deepseek-chat`
179
+ - **Mistral**: `mistralai/mistral-large-2512` (Mistral Large 3), `mistralai/mistral-medium-3.1`, `mistralai/ministral-14b-2512`
180
+ - **Qwen**: `qwen/qwen3-coder`, `qwen/qwen3-30b-a3b`, `qwen/qwen3-next-80b-a3b-instruct`
181
+ - **xAI**: `x-ai/grok-4`, `x-ai/grok-4.1-fast`, `x-ai/grok-4-fast`
182
+
183
+ ## Environment
184
+
185
+ ```bash
186
+ # Use AI Gateway (recommended)
187
+ AI_GATEWAY_URL=https://gateway.ai.cloudflare.com/v1/...
188
+ AI_GATEWAY_TOKEN=...
189
+
190
+ # Or direct API keys
191
+ ANTHROPIC_API_KEY=sk-ant-...
192
+ OPENAI_API_KEY=sk-...
193
+ ```
194
+
195
+ ## Adding Evals
196
+
197
+ ### Vitest-Based Evals
198
+
199
+ 1. Create a new test file in `test/evals/`
200
+ 2. Import functions and models:
201
+ ```typescript
202
+ import { code, ai, list } from '../../src/primitives.js'
203
+ import { EVAL_MODELS, type EvalModel } from '../../src/eval/models.js'
204
+ ```
205
+ 3. Use `describe.skipIf(!hasAPI)` to skip when no API access
206
+ 4. Loop over models with `for (const model of models)`
207
+
208
+ ### Custom Runner Evals
209
+
210
+ 1. Add test cases to `evals/run-evals.ts`
211
+ 2. Use `runEval()` with `task` function and `scorers` array
212
+ 3. Use `createModelVariants({ tiers: ['fast'] })` to filter models
@@ -0,0 +1,108 @@
1
+ /**
2
+ * Classification Eval
3
+ *
4
+ * Tests model ability to classify inputs correctly.
5
+ * Includes sentiment analysis, category classification, and boolean questions.
6
+ */
7
+
8
+ import { evalite } from 'evalite'
9
+ import { generateObject } from '../src/generate.js'
10
+ import { schema } from '../src/schema.js'
11
+ import { createModelVariants, type EvalModel } from '../src/eval/models.js'
12
+
13
+ // Classification test cases
14
+ const TEST_CASES = [
15
+ // Sentiment
16
+ { text: 'This product exceeded my expectations!', expected: 'positive', options: ['positive', 'negative', 'neutral'] },
17
+ { text: 'The delivery was late and packaging damaged.', expected: 'negative', options: ['positive', 'negative', 'neutral'] },
18
+ { text: 'The product arrived as described.', expected: 'neutral', options: ['positive', 'negative', 'neutral'] },
19
+
20
+ // Support ticket classification
21
+ { text: 'I need to reset my password', expected: 'account', options: ['account', 'billing', 'technical', 'shipping'] },
22
+ { text: 'When will my refund be processed?', expected: 'billing', options: ['account', 'billing', 'technical', 'shipping'] },
23
+ { text: 'The app crashes when uploading images', expected: 'technical', options: ['account', 'billing', 'technical', 'shipping'] },
24
+ { text: 'My package shows delivered but I never received it', expected: 'shipping', options: ['account', 'billing', 'technical', 'shipping'] },
25
+ ]
26
+
27
+ const modelVariants = createModelVariants({ tiers: ['fast', 'cheap'] })
28
+
29
+ evalite.each(modelVariants)('Classification', {
30
+ data: TEST_CASES.map(tc => ({ input: tc, expected: tc.expected })),
31
+
32
+ task: async (input, variant) => {
33
+ const model = variant as EvalModel
34
+ const startTime = Date.now()
35
+
36
+ const enumStr = input.options.join(' | ')
37
+
38
+ const { object, usage } = await generateObject({
39
+ model: model.id,
40
+ schema: schema({
41
+ category: enumStr,
42
+ confidence: 'Confidence 0-1 (number)',
43
+ }),
44
+ prompt: `Classify this text into one of: ${input.options.join(', ')}
45
+
46
+ Text: "${input.text}"`,
47
+ })
48
+
49
+ const latencyMs = Date.now() - startTime
50
+
51
+ return {
52
+ predicted: object.category,
53
+ confidence: object.confidence,
54
+ expected: input.expected,
55
+ text: input.text,
56
+ options: input.options,
57
+ modelId: model.id,
58
+ modelName: model.name,
59
+ latencyMs,
60
+ usage,
61
+ }
62
+ },
63
+
64
+ scorers: [
65
+ // Accuracy
66
+ {
67
+ name: 'Accuracy',
68
+ description: 'Whether classification is correct',
69
+ scorer: ({ output, expected }) => ({
70
+ score: output.predicted === expected ? 1 : 0,
71
+ }),
72
+ },
73
+
74
+ // Valid category
75
+ {
76
+ name: 'Valid Category',
77
+ description: 'Whether output is a valid option',
78
+ scorer: ({ output }) => ({
79
+ score: (output.options as string[]).includes(output.predicted as string) ? 1 : 0,
80
+ }),
81
+ },
82
+
83
+ // Calibration
84
+ {
85
+ name: 'Calibration',
86
+ description: 'Confidence matches accuracy',
87
+ scorer: ({ output, expected }) => {
88
+ const correct = output.predicted === expected
89
+ const conf = output.confidence as number
90
+
91
+ // High confidence when correct, low when wrong = well calibrated
92
+ if (correct && conf >= 0.7) return { score: 1 }
93
+ if (!correct && conf <= 0.5) return { score: 0.8 }
94
+ if (correct && conf < 0.5) return { score: 0.6 } // Underconfident
95
+ if (!correct && conf > 0.7) return { score: 0.2 } // Overconfident
96
+ return { score: 0.5 }
97
+ },
98
+ },
99
+ ],
100
+
101
+ columns: ({ output, expected }) => [
102
+ { label: 'Model', value: output.modelName },
103
+ { label: 'Expected', value: expected },
104
+ { label: 'Got', value: output.predicted },
105
+ { label: 'Correct', value: output.predicted === expected ? 'Yes' : 'No' },
106
+ { label: 'Confidence', value: `${((output.confidence as number) * 100).toFixed(0)}%` },
107
+ ],
108
+ })
@@ -0,0 +1,370 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * Marketing Copy Eval with LLM-as-Judge ELO Ranking
4
+ *
5
+ * Generates marketing copy (title, description, hero headline/subhead, CTAs)
6
+ * and uses pairwise comparison with an LLM judge to create ELO rankings.
7
+ *
8
+ * Usage:
9
+ * npx tsx evals/marketing.eval.ts
10
+ * npx tsx evals/marketing.eval.ts --judge=opus # Use specific judge model
11
+ * npx tsx evals/marketing.eval.ts --judge=haiku # Test cheap judge
12
+ * npx tsx evals/marketing.eval.ts --judge=flash # Test fast judge
13
+ * npx tsx evals/marketing.eval.ts --all # Run all tiers
14
+ * npx tsx evals/marketing.eval.ts --all --judge=haiku # All tiers + cheap judge
15
+ */
16
+
17
+ // Load .env from project root
18
+ import { config } from 'dotenv'
19
+ import { resolve } from 'path'
20
+ config({ path: resolve(import.meta.dirname, '../../../.env') })
21
+
22
+ import { generateObject } from '../src/generate.js'
23
+ import { schema } from '../src/schema.js'
24
+ import { EVAL_MODELS, type EvalModel, type ModelTier } from '../src/eval/models.js'
25
+
26
+ // Parse CLI args
27
+ const args = process.argv.slice(2)
28
+ const judgeArg = args.find(a => a.startsWith('--judge='))
29
+ const JUDGE_MODEL = judgeArg ? judgeArg.split('=')[1] : 'sonnet'
30
+ const runAll = args.includes('--all')
31
+
32
+ const tiers: ModelTier[] = runAll ? ['best', 'fast', 'cheap'] : ['fast']
33
+
34
+ // Marketing copy schema
35
+ const marketingCopySchema = schema({
36
+ title: 'Product/page title (5-10 words)',
37
+ description: 'Meta description for SEO (150-160 characters)',
38
+ hero: {
39
+ headline: 'Hero headline (5-8 words, compelling)',
40
+ subhead: 'Supporting subheadline (10-20 words)',
41
+ primaryCTA: 'Primary call-to-action button text (2-4 words)',
42
+ secondaryCTA: 'Secondary call-to-action link text (3-6 words)',
43
+ },
44
+ })
45
+
46
+ // Test cases - different product/service scenarios
47
+ const TEST_CASES = [
48
+ {
49
+ name: 'SaaS Analytics Platform',
50
+ prompt: `Create marketing copy for a B2B SaaS analytics platform called "InsightFlow" that helps companies understand their customer behavior with AI-powered insights. Target audience: Product managers and growth teams at mid-size tech companies.`,
51
+ },
52
+ {
53
+ name: 'E-commerce Fashion Brand',
54
+ prompt: `Create marketing copy for a sustainable fashion e-commerce brand called "EcoThread" that sells organic, ethically-made clothing. Target audience: Environmentally conscious millennials aged 25-35.`,
55
+ },
56
+ {
57
+ name: 'Developer Tool',
58
+ prompt: `Create marketing copy for a CLI tool called "DeployFast" that simplifies Kubernetes deployments with one-command deploys. Target audience: DevOps engineers and backend developers.`,
59
+ },
60
+ {
61
+ name: 'Mobile Fitness App',
62
+ prompt: `Create marketing copy for a fitness app called "FitPulse" that uses AI to create personalized workout plans and tracks progress with smart watch integration. Target audience: Busy professionals aged 30-45.`,
63
+ },
64
+ ]
65
+
66
+ interface MarketingCopy {
67
+ title: string
68
+ description: string
69
+ hero: {
70
+ headline: string
71
+ subhead: string
72
+ primaryCTA: string
73
+ secondaryCTA: string
74
+ }
75
+ }
76
+
77
+ interface GeneratedCopy {
78
+ model: EvalModel
79
+ testCase: typeof TEST_CASES[0]
80
+ copy: MarketingCopy
81
+ latencyMs: number
82
+ }
83
+
84
+ interface ELORating {
85
+ modelId: string
86
+ modelName: string
87
+ rating: number
88
+ wins: number
89
+ losses: number
90
+ draws: number
91
+ }
92
+
93
+ // ELO calculation
94
+ const K_FACTOR = 32
95
+ const INITIAL_ELO = 1500
96
+
97
+ function calculateEloChange(ratingA: number, ratingB: number, scoreA: number): { deltaA: number; deltaB: number } {
98
+ const expectedA = 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400))
99
+ const expectedB = 1 - expectedA
100
+
101
+ const deltaA = K_FACTOR * (scoreA - expectedA)
102
+ const deltaB = K_FACTOR * ((1 - scoreA) - expectedB)
103
+
104
+ return { deltaA, deltaB }
105
+ }
106
+
107
+ // LLM Judge for pairwise comparison
108
+ async function judgePair(
109
+ copyA: MarketingCopy,
110
+ copyB: MarketingCopy,
111
+ testCase: typeof TEST_CASES[0],
112
+ judgeModel: string
113
+ ): Promise<'A' | 'B' | 'TIE'> {
114
+ const prompt = `You are an expert marketing copywriter and brand strategist. Compare these two marketing copy options for: ${testCase.name}
115
+
116
+ Context: ${testCase.prompt}
117
+
118
+ === OPTION A ===
119
+ Title: ${copyA.title}
120
+ Description: ${copyA.description}
121
+ Hero Headline: ${copyA.hero.headline}
122
+ Hero Subhead: ${copyA.hero.subhead}
123
+ Primary CTA: ${copyA.hero.primaryCTA}
124
+ Secondary CTA: ${copyA.hero.secondaryCTA}
125
+
126
+ === OPTION B ===
127
+ Title: ${copyB.title}
128
+ Description: ${copyB.description}
129
+ Hero Headline: ${copyB.hero.headline}
130
+ Hero Subhead: ${copyB.hero.subhead}
131
+ Primary CTA: ${copyB.hero.primaryCTA}
132
+ Secondary CTA: ${copyB.hero.secondaryCTA}
133
+
134
+ Evaluate based on:
135
+ 1. Clarity and impact of messaging
136
+ 2. Target audience alignment
137
+ 3. Emotional appeal and persuasiveness
138
+ 4. CTA effectiveness
139
+ 5. Overall brand voice consistency
140
+
141
+ Which option is better? Answer A, B, or TIE if they're roughly equal.`
142
+
143
+ try {
144
+ const { object } = await generateObject({
145
+ model: judgeModel,
146
+ schema: schema({
147
+ reasoning: 'Brief explanation of your judgment (2-3 sentences)',
148
+ winner: 'A | B | TIE',
149
+ }),
150
+ prompt,
151
+ temperature: 0.3,
152
+ })
153
+
154
+ const result = object as { reasoning: string; winner: string }
155
+ const winner = result.winner.toUpperCase().trim()
156
+
157
+ if (winner === 'A' || winner === 'B' || winner === 'TIE') {
158
+ return winner
159
+ }
160
+ return 'TIE'
161
+ } catch (err) {
162
+ console.error(` Judge error: ${err}`)
163
+ return 'TIE'
164
+ }
165
+ }
166
+
167
+ // Generate marketing copy for a model
168
+ async function generateCopy(model: EvalModel, testCase: typeof TEST_CASES[0]): Promise<GeneratedCopy> {
169
+ const start = Date.now()
170
+
171
+ const { object } = await generateObject({
172
+ model: model.id,
173
+ schema: marketingCopySchema,
174
+ prompt: testCase.prompt,
175
+ temperature: 0.7,
176
+ })
177
+
178
+ return {
179
+ model,
180
+ testCase,
181
+ copy: object as MarketingCopy,
182
+ latencyMs: Date.now() - start,
183
+ }
184
+ }
185
+
186
+ // Run pairwise comparisons and calculate ELO
187
+ async function runEloTournament(
188
+ copies: GeneratedCopy[],
189
+ judgeModel: string
190
+ ): Promise<ELORating[]> {
191
+ // Initialize ELO ratings
192
+ const ratings: Map<string, ELORating> = new Map()
193
+
194
+ for (const copy of copies) {
195
+ if (!ratings.has(copy.model.id)) {
196
+ ratings.set(copy.model.id, {
197
+ modelId: copy.model.id,
198
+ modelName: copy.model.name,
199
+ rating: INITIAL_ELO,
200
+ wins: 0,
201
+ losses: 0,
202
+ draws: 0,
203
+ })
204
+ }
205
+ }
206
+
207
+ // Group copies by test case
208
+ const byTestCase = new Map<string, GeneratedCopy[]>()
209
+ for (const copy of copies) {
210
+ const key = copy.testCase.name
211
+ if (!byTestCase.has(key)) {
212
+ byTestCase.set(key, [])
213
+ }
214
+ byTestCase.get(key)!.push(copy)
215
+ }
216
+
217
+ console.log(`\n⚖️ Running pairwise comparisons with ${JUDGE_MODEL} as judge...\n`)
218
+
219
+ let comparisonCount = 0
220
+ const totalComparisons = Array.from(byTestCase.values()).reduce(
221
+ (sum, copies) => sum + (copies.length * (copies.length - 1)) / 2,
222
+ 0
223
+ )
224
+
225
+ // Run pairwise comparisons within each test case
226
+ for (const [testCaseName, testCaseCopies] of byTestCase) {
227
+ console.log(` 📝 ${testCaseName}:`)
228
+
229
+ for (let i = 0; i < testCaseCopies.length; i++) {
230
+ for (let j = i + 1; j < testCaseCopies.length; j++) {
231
+ const copyA = testCaseCopies[i]
232
+ const copyB = testCaseCopies[j]
233
+
234
+ comparisonCount++
235
+ process.stdout.write(` ${comparisonCount}/${totalComparisons} ${copyA.model.name} vs ${copyB.model.name}... `)
236
+
237
+ const winner = await judgePair(copyA.copy, copyB.copy, copyA.testCase, judgeModel)
238
+
239
+ const ratingA = ratings.get(copyA.model.id)!
240
+ const ratingB = ratings.get(copyB.model.id)!
241
+
242
+ let scoreA: number
243
+ if (winner === 'A') {
244
+ scoreA = 1
245
+ ratingA.wins++
246
+ ratingB.losses++
247
+ console.log(`${copyA.model.name} wins`)
248
+ } else if (winner === 'B') {
249
+ scoreA = 0
250
+ ratingA.losses++
251
+ ratingB.wins++
252
+ console.log(`${copyB.model.name} wins`)
253
+ } else {
254
+ scoreA = 0.5
255
+ ratingA.draws++
256
+ ratingB.draws++
257
+ console.log(`TIE`)
258
+ }
259
+
260
+ const { deltaA, deltaB } = calculateEloChange(ratingA.rating, ratingB.rating, scoreA)
261
+ ratingA.rating += deltaA
262
+ ratingB.rating += deltaB
263
+ }
264
+ }
265
+ }
266
+
267
+ // Sort by ELO rating
268
+ return Array.from(ratings.values()).sort((a, b) => b.rating - a.rating)
269
+ }
270
+
271
+ // Main
272
+ async function main() {
273
+ console.log('╔════════════════════════════════════════════════════════════════╗')
274
+ console.log('║ Marketing Copy Eval (LLM-as-Judge) ║')
275
+ console.log('╚════════════════════════════════════════════════════════════════╝')
276
+ console.log('')
277
+ console.log(`Judge Model: ${JUDGE_MODEL}`)
278
+ console.log(`Tiers: ${tiers.join(', ')}`)
279
+
280
+ // Get models to test
281
+ const models = EVAL_MODELS.filter(m => tiers.includes(m.tier))
282
+ console.log(`Models: ${models.map(m => m.name).join(', ')}`)
283
+ console.log(`Test Cases: ${TEST_CASES.length}`)
284
+ console.log('')
285
+
286
+ // Generate copy from each model for each test case
287
+ console.log('🎨 Generating marketing copy...\n')
288
+
289
+ const allCopies: GeneratedCopy[] = []
290
+ const startTime = Date.now()
291
+
292
+ for (const testCase of TEST_CASES) {
293
+ console.log(` 📦 ${testCase.name}:`)
294
+
295
+ const jobs = models.map(async model => {
296
+ try {
297
+ const copy = await generateCopy(model, testCase)
298
+ console.log(` ✓ ${model.name} (${copy.latencyMs}ms)`)
299
+ return copy
300
+ } catch (err) {
301
+ console.log(` ✗ ${model.name}: ${err}`)
302
+ return null
303
+ }
304
+ })
305
+
306
+ const results = await Promise.all(jobs)
307
+ allCopies.push(...results.filter((r): r is GeneratedCopy => r !== null))
308
+ }
309
+
310
+ const generateTime = Date.now() - startTime
311
+ console.log(`\n Generated ${allCopies.length} copies in ${(generateTime / 1000).toFixed(1)}s`)
312
+
313
+ // Run ELO tournament
314
+ const tournamentStart = Date.now()
315
+ const eloRatings = await runEloTournament(allCopies, JUDGE_MODEL)
316
+ const tournamentTime = Date.now() - tournamentStart
317
+
318
+ // Display results
319
+ console.log('')
320
+ console.log('╔════════════════════════════════════════════════════════════════╗')
321
+ console.log('║ ELO Rankings ║')
322
+ console.log('╚════════════════════════════════════════════════════════════════╝')
323
+ console.log('')
324
+ console.log(' Rank | Model | ELO | W | L | D |')
325
+ console.log(' -----|------------------------|--------|-----|-----|-----|')
326
+
327
+ eloRatings.forEach((rating, idx) => {
328
+ const rank = `${idx + 1}`.padStart(4)
329
+ const name = rating.modelName.padEnd(22)
330
+ const elo = Math.round(rating.rating).toString().padStart(6)
331
+ const wins = rating.wins.toString().padStart(3)
332
+ const losses = rating.losses.toString().padStart(3)
333
+ const draws = rating.draws.toString().padStart(3)
334
+ console.log(` ${rank} | ${name} | ${elo} | ${wins} | ${losses} | ${draws} |`)
335
+ })
336
+
337
+ console.log('')
338
+ console.log(` Judge: ${JUDGE_MODEL}`)
339
+ console.log(` Generation Time: ${(generateTime / 1000).toFixed(1)}s`)
340
+ console.log(` Tournament Time: ${(tournamentTime / 1000).toFixed(1)}s`)
341
+ console.log(` Total Time: ${((generateTime + tournamentTime) / 1000).toFixed(1)}s`)
342
+
343
+ // Show sample outputs from top 3
344
+ console.log('')
345
+ console.log('╔════════════════════════════════════════════════════════════════╗')
346
+ console.log('║ Sample Outputs (Top 3) ║')
347
+ console.log('╚════════════════════════════════════════════════════════════════╝')
348
+
349
+ const top3Models = eloRatings.slice(0, 3).map(r => r.modelId)
350
+ const sampleTestCase = TEST_CASES[0]
351
+
352
+ for (const modelId of top3Models) {
353
+ const copy = allCopies.find(c => c.model.id === modelId && c.testCase.name === sampleTestCase.name)
354
+ if (copy) {
355
+ const rank = eloRatings.findIndex(r => r.modelId === modelId) + 1
356
+ console.log(`\n #${rank} ${copy.model.name} (${sampleTestCase.name}):`)
357
+ console.log(` ─────────────────────────────────────────`)
358
+ console.log(` Title: ${copy.copy.title}`)
359
+ console.log(` Description: ${copy.copy.description}`)
360
+ console.log(` Headline: ${copy.copy.hero.headline}`)
361
+ console.log(` Subhead: ${copy.copy.hero.subhead}`)
362
+ console.log(` Primary CTA: [${copy.copy.hero.primaryCTA}]`)
363
+ console.log(` Secondary CTA: ${copy.copy.hero.secondaryCTA}`)
364
+ }
365
+ }
366
+
367
+ console.log('')
368
+ }
369
+
370
+ main().catch(console.error)