ai-functions 0.2.19 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +5 -0
- package/.turbo/turbo-test.log +105 -0
- package/README.md +232 -37
- package/TODO.md +138 -0
- package/dist/ai-promise.d.ts +219 -0
- package/dist/ai-promise.d.ts.map +1 -0
- package/dist/ai-promise.js +610 -0
- package/dist/ai-promise.js.map +1 -0
- package/dist/ai.d.ts +285 -0
- package/dist/ai.d.ts.map +1 -0
- package/dist/ai.js +842 -0
- package/dist/ai.js.map +1 -0
- package/dist/batch/anthropic.d.ts +23 -0
- package/dist/batch/anthropic.d.ts.map +1 -0
- package/dist/batch/anthropic.js +257 -0
- package/dist/batch/anthropic.js.map +1 -0
- package/dist/batch/bedrock.d.ts +64 -0
- package/dist/batch/bedrock.d.ts.map +1 -0
- package/dist/batch/bedrock.js +586 -0
- package/dist/batch/bedrock.js.map +1 -0
- package/dist/batch/cloudflare.d.ts +37 -0
- package/dist/batch/cloudflare.d.ts.map +1 -0
- package/dist/batch/cloudflare.js +289 -0
- package/dist/batch/cloudflare.js.map +1 -0
- package/dist/batch/google.d.ts +41 -0
- package/dist/batch/google.d.ts.map +1 -0
- package/dist/batch/google.js +360 -0
- package/dist/batch/google.js.map +1 -0
- package/dist/batch/index.d.ts +31 -0
- package/dist/batch/index.d.ts.map +1 -0
- package/dist/batch/index.js +31 -0
- package/dist/batch/index.js.map +1 -0
- package/dist/batch/memory.d.ts +44 -0
- package/dist/batch/memory.d.ts.map +1 -0
- package/dist/batch/memory.js +188 -0
- package/dist/batch/memory.js.map +1 -0
- package/dist/batch/openai.d.ts +37 -0
- package/dist/batch/openai.d.ts.map +1 -0
- package/dist/batch/openai.js +403 -0
- package/dist/batch/openai.js.map +1 -0
- package/dist/batch-map.d.ts +125 -0
- package/dist/batch-map.d.ts.map +1 -0
- package/dist/batch-map.js +406 -0
- package/dist/batch-map.js.map +1 -0
- package/dist/batch-queue.d.ts +273 -0
- package/dist/batch-queue.d.ts.map +1 -0
- package/dist/batch-queue.js +271 -0
- package/dist/batch-queue.js.map +1 -0
- package/dist/context.d.ts +133 -0
- package/dist/context.d.ts.map +1 -0
- package/dist/context.js +267 -0
- package/dist/context.js.map +1 -0
- package/dist/embeddings.d.ts +123 -0
- package/dist/embeddings.d.ts.map +1 -0
- package/dist/embeddings.js +170 -0
- package/dist/embeddings.js.map +1 -0
- package/dist/eval/index.d.ts +8 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +8 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/models.d.ts +66 -0
- package/dist/eval/models.d.ts.map +1 -0
- package/dist/eval/models.js +120 -0
- package/dist/eval/models.js.map +1 -0
- package/dist/eval/runner.d.ts +64 -0
- package/dist/eval/runner.d.ts.map +1 -0
- package/dist/eval/runner.js +148 -0
- package/dist/eval/runner.js.map +1 -0
- package/dist/generate.d.ts +168 -0
- package/dist/generate.d.ts.map +1 -0
- package/dist/generate.js +174 -0
- package/dist/generate.js.map +1 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +54 -0
- package/dist/index.js.map +1 -0
- package/dist/primitives.d.ts +292 -0
- package/dist/primitives.d.ts.map +1 -0
- package/dist/primitives.js +471 -0
- package/dist/primitives.js.map +1 -0
- package/dist/providers/cloudflare.d.ts +9 -0
- package/dist/providers/cloudflare.d.ts.map +1 -0
- package/dist/providers/cloudflare.js +9 -0
- package/dist/providers/cloudflare.js.map +1 -0
- package/dist/providers/index.d.ts +9 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +9 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/schema.d.ts +54 -0
- package/dist/schema.d.ts.map +1 -0
- package/dist/schema.js +109 -0
- package/dist/schema.js.map +1 -0
- package/dist/template.d.ts +73 -0
- package/dist/template.d.ts.map +1 -0
- package/dist/template.js +129 -0
- package/dist/template.js.map +1 -0
- package/dist/types.d.ts +481 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +5 -0
- package/dist/types.js.map +1 -0
- package/evalite.config.ts +19 -0
- package/evals/README.md +212 -0
- package/evals/classification.eval.ts +108 -0
- package/evals/marketing.eval.ts +370 -0
- package/evals/math.eval.ts +94 -0
- package/evals/run-evals.ts +166 -0
- package/evals/structured-output.eval.ts +143 -0
- package/evals/writing.eval.ts +117 -0
- package/examples/batch-blog-posts.ts +160 -0
- package/package.json +59 -43
- package/src/ai-promise.ts +784 -0
- package/src/ai.ts +1183 -0
- package/src/batch/anthropic.ts +375 -0
- package/src/batch/bedrock.ts +801 -0
- package/src/batch/cloudflare.ts +421 -0
- package/src/batch/google.ts +491 -0
- package/src/batch/index.ts +31 -0
- package/src/batch/memory.ts +253 -0
- package/src/batch/openai.ts +557 -0
- package/src/batch-map.ts +534 -0
- package/src/batch-queue.ts +493 -0
- package/src/context.ts +332 -0
- package/src/embeddings.ts +244 -0
- package/src/eval/index.ts +8 -0
- package/src/eval/models.ts +158 -0
- package/src/eval/runner.ts +217 -0
- package/src/generate.ts +245 -0
- package/src/index.ts +154 -0
- package/src/primitives.ts +612 -0
- package/src/providers/cloudflare.ts +15 -0
- package/src/providers/index.ts +14 -0
- package/src/schema.ts +147 -0
- package/src/template.ts +209 -0
- package/src/types.ts +540 -0
- package/test/README.md +105 -0
- package/test/ai-proxy.test.ts +192 -0
- package/test/async-iterators.test.ts +327 -0
- package/test/batch-background.test.ts +482 -0
- package/test/batch-blog-posts.test.ts +387 -0
- package/test/blog-generation.test.ts +510 -0
- package/test/browse-read.test.ts +611 -0
- package/test/core-functions.test.ts +694 -0
- package/test/decide.test.ts +393 -0
- package/test/define.test.ts +274 -0
- package/test/e2e-bedrock-manual.ts +163 -0
- package/test/e2e-bedrock.test.ts +191 -0
- package/test/e2e-flex-gateway.ts +157 -0
- package/test/e2e-flex-manual.ts +183 -0
- package/test/e2e-flex.test.ts +209 -0
- package/test/e2e-google-manual.ts +178 -0
- package/test/e2e-google.test.ts +216 -0
- package/test/embeddings.test.ts +284 -0
- package/test/evals/define-function.eval.test.ts +379 -0
- package/test/evals/primitives.eval.test.ts +384 -0
- package/test/function-types.test.ts +492 -0
- package/test/generate-core.test.ts +319 -0
- package/test/generate.test.ts +163 -0
- package/test/implicit-batch.test.ts +422 -0
- package/test/schema.test.ts +109 -0
- package/test/tagged-templates.test.ts +302 -0
- package/tsconfig.json +8 -6
- package/vitest.config.ts +42 -0
- package/LICENSE +0 -21
- package/db/cache.ts +0 -6
- package/db/mongo.ts +0 -75
- package/dist/mjs/db/cache.d.ts +0 -1
- package/dist/mjs/db/cache.js +0 -5
- package/dist/mjs/db/mongo.d.ts +0 -31
- package/dist/mjs/db/mongo.js +0 -48
- package/dist/mjs/examples/data.d.ts +0 -1105
- package/dist/mjs/examples/data.js +0 -1105
- package/dist/mjs/functions/ai.d.ts +0 -20
- package/dist/mjs/functions/ai.js +0 -83
- package/dist/mjs/functions/ai.test.d.ts +0 -1
- package/dist/mjs/functions/ai.test.js +0 -29
- package/dist/mjs/functions/gpt.d.ts +0 -4
- package/dist/mjs/functions/gpt.js +0 -10
- package/dist/mjs/functions/list.d.ts +0 -7
- package/dist/mjs/functions/list.js +0 -72
- package/dist/mjs/index.d.ts +0 -3
- package/dist/mjs/index.js +0 -3
- package/dist/mjs/queue/kafka.d.ts +0 -0
- package/dist/mjs/queue/kafka.js +0 -1
- package/dist/mjs/queue/memory.d.ts +0 -0
- package/dist/mjs/queue/memory.js +0 -1
- package/dist/mjs/queue/mongo.d.ts +0 -30
- package/dist/mjs/queue/mongo.js +0 -42
- package/dist/mjs/streams/kafka.d.ts +0 -0
- package/dist/mjs/streams/kafka.js +0 -1
- package/dist/mjs/streams/memory.d.ts +0 -0
- package/dist/mjs/streams/memory.js +0 -1
- package/dist/mjs/streams/mongo.d.ts +0 -0
- package/dist/mjs/streams/mongo.js +0 -1
- package/dist/mjs/streams/types.d.ts +0 -0
- package/dist/mjs/streams/types.js +0 -1
- package/dist/mjs/types.d.ts +0 -11
- package/dist/mjs/types.js +0 -1
- package/dist/mjs/utils/completion.d.ts +0 -9
- package/dist/mjs/utils/completion.js +0 -20
- package/dist/mjs/utils/schema.d.ts +0 -10
- package/dist/mjs/utils/schema.js +0 -72
- package/dist/mjs/utils/schema.test.d.ts +0 -1
- package/dist/mjs/utils/schema.test.js +0 -60
- package/dist/mjs/utils/state.d.ts +0 -1
- package/dist/mjs/utils/state.js +0 -19
- package/examples/data.ts +0 -1105
- package/fixup +0 -11
- package/functions/ai.test.ts +0 -41
- package/functions/ai.ts +0 -115
- package/functions/gpt.ts +0 -12
- package/functions/list.ts +0 -84
- package/index.ts +0 -3
- package/queue/kafka.ts +0 -0
- package/queue/memory.ts +0 -0
- package/queue/mongo.ts +0 -88
- package/streams/kafka.ts +0 -0
- package/streams/memory.ts +0 -0
- package/streams/mongo.ts +0 -0
- package/streams/types.ts +0 -0
- package/tsconfig-backup.json +0 -105
- package/tsconfig-base.json +0 -26
- package/tsconfig-cjs.json +0 -8
- package/types.ts +0 -12
- package/utils/completion.ts +0 -28
- package/utils/schema.test.ts +0 -69
- package/utils/schema.ts +0 -74
- package/utils/state.ts +0 -23
package/evals/README.md
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# AI Functions Eval Suite
|
|
2
|
+
|
|
3
|
+
Evaluations for ai-functions using both vitest-based tests and a custom eval runner.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### Vitest-Based Evals (Recommended)
|
|
8
|
+
|
|
9
|
+
Tests the core AI primitives (`code`, `ai`, `list`, `is`, `defineFunction`, etc.) with real AI calls:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Run all eval tests
|
|
13
|
+
pnpm test:evals
|
|
14
|
+
|
|
15
|
+
# Run primitives eval (code, ai, list, is, etc.)
|
|
16
|
+
pnpm test:evals:primitives
|
|
17
|
+
|
|
18
|
+
# Run defineFunction eval
|
|
19
|
+
pnpm test:evals:define
|
|
20
|
+
|
|
21
|
+
# Run with specific model
|
|
22
|
+
MODEL=sonnet pnpm test:evals
|
|
23
|
+
|
|
24
|
+
# Run with specific tiers
|
|
25
|
+
EVAL_TIERS=best,fast pnpm test:evals
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Custom Runner Evals
|
|
29
|
+
|
|
30
|
+
Math and classification evals with detailed scoring:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Run all evals (math + classification)
|
|
34
|
+
pnpm eval
|
|
35
|
+
|
|
36
|
+
# Run specific eval
|
|
37
|
+
pnpm eval:math
|
|
38
|
+
pnpm eval:class
|
|
39
|
+
|
|
40
|
+
# Run all tiers (best, fast, cheap)
|
|
41
|
+
pnpm eval:all
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Eval Suites
|
|
45
|
+
|
|
46
|
+
### Vitest Evals (test/evals/)
|
|
47
|
+
|
|
48
|
+
| Test Suite | Functions Tested | Test Cases |
|
|
49
|
+
|------------|------------------|------------|
|
|
50
|
+
| `primitives.eval.test.ts` | `code()`, `ai()`, `list()`, `is()`, `summarize()`, `extract()`, `write()`, `lists()` | Code generation, text generation, classification, extraction |
|
|
51
|
+
| `define-function.eval.test.ts` | `defineFunction()`, `define.generative()`, `define.code()` | Generative functions, code functions, structured outputs |
|
|
52
|
+
|
|
53
|
+
### Custom Runner Evals (evals/)
|
|
54
|
+
|
|
55
|
+
| Eval | Tests | Scoring |
|
|
56
|
+
|------|-------|---------|
|
|
57
|
+
| `Math` | Arithmetic, word problems | Correct answer + shows work |
|
|
58
|
+
| `Classification` | Sentiment, support tickets | Accuracy + calibration |
|
|
59
|
+
| `Marketing` | Marketing copy generation | LLM-as-judge ELO ranking |
|
|
60
|
+
|
|
61
|
+
### Marketing Copy Eval (LLM-as-Judge)
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Run marketing eval (fast tier only)
|
|
65
|
+
pnpm eval:marketing
|
|
66
|
+
|
|
67
|
+
# Run with all tiers
|
|
68
|
+
pnpm eval:marketing:all
|
|
69
|
+
|
|
70
|
+
# Use different judge model
|
|
71
|
+
pnpm eval:marketing -- --judge=opus
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Generates marketing copy (title, description, hero headline/subhead, CTAs) for different scenarios and uses pairwise LLM-as-judge comparisons to create ELO rankings.
|
|
75
|
+
|
|
76
|
+
## Latest Results (December 2025)
|
|
77
|
+
|
|
78
|
+
**Overall: 94.0%** | Cost: $0.06 | Time: 95s | 10 Models
|
|
79
|
+
|
|
80
|
+
### Performance Summary
|
|
81
|
+
|
|
82
|
+
| Model | Math | Class | Overall | Avg Latency | Notes |
|
|
83
|
+
|-------|------|-------|---------|-------------|-------|
|
|
84
|
+
| Claude Sonnet 4.5 | 100% | 100% | **100%** | ~380ms | Best overall |
|
|
85
|
+
| GPT-5 Mini | 100% | 91.7% | 95.9% | ~1850ms | Slower but accurate |
|
|
86
|
+
| Gemini 2.5 Flash | 100% | 91.7% | 95.9% | ~200ms | **Fastest** |
|
|
87
|
+
| DeepSeek Chat | 100% | 91.7% | 95.9% | ~210ms | Great value |
|
|
88
|
+
| Mistral Medium 3.1 | 96% | 100% | 98.0% | ~850ms | Strong classify |
|
|
89
|
+
| Grok 4.1 Fast | 100% | 91.7% | 95.9% | ~2300ms | 2M context |
|
|
90
|
+
| Grok 4 Fast | 92% | 100% | 96.0% | ~1800ms | Good balance |
|
|
91
|
+
| Qwen3 30B | 96% | 91.7% | 93.9% | ~8900ms | Slowest |
|
|
92
|
+
| Llama 3.3 70B | 90% | 91.7% | 90.9% | ~185ms | Fast open model |
|
|
93
|
+
| GPT-oss 20B | 72% | 83.3% | 77.7% | ~1200ms | Open source |
|
|
94
|
+
|
|
95
|
+
### Performance/$ Analysis (Fast Tier)
|
|
96
|
+
|
|
97
|
+
| Model | Score | Est $/1M tokens | Score/$ | Recommendation |
|
|
98
|
+
|-------|-------|-----------------|---------|----------------|
|
|
99
|
+
| DeepSeek Chat | 95.9% | $0.28 | **342** | Best value |
|
|
100
|
+
| Gemini 2.5 Flash | 95.9% | $0.30 | 320 | Fast + cheap |
|
|
101
|
+
| Llama 3.3 70B | 90.9% | $0.40 | 227 | Good OSS option |
|
|
102
|
+
| Claude Sonnet 4.5 | 100% | $3.00 | 33 | Best quality |
|
|
103
|
+
| Mistral Medium 3.1 | 98.0% | $2.50 | 39 | Strong balance |
|
|
104
|
+
| GPT-5 Mini | 95.9% | $1.00 | 96 | OpenAI ecosystem |
|
|
105
|
+
| Grok 4.1 Fast | 95.9% | $2.00 | 48 | 2M context |
|
|
106
|
+
|
|
107
|
+
### Math Eval (94.6%)
|
|
108
|
+
|
|
109
|
+
| Model | Score | Avg Latency |
|
|
110
|
+
|-------|-------|-------------|
|
|
111
|
+
| Claude Sonnet 4.5 | 100% | ~380ms |
|
|
112
|
+
| GPT-5 Mini | 100% | ~200ms |
|
|
113
|
+
| Gemini 2.5 Flash | 100% | ~170ms |
|
|
114
|
+
| DeepSeek Chat | 100% | ~220ms |
|
|
115
|
+
| Grok 4.1 Fast | 100% | ~2600ms |
|
|
116
|
+
| Mistral Medium 3.1 | 96% | ~1040ms |
|
|
117
|
+
| Qwen3 30B | 96% | ~13000ms |
|
|
118
|
+
| Grok 4 Fast | 92% | ~2000ms |
|
|
119
|
+
| Llama 3.3 70B | 90% | ~170ms |
|
|
120
|
+
| GPT-oss 20B | 72% | ~180ms |
|
|
121
|
+
|
|
122
|
+
### Classification Eval (93.3%)
|
|
123
|
+
|
|
124
|
+
| Model | Score | Avg Latency |
|
|
125
|
+
|-------|-------|-------------|
|
|
126
|
+
| Claude Sonnet 4.5 | 100% | ~205ms |
|
|
127
|
+
| Mistral Medium 3.1 | 100% | ~700ms |
|
|
128
|
+
| Grok 4 Fast | 100% | ~1670ms |
|
|
129
|
+
| GPT-5 Mini | 91.7% | ~3500ms |
|
|
130
|
+
| Gemini 2.5 Flash | 91.7% | ~235ms |
|
|
131
|
+
| Llama 3.3 70B | 91.7% | ~230ms |
|
|
132
|
+
| DeepSeek Chat | 91.7% | ~230ms |
|
|
133
|
+
| Qwen3 30B | 91.7% | ~3970ms |
|
|
134
|
+
| Grok 4.1 Fast | 91.7% | ~2170ms |
|
|
135
|
+
| GPT-oss 20B | 83.3% | ~2840ms |
|
|
136
|
+
|
|
137
|
+
### Marketing Copy Eval (ELO Rankings)
|
|
138
|
+
|
|
139
|
+
Uses LLM-as-judge (Claude Sonnet) for pairwise comparisons across 4 test scenarios.
|
|
140
|
+
|
|
141
|
+
| Rank | Model | ELO | W | L | D | Notes |
|
|
142
|
+
|------|-------|-----|---|---|---|-------|
|
|
143
|
+
| 1 | Claude Sonnet 4.5 | **1745** | 31 | 3 | 0 | Dominant winner |
|
|
144
|
+
| 2 | Grok 4.1 Fast | 1595 | 22 | 12 | 0 | Strong creative |
|
|
145
|
+
| 3 | GPT-5 Mini | 1593 | 26 | 8 | 0 | Consistent quality |
|
|
146
|
+
| 4 | Grok 4 Fast | 1558 | 17 | 17 | 0 | Good balance |
|
|
147
|
+
| 5 | Gemini 2.5 Flash | 1503 | 14 | 20 | 0 | Middle tier |
|
|
148
|
+
| 6 | Mistral Medium 3.1 | 1481 | 16 | 18 | 0 | Solid performer |
|
|
149
|
+
| 7 | GPT-oss 20B | 1471 | 19 | 15 | 0 | OSS option |
|
|
150
|
+
| 8 | DeepSeek Chat | 1449 | 10 | 16 | 0 | Value option |
|
|
151
|
+
| 9 | Qwen3 30B | 1371 | 6 | 20 | 0 | Below average |
|
|
152
|
+
| 10 | Llama 3.3 70B | 1231 | 1 | 33 | 0 | Struggled |
|
|
153
|
+
|
|
154
|
+
**Key Insights:**
|
|
155
|
+
- Claude Sonnet 4.5 won 31 of 34 comparisons (91%)
|
|
156
|
+
- Grok models performed unexpectedly well on creative tasks
|
|
157
|
+
- Llama 3.3 70B, despite being strong on classification, struggled with marketing copy
|
|
158
|
+
|
|
159
|
+
## Models
|
|
160
|
+
|
|
161
|
+
Uses model IDs from `language-models` package, routed via `ai-providers`:
|
|
162
|
+
|
|
163
|
+
### Model Tiers
|
|
164
|
+
|
|
165
|
+
| Tier | Description | Models |
|
|
166
|
+
|------|-------------|--------|
|
|
167
|
+
| `best` | Highest capability | opus, o3, gpt-5.1, gemini-pro, deepseek-v3.2, mistral-large-3, qwen3-coder, grok-4 |
|
|
168
|
+
| `fast` | Good balance | sonnet, gpt-5-mini, flash, llama-3.3-70b, mistral-medium-3.1, qwen3-30b, grok-4.1-fast |
|
|
169
|
+
| `cheap` | Cost-optimized | haiku, gpt-5-nano, ministral-14b |
|
|
170
|
+
|
|
171
|
+
### Full Model List (December 2025)
|
|
172
|
+
|
|
173
|
+
- **Anthropic**: `opus`, `sonnet`, `haiku`
|
|
174
|
+
- **OpenAI**: `openai/gpt-5.1`, `openai/gpt-5-mini`, `openai/gpt-5-nano`, `openai/o3`
|
|
175
|
+
- **OpenAI OSS**: `openai/gpt-oss-120b`, `openai/gpt-oss-20b` (open source models)
|
|
176
|
+
- **Google**: `gemini-pro`, `flash`
|
|
177
|
+
- **Meta**: `meta-llama/llama-4-maverick`, `meta-llama/llama-3.3-70b-instruct`
|
|
178
|
+
- **DeepSeek**: `deepseek/deepseek-v3.2`, `deepseek/deepseek-v3.2-speciale`, `deepseek/deepseek-chat`
|
|
179
|
+
- **Mistral**: `mistralai/mistral-large-2512` (Mistral Large 3), `mistralai/mistral-medium-3.1`, `mistralai/ministral-14b-2512`
|
|
180
|
+
- **Qwen**: `qwen/qwen3-coder`, `qwen/qwen3-30b-a3b`, `qwen/qwen3-next-80b-a3b-instruct`
|
|
181
|
+
- **xAI**: `x-ai/grok-4`, `x-ai/grok-4.1-fast`, `x-ai/grok-4-fast`
|
|
182
|
+
|
|
183
|
+
## Environment
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
# Use AI Gateway (recommended)
|
|
187
|
+
AI_GATEWAY_URL=https://gateway.ai.cloudflare.com/v1/...
|
|
188
|
+
AI_GATEWAY_TOKEN=...
|
|
189
|
+
|
|
190
|
+
# Or direct API keys
|
|
191
|
+
ANTHROPIC_API_KEY=sk-ant-...
|
|
192
|
+
OPENAI_API_KEY=sk-...
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Adding Evals
|
|
196
|
+
|
|
197
|
+
### Vitest-Based Evals
|
|
198
|
+
|
|
199
|
+
1. Create a new test file in `test/evals/`
|
|
200
|
+
2. Import functions and models:
|
|
201
|
+
```typescript
|
|
202
|
+
import { code, ai, list } from '../../src/primitives.js'
|
|
203
|
+
import { EVAL_MODELS, type EvalModel } from '../../src/eval/models.js'
|
|
204
|
+
```
|
|
205
|
+
3. Use `describe.skipIf(!hasAPI)` to skip when no API access
|
|
206
|
+
4. Loop over models with `for (const model of models)`
|
|
207
|
+
|
|
208
|
+
### Custom Runner Evals
|
|
209
|
+
|
|
210
|
+
1. Add test cases to `evals/run-evals.ts`
|
|
211
|
+
2. Use `runEval()` with `task` function and `scorers` array
|
|
212
|
+
3. Use `createModelVariants({ tiers: ['fast'] })` to filter models
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Classification Eval
|
|
3
|
+
*
|
|
4
|
+
* Tests model ability to classify inputs correctly.
|
|
5
|
+
* Includes sentiment analysis, category classification, and boolean questions.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { evalite } from 'evalite'
|
|
9
|
+
import { generateObject } from '../src/generate.js'
|
|
10
|
+
import { schema } from '../src/schema.js'
|
|
11
|
+
import { createModelVariants, type EvalModel } from '../src/eval/models.js'
|
|
12
|
+
|
|
13
|
+
// Classification test cases
|
|
14
|
+
const TEST_CASES = [
|
|
15
|
+
// Sentiment
|
|
16
|
+
{ text: 'This product exceeded my expectations!', expected: 'positive', options: ['positive', 'negative', 'neutral'] },
|
|
17
|
+
{ text: 'The delivery was late and packaging damaged.', expected: 'negative', options: ['positive', 'negative', 'neutral'] },
|
|
18
|
+
{ text: 'The product arrived as described.', expected: 'neutral', options: ['positive', 'negative', 'neutral'] },
|
|
19
|
+
|
|
20
|
+
// Support ticket classification
|
|
21
|
+
{ text: 'I need to reset my password', expected: 'account', options: ['account', 'billing', 'technical', 'shipping'] },
|
|
22
|
+
{ text: 'When will my refund be processed?', expected: 'billing', options: ['account', 'billing', 'technical', 'shipping'] },
|
|
23
|
+
{ text: 'The app crashes when uploading images', expected: 'technical', options: ['account', 'billing', 'technical', 'shipping'] },
|
|
24
|
+
{ text: 'My package shows delivered but I never received it', expected: 'shipping', options: ['account', 'billing', 'technical', 'shipping'] },
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
const modelVariants = createModelVariants({ tiers: ['fast', 'cheap'] })
|
|
28
|
+
|
|
29
|
+
evalite.each(modelVariants)('Classification', {
|
|
30
|
+
data: TEST_CASES.map(tc => ({ input: tc, expected: tc.expected })),
|
|
31
|
+
|
|
32
|
+
task: async (input, variant) => {
|
|
33
|
+
const model = variant as EvalModel
|
|
34
|
+
const startTime = Date.now()
|
|
35
|
+
|
|
36
|
+
const enumStr = input.options.join(' | ')
|
|
37
|
+
|
|
38
|
+
const { object, usage } = await generateObject({
|
|
39
|
+
model: model.id,
|
|
40
|
+
schema: schema({
|
|
41
|
+
category: enumStr,
|
|
42
|
+
confidence: 'Confidence 0-1 (number)',
|
|
43
|
+
}),
|
|
44
|
+
prompt: `Classify this text into one of: ${input.options.join(', ')}
|
|
45
|
+
|
|
46
|
+
Text: "${input.text}"`,
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
const latencyMs = Date.now() - startTime
|
|
50
|
+
|
|
51
|
+
return {
|
|
52
|
+
predicted: object.category,
|
|
53
|
+
confidence: object.confidence,
|
|
54
|
+
expected: input.expected,
|
|
55
|
+
text: input.text,
|
|
56
|
+
options: input.options,
|
|
57
|
+
modelId: model.id,
|
|
58
|
+
modelName: model.name,
|
|
59
|
+
latencyMs,
|
|
60
|
+
usage,
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
|
|
64
|
+
scorers: [
|
|
65
|
+
// Accuracy
|
|
66
|
+
{
|
|
67
|
+
name: 'Accuracy',
|
|
68
|
+
description: 'Whether classification is correct',
|
|
69
|
+
scorer: ({ output, expected }) => ({
|
|
70
|
+
score: output.predicted === expected ? 1 : 0,
|
|
71
|
+
}),
|
|
72
|
+
},
|
|
73
|
+
|
|
74
|
+
// Valid category
|
|
75
|
+
{
|
|
76
|
+
name: 'Valid Category',
|
|
77
|
+
description: 'Whether output is a valid option',
|
|
78
|
+
scorer: ({ output }) => ({
|
|
79
|
+
score: (output.options as string[]).includes(output.predicted as string) ? 1 : 0,
|
|
80
|
+
}),
|
|
81
|
+
},
|
|
82
|
+
|
|
83
|
+
// Calibration
|
|
84
|
+
{
|
|
85
|
+
name: 'Calibration',
|
|
86
|
+
description: 'Confidence matches accuracy',
|
|
87
|
+
scorer: ({ output, expected }) => {
|
|
88
|
+
const correct = output.predicted === expected
|
|
89
|
+
const conf = output.confidence as number
|
|
90
|
+
|
|
91
|
+
// High confidence when correct, low when wrong = well calibrated
|
|
92
|
+
if (correct && conf >= 0.7) return { score: 1 }
|
|
93
|
+
if (!correct && conf <= 0.5) return { score: 0.8 }
|
|
94
|
+
if (correct && conf < 0.5) return { score: 0.6 } // Underconfident
|
|
95
|
+
if (!correct && conf > 0.7) return { score: 0.2 } // Overconfident
|
|
96
|
+
return { score: 0.5 }
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
],
|
|
100
|
+
|
|
101
|
+
columns: ({ output, expected }) => [
|
|
102
|
+
{ label: 'Model', value: output.modelName },
|
|
103
|
+
{ label: 'Expected', value: expected },
|
|
104
|
+
{ label: 'Got', value: output.predicted },
|
|
105
|
+
{ label: 'Correct', value: output.predicted === expected ? 'Yes' : 'No' },
|
|
106
|
+
{ label: 'Confidence', value: `${((output.confidence as number) * 100).toFixed(0)}%` },
|
|
107
|
+
],
|
|
108
|
+
})
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* Marketing Copy Eval with LLM-as-Judge ELO Ranking
|
|
4
|
+
*
|
|
5
|
+
* Generates marketing copy (title, description, hero headline/subhead, CTAs)
|
|
6
|
+
* and uses pairwise comparison with an LLM judge to create ELO rankings.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* npx tsx evals/marketing.eval.ts
|
|
10
|
+
* npx tsx evals/marketing.eval.ts --judge=opus # Use specific judge model
|
|
11
|
+
* npx tsx evals/marketing.eval.ts --judge=haiku # Test cheap judge
|
|
12
|
+
* npx tsx evals/marketing.eval.ts --judge=flash # Test fast judge
|
|
13
|
+
* npx tsx evals/marketing.eval.ts --all # Run all tiers
|
|
14
|
+
* npx tsx evals/marketing.eval.ts --all --judge=haiku # All tiers + cheap judge
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
// Load .env from project root
|
|
18
|
+
import { config } from 'dotenv'
|
|
19
|
+
import { resolve } from 'path'
|
|
20
|
+
config({ path: resolve(import.meta.dirname, '../../../.env') })
|
|
21
|
+
|
|
22
|
+
import { generateObject } from '../src/generate.js'
|
|
23
|
+
import { schema } from '../src/schema.js'
|
|
24
|
+
import { EVAL_MODELS, type EvalModel, type ModelTier } from '../src/eval/models.js'
|
|
25
|
+
|
|
26
|
+
// Parse CLI args
|
|
27
|
+
const args = process.argv.slice(2)
|
|
28
|
+
const judgeArg = args.find(a => a.startsWith('--judge='))
|
|
29
|
+
const JUDGE_MODEL = judgeArg ? judgeArg.split('=')[1] : 'sonnet'
|
|
30
|
+
const runAll = args.includes('--all')
|
|
31
|
+
|
|
32
|
+
const tiers: ModelTier[] = runAll ? ['best', 'fast', 'cheap'] : ['fast']
|
|
33
|
+
|
|
34
|
+
// Marketing copy schema
|
|
35
|
+
const marketingCopySchema = schema({
|
|
36
|
+
title: 'Product/page title (5-10 words)',
|
|
37
|
+
description: 'Meta description for SEO (150-160 characters)',
|
|
38
|
+
hero: {
|
|
39
|
+
headline: 'Hero headline (5-8 words, compelling)',
|
|
40
|
+
subhead: 'Supporting subheadline (10-20 words)',
|
|
41
|
+
primaryCTA: 'Primary call-to-action button text (2-4 words)',
|
|
42
|
+
secondaryCTA: 'Secondary call-to-action link text (3-6 words)',
|
|
43
|
+
},
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
// Test cases - different product/service scenarios
|
|
47
|
+
const TEST_CASES = [
|
|
48
|
+
{
|
|
49
|
+
name: 'SaaS Analytics Platform',
|
|
50
|
+
prompt: `Create marketing copy for a B2B SaaS analytics platform called "InsightFlow" that helps companies understand their customer behavior with AI-powered insights. Target audience: Product managers and growth teams at mid-size tech companies.`,
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
name: 'E-commerce Fashion Brand',
|
|
54
|
+
prompt: `Create marketing copy for a sustainable fashion e-commerce brand called "EcoThread" that sells organic, ethically-made clothing. Target audience: Environmentally conscious millennials aged 25-35.`,
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
name: 'Developer Tool',
|
|
58
|
+
prompt: `Create marketing copy for a CLI tool called "DeployFast" that simplifies Kubernetes deployments with one-command deploys. Target audience: DevOps engineers and backend developers.`,
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
name: 'Mobile Fitness App',
|
|
62
|
+
prompt: `Create marketing copy for a fitness app called "FitPulse" that uses AI to create personalized workout plans and tracks progress with smart watch integration. Target audience: Busy professionals aged 30-45.`,
|
|
63
|
+
},
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
interface MarketingCopy {
|
|
67
|
+
title: string
|
|
68
|
+
description: string
|
|
69
|
+
hero: {
|
|
70
|
+
headline: string
|
|
71
|
+
subhead: string
|
|
72
|
+
primaryCTA: string
|
|
73
|
+
secondaryCTA: string
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
interface GeneratedCopy {
|
|
78
|
+
model: EvalModel
|
|
79
|
+
testCase: typeof TEST_CASES[0]
|
|
80
|
+
copy: MarketingCopy
|
|
81
|
+
latencyMs: number
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
interface ELORating {
|
|
85
|
+
modelId: string
|
|
86
|
+
modelName: string
|
|
87
|
+
rating: number
|
|
88
|
+
wins: number
|
|
89
|
+
losses: number
|
|
90
|
+
draws: number
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// ELO calculation
|
|
94
|
+
const K_FACTOR = 32
|
|
95
|
+
const INITIAL_ELO = 1500
|
|
96
|
+
|
|
97
|
+
function calculateEloChange(ratingA: number, ratingB: number, scoreA: number): { deltaA: number; deltaB: number } {
|
|
98
|
+
const expectedA = 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400))
|
|
99
|
+
const expectedB = 1 - expectedA
|
|
100
|
+
|
|
101
|
+
const deltaA = K_FACTOR * (scoreA - expectedA)
|
|
102
|
+
const deltaB = K_FACTOR * ((1 - scoreA) - expectedB)
|
|
103
|
+
|
|
104
|
+
return { deltaA, deltaB }
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// LLM Judge for pairwise comparison
|
|
108
|
+
async function judgePair(
|
|
109
|
+
copyA: MarketingCopy,
|
|
110
|
+
copyB: MarketingCopy,
|
|
111
|
+
testCase: typeof TEST_CASES[0],
|
|
112
|
+
judgeModel: string
|
|
113
|
+
): Promise<'A' | 'B' | 'TIE'> {
|
|
114
|
+
const prompt = `You are an expert marketing copywriter and brand strategist. Compare these two marketing copy options for: ${testCase.name}
|
|
115
|
+
|
|
116
|
+
Context: ${testCase.prompt}
|
|
117
|
+
|
|
118
|
+
=== OPTION A ===
|
|
119
|
+
Title: ${copyA.title}
|
|
120
|
+
Description: ${copyA.description}
|
|
121
|
+
Hero Headline: ${copyA.hero.headline}
|
|
122
|
+
Hero Subhead: ${copyA.hero.subhead}
|
|
123
|
+
Primary CTA: ${copyA.hero.primaryCTA}
|
|
124
|
+
Secondary CTA: ${copyA.hero.secondaryCTA}
|
|
125
|
+
|
|
126
|
+
=== OPTION B ===
|
|
127
|
+
Title: ${copyB.title}
|
|
128
|
+
Description: ${copyB.description}
|
|
129
|
+
Hero Headline: ${copyB.hero.headline}
|
|
130
|
+
Hero Subhead: ${copyB.hero.subhead}
|
|
131
|
+
Primary CTA: ${copyB.hero.primaryCTA}
|
|
132
|
+
Secondary CTA: ${copyB.hero.secondaryCTA}
|
|
133
|
+
|
|
134
|
+
Evaluate based on:
|
|
135
|
+
1. Clarity and impact of messaging
|
|
136
|
+
2. Target audience alignment
|
|
137
|
+
3. Emotional appeal and persuasiveness
|
|
138
|
+
4. CTA effectiveness
|
|
139
|
+
5. Overall brand voice consistency
|
|
140
|
+
|
|
141
|
+
Which option is better? Answer A, B, or TIE if they're roughly equal.`
|
|
142
|
+
|
|
143
|
+
try {
|
|
144
|
+
const { object } = await generateObject({
|
|
145
|
+
model: judgeModel,
|
|
146
|
+
schema: schema({
|
|
147
|
+
reasoning: 'Brief explanation of your judgment (2-3 sentences)',
|
|
148
|
+
winner: 'A | B | TIE',
|
|
149
|
+
}),
|
|
150
|
+
prompt,
|
|
151
|
+
temperature: 0.3,
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
const result = object as { reasoning: string; winner: string }
|
|
155
|
+
const winner = result.winner.toUpperCase().trim()
|
|
156
|
+
|
|
157
|
+
if (winner === 'A' || winner === 'B' || winner === 'TIE') {
|
|
158
|
+
return winner
|
|
159
|
+
}
|
|
160
|
+
return 'TIE'
|
|
161
|
+
} catch (err) {
|
|
162
|
+
console.error(` Judge error: ${err}`)
|
|
163
|
+
return 'TIE'
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Generate marketing copy for a model
|
|
168
|
+
async function generateCopy(model: EvalModel, testCase: typeof TEST_CASES[0]): Promise<GeneratedCopy> {
|
|
169
|
+
const start = Date.now()
|
|
170
|
+
|
|
171
|
+
const { object } = await generateObject({
|
|
172
|
+
model: model.id,
|
|
173
|
+
schema: marketingCopySchema,
|
|
174
|
+
prompt: testCase.prompt,
|
|
175
|
+
temperature: 0.7,
|
|
176
|
+
})
|
|
177
|
+
|
|
178
|
+
return {
|
|
179
|
+
model,
|
|
180
|
+
testCase,
|
|
181
|
+
copy: object as MarketingCopy,
|
|
182
|
+
latencyMs: Date.now() - start,
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Run pairwise comparisons and calculate ELO
|
|
187
|
+
async function runEloTournament(
|
|
188
|
+
copies: GeneratedCopy[],
|
|
189
|
+
judgeModel: string
|
|
190
|
+
): Promise<ELORating[]> {
|
|
191
|
+
// Initialize ELO ratings
|
|
192
|
+
const ratings: Map<string, ELORating> = new Map()
|
|
193
|
+
|
|
194
|
+
for (const copy of copies) {
|
|
195
|
+
if (!ratings.has(copy.model.id)) {
|
|
196
|
+
ratings.set(copy.model.id, {
|
|
197
|
+
modelId: copy.model.id,
|
|
198
|
+
modelName: copy.model.name,
|
|
199
|
+
rating: INITIAL_ELO,
|
|
200
|
+
wins: 0,
|
|
201
|
+
losses: 0,
|
|
202
|
+
draws: 0,
|
|
203
|
+
})
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Group copies by test case
|
|
208
|
+
const byTestCase = new Map<string, GeneratedCopy[]>()
|
|
209
|
+
for (const copy of copies) {
|
|
210
|
+
const key = copy.testCase.name
|
|
211
|
+
if (!byTestCase.has(key)) {
|
|
212
|
+
byTestCase.set(key, [])
|
|
213
|
+
}
|
|
214
|
+
byTestCase.get(key)!.push(copy)
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
console.log(`\n⚖️ Running pairwise comparisons with ${JUDGE_MODEL} as judge...\n`)
|
|
218
|
+
|
|
219
|
+
let comparisonCount = 0
|
|
220
|
+
const totalComparisons = Array.from(byTestCase.values()).reduce(
|
|
221
|
+
(sum, copies) => sum + (copies.length * (copies.length - 1)) / 2,
|
|
222
|
+
0
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
// Run pairwise comparisons within each test case
|
|
226
|
+
for (const [testCaseName, testCaseCopies] of byTestCase) {
|
|
227
|
+
console.log(` 📝 ${testCaseName}:`)
|
|
228
|
+
|
|
229
|
+
for (let i = 0; i < testCaseCopies.length; i++) {
|
|
230
|
+
for (let j = i + 1; j < testCaseCopies.length; j++) {
|
|
231
|
+
const copyA = testCaseCopies[i]
|
|
232
|
+
const copyB = testCaseCopies[j]
|
|
233
|
+
|
|
234
|
+
comparisonCount++
|
|
235
|
+
process.stdout.write(` ${comparisonCount}/${totalComparisons} ${copyA.model.name} vs ${copyB.model.name}... `)
|
|
236
|
+
|
|
237
|
+
const winner = await judgePair(copyA.copy, copyB.copy, copyA.testCase, judgeModel)
|
|
238
|
+
|
|
239
|
+
const ratingA = ratings.get(copyA.model.id)!
|
|
240
|
+
const ratingB = ratings.get(copyB.model.id)!
|
|
241
|
+
|
|
242
|
+
let scoreA: number
|
|
243
|
+
if (winner === 'A') {
|
|
244
|
+
scoreA = 1
|
|
245
|
+
ratingA.wins++
|
|
246
|
+
ratingB.losses++
|
|
247
|
+
console.log(`${copyA.model.name} wins`)
|
|
248
|
+
} else if (winner === 'B') {
|
|
249
|
+
scoreA = 0
|
|
250
|
+
ratingA.losses++
|
|
251
|
+
ratingB.wins++
|
|
252
|
+
console.log(`${copyB.model.name} wins`)
|
|
253
|
+
} else {
|
|
254
|
+
scoreA = 0.5
|
|
255
|
+
ratingA.draws++
|
|
256
|
+
ratingB.draws++
|
|
257
|
+
console.log(`TIE`)
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
const { deltaA, deltaB } = calculateEloChange(ratingA.rating, ratingB.rating, scoreA)
|
|
261
|
+
ratingA.rating += deltaA
|
|
262
|
+
ratingB.rating += deltaB
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Sort by ELO rating
|
|
268
|
+
return Array.from(ratings.values()).sort((a, b) => b.rating - a.rating)
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Main
|
|
272
|
+
async function main() {
|
|
273
|
+
console.log('╔════════════════════════════════════════════════════════════════╗')
|
|
274
|
+
console.log('║ Marketing Copy Eval (LLM-as-Judge) ║')
|
|
275
|
+
console.log('╚════════════════════════════════════════════════════════════════╝')
|
|
276
|
+
console.log('')
|
|
277
|
+
console.log(`Judge Model: ${JUDGE_MODEL}`)
|
|
278
|
+
console.log(`Tiers: ${tiers.join(', ')}`)
|
|
279
|
+
|
|
280
|
+
// Get models to test
|
|
281
|
+
const models = EVAL_MODELS.filter(m => tiers.includes(m.tier))
|
|
282
|
+
console.log(`Models: ${models.map(m => m.name).join(', ')}`)
|
|
283
|
+
console.log(`Test Cases: ${TEST_CASES.length}`)
|
|
284
|
+
console.log('')
|
|
285
|
+
|
|
286
|
+
// Generate copy from each model for each test case
|
|
287
|
+
console.log('🎨 Generating marketing copy...\n')
|
|
288
|
+
|
|
289
|
+
const allCopies: GeneratedCopy[] = []
|
|
290
|
+
const startTime = Date.now()
|
|
291
|
+
|
|
292
|
+
for (const testCase of TEST_CASES) {
|
|
293
|
+
console.log(` 📦 ${testCase.name}:`)
|
|
294
|
+
|
|
295
|
+
const jobs = models.map(async model => {
|
|
296
|
+
try {
|
|
297
|
+
const copy = await generateCopy(model, testCase)
|
|
298
|
+
console.log(` ✓ ${model.name} (${copy.latencyMs}ms)`)
|
|
299
|
+
return copy
|
|
300
|
+
} catch (err) {
|
|
301
|
+
console.log(` ✗ ${model.name}: ${err}`)
|
|
302
|
+
return null
|
|
303
|
+
}
|
|
304
|
+
})
|
|
305
|
+
|
|
306
|
+
const results = await Promise.all(jobs)
|
|
307
|
+
allCopies.push(...results.filter((r): r is GeneratedCopy => r !== null))
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
const generateTime = Date.now() - startTime
|
|
311
|
+
console.log(`\n Generated ${allCopies.length} copies in ${(generateTime / 1000).toFixed(1)}s`)
|
|
312
|
+
|
|
313
|
+
// Run ELO tournament
|
|
314
|
+
const tournamentStart = Date.now()
|
|
315
|
+
const eloRatings = await runEloTournament(allCopies, JUDGE_MODEL)
|
|
316
|
+
const tournamentTime = Date.now() - tournamentStart
|
|
317
|
+
|
|
318
|
+
// Display results
|
|
319
|
+
console.log('')
|
|
320
|
+
console.log('╔════════════════════════════════════════════════════════════════╗')
|
|
321
|
+
console.log('║ ELO Rankings ║')
|
|
322
|
+
console.log('╚════════════════════════════════════════════════════════════════╝')
|
|
323
|
+
console.log('')
|
|
324
|
+
console.log(' Rank | Model | ELO | W | L | D |')
|
|
325
|
+
console.log(' -----|------------------------|--------|-----|-----|-----|')
|
|
326
|
+
|
|
327
|
+
eloRatings.forEach((rating, idx) => {
|
|
328
|
+
const rank = `${idx + 1}`.padStart(4)
|
|
329
|
+
const name = rating.modelName.padEnd(22)
|
|
330
|
+
const elo = Math.round(rating.rating).toString().padStart(6)
|
|
331
|
+
const wins = rating.wins.toString().padStart(3)
|
|
332
|
+
const losses = rating.losses.toString().padStart(3)
|
|
333
|
+
const draws = rating.draws.toString().padStart(3)
|
|
334
|
+
console.log(` ${rank} | ${name} | ${elo} | ${wins} | ${losses} | ${draws} |`)
|
|
335
|
+
})
|
|
336
|
+
|
|
337
|
+
console.log('')
|
|
338
|
+
console.log(` Judge: ${JUDGE_MODEL}`)
|
|
339
|
+
console.log(` Generation Time: ${(generateTime / 1000).toFixed(1)}s`)
|
|
340
|
+
console.log(` Tournament Time: ${(tournamentTime / 1000).toFixed(1)}s`)
|
|
341
|
+
console.log(` Total Time: ${((generateTime + tournamentTime) / 1000).toFixed(1)}s`)
|
|
342
|
+
|
|
343
|
+
// Show sample outputs from top 3
|
|
344
|
+
console.log('')
|
|
345
|
+
console.log('╔════════════════════════════════════════════════════════════════╗')
|
|
346
|
+
console.log('║ Sample Outputs (Top 3) ║')
|
|
347
|
+
console.log('╚════════════════════════════════════════════════════════════════╝')
|
|
348
|
+
|
|
349
|
+
const top3Models = eloRatings.slice(0, 3).map(r => r.modelId)
|
|
350
|
+
const sampleTestCase = TEST_CASES[0]
|
|
351
|
+
|
|
352
|
+
for (const modelId of top3Models) {
|
|
353
|
+
const copy = allCopies.find(c => c.model.id === modelId && c.testCase.name === sampleTestCase.name)
|
|
354
|
+
if (copy) {
|
|
355
|
+
const rank = eloRatings.findIndex(r => r.modelId === modelId) + 1
|
|
356
|
+
console.log(`\n #${rank} ${copy.model.name} (${sampleTestCase.name}):`)
|
|
357
|
+
console.log(` ─────────────────────────────────────────`)
|
|
358
|
+
console.log(` Title: ${copy.copy.title}`)
|
|
359
|
+
console.log(` Description: ${copy.copy.description}`)
|
|
360
|
+
console.log(` Headline: ${copy.copy.hero.headline}`)
|
|
361
|
+
console.log(` Subhead: ${copy.copy.hero.subhead}`)
|
|
362
|
+
console.log(` Primary CTA: [${copy.copy.hero.primaryCTA}]`)
|
|
363
|
+
console.log(` Secondary CTA: ${copy.copy.hero.secondaryCTA}`)
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
console.log('')
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
main().catch(console.error)
|