ai-functions 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +5 -0
- package/.turbo/turbo-test.log +105 -0
- package/README.md +190 -86
- package/TODO.md +138 -0
- package/dist/ai-promise.d.ts +219 -0
- package/dist/ai-promise.d.ts.map +1 -0
- package/dist/ai-promise.js +610 -0
- package/dist/ai-promise.js.map +1 -0
- package/dist/ai.d.ts +285 -0
- package/dist/ai.d.ts.map +1 -0
- package/dist/ai.js +842 -0
- package/dist/ai.js.map +1 -0
- package/dist/batch/anthropic.d.ts +23 -0
- package/dist/batch/anthropic.d.ts.map +1 -0
- package/dist/batch/anthropic.js +257 -0
- package/dist/batch/anthropic.js.map +1 -0
- package/dist/batch/bedrock.d.ts +64 -0
- package/dist/batch/bedrock.d.ts.map +1 -0
- package/dist/batch/bedrock.js +586 -0
- package/dist/batch/bedrock.js.map +1 -0
- package/dist/batch/cloudflare.d.ts +37 -0
- package/dist/batch/cloudflare.d.ts.map +1 -0
- package/dist/batch/cloudflare.js +289 -0
- package/dist/batch/cloudflare.js.map +1 -0
- package/dist/batch/google.d.ts +41 -0
- package/dist/batch/google.d.ts.map +1 -0
- package/dist/batch/google.js +360 -0
- package/dist/batch/google.js.map +1 -0
- package/dist/batch/index.d.ts +31 -0
- package/dist/batch/index.d.ts.map +1 -0
- package/dist/batch/index.js +31 -0
- package/dist/batch/index.js.map +1 -0
- package/dist/batch/memory.d.ts +44 -0
- package/dist/batch/memory.d.ts.map +1 -0
- package/dist/batch/memory.js +188 -0
- package/dist/batch/memory.js.map +1 -0
- package/dist/batch/openai.d.ts +37 -0
- package/dist/batch/openai.d.ts.map +1 -0
- package/dist/batch/openai.js +403 -0
- package/dist/batch/openai.js.map +1 -0
- package/dist/batch-map.d.ts +125 -0
- package/dist/batch-map.d.ts.map +1 -0
- package/dist/batch-map.js +406 -0
- package/dist/batch-map.js.map +1 -0
- package/dist/batch-queue.d.ts +273 -0
- package/dist/batch-queue.d.ts.map +1 -0
- package/dist/batch-queue.js +271 -0
- package/dist/batch-queue.js.map +1 -0
- package/dist/context.d.ts +133 -0
- package/dist/context.d.ts.map +1 -0
- package/dist/context.js +267 -0
- package/dist/context.js.map +1 -0
- package/dist/embeddings.d.ts +123 -0
- package/dist/embeddings.d.ts.map +1 -0
- package/dist/embeddings.js +170 -0
- package/dist/embeddings.js.map +1 -0
- package/dist/eval/index.d.ts +8 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +8 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/models.d.ts +66 -0
- package/dist/eval/models.d.ts.map +1 -0
- package/dist/eval/models.js +120 -0
- package/dist/eval/models.js.map +1 -0
- package/dist/eval/runner.d.ts +64 -0
- package/dist/eval/runner.d.ts.map +1 -0
- package/dist/eval/runner.js +148 -0
- package/dist/eval/runner.js.map +1 -0
- package/dist/generate.d.ts +168 -0
- package/dist/generate.d.ts.map +1 -0
- package/dist/generate.js +174 -0
- package/dist/generate.js.map +1 -0
- package/dist/index.d.ts +29 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +53 -52
- package/dist/index.js.map +1 -1
- package/dist/primitives.d.ts +292 -0
- package/dist/primitives.d.ts.map +1 -0
- package/dist/primitives.js +471 -0
- package/dist/primitives.js.map +1 -0
- package/dist/providers/cloudflare.d.ts +9 -0
- package/dist/providers/cloudflare.d.ts.map +1 -0
- package/dist/providers/cloudflare.js +9 -0
- package/dist/providers/cloudflare.js.map +1 -0
- package/dist/providers/index.d.ts +9 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +9 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/schema.d.ts +54 -0
- package/dist/schema.d.ts.map +1 -0
- package/dist/schema.js +109 -0
- package/dist/schema.js.map +1 -0
- package/dist/template.d.ts +73 -0
- package/dist/template.d.ts.map +1 -0
- package/dist/template.js +129 -0
- package/dist/template.js.map +1 -0
- package/dist/types.d.ts +474 -106
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +4 -8
- package/dist/types.js.map +1 -1
- package/evalite.config.ts +19 -0
- package/evals/README.md +212 -0
- package/evals/classification.eval.ts +108 -0
- package/evals/marketing.eval.ts +370 -0
- package/evals/math.eval.ts +94 -0
- package/evals/run-evals.ts +166 -0
- package/evals/structured-output.eval.ts +143 -0
- package/evals/writing.eval.ts +117 -0
- package/examples/batch-blog-posts.ts +160 -0
- package/package.json +57 -57
- package/src/ai-promise.ts +784 -0
- package/src/ai.ts +1183 -0
- package/src/batch/anthropic.ts +375 -0
- package/src/batch/bedrock.ts +801 -0
- package/src/batch/cloudflare.ts +421 -0
- package/src/batch/google.ts +491 -0
- package/src/batch/index.ts +31 -0
- package/src/batch/memory.ts +253 -0
- package/src/batch/openai.ts +557 -0
- package/src/batch-map.ts +534 -0
- package/src/batch-queue.ts +493 -0
- package/src/context.ts +332 -0
- package/src/embeddings.ts +244 -0
- package/src/eval/index.ts +8 -0
- package/src/eval/models.ts +158 -0
- package/src/eval/runner.ts +217 -0
- package/src/generate.ts +245 -0
- package/src/index.ts +154 -0
- package/src/primitives.ts +612 -0
- package/src/providers/cloudflare.ts +15 -0
- package/src/providers/index.ts +14 -0
- package/src/schema.ts +147 -0
- package/src/template.ts +209 -0
- package/src/types.ts +540 -0
- package/test/README.md +105 -0
- package/test/ai-proxy.test.ts +192 -0
- package/test/async-iterators.test.ts +327 -0
- package/test/batch-background.test.ts +482 -0
- package/test/batch-blog-posts.test.ts +387 -0
- package/test/blog-generation.test.ts +510 -0
- package/test/browse-read.test.ts +611 -0
- package/test/core-functions.test.ts +694 -0
- package/test/decide.test.ts +393 -0
- package/test/define.test.ts +274 -0
- package/test/e2e-bedrock-manual.ts +163 -0
- package/test/e2e-bedrock.test.ts +191 -0
- package/test/e2e-flex-gateway.ts +157 -0
- package/test/e2e-flex-manual.ts +183 -0
- package/test/e2e-flex.test.ts +209 -0
- package/test/e2e-google-manual.ts +178 -0
- package/test/e2e-google.test.ts +216 -0
- package/test/embeddings.test.ts +284 -0
- package/test/evals/define-function.eval.test.ts +379 -0
- package/test/evals/primitives.eval.test.ts +384 -0
- package/test/function-types.test.ts +492 -0
- package/test/generate-core.test.ts +319 -0
- package/test/generate.test.ts +163 -0
- package/test/implicit-batch.test.ts +422 -0
- package/test/schema.test.ts +109 -0
- package/test/tagged-templates.test.ts +302 -0
- package/tsconfig.json +10 -0
- package/vitest.config.ts +42 -0
- package/LICENSE +0 -21
- package/bin/cli.js +0 -5
- package/dist/cli/index.d.ts +0 -10
- package/dist/cli/index.d.ts.map +0 -1
- package/dist/cli/index.js +0 -38
- package/dist/cli/index.js.map +0 -1
- package/dist/cli/index.test.d.ts +0 -2
- package/dist/cli/index.test.d.ts.map +0 -1
- package/dist/cli/index.test.js +0 -35
- package/dist/cli/index.test.js.map +0 -1
- package/dist/constants/models.d.ts +0 -10
- package/dist/constants/models.d.ts.map +0 -1
- package/dist/constants/models.js +0 -12
- package/dist/constants/models.js.map +0 -1
- package/dist/converters/index.d.ts +0 -3
- package/dist/converters/index.d.ts.map +0 -1
- package/dist/converters/index.js +0 -3
- package/dist/converters/index.js.map +0 -1
- package/dist/converters/model.d.ts +0 -4
- package/dist/converters/model.d.ts.map +0 -1
- package/dist/converters/model.js +0 -19
- package/dist/converters/model.js.map +0 -1
- package/dist/converters/schema.d.ts +0 -4
- package/dist/converters/schema.d.ts.map +0 -1
- package/dist/converters/schema.js +0 -25
- package/dist/converters/schema.js.map +0 -1
- package/dist/core/responses.d.ts +0 -5
- package/dist/core/responses.d.ts.map +0 -1
- package/dist/core/responses.js +0 -16
- package/dist/core/responses.js.map +0 -1
- package/dist/core/responses.test.d.ts +0 -2
- package/dist/core/responses.test.d.ts.map +0 -1
- package/dist/core/responses.test.js +0 -31
- package/dist/core/responses.test.js.map +0 -1
- package/dist/errors.d.ts +0 -6
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js +0 -9
- package/dist/errors.js.map +0 -1
- package/dist/examples/streaming.test.d.ts +0 -2
- package/dist/examples/streaming.test.d.ts.map +0 -1
- package/dist/examples/streaming.test.js +0 -176
- package/dist/examples/streaming.test.js.map +0 -1
- package/dist/factory/__tests__/index.test.d.ts +0 -2
- package/dist/factory/__tests__/index.test.d.ts.map +0 -1
- package/dist/factory/__tests__/index.test.js +0 -430
- package/dist/factory/__tests__/index.test.js.map +0 -1
- package/dist/factory/__tests__/list.test.d.ts +0 -2
- package/dist/factory/__tests__/list.test.d.ts.map +0 -1
- package/dist/factory/__tests__/list.test.js +0 -92
- package/dist/factory/__tests__/list.test.js.map +0 -1
- package/dist/factory/index.d.ts +0 -20
- package/dist/factory/index.d.ts.map +0 -1
- package/dist/factory/index.js +0 -287
- package/dist/factory/index.js.map +0 -1
- package/dist/factory/index.test.d.ts +0 -2
- package/dist/factory/index.test.d.ts.map +0 -1
- package/dist/factory/index.test.js +0 -287
- package/dist/factory/index.test.js.map +0 -1
- package/dist/factory/list.d.ts +0 -3
- package/dist/factory/list.d.ts.map +0 -1
- package/dist/factory/list.js +0 -221
- package/dist/factory/list.js.map +0 -1
- package/dist/factory/list.test.d.ts +0 -2
- package/dist/factory/list.test.d.ts.map +0 -1
- package/dist/factory/list.test.js +0 -84
- package/dist/factory/list.test.js.map +0 -1
- package/dist/generate/index.d.ts +0 -5
- package/dist/generate/index.d.ts.map +0 -1
- package/dist/generate/index.js +0 -17
- package/dist/generate/index.js.map +0 -1
- package/dist/index.test.d.ts +0 -2
- package/dist/index.test.d.ts.map +0 -1
- package/dist/index.test.js +0 -59
- package/dist/index.test.js.map +0 -1
- package/dist/list/await.d.ts +0 -3
- package/dist/list/await.d.ts.map +0 -1
- package/dist/list/await.js +0 -28
- package/dist/list/await.js.map +0 -1
- package/dist/list/constants.d.ts +0 -4
- package/dist/list/constants.d.ts.map +0 -1
- package/dist/list/constants.js +0 -5
- package/dist/list/constants.js.map +0 -1
- package/dist/list/create-function.d.ts +0 -3
- package/dist/list/create-function.d.ts.map +0 -1
- package/dist/list/create-function.js +0 -11
- package/dist/list/create-function.js.map +0 -1
- package/dist/list/index.d.ts +0 -4
- package/dist/list/index.d.ts.map +0 -1
- package/dist/list/index.js +0 -5
- package/dist/list/index.js.map +0 -1
- package/dist/list/prompt.d.ts +0 -3
- package/dist/list/prompt.d.ts.map +0 -1
- package/dist/list/prompt.js +0 -6
- package/dist/list/prompt.js.map +0 -1
- package/dist/list/schemas.d.ts +0 -4
- package/dist/list/schemas.d.ts.map +0 -1
- package/dist/list/schemas.js +0 -8
- package/dist/list/schemas.js.map +0 -1
- package/dist/list/stream.d.ts +0 -3
- package/dist/list/stream.d.ts.map +0 -1
- package/dist/list/stream.js +0 -33
- package/dist/list/stream.js.map +0 -1
- package/dist/list/types.d.ts +0 -11
- package/dist/list/types.d.ts.map +0 -1
- package/dist/list/types.js +0 -2
- package/dist/list/types.js.map +0 -1
- package/dist/list/validation.d.ts +0 -3
- package/dist/list/validation.d.ts.map +0 -1
- package/dist/list/validation.js +0 -12
- package/dist/list/validation.js.map +0 -1
- package/dist/providers/config.d.ts +0 -4
- package/dist/providers/config.d.ts.map +0 -1
- package/dist/providers/config.js +0 -21
- package/dist/providers/config.js.map +0 -1
- package/dist/providers/config.test.d.ts +0 -2
- package/dist/providers/config.test.d.ts.map +0 -1
- package/dist/providers/config.test.js +0 -37
- package/dist/providers/config.test.js.map +0 -1
- package/dist/proxy/constants.d.ts +0 -4
- package/dist/proxy/constants.d.ts.map +0 -1
- package/dist/proxy/constants.js +0 -5
- package/dist/proxy/constants.js.map +0 -1
- package/dist/proxy/create-function.d.ts +0 -4
- package/dist/proxy/create-function.d.ts.map +0 -1
- package/dist/proxy/create-function.js +0 -24
- package/dist/proxy/create-function.js.map +0 -1
- package/dist/proxy/create-proxy.d.ts +0 -2
- package/dist/proxy/create-proxy.d.ts.map +0 -1
- package/dist/proxy/create-proxy.js +0 -11
- package/dist/proxy/create-proxy.js.map +0 -1
- package/dist/proxy/function-generator.d.ts +0 -9
- package/dist/proxy/function-generator.d.ts.map +0 -1
- package/dist/proxy/function-generator.js +0 -29
- package/dist/proxy/function-generator.js.map +0 -1
- package/dist/proxy/index.d.ts +0 -4
- package/dist/proxy/index.d.ts.map +0 -1
- package/dist/proxy/index.js +0 -4
- package/dist/proxy/index.js.map +0 -1
- package/dist/proxy/prompt.d.ts +0 -2
- package/dist/proxy/prompt.d.ts.map +0 -1
- package/dist/proxy/prompt.js +0 -6
- package/dist/proxy/prompt.js.map +0 -1
- package/dist/proxy/types.d.ts +0 -7
- package/dist/proxy/types.d.ts.map +0 -1
- package/dist/proxy/types.js +0 -2
- package/dist/proxy/types.js.map +0 -1
- package/dist/queue/manager.d.ts +0 -5
- package/dist/queue/manager.d.ts.map +0 -1
- package/dist/queue/manager.js +0 -37
- package/dist/queue/manager.js.map +0 -1
- package/dist/queue/manager.test.d.ts +0 -2
- package/dist/queue/manager.test.d.ts.map +0 -1
- package/dist/queue/manager.test.js +0 -52
- package/dist/queue/manager.test.js.map +0 -1
- package/dist/schema-converter.d.ts +0 -4
- package/dist/schema-converter.d.ts.map +0 -1
- package/dist/schema-converter.js +0 -30
- package/dist/schema-converter.js.map +0 -1
- package/dist/stream/index.d.ts +0 -7
- package/dist/stream/index.d.ts.map +0 -1
- package/dist/stream/index.js +0 -23
- package/dist/stream/index.js.map +0 -1
- package/dist/streaming/utils.d.ts +0 -4
- package/dist/streaming/utils.d.ts.map +0 -1
- package/dist/streaming/utils.js +0 -131
- package/dist/streaming/utils.js.map +0 -1
- package/dist/streaming/utils.test.d.ts +0 -2
- package/dist/streaming/utils.test.d.ts.map +0 -1
- package/dist/streaming/utils.test.js +0 -84
- package/dist/streaming/utils.test.js.map +0 -1
- package/dist/templates/result.d.ts +0 -7
- package/dist/templates/result.d.ts.map +0 -1
- package/dist/templates/result.js +0 -40
- package/dist/templates/result.js.map +0 -1
- package/dist/templates/result.test.d.ts +0 -2
- package/dist/templates/result.test.d.ts.map +0 -1
- package/dist/templates/result.test.js +0 -75
- package/dist/templates/result.test.js.map +0 -1
- package/dist/test/setup.d.ts +0 -2
- package/dist/test/setup.d.ts.map +0 -1
- package/dist/test/setup.js +0 -21
- package/dist/test/setup.js.map +0 -1
- package/dist/test-types.d.ts +0 -13
- package/dist/test-types.d.ts.map +0 -1
- package/dist/test-types.js +0 -55
- package/dist/test-types.js.map +0 -1
- package/dist/types/index.d.ts +0 -4
- package/dist/types/index.d.ts.map +0 -1
- package/dist/types/index.js +0 -4
- package/dist/types/index.js.map +0 -1
- package/dist/types/list.d.ts +0 -10
- package/dist/types/list.d.ts.map +0 -1
- package/dist/types/list.js +0 -2
- package/dist/types/list.js.map +0 -1
- package/dist/types/model.d.ts +0 -7
- package/dist/types/model.d.ts.map +0 -1
- package/dist/types/model.js +0 -2
- package/dist/types/model.js.map +0 -1
- package/dist/types/options.d.ts +0 -25
- package/dist/types/options.d.ts.map +0 -1
- package/dist/types/options.js +0 -2
- package/dist/types/options.js.map +0 -1
- package/dist/types/schema.d.ts +0 -5
- package/dist/types/schema.d.ts.map +0 -1
- package/dist/types/schema.js +0 -2
- package/dist/types/schema.js.map +0 -1
- package/dist/utils/__tests__/request-handler.test.d.ts +0 -2
- package/dist/utils/__tests__/request-handler.test.d.ts.map +0 -1
- package/dist/utils/__tests__/request-handler.test.js +0 -134
- package/dist/utils/__tests__/request-handler.test.js.map +0 -1
- package/dist/utils/__tests__/schema.test.d.ts +0 -2
- package/dist/utils/__tests__/schema.test.d.ts.map +0 -1
- package/dist/utils/__tests__/schema.test.js +0 -49
- package/dist/utils/__tests__/schema.test.js.map +0 -1
- package/dist/utils/__tests__/stream-progress.test.d.ts +0 -2
- package/dist/utils/__tests__/stream-progress.test.d.ts.map +0 -1
- package/dist/utils/__tests__/stream-progress.test.js +0 -85
- package/dist/utils/__tests__/stream-progress.test.js.map +0 -1
- package/dist/utils/index.d.ts +0 -2
- package/dist/utils/index.d.ts.map +0 -1
- package/dist/utils/index.js +0 -2
- package/dist/utils/index.js.map +0 -1
- package/dist/utils/request-handler.d.ts +0 -17
- package/dist/utils/request-handler.d.ts.map +0 -1
- package/dist/utils/request-handler.js +0 -105
- package/dist/utils/request-handler.js.map +0 -1
- package/dist/utils/schema.d.ts +0 -11
- package/dist/utils/schema.d.ts.map +0 -1
- package/dist/utils/schema.js +0 -51
- package/dist/utils/schema.js.map +0 -1
- package/dist/utils/stream-progress.d.ts +0 -17
- package/dist/utils/stream-progress.d.ts.map +0 -1
- package/dist/utils/stream-progress.js +0 -86
- package/dist/utils/stream-progress.js.map +0 -1
- package/dist/utils/validation.d.ts +0 -3
- package/dist/utils/validation.d.ts.map +0 -1
- package/dist/utils/validation.js +0 -30
- package/dist/utils/validation.js.map +0 -1
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { defineConfig } from 'evalite/config'
|
|
2
|
+
|
|
3
|
+
export default defineConfig({
|
|
4
|
+
// Single trial by default (override per-eval for LLM-as-judge)
|
|
5
|
+
trialCount: 1,
|
|
6
|
+
|
|
7
|
+
// Allow longer timeouts for API calls
|
|
8
|
+
testTimeout: 60_000,
|
|
9
|
+
|
|
10
|
+
// Run up to 5 evals in parallel (be nice to rate limits)
|
|
11
|
+
maxConcurrency: 5,
|
|
12
|
+
|
|
13
|
+
// Fail CI if average score drops below 70%
|
|
14
|
+
scoreThreshold: 70,
|
|
15
|
+
|
|
16
|
+
server: {
|
|
17
|
+
port: 3006,
|
|
18
|
+
},
|
|
19
|
+
})
|
package/evals/README.md
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# AI Functions Eval Suite
|
|
2
|
+
|
|
3
|
+
Evaluations for ai-functions using both vitest-based tests and a custom eval runner.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### Vitest-Based Evals (Recommended)
|
|
8
|
+
|
|
9
|
+
Tests the core AI primitives (`code`, `ai`, `list`, `is`, `defineFunction`, etc.) with real AI calls:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Run all eval tests
|
|
13
|
+
pnpm test:evals
|
|
14
|
+
|
|
15
|
+
# Run primitives eval (code, ai, list, is, etc.)
|
|
16
|
+
pnpm test:evals:primitives
|
|
17
|
+
|
|
18
|
+
# Run defineFunction eval
|
|
19
|
+
pnpm test:evals:define
|
|
20
|
+
|
|
21
|
+
# Run with specific model
|
|
22
|
+
MODEL=sonnet pnpm test:evals
|
|
23
|
+
|
|
24
|
+
# Run with specific tiers
|
|
25
|
+
EVAL_TIERS=best,fast pnpm test:evals
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Custom Runner Evals
|
|
29
|
+
|
|
30
|
+
Math and classification evals with detailed scoring:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Run all evals (math + classification)
|
|
34
|
+
pnpm eval
|
|
35
|
+
|
|
36
|
+
# Run specific eval
|
|
37
|
+
pnpm eval:math
|
|
38
|
+
pnpm eval:class
|
|
39
|
+
|
|
40
|
+
# Run all tiers (best, fast, cheap)
|
|
41
|
+
pnpm eval:all
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Eval Suites
|
|
45
|
+
|
|
46
|
+
### Vitest Evals (test/evals/)
|
|
47
|
+
|
|
48
|
+
| Test Suite | Functions Tested | Test Cases |
|
|
49
|
+
|------------|------------------|------------|
|
|
50
|
+
| `primitives.eval.test.ts` | `code()`, `ai()`, `list()`, `is()`, `summarize()`, `extract()`, `write()`, `lists()` | Code generation, text generation, classification, extraction |
|
|
51
|
+
| `define-function.eval.test.ts` | `defineFunction()`, `define.generative()`, `define.code()` | Generative functions, code functions, structured outputs |
|
|
52
|
+
|
|
53
|
+
### Custom Runner Evals (evals/)
|
|
54
|
+
|
|
55
|
+
| Eval | Tests | Scoring |
|
|
56
|
+
|------|-------|---------|
|
|
57
|
+
| `Math` | Arithmetic, word problems | Correct answer + shows work |
|
|
58
|
+
| `Classification` | Sentiment, support tickets | Accuracy + calibration |
|
|
59
|
+
| `Marketing` | Marketing copy generation | LLM-as-judge ELO ranking |
|
|
60
|
+
|
|
61
|
+
### Marketing Copy Eval (LLM-as-Judge)
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Run marketing eval (fast tier only)
|
|
65
|
+
pnpm eval:marketing
|
|
66
|
+
|
|
67
|
+
# Run with all tiers
|
|
68
|
+
pnpm eval:marketing:all
|
|
69
|
+
|
|
70
|
+
# Use different judge model
|
|
71
|
+
pnpm eval:marketing -- --judge=opus
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Generates marketing copy (title, description, hero headline/subhead, CTAs) for different scenarios and uses pairwise LLM-as-judge comparisons to create ELO rankings.
|
|
75
|
+
|
|
76
|
+
## Latest Results (December 2025)
|
|
77
|
+
|
|
78
|
+
**Overall: 94.0%** | Cost: $0.06 | Time: 95s | 10 Models
|
|
79
|
+
|
|
80
|
+
### Performance Summary
|
|
81
|
+
|
|
82
|
+
| Model | Math | Class | Overall | Avg Latency | Notes |
|
|
83
|
+
|-------|------|-------|---------|-------------|-------|
|
|
84
|
+
| Claude Sonnet 4.5 | 100% | 100% | **100%** | ~380ms | Best overall |
|
|
85
|
+
| GPT-5 Mini | 100% | 91.7% | 95.9% | ~1850ms | Slower but accurate |
|
|
86
|
+
| Gemini 2.5 Flash | 100% | 91.7% | 95.9% | ~200ms | **Fastest** |
|
|
87
|
+
| DeepSeek Chat | 100% | 91.7% | 95.9% | ~210ms | Great value |
|
|
88
|
+
| Mistral Medium 3.1 | 96% | 100% | 98.0% | ~850ms | Strong classify |
|
|
89
|
+
| Grok 4.1 Fast | 100% | 91.7% | 95.9% | ~2300ms | 2M context |
|
|
90
|
+
| Grok 4 Fast | 92% | 100% | 96.0% | ~1800ms | Good balance |
|
|
91
|
+
| Qwen3 30B | 96% | 91.7% | 93.9% | ~8900ms | Slowest |
|
|
92
|
+
| Llama 3.3 70B | 90% | 91.7% | 90.9% | ~185ms | Fast open model |
|
|
93
|
+
| GPT-oss 20B | 72% | 83.3% | 77.7% | ~1200ms | Open source |
|
|
94
|
+
|
|
95
|
+
### Performance/$ Analysis (Fast Tier)
|
|
96
|
+
|
|
97
|
+
| Model | Score | Est $/1M tokens | Score/$ | Recommendation |
|
|
98
|
+
|-------|-------|-----------------|---------|----------------|
|
|
99
|
+
| DeepSeek Chat | 95.9% | $0.28 | **342** | Best value |
|
|
100
|
+
| Gemini 2.5 Flash | 95.9% | $0.30 | 320 | Fast + cheap |
|
|
101
|
+
| Llama 3.3 70B | 90.9% | $0.40 | 227 | Good OSS option |
|
|
102
|
+
| Claude Sonnet 4.5 | 100% | $3.00 | 33 | Best quality |
|
|
103
|
+
| Mistral Medium 3.1 | 98.0% | $2.50 | 39 | Strong balance |
|
|
104
|
+
| GPT-5 Mini | 95.9% | $1.00 | 96 | OpenAI ecosystem |
|
|
105
|
+
| Grok 4.1 Fast | 95.9% | $2.00 | 48 | 2M context |
|
|
106
|
+
|
|
107
|
+
### Math Eval (94.6%)
|
|
108
|
+
|
|
109
|
+
| Model | Score | Avg Latency |
|
|
110
|
+
|-------|-------|-------------|
|
|
111
|
+
| Claude Sonnet 4.5 | 100% | ~380ms |
|
|
112
|
+
| GPT-5 Mini | 100% | ~200ms |
|
|
113
|
+
| Gemini 2.5 Flash | 100% | ~170ms |
|
|
114
|
+
| DeepSeek Chat | 100% | ~220ms |
|
|
115
|
+
| Grok 4.1 Fast | 100% | ~2600ms |
|
|
116
|
+
| Mistral Medium 3.1 | 96% | ~1040ms |
|
|
117
|
+
| Qwen3 30B | 96% | ~13000ms |
|
|
118
|
+
| Grok 4 Fast | 92% | ~2000ms |
|
|
119
|
+
| Llama 3.3 70B | 90% | ~170ms |
|
|
120
|
+
| GPT-oss 20B | 72% | ~180ms |
|
|
121
|
+
|
|
122
|
+
### Classification Eval (93.3%)
|
|
123
|
+
|
|
124
|
+
| Model | Score | Avg Latency |
|
|
125
|
+
|-------|-------|-------------|
|
|
126
|
+
| Claude Sonnet 4.5 | 100% | ~205ms |
|
|
127
|
+
| Mistral Medium 3.1 | 100% | ~700ms |
|
|
128
|
+
| Grok 4 Fast | 100% | ~1670ms |
|
|
129
|
+
| GPT-5 Mini | 91.7% | ~3500ms |
|
|
130
|
+
| Gemini 2.5 Flash | 91.7% | ~235ms |
|
|
131
|
+
| Llama 3.3 70B | 91.7% | ~230ms |
|
|
132
|
+
| DeepSeek Chat | 91.7% | ~230ms |
|
|
133
|
+
| Qwen3 30B | 91.7% | ~3970ms |
|
|
134
|
+
| Grok 4.1 Fast | 91.7% | ~2170ms |
|
|
135
|
+
| GPT-oss 20B | 83.3% | ~2840ms |
|
|
136
|
+
|
|
137
|
+
### Marketing Copy Eval (ELO Rankings)
|
|
138
|
+
|
|
139
|
+
Uses LLM-as-judge (Claude Sonnet) for pairwise comparisons across 4 test scenarios.
|
|
140
|
+
|
|
141
|
+
| Rank | Model | ELO | W | L | D | Notes |
|
|
142
|
+
|------|-------|-----|---|---|---|-------|
|
|
143
|
+
| 1 | Claude Sonnet 4.5 | **1745** | 31 | 3 | 0 | Dominant winner |
|
|
144
|
+
| 2 | Grok 4.1 Fast | 1595 | 22 | 12 | 0 | Strong creative |
|
|
145
|
+
| 3 | GPT-5 Mini | 1593 | 26 | 8 | 0 | Consistent quality |
|
|
146
|
+
| 4 | Grok 4 Fast | 1558 | 17 | 17 | 0 | Good balance |
|
|
147
|
+
| 5 | Gemini 2.5 Flash | 1503 | 14 | 20 | 0 | Middle tier |
|
|
148
|
+
| 6 | Mistral Medium 3.1 | 1481 | 16 | 18 | 0 | Solid performer |
|
|
149
|
+
| 7 | GPT-oss 20B | 1471 | 19 | 15 | 0 | OSS option |
|
|
150
|
+
| 8 | DeepSeek Chat | 1449 | 10 | 16 | 0 | Value option |
|
|
151
|
+
| 9 | Qwen3 30B | 1371 | 6 | 20 | 0 | Below average |
|
|
152
|
+
| 10 | Llama 3.3 70B | 1231 | 1 | 33 | 0 | Struggled |
|
|
153
|
+
|
|
154
|
+
**Key Insights:**
|
|
155
|
+
- Claude Sonnet 4.5 won 31 of 34 comparisons (91%)
|
|
156
|
+
- Grok models performed unexpectedly well on creative tasks
|
|
157
|
+
- Llama 3.3 70B, despite being strong on classification, struggled with marketing copy
|
|
158
|
+
|
|
159
|
+
## Models
|
|
160
|
+
|
|
161
|
+
Uses model IDs from `language-models` package, routed via `ai-providers`:
|
|
162
|
+
|
|
163
|
+
### Model Tiers
|
|
164
|
+
|
|
165
|
+
| Tier | Description | Models |
|
|
166
|
+
|------|-------------|--------|
|
|
167
|
+
| `best` | Highest capability | opus, o3, gpt-5.1, gemini-pro, deepseek-v3.2, mistral-large-3, qwen3-coder, grok-4 |
|
|
168
|
+
| `fast` | Good balance | sonnet, gpt-5-mini, flash, llama-3.3-70b, mistral-medium-3.1, qwen3-30b, grok-4.1-fast |
|
|
169
|
+
| `cheap` | Cost-optimized | haiku, gpt-5-nano, ministral-14b |
|
|
170
|
+
|
|
171
|
+
### Full Model List (December 2025)
|
|
172
|
+
|
|
173
|
+
- **Anthropic**: `opus`, `sonnet`, `haiku`
|
|
174
|
+
- **OpenAI**: `openai/gpt-5.1`, `openai/gpt-5-mini`, `openai/gpt-5-nano`, `openai/o3`
|
|
175
|
+
- **OpenAI OSS**: `openai/gpt-oss-120b`, `openai/gpt-oss-20b` (open source models)
|
|
176
|
+
- **Google**: `gemini-pro`, `flash`
|
|
177
|
+
- **Meta**: `meta-llama/llama-4-maverick`, `meta-llama/llama-3.3-70b-instruct`
|
|
178
|
+
- **DeepSeek**: `deepseek/deepseek-v3.2`, `deepseek/deepseek-v3.2-speciale`, `deepseek/deepseek-chat`
|
|
179
|
+
- **Mistral**: `mistralai/mistral-large-2512` (Mistral Large 3), `mistralai/mistral-medium-3.1`, `mistralai/ministral-14b-2512`
|
|
180
|
+
- **Qwen**: `qwen/qwen3-coder`, `qwen/qwen3-30b-a3b`, `qwen/qwen3-next-80b-a3b-instruct`
|
|
181
|
+
- **xAI**: `x-ai/grok-4`, `x-ai/grok-4.1-fast`, `x-ai/grok-4-fast`
|
|
182
|
+
|
|
183
|
+
## Environment
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
# Use AI Gateway (recommended)
|
|
187
|
+
AI_GATEWAY_URL=https://gateway.ai.cloudflare.com/v1/...
|
|
188
|
+
AI_GATEWAY_TOKEN=...
|
|
189
|
+
|
|
190
|
+
# Or direct API keys
|
|
191
|
+
ANTHROPIC_API_KEY=sk-ant-...
|
|
192
|
+
OPENAI_API_KEY=sk-...
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Adding Evals
|
|
196
|
+
|
|
197
|
+
### Vitest-Based Evals
|
|
198
|
+
|
|
199
|
+
1. Create a new test file in `test/evals/`
|
|
200
|
+
2. Import functions and models:
|
|
201
|
+
```typescript
|
|
202
|
+
import { code, ai, list } from '../../src/primitives.js'
|
|
203
|
+
import { EVAL_MODELS, type EvalModel } from '../../src/eval/models.js'
|
|
204
|
+
```
|
|
205
|
+
3. Use `describe.skipIf(!hasAPI)` to skip when no API access
|
|
206
|
+
4. Loop over models with `for (const model of models)`
|
|
207
|
+
|
|
208
|
+
### Custom Runner Evals
|
|
209
|
+
|
|
210
|
+
1. Add test cases to `evals/run-evals.ts`
|
|
211
|
+
2. Use `runEval()` with `task` function and `scorers` array
|
|
212
|
+
3. Use `createModelVariants({ tiers: ['fast'] })` to filter models
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Classification Eval
|
|
3
|
+
*
|
|
4
|
+
* Tests model ability to classify inputs correctly.
|
|
5
|
+
* Includes sentiment analysis, category classification, and boolean questions.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { evalite } from 'evalite'
|
|
9
|
+
import { generateObject } from '../src/generate.js'
|
|
10
|
+
import { schema } from '../src/schema.js'
|
|
11
|
+
import { createModelVariants, type EvalModel } from '../src/eval/models.js'
|
|
12
|
+
|
|
13
|
+
// Classification test cases
|
|
14
|
+
const TEST_CASES = [
|
|
15
|
+
// Sentiment
|
|
16
|
+
{ text: 'This product exceeded my expectations!', expected: 'positive', options: ['positive', 'negative', 'neutral'] },
|
|
17
|
+
{ text: 'The delivery was late and packaging damaged.', expected: 'negative', options: ['positive', 'negative', 'neutral'] },
|
|
18
|
+
{ text: 'The product arrived as described.', expected: 'neutral', options: ['positive', 'negative', 'neutral'] },
|
|
19
|
+
|
|
20
|
+
// Support ticket classification
|
|
21
|
+
{ text: 'I need to reset my password', expected: 'account', options: ['account', 'billing', 'technical', 'shipping'] },
|
|
22
|
+
{ text: 'When will my refund be processed?', expected: 'billing', options: ['account', 'billing', 'technical', 'shipping'] },
|
|
23
|
+
{ text: 'The app crashes when uploading images', expected: 'technical', options: ['account', 'billing', 'technical', 'shipping'] },
|
|
24
|
+
{ text: 'My package shows delivered but I never received it', expected: 'shipping', options: ['account', 'billing', 'technical', 'shipping'] },
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
const modelVariants = createModelVariants({ tiers: ['fast', 'cheap'] })
|
|
28
|
+
|
|
29
|
+
evalite.each(modelVariants)('Classification', {
|
|
30
|
+
data: TEST_CASES.map(tc => ({ input: tc, expected: tc.expected })),
|
|
31
|
+
|
|
32
|
+
task: async (input, variant) => {
|
|
33
|
+
const model = variant as EvalModel
|
|
34
|
+
const startTime = Date.now()
|
|
35
|
+
|
|
36
|
+
const enumStr = input.options.join(' | ')
|
|
37
|
+
|
|
38
|
+
const { object, usage } = await generateObject({
|
|
39
|
+
model: model.id,
|
|
40
|
+
schema: schema({
|
|
41
|
+
category: enumStr,
|
|
42
|
+
confidence: 'Confidence 0-1 (number)',
|
|
43
|
+
}),
|
|
44
|
+
prompt: `Classify this text into one of: ${input.options.join(', ')}
|
|
45
|
+
|
|
46
|
+
Text: "${input.text}"`,
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
const latencyMs = Date.now() - startTime
|
|
50
|
+
|
|
51
|
+
return {
|
|
52
|
+
predicted: object.category,
|
|
53
|
+
confidence: object.confidence,
|
|
54
|
+
expected: input.expected,
|
|
55
|
+
text: input.text,
|
|
56
|
+
options: input.options,
|
|
57
|
+
modelId: model.id,
|
|
58
|
+
modelName: model.name,
|
|
59
|
+
latencyMs,
|
|
60
|
+
usage,
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
|
|
64
|
+
scorers: [
|
|
65
|
+
// Accuracy
|
|
66
|
+
{
|
|
67
|
+
name: 'Accuracy',
|
|
68
|
+
description: 'Whether classification is correct',
|
|
69
|
+
scorer: ({ output, expected }) => ({
|
|
70
|
+
score: output.predicted === expected ? 1 : 0,
|
|
71
|
+
}),
|
|
72
|
+
},
|
|
73
|
+
|
|
74
|
+
// Valid category
|
|
75
|
+
{
|
|
76
|
+
name: 'Valid Category',
|
|
77
|
+
description: 'Whether output is a valid option',
|
|
78
|
+
scorer: ({ output }) => ({
|
|
79
|
+
score: (output.options as string[]).includes(output.predicted as string) ? 1 : 0,
|
|
80
|
+
}),
|
|
81
|
+
},
|
|
82
|
+
|
|
83
|
+
// Calibration
|
|
84
|
+
{
|
|
85
|
+
name: 'Calibration',
|
|
86
|
+
description: 'Confidence matches accuracy',
|
|
87
|
+
scorer: ({ output, expected }) => {
|
|
88
|
+
const correct = output.predicted === expected
|
|
89
|
+
const conf = output.confidence as number
|
|
90
|
+
|
|
91
|
+
// High confidence when correct, low when wrong = well calibrated
|
|
92
|
+
if (correct && conf >= 0.7) return { score: 1 }
|
|
93
|
+
if (!correct && conf <= 0.5) return { score: 0.8 }
|
|
94
|
+
if (correct && conf < 0.5) return { score: 0.6 } // Underconfident
|
|
95
|
+
if (!correct && conf > 0.7) return { score: 0.2 } // Overconfident
|
|
96
|
+
return { score: 0.5 }
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
],
|
|
100
|
+
|
|
101
|
+
columns: ({ output, expected }) => [
|
|
102
|
+
{ label: 'Model', value: output.modelName },
|
|
103
|
+
{ label: 'Expected', value: expected },
|
|
104
|
+
{ label: 'Got', value: output.predicted },
|
|
105
|
+
{ label: 'Correct', value: output.predicted === expected ? 'Yes' : 'No' },
|
|
106
|
+
{ label: 'Confidence', value: `${((output.confidence as number) * 100).toFixed(0)}%` },
|
|
107
|
+
],
|
|
108
|
+
})
|