ai-functions 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +5 -0
- package/.turbo/turbo-test.log +105 -0
- package/README.md +190 -86
- package/TODO.md +138 -0
- package/dist/ai-promise.d.ts +219 -0
- package/dist/ai-promise.d.ts.map +1 -0
- package/dist/ai-promise.js +610 -0
- package/dist/ai-promise.js.map +1 -0
- package/dist/ai.d.ts +285 -0
- package/dist/ai.d.ts.map +1 -0
- package/dist/ai.js +842 -0
- package/dist/ai.js.map +1 -0
- package/dist/batch/anthropic.d.ts +23 -0
- package/dist/batch/anthropic.d.ts.map +1 -0
- package/dist/batch/anthropic.js +257 -0
- package/dist/batch/anthropic.js.map +1 -0
- package/dist/batch/bedrock.d.ts +64 -0
- package/dist/batch/bedrock.d.ts.map +1 -0
- package/dist/batch/bedrock.js +586 -0
- package/dist/batch/bedrock.js.map +1 -0
- package/dist/batch/cloudflare.d.ts +37 -0
- package/dist/batch/cloudflare.d.ts.map +1 -0
- package/dist/batch/cloudflare.js +289 -0
- package/dist/batch/cloudflare.js.map +1 -0
- package/dist/batch/google.d.ts +41 -0
- package/dist/batch/google.d.ts.map +1 -0
- package/dist/batch/google.js +360 -0
- package/dist/batch/google.js.map +1 -0
- package/dist/batch/index.d.ts +31 -0
- package/dist/batch/index.d.ts.map +1 -0
- package/dist/batch/index.js +31 -0
- package/dist/batch/index.js.map +1 -0
- package/dist/batch/memory.d.ts +44 -0
- package/dist/batch/memory.d.ts.map +1 -0
- package/dist/batch/memory.js +188 -0
- package/dist/batch/memory.js.map +1 -0
- package/dist/batch/openai.d.ts +37 -0
- package/dist/batch/openai.d.ts.map +1 -0
- package/dist/batch/openai.js +403 -0
- package/dist/batch/openai.js.map +1 -0
- package/dist/batch-map.d.ts +125 -0
- package/dist/batch-map.d.ts.map +1 -0
- package/dist/batch-map.js +406 -0
- package/dist/batch-map.js.map +1 -0
- package/dist/batch-queue.d.ts +273 -0
- package/dist/batch-queue.d.ts.map +1 -0
- package/dist/batch-queue.js +271 -0
- package/dist/batch-queue.js.map +1 -0
- package/dist/context.d.ts +133 -0
- package/dist/context.d.ts.map +1 -0
- package/dist/context.js +267 -0
- package/dist/context.js.map +1 -0
- package/dist/embeddings.d.ts +123 -0
- package/dist/embeddings.d.ts.map +1 -0
- package/dist/embeddings.js +170 -0
- package/dist/embeddings.js.map +1 -0
- package/dist/eval/index.d.ts +8 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +8 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/models.d.ts +66 -0
- package/dist/eval/models.d.ts.map +1 -0
- package/dist/eval/models.js +120 -0
- package/dist/eval/models.js.map +1 -0
- package/dist/eval/runner.d.ts +64 -0
- package/dist/eval/runner.d.ts.map +1 -0
- package/dist/eval/runner.js +148 -0
- package/dist/eval/runner.js.map +1 -0
- package/dist/generate.d.ts +168 -0
- package/dist/generate.d.ts.map +1 -0
- package/dist/generate.js +174 -0
- package/dist/generate.js.map +1 -0
- package/dist/index.d.ts +29 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +53 -52
- package/dist/index.js.map +1 -1
- package/dist/primitives.d.ts +292 -0
- package/dist/primitives.d.ts.map +1 -0
- package/dist/primitives.js +471 -0
- package/dist/primitives.js.map +1 -0
- package/dist/providers/cloudflare.d.ts +9 -0
- package/dist/providers/cloudflare.d.ts.map +1 -0
- package/dist/providers/cloudflare.js +9 -0
- package/dist/providers/cloudflare.js.map +1 -0
- package/dist/providers/index.d.ts +9 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +9 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/schema.d.ts +54 -0
- package/dist/schema.d.ts.map +1 -0
- package/dist/schema.js +109 -0
- package/dist/schema.js.map +1 -0
- package/dist/template.d.ts +73 -0
- package/dist/template.d.ts.map +1 -0
- package/dist/template.js +129 -0
- package/dist/template.js.map +1 -0
- package/dist/types.d.ts +474 -106
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +4 -8
- package/dist/types.js.map +1 -1
- package/evalite.config.ts +19 -0
- package/evals/README.md +212 -0
- package/evals/classification.eval.ts +108 -0
- package/evals/marketing.eval.ts +370 -0
- package/evals/math.eval.ts +94 -0
- package/evals/run-evals.ts +166 -0
- package/evals/structured-output.eval.ts +143 -0
- package/evals/writing.eval.ts +117 -0
- package/examples/batch-blog-posts.ts +160 -0
- package/package.json +57 -57
- package/src/ai-promise.ts +784 -0
- package/src/ai.ts +1183 -0
- package/src/batch/anthropic.ts +375 -0
- package/src/batch/bedrock.ts +801 -0
- package/src/batch/cloudflare.ts +421 -0
- package/src/batch/google.ts +491 -0
- package/src/batch/index.ts +31 -0
- package/src/batch/memory.ts +253 -0
- package/src/batch/openai.ts +557 -0
- package/src/batch-map.ts +534 -0
- package/src/batch-queue.ts +493 -0
- package/src/context.ts +332 -0
- package/src/embeddings.ts +244 -0
- package/src/eval/index.ts +8 -0
- package/src/eval/models.ts +158 -0
- package/src/eval/runner.ts +217 -0
- package/src/generate.ts +245 -0
- package/src/index.ts +154 -0
- package/src/primitives.ts +612 -0
- package/src/providers/cloudflare.ts +15 -0
- package/src/providers/index.ts +14 -0
- package/src/schema.ts +147 -0
- package/src/template.ts +209 -0
- package/src/types.ts +540 -0
- package/test/README.md +105 -0
- package/test/ai-proxy.test.ts +192 -0
- package/test/async-iterators.test.ts +327 -0
- package/test/batch-background.test.ts +482 -0
- package/test/batch-blog-posts.test.ts +387 -0
- package/test/blog-generation.test.ts +510 -0
- package/test/browse-read.test.ts +611 -0
- package/test/core-functions.test.ts +694 -0
- package/test/decide.test.ts +393 -0
- package/test/define.test.ts +274 -0
- package/test/e2e-bedrock-manual.ts +163 -0
- package/test/e2e-bedrock.test.ts +191 -0
- package/test/e2e-flex-gateway.ts +157 -0
- package/test/e2e-flex-manual.ts +183 -0
- package/test/e2e-flex.test.ts +209 -0
- package/test/e2e-google-manual.ts +178 -0
- package/test/e2e-google.test.ts +216 -0
- package/test/embeddings.test.ts +284 -0
- package/test/evals/define-function.eval.test.ts +379 -0
- package/test/evals/primitives.eval.test.ts +384 -0
- package/test/function-types.test.ts +492 -0
- package/test/generate-core.test.ts +319 -0
- package/test/generate.test.ts +163 -0
- package/test/implicit-batch.test.ts +422 -0
- package/test/schema.test.ts +109 -0
- package/test/tagged-templates.test.ts +302 -0
- package/tsconfig.json +10 -0
- package/vitest.config.ts +42 -0
- package/LICENSE +0 -21
- package/bin/cli.js +0 -5
- package/dist/cli/index.d.ts +0 -10
- package/dist/cli/index.d.ts.map +0 -1
- package/dist/cli/index.js +0 -38
- package/dist/cli/index.js.map +0 -1
- package/dist/cli/index.test.d.ts +0 -2
- package/dist/cli/index.test.d.ts.map +0 -1
- package/dist/cli/index.test.js +0 -35
- package/dist/cli/index.test.js.map +0 -1
- package/dist/constants/models.d.ts +0 -10
- package/dist/constants/models.d.ts.map +0 -1
- package/dist/constants/models.js +0 -12
- package/dist/constants/models.js.map +0 -1
- package/dist/converters/index.d.ts +0 -3
- package/dist/converters/index.d.ts.map +0 -1
- package/dist/converters/index.js +0 -3
- package/dist/converters/index.js.map +0 -1
- package/dist/converters/model.d.ts +0 -4
- package/dist/converters/model.d.ts.map +0 -1
- package/dist/converters/model.js +0 -19
- package/dist/converters/model.js.map +0 -1
- package/dist/converters/schema.d.ts +0 -4
- package/dist/converters/schema.d.ts.map +0 -1
- package/dist/converters/schema.js +0 -25
- package/dist/converters/schema.js.map +0 -1
- package/dist/core/responses.d.ts +0 -5
- package/dist/core/responses.d.ts.map +0 -1
- package/dist/core/responses.js +0 -16
- package/dist/core/responses.js.map +0 -1
- package/dist/core/responses.test.d.ts +0 -2
- package/dist/core/responses.test.d.ts.map +0 -1
- package/dist/core/responses.test.js +0 -31
- package/dist/core/responses.test.js.map +0 -1
- package/dist/errors.d.ts +0 -6
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js +0 -9
- package/dist/errors.js.map +0 -1
- package/dist/examples/streaming.test.d.ts +0 -2
- package/dist/examples/streaming.test.d.ts.map +0 -1
- package/dist/examples/streaming.test.js +0 -176
- package/dist/examples/streaming.test.js.map +0 -1
- package/dist/factory/__tests__/index.test.d.ts +0 -2
- package/dist/factory/__tests__/index.test.d.ts.map +0 -1
- package/dist/factory/__tests__/index.test.js +0 -430
- package/dist/factory/__tests__/index.test.js.map +0 -1
- package/dist/factory/__tests__/list.test.d.ts +0 -2
- package/dist/factory/__tests__/list.test.d.ts.map +0 -1
- package/dist/factory/__tests__/list.test.js +0 -92
- package/dist/factory/__tests__/list.test.js.map +0 -1
- package/dist/factory/index.d.ts +0 -20
- package/dist/factory/index.d.ts.map +0 -1
- package/dist/factory/index.js +0 -287
- package/dist/factory/index.js.map +0 -1
- package/dist/factory/index.test.d.ts +0 -2
- package/dist/factory/index.test.d.ts.map +0 -1
- package/dist/factory/index.test.js +0 -287
- package/dist/factory/index.test.js.map +0 -1
- package/dist/factory/list.d.ts +0 -3
- package/dist/factory/list.d.ts.map +0 -1
- package/dist/factory/list.js +0 -221
- package/dist/factory/list.js.map +0 -1
- package/dist/factory/list.test.d.ts +0 -2
- package/dist/factory/list.test.d.ts.map +0 -1
- package/dist/factory/list.test.js +0 -84
- package/dist/factory/list.test.js.map +0 -1
- package/dist/generate/index.d.ts +0 -5
- package/dist/generate/index.d.ts.map +0 -1
- package/dist/generate/index.js +0 -17
- package/dist/generate/index.js.map +0 -1
- package/dist/index.test.d.ts +0 -2
- package/dist/index.test.d.ts.map +0 -1
- package/dist/index.test.js +0 -59
- package/dist/index.test.js.map +0 -1
- package/dist/list/await.d.ts +0 -3
- package/dist/list/await.d.ts.map +0 -1
- package/dist/list/await.js +0 -28
- package/dist/list/await.js.map +0 -1
- package/dist/list/constants.d.ts +0 -4
- package/dist/list/constants.d.ts.map +0 -1
- package/dist/list/constants.js +0 -5
- package/dist/list/constants.js.map +0 -1
- package/dist/list/create-function.d.ts +0 -3
- package/dist/list/create-function.d.ts.map +0 -1
- package/dist/list/create-function.js +0 -11
- package/dist/list/create-function.js.map +0 -1
- package/dist/list/index.d.ts +0 -4
- package/dist/list/index.d.ts.map +0 -1
- package/dist/list/index.js +0 -5
- package/dist/list/index.js.map +0 -1
- package/dist/list/prompt.d.ts +0 -3
- package/dist/list/prompt.d.ts.map +0 -1
- package/dist/list/prompt.js +0 -6
- package/dist/list/prompt.js.map +0 -1
- package/dist/list/schemas.d.ts +0 -4
- package/dist/list/schemas.d.ts.map +0 -1
- package/dist/list/schemas.js +0 -8
- package/dist/list/schemas.js.map +0 -1
- package/dist/list/stream.d.ts +0 -3
- package/dist/list/stream.d.ts.map +0 -1
- package/dist/list/stream.js +0 -33
- package/dist/list/stream.js.map +0 -1
- package/dist/list/types.d.ts +0 -11
- package/dist/list/types.d.ts.map +0 -1
- package/dist/list/types.js +0 -2
- package/dist/list/types.js.map +0 -1
- package/dist/list/validation.d.ts +0 -3
- package/dist/list/validation.d.ts.map +0 -1
- package/dist/list/validation.js +0 -12
- package/dist/list/validation.js.map +0 -1
- package/dist/providers/config.d.ts +0 -4
- package/dist/providers/config.d.ts.map +0 -1
- package/dist/providers/config.js +0 -21
- package/dist/providers/config.js.map +0 -1
- package/dist/providers/config.test.d.ts +0 -2
- package/dist/providers/config.test.d.ts.map +0 -1
- package/dist/providers/config.test.js +0 -37
- package/dist/providers/config.test.js.map +0 -1
- package/dist/proxy/constants.d.ts +0 -4
- package/dist/proxy/constants.d.ts.map +0 -1
- package/dist/proxy/constants.js +0 -5
- package/dist/proxy/constants.js.map +0 -1
- package/dist/proxy/create-function.d.ts +0 -4
- package/dist/proxy/create-function.d.ts.map +0 -1
- package/dist/proxy/create-function.js +0 -24
- package/dist/proxy/create-function.js.map +0 -1
- package/dist/proxy/create-proxy.d.ts +0 -2
- package/dist/proxy/create-proxy.d.ts.map +0 -1
- package/dist/proxy/create-proxy.js +0 -11
- package/dist/proxy/create-proxy.js.map +0 -1
- package/dist/proxy/function-generator.d.ts +0 -9
- package/dist/proxy/function-generator.d.ts.map +0 -1
- package/dist/proxy/function-generator.js +0 -29
- package/dist/proxy/function-generator.js.map +0 -1
- package/dist/proxy/index.d.ts +0 -4
- package/dist/proxy/index.d.ts.map +0 -1
- package/dist/proxy/index.js +0 -4
- package/dist/proxy/index.js.map +0 -1
- package/dist/proxy/prompt.d.ts +0 -2
- package/dist/proxy/prompt.d.ts.map +0 -1
- package/dist/proxy/prompt.js +0 -6
- package/dist/proxy/prompt.js.map +0 -1
- package/dist/proxy/types.d.ts +0 -7
- package/dist/proxy/types.d.ts.map +0 -1
- package/dist/proxy/types.js +0 -2
- package/dist/proxy/types.js.map +0 -1
- package/dist/queue/manager.d.ts +0 -5
- package/dist/queue/manager.d.ts.map +0 -1
- package/dist/queue/manager.js +0 -37
- package/dist/queue/manager.js.map +0 -1
- package/dist/queue/manager.test.d.ts +0 -2
- package/dist/queue/manager.test.d.ts.map +0 -1
- package/dist/queue/manager.test.js +0 -52
- package/dist/queue/manager.test.js.map +0 -1
- package/dist/schema-converter.d.ts +0 -4
- package/dist/schema-converter.d.ts.map +0 -1
- package/dist/schema-converter.js +0 -30
- package/dist/schema-converter.js.map +0 -1
- package/dist/stream/index.d.ts +0 -7
- package/dist/stream/index.d.ts.map +0 -1
- package/dist/stream/index.js +0 -23
- package/dist/stream/index.js.map +0 -1
- package/dist/streaming/utils.d.ts +0 -4
- package/dist/streaming/utils.d.ts.map +0 -1
- package/dist/streaming/utils.js +0 -131
- package/dist/streaming/utils.js.map +0 -1
- package/dist/streaming/utils.test.d.ts +0 -2
- package/dist/streaming/utils.test.d.ts.map +0 -1
- package/dist/streaming/utils.test.js +0 -84
- package/dist/streaming/utils.test.js.map +0 -1
- package/dist/templates/result.d.ts +0 -7
- package/dist/templates/result.d.ts.map +0 -1
- package/dist/templates/result.js +0 -40
- package/dist/templates/result.js.map +0 -1
- package/dist/templates/result.test.d.ts +0 -2
- package/dist/templates/result.test.d.ts.map +0 -1
- package/dist/templates/result.test.js +0 -75
- package/dist/templates/result.test.js.map +0 -1
- package/dist/test/setup.d.ts +0 -2
- package/dist/test/setup.d.ts.map +0 -1
- package/dist/test/setup.js +0 -21
- package/dist/test/setup.js.map +0 -1
- package/dist/test-types.d.ts +0 -13
- package/dist/test-types.d.ts.map +0 -1
- package/dist/test-types.js +0 -55
- package/dist/test-types.js.map +0 -1
- package/dist/types/index.d.ts +0 -4
- package/dist/types/index.d.ts.map +0 -1
- package/dist/types/index.js +0 -4
- package/dist/types/index.js.map +0 -1
- package/dist/types/list.d.ts +0 -10
- package/dist/types/list.d.ts.map +0 -1
- package/dist/types/list.js +0 -2
- package/dist/types/list.js.map +0 -1
- package/dist/types/model.d.ts +0 -7
- package/dist/types/model.d.ts.map +0 -1
- package/dist/types/model.js +0 -2
- package/dist/types/model.js.map +0 -1
- package/dist/types/options.d.ts +0 -25
- package/dist/types/options.d.ts.map +0 -1
- package/dist/types/options.js +0 -2
- package/dist/types/options.js.map +0 -1
- package/dist/types/schema.d.ts +0 -5
- package/dist/types/schema.d.ts.map +0 -1
- package/dist/types/schema.js +0 -2
- package/dist/types/schema.js.map +0 -1
- package/dist/utils/__tests__/request-handler.test.d.ts +0 -2
- package/dist/utils/__tests__/request-handler.test.d.ts.map +0 -1
- package/dist/utils/__tests__/request-handler.test.js +0 -134
- package/dist/utils/__tests__/request-handler.test.js.map +0 -1
- package/dist/utils/__tests__/schema.test.d.ts +0 -2
- package/dist/utils/__tests__/schema.test.d.ts.map +0 -1
- package/dist/utils/__tests__/schema.test.js +0 -49
- package/dist/utils/__tests__/schema.test.js.map +0 -1
- package/dist/utils/__tests__/stream-progress.test.d.ts +0 -2
- package/dist/utils/__tests__/stream-progress.test.d.ts.map +0 -1
- package/dist/utils/__tests__/stream-progress.test.js +0 -85
- package/dist/utils/__tests__/stream-progress.test.js.map +0 -1
- package/dist/utils/index.d.ts +0 -2
- package/dist/utils/index.d.ts.map +0 -1
- package/dist/utils/index.js +0 -2
- package/dist/utils/index.js.map +0 -1
- package/dist/utils/request-handler.d.ts +0 -17
- package/dist/utils/request-handler.d.ts.map +0 -1
- package/dist/utils/request-handler.js +0 -105
- package/dist/utils/request-handler.js.map +0 -1
- package/dist/utils/schema.d.ts +0 -11
- package/dist/utils/schema.d.ts.map +0 -1
- package/dist/utils/schema.js +0 -51
- package/dist/utils/schema.js.map +0 -1
- package/dist/utils/stream-progress.d.ts +0 -17
- package/dist/utils/stream-progress.d.ts.map +0 -1
- package/dist/utils/stream-progress.js +0 -86
- package/dist/utils/stream-progress.js.map +0 -1
- package/dist/utils/validation.d.ts +0 -3
- package/dist/utils/validation.d.ts.map +0 -1
- package/dist/utils/validation.js +0 -30
- package/dist/utils/validation.js.map +0 -1
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* Marketing Copy Eval with LLM-as-Judge ELO Ranking
|
|
4
|
+
*
|
|
5
|
+
* Generates marketing copy (title, description, hero headline/subhead, CTAs)
|
|
6
|
+
* and uses pairwise comparison with an LLM judge to create ELO rankings.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* npx tsx evals/marketing.eval.ts
|
|
10
|
+
* npx tsx evals/marketing.eval.ts --judge=opus # Use specific judge model
|
|
11
|
+
* npx tsx evals/marketing.eval.ts --judge=haiku # Test cheap judge
|
|
12
|
+
* npx tsx evals/marketing.eval.ts --judge=flash # Test fast judge
|
|
13
|
+
* npx tsx evals/marketing.eval.ts --all # Run all tiers
|
|
14
|
+
* npx tsx evals/marketing.eval.ts --all --judge=haiku # All tiers + cheap judge
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
// Load .env from project root
|
|
18
|
+
import { config } from 'dotenv'
|
|
19
|
+
import { resolve } from 'path'
|
|
20
|
+
config({ path: resolve(import.meta.dirname, '../../../.env') })
|
|
21
|
+
|
|
22
|
+
import { generateObject } from '../src/generate.js'
|
|
23
|
+
import { schema } from '../src/schema.js'
|
|
24
|
+
import { EVAL_MODELS, type EvalModel, type ModelTier } from '../src/eval/models.js'
|
|
25
|
+
|
|
26
|
+
// Parse CLI args
|
|
27
|
+
const args = process.argv.slice(2)
|
|
28
|
+
const judgeArg = args.find(a => a.startsWith('--judge='))
|
|
29
|
+
const JUDGE_MODEL = judgeArg ? judgeArg.split('=')[1] : 'sonnet'
|
|
30
|
+
const runAll = args.includes('--all')
|
|
31
|
+
|
|
32
|
+
const tiers: ModelTier[] = runAll ? ['best', 'fast', 'cheap'] : ['fast']
|
|
33
|
+
|
|
34
|
+
// Marketing copy schema
|
|
35
|
+
const marketingCopySchema = schema({
|
|
36
|
+
title: 'Product/page title (5-10 words)',
|
|
37
|
+
description: 'Meta description for SEO (150-160 characters)',
|
|
38
|
+
hero: {
|
|
39
|
+
headline: 'Hero headline (5-8 words, compelling)',
|
|
40
|
+
subhead: 'Supporting subheadline (10-20 words)',
|
|
41
|
+
primaryCTA: 'Primary call-to-action button text (2-4 words)',
|
|
42
|
+
secondaryCTA: 'Secondary call-to-action link text (3-6 words)',
|
|
43
|
+
},
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
// Test cases - different product/service scenarios
|
|
47
|
+
const TEST_CASES = [
|
|
48
|
+
{
|
|
49
|
+
name: 'SaaS Analytics Platform',
|
|
50
|
+
prompt: `Create marketing copy for a B2B SaaS analytics platform called "InsightFlow" that helps companies understand their customer behavior with AI-powered insights. Target audience: Product managers and growth teams at mid-size tech companies.`,
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
name: 'E-commerce Fashion Brand',
|
|
54
|
+
prompt: `Create marketing copy for a sustainable fashion e-commerce brand called "EcoThread" that sells organic, ethically-made clothing. Target audience: Environmentally conscious millennials aged 25-35.`,
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
name: 'Developer Tool',
|
|
58
|
+
prompt: `Create marketing copy for a CLI tool called "DeployFast" that simplifies Kubernetes deployments with one-command deploys. Target audience: DevOps engineers and backend developers.`,
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
name: 'Mobile Fitness App',
|
|
62
|
+
prompt: `Create marketing copy for a fitness app called "FitPulse" that uses AI to create personalized workout plans and tracks progress with smart watch integration. Target audience: Busy professionals aged 30-45.`,
|
|
63
|
+
},
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
interface MarketingCopy {
|
|
67
|
+
title: string
|
|
68
|
+
description: string
|
|
69
|
+
hero: {
|
|
70
|
+
headline: string
|
|
71
|
+
subhead: string
|
|
72
|
+
primaryCTA: string
|
|
73
|
+
secondaryCTA: string
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
interface GeneratedCopy {
|
|
78
|
+
model: EvalModel
|
|
79
|
+
testCase: typeof TEST_CASES[0]
|
|
80
|
+
copy: MarketingCopy
|
|
81
|
+
latencyMs: number
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
interface ELORating {
|
|
85
|
+
modelId: string
|
|
86
|
+
modelName: string
|
|
87
|
+
rating: number
|
|
88
|
+
wins: number
|
|
89
|
+
losses: number
|
|
90
|
+
draws: number
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// ELO calculation
|
|
94
|
+
const K_FACTOR = 32
|
|
95
|
+
const INITIAL_ELO = 1500
|
|
96
|
+
|
|
97
|
+
function calculateEloChange(ratingA: number, ratingB: number, scoreA: number): { deltaA: number; deltaB: number } {
|
|
98
|
+
const expectedA = 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400))
|
|
99
|
+
const expectedB = 1 - expectedA
|
|
100
|
+
|
|
101
|
+
const deltaA = K_FACTOR * (scoreA - expectedA)
|
|
102
|
+
const deltaB = K_FACTOR * ((1 - scoreA) - expectedB)
|
|
103
|
+
|
|
104
|
+
return { deltaA, deltaB }
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// LLM Judge for pairwise comparison
|
|
108
|
+
async function judgePair(
|
|
109
|
+
copyA: MarketingCopy,
|
|
110
|
+
copyB: MarketingCopy,
|
|
111
|
+
testCase: typeof TEST_CASES[0],
|
|
112
|
+
judgeModel: string
|
|
113
|
+
): Promise<'A' | 'B' | 'TIE'> {
|
|
114
|
+
const prompt = `You are an expert marketing copywriter and brand strategist. Compare these two marketing copy options for: ${testCase.name}
|
|
115
|
+
|
|
116
|
+
Context: ${testCase.prompt}
|
|
117
|
+
|
|
118
|
+
=== OPTION A ===
|
|
119
|
+
Title: ${copyA.title}
|
|
120
|
+
Description: ${copyA.description}
|
|
121
|
+
Hero Headline: ${copyA.hero.headline}
|
|
122
|
+
Hero Subhead: ${copyA.hero.subhead}
|
|
123
|
+
Primary CTA: ${copyA.hero.primaryCTA}
|
|
124
|
+
Secondary CTA: ${copyA.hero.secondaryCTA}
|
|
125
|
+
|
|
126
|
+
=== OPTION B ===
|
|
127
|
+
Title: ${copyB.title}
|
|
128
|
+
Description: ${copyB.description}
|
|
129
|
+
Hero Headline: ${copyB.hero.headline}
|
|
130
|
+
Hero Subhead: ${copyB.hero.subhead}
|
|
131
|
+
Primary CTA: ${copyB.hero.primaryCTA}
|
|
132
|
+
Secondary CTA: ${copyB.hero.secondaryCTA}
|
|
133
|
+
|
|
134
|
+
Evaluate based on:
|
|
135
|
+
1. Clarity and impact of messaging
|
|
136
|
+
2. Target audience alignment
|
|
137
|
+
3. Emotional appeal and persuasiveness
|
|
138
|
+
4. CTA effectiveness
|
|
139
|
+
5. Overall brand voice consistency
|
|
140
|
+
|
|
141
|
+
Which option is better? Answer A, B, or TIE if they're roughly equal.`
|
|
142
|
+
|
|
143
|
+
try {
|
|
144
|
+
const { object } = await generateObject({
|
|
145
|
+
model: judgeModel,
|
|
146
|
+
schema: schema({
|
|
147
|
+
reasoning: 'Brief explanation of your judgment (2-3 sentences)',
|
|
148
|
+
winner: 'A | B | TIE',
|
|
149
|
+
}),
|
|
150
|
+
prompt,
|
|
151
|
+
temperature: 0.3,
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
const result = object as { reasoning: string; winner: string }
|
|
155
|
+
const winner = result.winner.toUpperCase().trim()
|
|
156
|
+
|
|
157
|
+
if (winner === 'A' || winner === 'B' || winner === 'TIE') {
|
|
158
|
+
return winner
|
|
159
|
+
}
|
|
160
|
+
return 'TIE'
|
|
161
|
+
} catch (err) {
|
|
162
|
+
console.error(` Judge error: ${err}`)
|
|
163
|
+
return 'TIE'
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Generate marketing copy for a model
|
|
168
|
+
async function generateCopy(model: EvalModel, testCase: typeof TEST_CASES[0]): Promise<GeneratedCopy> {
|
|
169
|
+
const start = Date.now()
|
|
170
|
+
|
|
171
|
+
const { object } = await generateObject({
|
|
172
|
+
model: model.id,
|
|
173
|
+
schema: marketingCopySchema,
|
|
174
|
+
prompt: testCase.prompt,
|
|
175
|
+
temperature: 0.7,
|
|
176
|
+
})
|
|
177
|
+
|
|
178
|
+
return {
|
|
179
|
+
model,
|
|
180
|
+
testCase,
|
|
181
|
+
copy: object as MarketingCopy,
|
|
182
|
+
latencyMs: Date.now() - start,
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Run pairwise comparisons and calculate ELO
|
|
187
|
+
async function runEloTournament(
|
|
188
|
+
copies: GeneratedCopy[],
|
|
189
|
+
judgeModel: string
|
|
190
|
+
): Promise<ELORating[]> {
|
|
191
|
+
// Initialize ELO ratings
|
|
192
|
+
const ratings: Map<string, ELORating> = new Map()
|
|
193
|
+
|
|
194
|
+
for (const copy of copies) {
|
|
195
|
+
if (!ratings.has(copy.model.id)) {
|
|
196
|
+
ratings.set(copy.model.id, {
|
|
197
|
+
modelId: copy.model.id,
|
|
198
|
+
modelName: copy.model.name,
|
|
199
|
+
rating: INITIAL_ELO,
|
|
200
|
+
wins: 0,
|
|
201
|
+
losses: 0,
|
|
202
|
+
draws: 0,
|
|
203
|
+
})
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Group copies by test case
|
|
208
|
+
const byTestCase = new Map<string, GeneratedCopy[]>()
|
|
209
|
+
for (const copy of copies) {
|
|
210
|
+
const key = copy.testCase.name
|
|
211
|
+
if (!byTestCase.has(key)) {
|
|
212
|
+
byTestCase.set(key, [])
|
|
213
|
+
}
|
|
214
|
+
byTestCase.get(key)!.push(copy)
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
console.log(`\n⚖️ Running pairwise comparisons with ${JUDGE_MODEL} as judge...\n`)
|
|
218
|
+
|
|
219
|
+
let comparisonCount = 0
|
|
220
|
+
const totalComparisons = Array.from(byTestCase.values()).reduce(
|
|
221
|
+
(sum, copies) => sum + (copies.length * (copies.length - 1)) / 2,
|
|
222
|
+
0
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
// Run pairwise comparisons within each test case
|
|
226
|
+
for (const [testCaseName, testCaseCopies] of byTestCase) {
|
|
227
|
+
console.log(` 📝 ${testCaseName}:`)
|
|
228
|
+
|
|
229
|
+
for (let i = 0; i < testCaseCopies.length; i++) {
|
|
230
|
+
for (let j = i + 1; j < testCaseCopies.length; j++) {
|
|
231
|
+
const copyA = testCaseCopies[i]
|
|
232
|
+
const copyB = testCaseCopies[j]
|
|
233
|
+
|
|
234
|
+
comparisonCount++
|
|
235
|
+
process.stdout.write(` ${comparisonCount}/${totalComparisons} ${copyA.model.name} vs ${copyB.model.name}... `)
|
|
236
|
+
|
|
237
|
+
const winner = await judgePair(copyA.copy, copyB.copy, copyA.testCase, judgeModel)
|
|
238
|
+
|
|
239
|
+
const ratingA = ratings.get(copyA.model.id)!
|
|
240
|
+
const ratingB = ratings.get(copyB.model.id)!
|
|
241
|
+
|
|
242
|
+
let scoreA: number
|
|
243
|
+
if (winner === 'A') {
|
|
244
|
+
scoreA = 1
|
|
245
|
+
ratingA.wins++
|
|
246
|
+
ratingB.losses++
|
|
247
|
+
console.log(`${copyA.model.name} wins`)
|
|
248
|
+
} else if (winner === 'B') {
|
|
249
|
+
scoreA = 0
|
|
250
|
+
ratingA.losses++
|
|
251
|
+
ratingB.wins++
|
|
252
|
+
console.log(`${copyB.model.name} wins`)
|
|
253
|
+
} else {
|
|
254
|
+
scoreA = 0.5
|
|
255
|
+
ratingA.draws++
|
|
256
|
+
ratingB.draws++
|
|
257
|
+
console.log(`TIE`)
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
const { deltaA, deltaB } = calculateEloChange(ratingA.rating, ratingB.rating, scoreA)
|
|
261
|
+
ratingA.rating += deltaA
|
|
262
|
+
ratingB.rating += deltaB
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Sort by ELO rating
|
|
268
|
+
return Array.from(ratings.values()).sort((a, b) => b.rating - a.rating)
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Main
|
|
272
|
+
async function main() {
|
|
273
|
+
console.log('╔════════════════════════════════════════════════════════════════╗')
|
|
274
|
+
console.log('║ Marketing Copy Eval (LLM-as-Judge) ║')
|
|
275
|
+
console.log('╚════════════════════════════════════════════════════════════════╝')
|
|
276
|
+
console.log('')
|
|
277
|
+
console.log(`Judge Model: ${JUDGE_MODEL}`)
|
|
278
|
+
console.log(`Tiers: ${tiers.join(', ')}`)
|
|
279
|
+
|
|
280
|
+
// Get models to test
|
|
281
|
+
const models = EVAL_MODELS.filter(m => tiers.includes(m.tier))
|
|
282
|
+
console.log(`Models: ${models.map(m => m.name).join(', ')}`)
|
|
283
|
+
console.log(`Test Cases: ${TEST_CASES.length}`)
|
|
284
|
+
console.log('')
|
|
285
|
+
|
|
286
|
+
// Generate copy from each model for each test case
|
|
287
|
+
console.log('🎨 Generating marketing copy...\n')
|
|
288
|
+
|
|
289
|
+
const allCopies: GeneratedCopy[] = []
|
|
290
|
+
const startTime = Date.now()
|
|
291
|
+
|
|
292
|
+
for (const testCase of TEST_CASES) {
|
|
293
|
+
console.log(` 📦 ${testCase.name}:`)
|
|
294
|
+
|
|
295
|
+
const jobs = models.map(async model => {
|
|
296
|
+
try {
|
|
297
|
+
const copy = await generateCopy(model, testCase)
|
|
298
|
+
console.log(` ✓ ${model.name} (${copy.latencyMs}ms)`)
|
|
299
|
+
return copy
|
|
300
|
+
} catch (err) {
|
|
301
|
+
console.log(` ✗ ${model.name}: ${err}`)
|
|
302
|
+
return null
|
|
303
|
+
}
|
|
304
|
+
})
|
|
305
|
+
|
|
306
|
+
const results = await Promise.all(jobs)
|
|
307
|
+
allCopies.push(...results.filter((r): r is GeneratedCopy => r !== null))
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
const generateTime = Date.now() - startTime
|
|
311
|
+
console.log(`\n Generated ${allCopies.length} copies in ${(generateTime / 1000).toFixed(1)}s`)
|
|
312
|
+
|
|
313
|
+
// Run ELO tournament
|
|
314
|
+
const tournamentStart = Date.now()
|
|
315
|
+
const eloRatings = await runEloTournament(allCopies, JUDGE_MODEL)
|
|
316
|
+
const tournamentTime = Date.now() - tournamentStart
|
|
317
|
+
|
|
318
|
+
// Display results
|
|
319
|
+
console.log('')
|
|
320
|
+
console.log('╔════════════════════════════════════════════════════════════════╗')
|
|
321
|
+
console.log('║ ELO Rankings ║')
|
|
322
|
+
console.log('╚════════════════════════════════════════════════════════════════╝')
|
|
323
|
+
console.log('')
|
|
324
|
+
console.log(' Rank | Model | ELO | W | L | D |')
|
|
325
|
+
console.log(' -----|------------------------|--------|-----|-----|-----|')
|
|
326
|
+
|
|
327
|
+
eloRatings.forEach((rating, idx) => {
|
|
328
|
+
const rank = `${idx + 1}`.padStart(4)
|
|
329
|
+
const name = rating.modelName.padEnd(22)
|
|
330
|
+
const elo = Math.round(rating.rating).toString().padStart(6)
|
|
331
|
+
const wins = rating.wins.toString().padStart(3)
|
|
332
|
+
const losses = rating.losses.toString().padStart(3)
|
|
333
|
+
const draws = rating.draws.toString().padStart(3)
|
|
334
|
+
console.log(` ${rank} | ${name} | ${elo} | ${wins} | ${losses} | ${draws} |`)
|
|
335
|
+
})
|
|
336
|
+
|
|
337
|
+
console.log('')
|
|
338
|
+
console.log(` Judge: ${JUDGE_MODEL}`)
|
|
339
|
+
console.log(` Generation Time: ${(generateTime / 1000).toFixed(1)}s`)
|
|
340
|
+
console.log(` Tournament Time: ${(tournamentTime / 1000).toFixed(1)}s`)
|
|
341
|
+
console.log(` Total Time: ${((generateTime + tournamentTime) / 1000).toFixed(1)}s`)
|
|
342
|
+
|
|
343
|
+
// Show sample outputs from top 3
|
|
344
|
+
console.log('')
|
|
345
|
+
console.log('╔════════════════════════════════════════════════════════════════╗')
|
|
346
|
+
console.log('║ Sample Outputs (Top 3) ║')
|
|
347
|
+
console.log('╚════════════════════════════════════════════════════════════════╝')
|
|
348
|
+
|
|
349
|
+
const top3Models = eloRatings.slice(0, 3).map(r => r.modelId)
|
|
350
|
+
const sampleTestCase = TEST_CASES[0]
|
|
351
|
+
|
|
352
|
+
for (const modelId of top3Models) {
|
|
353
|
+
const copy = allCopies.find(c => c.model.id === modelId && c.testCase.name === sampleTestCase.name)
|
|
354
|
+
if (copy) {
|
|
355
|
+
const rank = eloRatings.findIndex(r => r.modelId === modelId) + 1
|
|
356
|
+
console.log(`\n #${rank} ${copy.model.name} (${sampleTestCase.name}):`)
|
|
357
|
+
console.log(` ─────────────────────────────────────────`)
|
|
358
|
+
console.log(` Title: ${copy.copy.title}`)
|
|
359
|
+
console.log(` Description: ${copy.copy.description}`)
|
|
360
|
+
console.log(` Headline: ${copy.copy.hero.headline}`)
|
|
361
|
+
console.log(` Subhead: ${copy.copy.hero.subhead}`)
|
|
362
|
+
console.log(` Primary CTA: [${copy.copy.hero.primaryCTA}]`)
|
|
363
|
+
console.log(` Secondary CTA: ${copy.copy.hero.secondaryCTA}`)
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
console.log('')
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
main().catch(console.error)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Math Eval
|
|
3
|
+
*
|
|
4
|
+
* Tests model mathematical reasoning from simple arithmetic
|
|
5
|
+
* to word problems.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { evalite } from 'evalite'
|
|
9
|
+
import { generateObject } from '../src/generate.js'
|
|
10
|
+
import { schema } from '../src/schema.js'
|
|
11
|
+
import { createModelVariants, type EvalModel } from '../src/eval/models.js'
|
|
12
|
+
|
|
13
|
+
// Math test cases
|
|
14
|
+
const TEST_CASES = [
|
|
15
|
+
// Arithmetic
|
|
16
|
+
{ problem: 'What is 15 + 27?', expected: 42, difficulty: 'easy' },
|
|
17
|
+
{ problem: 'What is 144 / 12?', expected: 12, difficulty: 'easy' },
|
|
18
|
+
{ problem: 'What is 7 * 8?', expected: 56, difficulty: 'easy' },
|
|
19
|
+
|
|
20
|
+
// Word problems
|
|
21
|
+
{ problem: 'A store sells 45 apples at $2 each. What is the total revenue?', expected: 90, difficulty: 'medium' },
|
|
22
|
+
{ problem: 'A train travels 240 miles in 4 hours. What is the average speed in mph?', expected: 60, difficulty: 'medium' },
|
|
23
|
+
|
|
24
|
+
// Multi-step
|
|
25
|
+
{ problem: 'A company has 120 employees. 40% work in engineering, and 25% of engineers are senior. How many senior engineers?', expected: 12, difficulty: 'hard' },
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
const modelVariants = createModelVariants({ tiers: ['fast'] })
|
|
29
|
+
|
|
30
|
+
evalite.each(modelVariants)('Math', {
|
|
31
|
+
data: TEST_CASES.map(tc => ({ input: tc, expected: tc.expected })),
|
|
32
|
+
|
|
33
|
+
task: async (input, variant) => {
|
|
34
|
+
const model = variant as EvalModel
|
|
35
|
+
const startTime = Date.now()
|
|
36
|
+
|
|
37
|
+
const { object, usage } = await generateObject({
|
|
38
|
+
model: model.id,
|
|
39
|
+
schema: schema({
|
|
40
|
+
answer: 'The numeric answer (number)',
|
|
41
|
+
reasoning: 'Step by step reasoning',
|
|
42
|
+
}),
|
|
43
|
+
prompt: `Solve this math problem:\n\n${input.problem}`,
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
const latencyMs = Date.now() - startTime
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
answer: object.answer,
|
|
50
|
+
reasoning: object.reasoning,
|
|
51
|
+
expected: input.expected,
|
|
52
|
+
problem: input.problem,
|
|
53
|
+
difficulty: input.difficulty,
|
|
54
|
+
modelId: model.id,
|
|
55
|
+
modelName: model.name,
|
|
56
|
+
latencyMs,
|
|
57
|
+
usage,
|
|
58
|
+
}
|
|
59
|
+
},
|
|
60
|
+
|
|
61
|
+
scorers: [
|
|
62
|
+
// Exact answer
|
|
63
|
+
{
|
|
64
|
+
name: 'Correct Answer',
|
|
65
|
+
description: 'Whether the numeric answer is correct',
|
|
66
|
+
scorer: ({ output, expected }) => {
|
|
67
|
+
const answer = output.answer as number
|
|
68
|
+
const exp = expected as number
|
|
69
|
+
// Allow small floating point tolerance
|
|
70
|
+
return { score: Math.abs(answer - exp) < 0.01 ? 1 : 0 }
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
|
|
74
|
+
// Shows reasoning
|
|
75
|
+
{
|
|
76
|
+
name: 'Shows Work',
|
|
77
|
+
description: 'Whether model explains reasoning',
|
|
78
|
+
scorer: ({ output }) => {
|
|
79
|
+
const reasoning = output.reasoning as string
|
|
80
|
+
if (!reasoning || reasoning.length < 20) return { score: 0.2 }
|
|
81
|
+
if (reasoning.length > 50) return { score: 1 }
|
|
82
|
+
return { score: 0.6 }
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
],
|
|
86
|
+
|
|
87
|
+
columns: ({ output, expected }) => [
|
|
88
|
+
{ label: 'Model', value: output.modelName },
|
|
89
|
+
{ label: 'Difficulty', value: output.difficulty },
|
|
90
|
+
{ label: 'Expected', value: expected },
|
|
91
|
+
{ label: 'Got', value: output.answer },
|
|
92
|
+
{ label: 'Correct', value: Math.abs((output.answer as number) - (expected as number)) < 0.01 ? 'Yes' : 'No' },
|
|
93
|
+
],
|
|
94
|
+
})
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* Run AI Functions Eval Suite
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* npx tsx evals/run-evals.ts [--fast] [--all]
|
|
7
|
+
*
|
|
8
|
+
* Options:
|
|
9
|
+
* --fast Only run fast-tier models (default)
|
|
10
|
+
* --all Run all models
|
|
11
|
+
* --math Run only math eval
|
|
12
|
+
* --class Run only classification eval
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { runEval, generateObject, generateText, schema } from '../src/eval/runner.js'
|
|
16
|
+
import type { EvalModel, ModelTier } from '../src/eval/models.js'
|
|
17
|
+
|
|
18
|
+
// Parse CLI args
|
|
19
|
+
const args = process.argv.slice(2)
|
|
20
|
+
const runAll = args.includes('--all')
|
|
21
|
+
const runMath = args.includes('--math')
|
|
22
|
+
const runClass = args.includes('--class')
|
|
23
|
+
const runSingle = runMath || runClass
|
|
24
|
+
|
|
25
|
+
const tiers: ModelTier[] = runAll ? ['best', 'fast', 'cheap'] : ['fast']
|
|
26
|
+
|
|
27
|
+
console.log('╔════════════════════════════════════════════════════════════════╗')
|
|
28
|
+
console.log('║ AI Functions Eval Suite ║')
|
|
29
|
+
console.log('╚════════════════════════════════════════════════════════════════╝')
|
|
30
|
+
console.log('')
|
|
31
|
+
console.log(`Tiers: ${tiers.join(', ')}`)
|
|
32
|
+
|
|
33
|
+
// Math eval
|
|
34
|
+
async function runMathEval() {
|
|
35
|
+
const cases = [
|
|
36
|
+
{ name: 'Simple addition', input: { problem: 'What is 15 + 27?' }, expected: 42 },
|
|
37
|
+
{ name: 'Division', input: { problem: 'What is 144 / 12?' }, expected: 12 },
|
|
38
|
+
{ name: 'Multiplication', input: { problem: 'What is 7 * 8?' }, expected: 56 },
|
|
39
|
+
{ name: 'Word problem', input: { problem: 'A store sells 45 apples at $2 each. What is the total revenue?' }, expected: 90 },
|
|
40
|
+
{ name: 'Multi-step', input: { problem: 'A company has 120 employees. 40% work in engineering, and 25% of engineers are senior. How many senior engineers?' }, expected: 12 },
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
return runEval({
|
|
44
|
+
name: 'Math',
|
|
45
|
+
cases,
|
|
46
|
+
tiers,
|
|
47
|
+
task: async (input, model) => {
|
|
48
|
+
const { object } = await generateObject({
|
|
49
|
+
model: model.id,
|
|
50
|
+
schema: schema({
|
|
51
|
+
answer: 'The numeric answer (number)',
|
|
52
|
+
reasoning: 'Step by step reasoning',
|
|
53
|
+
}),
|
|
54
|
+
prompt: `Solve this math problem:\n\n${input.problem}`,
|
|
55
|
+
})
|
|
56
|
+
return object
|
|
57
|
+
},
|
|
58
|
+
scorers: [
|
|
59
|
+
{
|
|
60
|
+
name: 'Correct Answer',
|
|
61
|
+
description: 'Whether the numeric answer is correct',
|
|
62
|
+
scorer: ({ output, expected }) => {
|
|
63
|
+
const answer = (output as { answer: number }).answer
|
|
64
|
+
const exp = expected as number
|
|
65
|
+
return Math.abs(answer - exp) < 0.01 ? 1 : 0
|
|
66
|
+
},
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
name: 'Shows Work',
|
|
70
|
+
description: 'Whether model explains reasoning',
|
|
71
|
+
scorer: ({ output }) => {
|
|
72
|
+
const reasoning = (output as { reasoning: string }).reasoning
|
|
73
|
+
if (!reasoning || reasoning.length < 20) return 0.2
|
|
74
|
+
if (reasoning.length > 50) return 1
|
|
75
|
+
return 0.6
|
|
76
|
+
},
|
|
77
|
+
},
|
|
78
|
+
],
|
|
79
|
+
})
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Classification eval
|
|
83
|
+
async function runClassificationEval() {
|
|
84
|
+
const cases = [
|
|
85
|
+
{ name: 'Positive sentiment', input: { text: 'This product exceeded my expectations!', options: ['positive', 'negative', 'neutral'] }, expected: 'positive' },
|
|
86
|
+
{ name: 'Negative sentiment', input: { text: 'The delivery was late and packaging damaged.', options: ['positive', 'negative', 'neutral'] }, expected: 'negative' },
|
|
87
|
+
{ name: 'Neutral sentiment', input: { text: 'The product arrived as described.', options: ['positive', 'negative', 'neutral'] }, expected: 'neutral' },
|
|
88
|
+
{ name: 'Account ticket', input: { text: 'I need to reset my password', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'account' },
|
|
89
|
+
{ name: 'Billing ticket', input: { text: 'When will my refund be processed?', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'billing' },
|
|
90
|
+
{ name: 'Technical ticket', input: { text: 'The app crashes when uploading images', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'technical' },
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
return runEval({
|
|
94
|
+
name: 'Classification',
|
|
95
|
+
cases,
|
|
96
|
+
tiers,
|
|
97
|
+
task: async (input, model) => {
|
|
98
|
+
const enumStr = input.options.join(' | ')
|
|
99
|
+
const { object } = await generateObject({
|
|
100
|
+
model: model.id,
|
|
101
|
+
schema: schema({
|
|
102
|
+
category: enumStr,
|
|
103
|
+
confidence: 'Confidence 0-1 (number)',
|
|
104
|
+
}),
|
|
105
|
+
prompt: `Classify this text into one of: ${input.options.join(', ')}\n\nText: "${input.text}"`,
|
|
106
|
+
})
|
|
107
|
+
return object
|
|
108
|
+
},
|
|
109
|
+
scorers: [
|
|
110
|
+
{
|
|
111
|
+
name: 'Accuracy',
|
|
112
|
+
description: 'Whether classification is correct',
|
|
113
|
+
scorer: ({ output, expected }) => {
|
|
114
|
+
const predicted = (output as { category: string }).category
|
|
115
|
+
return predicted === expected ? 1 : 0
|
|
116
|
+
},
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
name: 'Valid Category',
|
|
120
|
+
description: 'Whether output is a valid option',
|
|
121
|
+
scorer: ({ input, output }) => {
|
|
122
|
+
const predicted = (output as { category: string }).category
|
|
123
|
+
const options = (input as { options: string[] }).options
|
|
124
|
+
return options.includes(predicted) ? 1 : 0
|
|
125
|
+
},
|
|
126
|
+
},
|
|
127
|
+
],
|
|
128
|
+
})
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Run evals
|
|
132
|
+
async function main() {
|
|
133
|
+
const results = []
|
|
134
|
+
|
|
135
|
+
if (!runSingle || runMath) {
|
|
136
|
+
results.push(await runMathEval())
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (!runSingle || runClass) {
|
|
140
|
+
results.push(await runClassificationEval())
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Overall summary
|
|
144
|
+
console.log('')
|
|
145
|
+
console.log('╔════════════════════════════════════════════════════════════════╗')
|
|
146
|
+
console.log('║ Summary ║')
|
|
147
|
+
console.log('╚════════════════════════════════════════════════════════════════╝')
|
|
148
|
+
|
|
149
|
+
let totalScore = 0
|
|
150
|
+
let totalCost = 0
|
|
151
|
+
let totalTime = 0
|
|
152
|
+
|
|
153
|
+
for (const result of results) {
|
|
154
|
+
console.log(`\n${result.name}: ${(result.avgScore * 100).toFixed(1)}%`)
|
|
155
|
+
totalScore += result.avgScore
|
|
156
|
+
totalCost += result.totalCost
|
|
157
|
+
totalTime += result.totalTime
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
console.log('')
|
|
161
|
+
console.log(`Overall: ${((totalScore / results.length) * 100).toFixed(1)}%`)
|
|
162
|
+
console.log(`Total Cost: $${totalCost.toFixed(4)}`)
|
|
163
|
+
console.log(`Total Time: ${(totalTime / 1000).toFixed(1)}s`)
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
main().catch(console.error)
|