ai-functions 0.2.19 ā 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +5 -0
- package/.turbo/turbo-test.log +105 -0
- package/README.md +232 -37
- package/TODO.md +138 -0
- package/dist/ai-promise.d.ts +219 -0
- package/dist/ai-promise.d.ts.map +1 -0
- package/dist/ai-promise.js +610 -0
- package/dist/ai-promise.js.map +1 -0
- package/dist/ai.d.ts +285 -0
- package/dist/ai.d.ts.map +1 -0
- package/dist/ai.js +842 -0
- package/dist/ai.js.map +1 -0
- package/dist/batch/anthropic.d.ts +23 -0
- package/dist/batch/anthropic.d.ts.map +1 -0
- package/dist/batch/anthropic.js +257 -0
- package/dist/batch/anthropic.js.map +1 -0
- package/dist/batch/bedrock.d.ts +64 -0
- package/dist/batch/bedrock.d.ts.map +1 -0
- package/dist/batch/bedrock.js +586 -0
- package/dist/batch/bedrock.js.map +1 -0
- package/dist/batch/cloudflare.d.ts +37 -0
- package/dist/batch/cloudflare.d.ts.map +1 -0
- package/dist/batch/cloudflare.js +289 -0
- package/dist/batch/cloudflare.js.map +1 -0
- package/dist/batch/google.d.ts +41 -0
- package/dist/batch/google.d.ts.map +1 -0
- package/dist/batch/google.js +360 -0
- package/dist/batch/google.js.map +1 -0
- package/dist/batch/index.d.ts +31 -0
- package/dist/batch/index.d.ts.map +1 -0
- package/dist/batch/index.js +31 -0
- package/dist/batch/index.js.map +1 -0
- package/dist/batch/memory.d.ts +44 -0
- package/dist/batch/memory.d.ts.map +1 -0
- package/dist/batch/memory.js +188 -0
- package/dist/batch/memory.js.map +1 -0
- package/dist/batch/openai.d.ts +37 -0
- package/dist/batch/openai.d.ts.map +1 -0
- package/dist/batch/openai.js +403 -0
- package/dist/batch/openai.js.map +1 -0
- package/dist/batch-map.d.ts +125 -0
- package/dist/batch-map.d.ts.map +1 -0
- package/dist/batch-map.js +406 -0
- package/dist/batch-map.js.map +1 -0
- package/dist/batch-queue.d.ts +273 -0
- package/dist/batch-queue.d.ts.map +1 -0
- package/dist/batch-queue.js +271 -0
- package/dist/batch-queue.js.map +1 -0
- package/dist/context.d.ts +133 -0
- package/dist/context.d.ts.map +1 -0
- package/dist/context.js +267 -0
- package/dist/context.js.map +1 -0
- package/dist/embeddings.d.ts +123 -0
- package/dist/embeddings.d.ts.map +1 -0
- package/dist/embeddings.js +170 -0
- package/dist/embeddings.js.map +1 -0
- package/dist/eval/index.d.ts +8 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +8 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/models.d.ts +66 -0
- package/dist/eval/models.d.ts.map +1 -0
- package/dist/eval/models.js +120 -0
- package/dist/eval/models.js.map +1 -0
- package/dist/eval/runner.d.ts +64 -0
- package/dist/eval/runner.d.ts.map +1 -0
- package/dist/eval/runner.js +148 -0
- package/dist/eval/runner.js.map +1 -0
- package/dist/generate.d.ts +168 -0
- package/dist/generate.d.ts.map +1 -0
- package/dist/generate.js +174 -0
- package/dist/generate.js.map +1 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +54 -0
- package/dist/index.js.map +1 -0
- package/dist/primitives.d.ts +292 -0
- package/dist/primitives.d.ts.map +1 -0
- package/dist/primitives.js +471 -0
- package/dist/primitives.js.map +1 -0
- package/dist/providers/cloudflare.d.ts +9 -0
- package/dist/providers/cloudflare.d.ts.map +1 -0
- package/dist/providers/cloudflare.js +9 -0
- package/dist/providers/cloudflare.js.map +1 -0
- package/dist/providers/index.d.ts +9 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +9 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/schema.d.ts +54 -0
- package/dist/schema.d.ts.map +1 -0
- package/dist/schema.js +109 -0
- package/dist/schema.js.map +1 -0
- package/dist/template.d.ts +73 -0
- package/dist/template.d.ts.map +1 -0
- package/dist/template.js +129 -0
- package/dist/template.js.map +1 -0
- package/dist/types.d.ts +481 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +5 -0
- package/dist/types.js.map +1 -0
- package/evalite.config.ts +19 -0
- package/evals/README.md +212 -0
- package/evals/classification.eval.ts +108 -0
- package/evals/marketing.eval.ts +370 -0
- package/evals/math.eval.ts +94 -0
- package/evals/run-evals.ts +166 -0
- package/evals/structured-output.eval.ts +143 -0
- package/evals/writing.eval.ts +117 -0
- package/examples/batch-blog-posts.ts +160 -0
- package/package.json +59 -43
- package/src/ai-promise.ts +784 -0
- package/src/ai.ts +1183 -0
- package/src/batch/anthropic.ts +375 -0
- package/src/batch/bedrock.ts +801 -0
- package/src/batch/cloudflare.ts +421 -0
- package/src/batch/google.ts +491 -0
- package/src/batch/index.ts +31 -0
- package/src/batch/memory.ts +253 -0
- package/src/batch/openai.ts +557 -0
- package/src/batch-map.ts +534 -0
- package/src/batch-queue.ts +493 -0
- package/src/context.ts +332 -0
- package/src/embeddings.ts +244 -0
- package/src/eval/index.ts +8 -0
- package/src/eval/models.ts +158 -0
- package/src/eval/runner.ts +217 -0
- package/src/generate.ts +245 -0
- package/src/index.ts +154 -0
- package/src/primitives.ts +612 -0
- package/src/providers/cloudflare.ts +15 -0
- package/src/providers/index.ts +14 -0
- package/src/schema.ts +147 -0
- package/src/template.ts +209 -0
- package/src/types.ts +540 -0
- package/test/README.md +105 -0
- package/test/ai-proxy.test.ts +192 -0
- package/test/async-iterators.test.ts +327 -0
- package/test/batch-background.test.ts +482 -0
- package/test/batch-blog-posts.test.ts +387 -0
- package/test/blog-generation.test.ts +510 -0
- package/test/browse-read.test.ts +611 -0
- package/test/core-functions.test.ts +694 -0
- package/test/decide.test.ts +393 -0
- package/test/define.test.ts +274 -0
- package/test/e2e-bedrock-manual.ts +163 -0
- package/test/e2e-bedrock.test.ts +191 -0
- package/test/e2e-flex-gateway.ts +157 -0
- package/test/e2e-flex-manual.ts +183 -0
- package/test/e2e-flex.test.ts +209 -0
- package/test/e2e-google-manual.ts +178 -0
- package/test/e2e-google.test.ts +216 -0
- package/test/embeddings.test.ts +284 -0
- package/test/evals/define-function.eval.test.ts +379 -0
- package/test/evals/primitives.eval.test.ts +384 -0
- package/test/function-types.test.ts +492 -0
- package/test/generate-core.test.ts +319 -0
- package/test/generate.test.ts +163 -0
- package/test/implicit-batch.test.ts +422 -0
- package/test/schema.test.ts +109 -0
- package/test/tagged-templates.test.ts +302 -0
- package/tsconfig.json +8 -6
- package/vitest.config.ts +42 -0
- package/LICENSE +0 -21
- package/db/cache.ts +0 -6
- package/db/mongo.ts +0 -75
- package/dist/mjs/db/cache.d.ts +0 -1
- package/dist/mjs/db/cache.js +0 -5
- package/dist/mjs/db/mongo.d.ts +0 -31
- package/dist/mjs/db/mongo.js +0 -48
- package/dist/mjs/examples/data.d.ts +0 -1105
- package/dist/mjs/examples/data.js +0 -1105
- package/dist/mjs/functions/ai.d.ts +0 -20
- package/dist/mjs/functions/ai.js +0 -83
- package/dist/mjs/functions/ai.test.d.ts +0 -1
- package/dist/mjs/functions/ai.test.js +0 -29
- package/dist/mjs/functions/gpt.d.ts +0 -4
- package/dist/mjs/functions/gpt.js +0 -10
- package/dist/mjs/functions/list.d.ts +0 -7
- package/dist/mjs/functions/list.js +0 -72
- package/dist/mjs/index.d.ts +0 -3
- package/dist/mjs/index.js +0 -3
- package/dist/mjs/queue/kafka.d.ts +0 -0
- package/dist/mjs/queue/kafka.js +0 -1
- package/dist/mjs/queue/memory.d.ts +0 -0
- package/dist/mjs/queue/memory.js +0 -1
- package/dist/mjs/queue/mongo.d.ts +0 -30
- package/dist/mjs/queue/mongo.js +0 -42
- package/dist/mjs/streams/kafka.d.ts +0 -0
- package/dist/mjs/streams/kafka.js +0 -1
- package/dist/mjs/streams/memory.d.ts +0 -0
- package/dist/mjs/streams/memory.js +0 -1
- package/dist/mjs/streams/mongo.d.ts +0 -0
- package/dist/mjs/streams/mongo.js +0 -1
- package/dist/mjs/streams/types.d.ts +0 -0
- package/dist/mjs/streams/types.js +0 -1
- package/dist/mjs/types.d.ts +0 -11
- package/dist/mjs/types.js +0 -1
- package/dist/mjs/utils/completion.d.ts +0 -9
- package/dist/mjs/utils/completion.js +0 -20
- package/dist/mjs/utils/schema.d.ts +0 -10
- package/dist/mjs/utils/schema.js +0 -72
- package/dist/mjs/utils/schema.test.d.ts +0 -1
- package/dist/mjs/utils/schema.test.js +0 -60
- package/dist/mjs/utils/state.d.ts +0 -1
- package/dist/mjs/utils/state.js +0 -19
- package/examples/data.ts +0 -1105
- package/fixup +0 -11
- package/functions/ai.test.ts +0 -41
- package/functions/ai.ts +0 -115
- package/functions/gpt.ts +0 -12
- package/functions/list.ts +0 -84
- package/index.ts +0 -3
- package/queue/kafka.ts +0 -0
- package/queue/memory.ts +0 -0
- package/queue/mongo.ts +0 -88
- package/streams/kafka.ts +0 -0
- package/streams/memory.ts +0 -0
- package/streams/mongo.ts +0 -0
- package/streams/types.ts +0 -0
- package/tsconfig-backup.json +0 -105
- package/tsconfig-base.json +0 -26
- package/tsconfig-cjs.json +0 -8
- package/types.ts +0 -12
- package/utils/completion.ts +0 -28
- package/utils/schema.test.ts +0 -69
- package/utils/schema.ts +0 -74
- package/utils/state.ts +0 -23
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Math Eval
|
|
3
|
+
*
|
|
4
|
+
* Tests model mathematical reasoning from simple arithmetic
|
|
5
|
+
* to word problems.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { evalite } from 'evalite'
|
|
9
|
+
import { generateObject } from '../src/generate.js'
|
|
10
|
+
import { schema } from '../src/schema.js'
|
|
11
|
+
import { createModelVariants, type EvalModel } from '../src/eval/models.js'
|
|
12
|
+
|
|
13
|
+
// Math test cases
|
|
14
|
+
const TEST_CASES = [
|
|
15
|
+
// Arithmetic
|
|
16
|
+
{ problem: 'What is 15 + 27?', expected: 42, difficulty: 'easy' },
|
|
17
|
+
{ problem: 'What is 144 / 12?', expected: 12, difficulty: 'easy' },
|
|
18
|
+
{ problem: 'What is 7 * 8?', expected: 56, difficulty: 'easy' },
|
|
19
|
+
|
|
20
|
+
// Word problems
|
|
21
|
+
{ problem: 'A store sells 45 apples at $2 each. What is the total revenue?', expected: 90, difficulty: 'medium' },
|
|
22
|
+
{ problem: 'A train travels 240 miles in 4 hours. What is the average speed in mph?', expected: 60, difficulty: 'medium' },
|
|
23
|
+
|
|
24
|
+
// Multi-step
|
|
25
|
+
{ problem: 'A company has 120 employees. 40% work in engineering, and 25% of engineers are senior. How many senior engineers?', expected: 12, difficulty: 'hard' },
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
const modelVariants = createModelVariants({ tiers: ['fast'] })
|
|
29
|
+
|
|
30
|
+
evalite.each(modelVariants)('Math', {
|
|
31
|
+
data: TEST_CASES.map(tc => ({ input: tc, expected: tc.expected })),
|
|
32
|
+
|
|
33
|
+
task: async (input, variant) => {
|
|
34
|
+
const model = variant as EvalModel
|
|
35
|
+
const startTime = Date.now()
|
|
36
|
+
|
|
37
|
+
const { object, usage } = await generateObject({
|
|
38
|
+
model: model.id,
|
|
39
|
+
schema: schema({
|
|
40
|
+
answer: 'The numeric answer (number)',
|
|
41
|
+
reasoning: 'Step by step reasoning',
|
|
42
|
+
}),
|
|
43
|
+
prompt: `Solve this math problem:\n\n${input.problem}`,
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
const latencyMs = Date.now() - startTime
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
answer: object.answer,
|
|
50
|
+
reasoning: object.reasoning,
|
|
51
|
+
expected: input.expected,
|
|
52
|
+
problem: input.problem,
|
|
53
|
+
difficulty: input.difficulty,
|
|
54
|
+
modelId: model.id,
|
|
55
|
+
modelName: model.name,
|
|
56
|
+
latencyMs,
|
|
57
|
+
usage,
|
|
58
|
+
}
|
|
59
|
+
},
|
|
60
|
+
|
|
61
|
+
scorers: [
|
|
62
|
+
// Exact answer
|
|
63
|
+
{
|
|
64
|
+
name: 'Correct Answer',
|
|
65
|
+
description: 'Whether the numeric answer is correct',
|
|
66
|
+
scorer: ({ output, expected }) => {
|
|
67
|
+
const answer = output.answer as number
|
|
68
|
+
const exp = expected as number
|
|
69
|
+
// Allow small floating point tolerance
|
|
70
|
+
return { score: Math.abs(answer - exp) < 0.01 ? 1 : 0 }
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
|
|
74
|
+
// Shows reasoning
|
|
75
|
+
{
|
|
76
|
+
name: 'Shows Work',
|
|
77
|
+
description: 'Whether model explains reasoning',
|
|
78
|
+
scorer: ({ output }) => {
|
|
79
|
+
const reasoning = output.reasoning as string
|
|
80
|
+
if (!reasoning || reasoning.length < 20) return { score: 0.2 }
|
|
81
|
+
if (reasoning.length > 50) return { score: 1 }
|
|
82
|
+
return { score: 0.6 }
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
],
|
|
86
|
+
|
|
87
|
+
columns: ({ output, expected }) => [
|
|
88
|
+
{ label: 'Model', value: output.modelName },
|
|
89
|
+
{ label: 'Difficulty', value: output.difficulty },
|
|
90
|
+
{ label: 'Expected', value: expected },
|
|
91
|
+
{ label: 'Got', value: output.answer },
|
|
92
|
+
{ label: 'Correct', value: Math.abs((output.answer as number) - (expected as number)) < 0.01 ? 'Yes' : 'No' },
|
|
93
|
+
],
|
|
94
|
+
})
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* Run AI Functions Eval Suite
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* npx tsx evals/run-evals.ts [--fast] [--all]
|
|
7
|
+
*
|
|
8
|
+
* Options:
|
|
9
|
+
* --fast Only run fast-tier models (default)
|
|
10
|
+
* --all Run all models
|
|
11
|
+
* --math Run only math eval
|
|
12
|
+
* --class Run only classification eval
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { runEval, generateObject, generateText, schema } from '../src/eval/runner.js'
|
|
16
|
+
import type { EvalModel, ModelTier } from '../src/eval/models.js'
|
|
17
|
+
|
|
18
|
+
// Parse CLI args
|
|
19
|
+
const args = process.argv.slice(2)
|
|
20
|
+
const runAll = args.includes('--all')
|
|
21
|
+
const runMath = args.includes('--math')
|
|
22
|
+
const runClass = args.includes('--class')
|
|
23
|
+
const runSingle = runMath || runClass
|
|
24
|
+
|
|
25
|
+
const tiers: ModelTier[] = runAll ? ['best', 'fast', 'cheap'] : ['fast']
|
|
26
|
+
|
|
27
|
+
console.log('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')
|
|
28
|
+
console.log('ā AI Functions Eval Suite ā')
|
|
29
|
+
console.log('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')
|
|
30
|
+
console.log('')
|
|
31
|
+
console.log(`Tiers: ${tiers.join(', ')}`)
|
|
32
|
+
|
|
33
|
+
// Math eval
|
|
34
|
+
async function runMathEval() {
|
|
35
|
+
const cases = [
|
|
36
|
+
{ name: 'Simple addition', input: { problem: 'What is 15 + 27?' }, expected: 42 },
|
|
37
|
+
{ name: 'Division', input: { problem: 'What is 144 / 12?' }, expected: 12 },
|
|
38
|
+
{ name: 'Multiplication', input: { problem: 'What is 7 * 8?' }, expected: 56 },
|
|
39
|
+
{ name: 'Word problem', input: { problem: 'A store sells 45 apples at $2 each. What is the total revenue?' }, expected: 90 },
|
|
40
|
+
{ name: 'Multi-step', input: { problem: 'A company has 120 employees. 40% work in engineering, and 25% of engineers are senior. How many senior engineers?' }, expected: 12 },
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
return runEval({
|
|
44
|
+
name: 'Math',
|
|
45
|
+
cases,
|
|
46
|
+
tiers,
|
|
47
|
+
task: async (input, model) => {
|
|
48
|
+
const { object } = await generateObject({
|
|
49
|
+
model: model.id,
|
|
50
|
+
schema: schema({
|
|
51
|
+
answer: 'The numeric answer (number)',
|
|
52
|
+
reasoning: 'Step by step reasoning',
|
|
53
|
+
}),
|
|
54
|
+
prompt: `Solve this math problem:\n\n${input.problem}`,
|
|
55
|
+
})
|
|
56
|
+
return object
|
|
57
|
+
},
|
|
58
|
+
scorers: [
|
|
59
|
+
{
|
|
60
|
+
name: 'Correct Answer',
|
|
61
|
+
description: 'Whether the numeric answer is correct',
|
|
62
|
+
scorer: ({ output, expected }) => {
|
|
63
|
+
const answer = (output as { answer: number }).answer
|
|
64
|
+
const exp = expected as number
|
|
65
|
+
return Math.abs(answer - exp) < 0.01 ? 1 : 0
|
|
66
|
+
},
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
name: 'Shows Work',
|
|
70
|
+
description: 'Whether model explains reasoning',
|
|
71
|
+
scorer: ({ output }) => {
|
|
72
|
+
const reasoning = (output as { reasoning: string }).reasoning
|
|
73
|
+
if (!reasoning || reasoning.length < 20) return 0.2
|
|
74
|
+
if (reasoning.length > 50) return 1
|
|
75
|
+
return 0.6
|
|
76
|
+
},
|
|
77
|
+
},
|
|
78
|
+
],
|
|
79
|
+
})
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Classification eval
|
|
83
|
+
async function runClassificationEval() {
|
|
84
|
+
const cases = [
|
|
85
|
+
{ name: 'Positive sentiment', input: { text: 'This product exceeded my expectations!', options: ['positive', 'negative', 'neutral'] }, expected: 'positive' },
|
|
86
|
+
{ name: 'Negative sentiment', input: { text: 'The delivery was late and packaging damaged.', options: ['positive', 'negative', 'neutral'] }, expected: 'negative' },
|
|
87
|
+
{ name: 'Neutral sentiment', input: { text: 'The product arrived as described.', options: ['positive', 'negative', 'neutral'] }, expected: 'neutral' },
|
|
88
|
+
{ name: 'Account ticket', input: { text: 'I need to reset my password', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'account' },
|
|
89
|
+
{ name: 'Billing ticket', input: { text: 'When will my refund be processed?', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'billing' },
|
|
90
|
+
{ name: 'Technical ticket', input: { text: 'The app crashes when uploading images', options: ['account', 'billing', 'technical', 'shipping'] }, expected: 'technical' },
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
return runEval({
|
|
94
|
+
name: 'Classification',
|
|
95
|
+
cases,
|
|
96
|
+
tiers,
|
|
97
|
+
task: async (input, model) => {
|
|
98
|
+
const enumStr = input.options.join(' | ')
|
|
99
|
+
const { object } = await generateObject({
|
|
100
|
+
model: model.id,
|
|
101
|
+
schema: schema({
|
|
102
|
+
category: enumStr,
|
|
103
|
+
confidence: 'Confidence 0-1 (number)',
|
|
104
|
+
}),
|
|
105
|
+
prompt: `Classify this text into one of: ${input.options.join(', ')}\n\nText: "${input.text}"`,
|
|
106
|
+
})
|
|
107
|
+
return object
|
|
108
|
+
},
|
|
109
|
+
scorers: [
|
|
110
|
+
{
|
|
111
|
+
name: 'Accuracy',
|
|
112
|
+
description: 'Whether classification is correct',
|
|
113
|
+
scorer: ({ output, expected }) => {
|
|
114
|
+
const predicted = (output as { category: string }).category
|
|
115
|
+
return predicted === expected ? 1 : 0
|
|
116
|
+
},
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
name: 'Valid Category',
|
|
120
|
+
description: 'Whether output is a valid option',
|
|
121
|
+
scorer: ({ input, output }) => {
|
|
122
|
+
const predicted = (output as { category: string }).category
|
|
123
|
+
const options = (input as { options: string[] }).options
|
|
124
|
+
return options.includes(predicted) ? 1 : 0
|
|
125
|
+
},
|
|
126
|
+
},
|
|
127
|
+
],
|
|
128
|
+
})
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Run evals
|
|
132
|
+
async function main() {
|
|
133
|
+
const results = []
|
|
134
|
+
|
|
135
|
+
if (!runSingle || runMath) {
|
|
136
|
+
results.push(await runMathEval())
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (!runSingle || runClass) {
|
|
140
|
+
results.push(await runClassificationEval())
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Overall summary
|
|
144
|
+
console.log('')
|
|
145
|
+
console.log('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')
|
|
146
|
+
console.log('ā Summary ā')
|
|
147
|
+
console.log('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')
|
|
148
|
+
|
|
149
|
+
let totalScore = 0
|
|
150
|
+
let totalCost = 0
|
|
151
|
+
let totalTime = 0
|
|
152
|
+
|
|
153
|
+
for (const result of results) {
|
|
154
|
+
console.log(`\n${result.name}: ${(result.avgScore * 100).toFixed(1)}%`)
|
|
155
|
+
totalScore += result.avgScore
|
|
156
|
+
totalCost += result.totalCost
|
|
157
|
+
totalTime += result.totalTime
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
console.log('')
|
|
161
|
+
console.log(`Overall: ${((totalScore / results.length) * 100).toFixed(1)}%`)
|
|
162
|
+
console.log(`Total Cost: $${totalCost.toFixed(4)}`)
|
|
163
|
+
console.log(`Total Time: ${(totalTime / 1000).toFixed(1)}s`)
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
main().catch(console.error)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured Output Eval
|
|
3
|
+
*
|
|
4
|
+
* Tests model ability to generate valid structured JSON output
|
|
5
|
+
* matching specified schemas across all providers.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { evalite } from 'evalite'
|
|
9
|
+
import { generateObject } from '../src/generate.js'
|
|
10
|
+
import { schema } from '../src/schema.js'
|
|
11
|
+
import { createModelVariants, getModelPricing, type EvalModel } from '../src/eval/models.js'
|
|
12
|
+
|
|
13
|
+
// Test cases for structured output
|
|
14
|
+
const TEST_CASES = [
|
|
15
|
+
{
|
|
16
|
+
name: 'Simple object',
|
|
17
|
+
prompt: 'Generate a greeting in French',
|
|
18
|
+
schema: {
|
|
19
|
+
greeting: 'A friendly greeting',
|
|
20
|
+
language: 'The language of the greeting',
|
|
21
|
+
},
|
|
22
|
+
expectedTypes: { greeting: 'string', language: 'string' },
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
name: 'With numbers',
|
|
26
|
+
prompt: 'Generate info about Tokyo',
|
|
27
|
+
schema: {
|
|
28
|
+
name: 'City name',
|
|
29
|
+
population: 'Population in millions (number)',
|
|
30
|
+
area: 'Area in square kilometers (number)',
|
|
31
|
+
},
|
|
32
|
+
expectedTypes: { name: 'string', population: 'number', area: 'number' },
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
name: 'With arrays',
|
|
36
|
+
prompt: 'Generate a simple pasta recipe',
|
|
37
|
+
schema: {
|
|
38
|
+
title: 'Recipe title',
|
|
39
|
+
ingredients: ['List of ingredients'],
|
|
40
|
+
steps: ['Cooking steps'],
|
|
41
|
+
},
|
|
42
|
+
expectedTypes: { title: 'string', ingredients: 'array', steps: 'array' },
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
name: 'With enum',
|
|
46
|
+
prompt: 'Analyze sentiment: "I love this product!"',
|
|
47
|
+
schema: {
|
|
48
|
+
sentiment: 'positive | negative | neutral',
|
|
49
|
+
confidence: 'Confidence score 0-1 (number)',
|
|
50
|
+
},
|
|
51
|
+
expectedTypes: { sentiment: 'string', confidence: 'number' },
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
name: 'Nested object',
|
|
55
|
+
prompt: 'Generate a fictional person living in Japan',
|
|
56
|
+
schema: {
|
|
57
|
+
person: { name: 'Full name', age: 'Age (number)' },
|
|
58
|
+
address: { city: 'City name', country: 'Country name' },
|
|
59
|
+
},
|
|
60
|
+
expectedTypes: { person: 'object', address: 'object' },
|
|
61
|
+
},
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
// Test across models - start with fast tier for quick iteration
|
|
65
|
+
const modelVariants = createModelVariants({ tiers: ['fast'] })
|
|
66
|
+
|
|
67
|
+
evalite.each(modelVariants)('Structured Output', {
|
|
68
|
+
data: TEST_CASES.map(tc => ({ input: tc })),
|
|
69
|
+
|
|
70
|
+
task: async (input, variant) => {
|
|
71
|
+
const model = variant as EvalModel
|
|
72
|
+
const startTime = Date.now()
|
|
73
|
+
|
|
74
|
+
const { object, usage } = await generateObject({
|
|
75
|
+
model: model.id,
|
|
76
|
+
schema: schema(input.schema),
|
|
77
|
+
prompt: input.prompt,
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
const latencyMs = Date.now() - startTime
|
|
81
|
+
|
|
82
|
+
// Calculate cost from language-models pricing
|
|
83
|
+
const pricing = getModelPricing(model.id)
|
|
84
|
+
const cost = pricing
|
|
85
|
+
? ((usage?.promptTokens ?? 0) * pricing.prompt + (usage?.completionTokens ?? 0) * pricing.completion) / 1_000_000
|
|
86
|
+
: 0
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
object,
|
|
90
|
+
expectedTypes: input.expectedTypes,
|
|
91
|
+
testName: input.name,
|
|
92
|
+
modelId: model.id,
|
|
93
|
+
modelName: model.name,
|
|
94
|
+
provider: model.provider,
|
|
95
|
+
latencyMs,
|
|
96
|
+
cost,
|
|
97
|
+
usage,
|
|
98
|
+
}
|
|
99
|
+
},
|
|
100
|
+
|
|
101
|
+
scorers: [
|
|
102
|
+
// Type accuracy
|
|
103
|
+
{
|
|
104
|
+
name: 'Type Accuracy',
|
|
105
|
+
description: 'Whether output fields have correct types',
|
|
106
|
+
scorer: ({ output }) => {
|
|
107
|
+
const obj = output.object as Record<string, unknown>
|
|
108
|
+
const expected = output.expectedTypes as Record<string, string>
|
|
109
|
+
const fields = Object.keys(expected)
|
|
110
|
+
|
|
111
|
+
let correct = 0
|
|
112
|
+
for (const field of fields) {
|
|
113
|
+
const val = obj[field]
|
|
114
|
+
const expectedType = expected[field]
|
|
115
|
+
const actualType = Array.isArray(val) ? 'array' : typeof val
|
|
116
|
+
|
|
117
|
+
if (actualType === expectedType) correct++
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return { score: correct / fields.length }
|
|
121
|
+
},
|
|
122
|
+
},
|
|
123
|
+
|
|
124
|
+
// Latency
|
|
125
|
+
{
|
|
126
|
+
name: 'Latency',
|
|
127
|
+
description: 'Response time (target < 3s)',
|
|
128
|
+
scorer: ({ output }) => {
|
|
129
|
+
const ms = output.latencyMs as number
|
|
130
|
+
if (ms < 2000) return { score: 1 }
|
|
131
|
+
if (ms > 10000) return { score: 0 }
|
|
132
|
+
return { score: 1 - (ms - 2000) / 8000 }
|
|
133
|
+
},
|
|
134
|
+
},
|
|
135
|
+
],
|
|
136
|
+
|
|
137
|
+
columns: ({ output, scores }) => [
|
|
138
|
+
{ label: 'Model', value: output.modelName },
|
|
139
|
+
{ label: 'Test', value: output.testName },
|
|
140
|
+
{ label: 'Latency', value: `${output.latencyMs}ms` },
|
|
141
|
+
{ label: 'Cost', value: `$${(output.cost as number).toFixed(6)}` },
|
|
142
|
+
],
|
|
143
|
+
})
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Writing Quality Eval (LLM-as-Judge)
|
|
3
|
+
*
|
|
4
|
+
* Tests model writing capabilities using LLM-as-judge scoring.
|
|
5
|
+
* Uses a strong model (sonnet) to judge output quality.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { evalite } from 'evalite'
|
|
9
|
+
import { generateText, generateObject } from '../src/generate.js'
|
|
10
|
+
import { schema } from '../src/schema.js'
|
|
11
|
+
import { createModelVariants, type EvalModel } from '../src/eval/models.js'
|
|
12
|
+
|
|
13
|
+
// Use sonnet as the judge model
|
|
14
|
+
const JUDGE_MODEL = 'sonnet'
|
|
15
|
+
|
|
16
|
+
// Writing test cases
|
|
17
|
+
const TEST_CASES = [
|
|
18
|
+
{
|
|
19
|
+
name: 'Professional email',
|
|
20
|
+
prompt: 'Write a professional email declining a meeting invitation politely.',
|
|
21
|
+
criteria: ['Polite tone', 'Clear explanation', 'Proper email format'],
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
name: 'Product description',
|
|
25
|
+
prompt: 'Write a product description for wireless earbuds targeting tech-savvy consumers.',
|
|
26
|
+
criteria: ['Highlights features', 'Compelling language', 'Clear value proposition'],
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
name: 'Explanation',
|
|
30
|
+
prompt: 'Explain how photosynthesis works in simple terms for a high school student.',
|
|
31
|
+
criteria: ['Accurate content', 'Clear language', 'Logical flow'],
|
|
32
|
+
},
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
const modelVariants = createModelVariants({ tiers: ['fast'] })
|
|
36
|
+
|
|
37
|
+
evalite.each(modelVariants)('Writing Quality', {
|
|
38
|
+
data: TEST_CASES.map(tc => ({ input: tc })),
|
|
39
|
+
|
|
40
|
+
task: async (input, variant) => {
|
|
41
|
+
const model = variant as EvalModel
|
|
42
|
+
const startTime = Date.now()
|
|
43
|
+
|
|
44
|
+
// Generate the writing
|
|
45
|
+
const { text, usage } = await generateText({
|
|
46
|
+
model: model.id,
|
|
47
|
+
prompt: input.prompt,
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
const latencyMs = Date.now() - startTime
|
|
51
|
+
|
|
52
|
+
return {
|
|
53
|
+
text,
|
|
54
|
+
testName: input.name,
|
|
55
|
+
criteria: input.criteria,
|
|
56
|
+
modelId: model.id,
|
|
57
|
+
modelName: model.name,
|
|
58
|
+
provider: model.provider,
|
|
59
|
+
latencyMs,
|
|
60
|
+
usage,
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
|
|
64
|
+
scorers: [
|
|
65
|
+
// LLM-as-judge for quality
|
|
66
|
+
{
|
|
67
|
+
name: 'Writing Quality',
|
|
68
|
+
description: 'LLM judge evaluation of writing quality',
|
|
69
|
+
scorer: async ({ input, output }) => {
|
|
70
|
+
const { object } = await generateObject({
|
|
71
|
+
model: JUDGE_MODEL,
|
|
72
|
+
schema: schema({
|
|
73
|
+
clarity: 'How clear is the writing? (number 0-1)',
|
|
74
|
+
engagement: 'How engaging is the content? (number 0-1)',
|
|
75
|
+
accuracy: 'How well does it meet the criteria? (number 0-1)',
|
|
76
|
+
reasoning: 'Brief explanation',
|
|
77
|
+
}),
|
|
78
|
+
prompt: `Evaluate this writing on a scale of 0-1.
|
|
79
|
+
|
|
80
|
+
Criteria: ${(input.criteria as string[]).join(', ')}
|
|
81
|
+
|
|
82
|
+
Writing:
|
|
83
|
+
"""
|
|
84
|
+
${output.text}
|
|
85
|
+
"""`,
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
const avg = ((object.clarity as number) + (object.engagement as number) + (object.accuracy as number)) / 3
|
|
89
|
+
return {
|
|
90
|
+
score: avg,
|
|
91
|
+
metadata: object,
|
|
92
|
+
}
|
|
93
|
+
},
|
|
94
|
+
},
|
|
95
|
+
|
|
96
|
+
// Word count check
|
|
97
|
+
{
|
|
98
|
+
name: 'Appropriate Length',
|
|
99
|
+
description: 'Whether output has reasonable length',
|
|
100
|
+
scorer: ({ output }) => {
|
|
101
|
+
const words = (output.text as string).split(/\s+/).length
|
|
102
|
+
if (words < 20) return { score: 0.3, metadata: { words } }
|
|
103
|
+
if (words > 500) return { score: 0.7, metadata: { words } }
|
|
104
|
+
return { score: 1, metadata: { words } }
|
|
105
|
+
},
|
|
106
|
+
},
|
|
107
|
+
],
|
|
108
|
+
|
|
109
|
+
columns: ({ output }) => [
|
|
110
|
+
{ label: 'Model', value: output.modelName },
|
|
111
|
+
{ label: 'Test', value: output.testName },
|
|
112
|
+
{ label: 'Words', value: (output.text as string).split(/\s+/).length },
|
|
113
|
+
{ label: 'Latency', value: `${output.latencyMs}ms` },
|
|
114
|
+
],
|
|
115
|
+
|
|
116
|
+
trialCount: 2, // Run twice for variance
|
|
117
|
+
})
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Batch Blog Post Generation Example
|
|
3
|
+
*
|
|
4
|
+
* This example demonstrates the new IMPLICIT batch processing:
|
|
5
|
+
*
|
|
6
|
+
* ```ts
|
|
7
|
+
* // Configure once (or use environment variables)
|
|
8
|
+
* configure({ provider: 'openai', model: 'gpt-4o', batchMode: 'auto' })
|
|
9
|
+
*
|
|
10
|
+
* // Use naturally - batching is automatic!
|
|
11
|
+
* const titles = await list`10 blog post titles about startups`
|
|
12
|
+
* const posts = titles.map(title => write`blog post: # ${title}`)
|
|
13
|
+
* console.log(await posts) // Batched automatically!
|
|
14
|
+
* ```
|
|
15
|
+
*
|
|
16
|
+
* Environment variables:
|
|
17
|
+
* - AI_PROVIDER: openai | anthropic | cloudflare | bedrock
|
|
18
|
+
* - AI_MODEL: model name (e.g., gpt-4o, claude-sonnet-4-20250514)
|
|
19
|
+
* - AI_BATCH_MODE: auto | immediate | deferred
|
|
20
|
+
* - AI_BATCH_THRESHOLD: minimum items for auto batch (default: 5)
|
|
21
|
+
*
|
|
22
|
+
* @example
|
|
23
|
+
* ```bash
|
|
24
|
+
* # Using environment variables
|
|
25
|
+
* AI_PROVIDER=openai AI_MODEL=gpt-4o npx tsx examples/batch-blog-posts.ts
|
|
26
|
+
*
|
|
27
|
+
* # Or with API keys
|
|
28
|
+
* OPENAI_API_KEY=sk-... npx tsx examples/batch-blog-posts.ts
|
|
29
|
+
* ```
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
import {
|
|
33
|
+
list,
|
|
34
|
+
write,
|
|
35
|
+
configure,
|
|
36
|
+
withContext,
|
|
37
|
+
type BatchProvider,
|
|
38
|
+
} from '../src/index.js'
|
|
39
|
+
|
|
40
|
+
// Import the batch adapter for your provider
|
|
41
|
+
// import '../src/batch/openai.js'
|
|
42
|
+
// import '../src/batch/anthropic.js'
|
|
43
|
+
// import '../src/batch/cloudflare.js'
|
|
44
|
+
// import '../src/batch/bedrock.js'
|
|
45
|
+
|
|
46
|
+
// For testing, use the memory adapter
|
|
47
|
+
import '../src/batch/memory.js'
|
|
48
|
+
|
|
49
|
+
async function main() {
|
|
50
|
+
console.log('\nš Implicit Batch Blog Post Generation\n')
|
|
51
|
+
|
|
52
|
+
// ============================================================================
|
|
53
|
+
// Option 1: Global Configuration (recommended)
|
|
54
|
+
// ============================================================================
|
|
55
|
+
|
|
56
|
+
configure({
|
|
57
|
+
provider: 'openai',
|
|
58
|
+
model: 'gpt-4o',
|
|
59
|
+
batchMode: 'auto', // 'auto' | 'immediate' | 'deferred'
|
|
60
|
+
batchThreshold: 5, // Use batch API when >= 5 items
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
console.log('š Step 1: Generate titles (executes immediately)...')
|
|
64
|
+
const titles = await list`10 blog post titles about building startups in 2026`
|
|
65
|
+
|
|
66
|
+
console.log(`\nGenerated ${(titles as any).length || 10} titles`)
|
|
67
|
+
|
|
68
|
+
// ============================================================================
|
|
69
|
+
// Option 2: The Clean API (what you asked for!)
|
|
70
|
+
// ============================================================================
|
|
71
|
+
|
|
72
|
+
console.log('\nā” Step 2: Map titles to blog posts (automatic batching)...')
|
|
73
|
+
console.log(' Code: titles.map(title => write`blog post: # ${title}`)')
|
|
74
|
+
|
|
75
|
+
// This is the API you wanted!
|
|
76
|
+
// - No explicit batch creation
|
|
77
|
+
// - No provider/model in the code
|
|
78
|
+
// - Automatic batch detection based on context
|
|
79
|
+
const posts = (titles as string[]).map(title =>
|
|
80
|
+
write`Write a comprehensive blog post for startup founders:
|
|
81
|
+
|
|
82
|
+
# ${title}
|
|
83
|
+
|
|
84
|
+
Include:
|
|
85
|
+
- Attention-grabbing introduction
|
|
86
|
+
- 3-5 key sections with actionable insights
|
|
87
|
+
- Real-world examples
|
|
88
|
+
- Compelling conclusion with call-to-action`
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
console.log(` Created ${posts.length} deferred operations`)
|
|
92
|
+
|
|
93
|
+
// When you await, it resolves via batch API if beneficial
|
|
94
|
+
console.log('\nā³ Step 3: Await results (batched automatically)...')
|
|
95
|
+
// Note: Each item is an AIPromise, we'd await them all
|
|
96
|
+
// const results = await Promise.all(posts)
|
|
97
|
+
|
|
98
|
+
console.log('\nā
Done!')
|
|
99
|
+
|
|
100
|
+
// ============================================================================
|
|
101
|
+
// Option 3: Scoped Context (for different providers in same code)
|
|
102
|
+
// ============================================================================
|
|
103
|
+
|
|
104
|
+
console.log('\nš Bonus: Using withContext for scoped configuration...')
|
|
105
|
+
|
|
106
|
+
await withContext(
|
|
107
|
+
{ provider: 'anthropic', model: 'claude-sonnet-4-20250514', batchMode: 'deferred' },
|
|
108
|
+
async () => {
|
|
109
|
+
console.log(' Inside context: Using Anthropic with deferred batching')
|
|
110
|
+
// All operations here use Anthropic
|
|
111
|
+
// const summaries = titles.map(title => write`summarize: ${title}`)
|
|
112
|
+
}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
console.log(' Outside context: Back to OpenAI')
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// ============================================================================
|
|
119
|
+
// Summary of the API
|
|
120
|
+
// ============================================================================
|
|
121
|
+
|
|
122
|
+
/*
|
|
123
|
+
The new API is clean and implicit:
|
|
124
|
+
|
|
125
|
+
1. Configure once (globally or via environment):
|
|
126
|
+
```ts
|
|
127
|
+
configure({ provider: 'openai', model: 'gpt-4o', batchMode: 'auto' })
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
2. Use naturally:
|
|
131
|
+
```ts
|
|
132
|
+
const titles = await list`10 blog post titles`
|
|
133
|
+
const posts = titles.map(title => write`blog post: # ${title}`)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
3. Batching happens automatically when:
|
|
137
|
+
- batchMode is 'auto' and items >= batchThreshold
|
|
138
|
+
- batchMode is 'deferred' (always batch)
|
|
139
|
+
|
|
140
|
+
4. No batching when:
|
|
141
|
+
- batchMode is 'immediate'
|
|
142
|
+
- batchMode is 'auto' and items < batchThreshold
|
|
143
|
+
|
|
144
|
+
5. Provider batch APIs supported:
|
|
145
|
+
- OpenAI: 50% discount, 24hr turnaround
|
|
146
|
+
- Anthropic: 50% discount, 24hr turnaround
|
|
147
|
+
- Cloudflare: Via AI Gateway
|
|
148
|
+
- AWS Bedrock: Native batch inference
|
|
149
|
+
*/
|
|
150
|
+
|
|
151
|
+
// Run the example
|
|
152
|
+
main()
|
|
153
|
+
.then(() => {
|
|
154
|
+
console.log('\n⨠Example complete!\n')
|
|
155
|
+
process.exit(0)
|
|
156
|
+
})
|
|
157
|
+
.catch((error) => {
|
|
158
|
+
console.error('\nā Error:', error.message)
|
|
159
|
+
process.exit(1)
|
|
160
|
+
})
|