ai-functions 2.1.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -4
- package/CHANGELOG.md +68 -1
- package/README.md +397 -157
- package/dist/ai-promise.d.ts +50 -3
- package/dist/ai-promise.d.ts.map +1 -1
- package/dist/ai-promise.js +410 -51
- package/dist/ai-promise.js.map +1 -1
- package/dist/ai-schemas.d.ts +56 -0
- package/dist/ai-schemas.d.ts.map +1 -0
- package/dist/ai-schemas.js +53 -0
- package/dist/ai-schemas.js.map +1 -0
- package/dist/ai.d.ts +16 -242
- package/dist/ai.d.ts.map +1 -1
- package/dist/ai.js +54 -837
- package/dist/ai.js.map +1 -1
- package/dist/batch/anthropic.d.ts +6 -4
- package/dist/batch/anthropic.d.ts.map +1 -1
- package/dist/batch/anthropic.js +83 -145
- package/dist/batch/anthropic.js.map +1 -1
- package/dist/batch/bedrock.d.ts +8 -30
- package/dist/batch/bedrock.d.ts.map +1 -1
- package/dist/batch/bedrock.js +155 -338
- package/dist/batch/bedrock.js.map +1 -1
- package/dist/batch/cloudflare.d.ts +8 -20
- package/dist/batch/cloudflare.d.ts.map +1 -1
- package/dist/batch/cloudflare.js +68 -189
- package/dist/batch/cloudflare.js.map +1 -1
- package/dist/batch/google.d.ts +6 -20
- package/dist/batch/google.d.ts.map +1 -1
- package/dist/batch/google.js +70 -238
- package/dist/batch/google.js.map +1 -1
- package/dist/batch/index.d.ts +4 -1
- package/dist/batch/index.d.ts.map +1 -1
- package/dist/batch/index.js +4 -1
- package/dist/batch/index.js.map +1 -1
- package/dist/batch/memory.d.ts +1 -1
- package/dist/batch/memory.d.ts.map +1 -1
- package/dist/batch/memory.js +14 -10
- package/dist/batch/memory.js.map +1 -1
- package/dist/batch/openai.d.ts +11 -14
- package/dist/batch/openai.d.ts.map +1 -1
- package/dist/batch/openai.js +52 -156
- package/dist/batch/openai.js.map +1 -1
- package/dist/batch/provider.d.ts +111 -0
- package/dist/batch/provider.d.ts.map +1 -0
- package/dist/batch/provider.js +233 -0
- package/dist/batch/provider.js.map +1 -0
- package/dist/batch-map.d.ts.map +1 -1
- package/dist/batch-map.js +23 -17
- package/dist/batch-map.js.map +1 -1
- package/dist/batch-queue.d.ts +65 -0
- package/dist/batch-queue.d.ts.map +1 -1
- package/dist/batch-queue.js +169 -14
- package/dist/batch-queue.js.map +1 -1
- package/dist/budget.d.ts +272 -0
- package/dist/budget.d.ts.map +1 -0
- package/dist/budget.js +513 -0
- package/dist/budget.js.map +1 -0
- package/dist/cache.d.ts +295 -0
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +433 -0
- package/dist/cache.js.map +1 -0
- package/dist/context.d.ts +42 -8
- package/dist/context.d.ts.map +1 -1
- package/dist/context.js +64 -62
- package/dist/context.js.map +1 -1
- package/dist/digital-objects-registry.d.ts +229 -0
- package/dist/digital-objects-registry.d.ts.map +1 -0
- package/dist/digital-objects-registry.js +617 -0
- package/dist/digital-objects-registry.js.map +1 -0
- package/dist/embeddings.d.ts +2 -2
- package/dist/embeddings.d.ts.map +1 -1
- package/dist/errors.d.ts +22 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +35 -0
- package/dist/errors.js.map +1 -0
- package/dist/eval/runner.d.ts +10 -1
- package/dist/eval/runner.d.ts.map +1 -1
- package/dist/eval/runner.js +41 -35
- package/dist/eval/runner.js.map +1 -1
- package/dist/eval-log/in-memory.d.ts +34 -0
- package/dist/eval-log/in-memory.d.ts.map +1 -0
- package/dist/eval-log/in-memory.js +84 -0
- package/dist/eval-log/in-memory.js.map +1 -0
- package/dist/eval-log/index.d.ts +29 -0
- package/dist/eval-log/index.d.ts.map +1 -0
- package/dist/eval-log/index.js +39 -0
- package/dist/eval-log/index.js.map +1 -0
- package/dist/eval-log/types.d.ts +101 -0
- package/dist/eval-log/types.d.ts.map +1 -0
- package/dist/eval-log/types.js +16 -0
- package/dist/eval-log/types.js.map +1 -0
- package/dist/function-registry.d.ts +116 -0
- package/dist/function-registry.d.ts.map +1 -0
- package/dist/function-registry.js +546 -0
- package/dist/function-registry.js.map +1 -0
- package/dist/generate.d.ts +9 -3
- package/dist/generate.d.ts.map +1 -1
- package/dist/generate.js +18 -22
- package/dist/generate.js.map +1 -1
- package/dist/index.d.ts +35 -20
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +89 -42
- package/dist/index.js.map +1 -1
- package/dist/logger.d.ts +118 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +187 -0
- package/dist/logger.js.map +1 -0
- package/dist/middleware/budget.d.ts +84 -0
- package/dist/middleware/budget.d.ts.map +1 -0
- package/dist/middleware/budget.js +110 -0
- package/dist/middleware/budget.js.map +1 -0
- package/dist/middleware/cache.d.ts +103 -0
- package/dist/middleware/cache.d.ts.map +1 -0
- package/dist/middleware/cache.js +228 -0
- package/dist/middleware/cache.js.map +1 -0
- package/dist/middleware/embed-cache.d.ts +99 -0
- package/dist/middleware/embed-cache.d.ts.map +1 -0
- package/dist/middleware/embed-cache.js +128 -0
- package/dist/middleware/embed-cache.js.map +1 -0
- package/dist/middleware/index.d.ts +11 -0
- package/dist/middleware/index.d.ts.map +1 -0
- package/dist/middleware/index.js +11 -0
- package/dist/middleware/index.js.map +1 -0
- package/dist/middleware/trace.d.ts +103 -0
- package/dist/middleware/trace.d.ts.map +1 -0
- package/dist/middleware/trace.js +176 -0
- package/dist/middleware/trace.js.map +1 -0
- package/dist/primitives.d.ts +120 -1
- package/dist/primitives.d.ts.map +1 -1
- package/dist/primitives.js +398 -26
- package/dist/primitives.js.map +1 -1
- package/dist/retry.d.ts +368 -0
- package/dist/retry.d.ts.map +1 -0
- package/dist/retry.js +646 -0
- package/dist/retry.js.map +1 -0
- package/dist/schema.d.ts.map +1 -1
- package/dist/schema.js +2 -10
- package/dist/schema.js.map +1 -1
- package/dist/telemetry.d.ts +128 -0
- package/dist/telemetry.d.ts.map +1 -0
- package/dist/telemetry.js +285 -0
- package/dist/telemetry.js.map +1 -0
- package/dist/template.d.ts.map +1 -1
- package/dist/template.js +6 -1
- package/dist/template.js.map +1 -1
- package/dist/tool-orchestration.d.ts +453 -0
- package/dist/tool-orchestration.d.ts.map +1 -0
- package/dist/tool-orchestration.js +763 -0
- package/dist/tool-orchestration.js.map +1 -0
- package/dist/type-guards.d.ts +28 -0
- package/dist/type-guards.d.ts.map +1 -0
- package/dist/type-guards.js +29 -0
- package/dist/type-guards.js.map +1 -0
- package/dist/types.d.ts +135 -17
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +36 -1
- package/dist/types.js.map +1 -1
- package/dist/wrap-for-v3.d.ts +80 -0
- package/dist/wrap-for-v3.d.ts.map +1 -0
- package/dist/wrap-for-v3.js +89 -0
- package/dist/wrap-for-v3.js.map +1 -0
- package/examples/00-quickstart.ts +232 -0
- package/examples/01-rag-chatbot.ts +212 -0
- package/examples/02-multi-agent-research.ts +290 -0
- package/examples/03-email-classification.ts +379 -0
- package/examples/04-content-moderation.ts +400 -0
- package/examples/05-document-extraction.ts +455 -0
- package/examples/06-streaming-chat-nextjs.ts +437 -0
- package/examples/07-cloudflare-worker.ts +483 -0
- package/examples/08-batch-processing.ts +491 -0
- package/examples/09-budget-constrained.ts +527 -0
- package/examples/10-tool-orchestration.ts +565 -0
- package/examples/11-retry-resilience.ts +403 -0
- package/examples/12-caching-strategies.ts +422 -0
- package/examples/README.md +145 -0
- package/package.json +10 -6
- package/src/ai-promise.ts +528 -99
- package/src/ai-schemas.ts +122 -0
- package/src/ai.ts +69 -1153
- package/src/batch/anthropic.ts +96 -161
- package/src/batch/bedrock.ts +203 -454
- package/src/batch/cloudflare.ts +99 -282
- package/src/batch/google.ts +91 -297
- package/src/batch/index.ts +4 -1
- package/src/batch/memory.ts +15 -10
- package/src/batch/openai.ts +65 -193
- package/src/batch/provider.ts +336 -0
- package/src/batch-map.ts +29 -24
- package/src/batch-queue.ts +200 -11
- package/src/budget.ts +740 -0
- package/src/cache.ts +681 -0
- package/src/context.ts +122 -76
- package/src/digital-objects-registry.ts +750 -0
- package/src/errors.ts +37 -0
- package/src/eval/runner.ts +63 -38
- package/src/eval-log/in-memory.ts +90 -0
- package/src/eval-log/index.ts +46 -0
- package/src/eval-log/types.ts +110 -0
- package/src/function-registry.ts +671 -0
- package/src/generate.ts +33 -33
- package/src/index.ts +325 -49
- package/src/logger.ts +232 -0
- package/src/middleware/budget.ts +171 -0
- package/src/middleware/cache.ts +299 -0
- package/src/middleware/embed-cache.ts +195 -0
- package/src/middleware/index.ts +23 -0
- package/src/middleware/trace.ts +248 -0
- package/src/primitives.ts +589 -62
- package/src/retry.ts +902 -0
- package/src/schema.ts +8 -17
- package/src/telemetry.ts +403 -0
- package/src/template.ts +8 -4
- package/src/tool-orchestration.ts +1173 -0
- package/src/type-guards.ts +31 -0
- package/src/types.ts +164 -25
- package/src/wrap-for-v3.ts +105 -0
- package/test/ai-promise.test.ts +1080 -0
- package/test/ai-proxy.test.ts +1 -1
- package/test/backward-compat.test.ts +147 -0
- package/test/batch-autosubmit-errors.test.ts +610 -0
- package/test/batch-blog-posts.test.ts +87 -129
- package/test/budget-tracking.test.ts +800 -0
- package/test/cache.test.ts +712 -0
- package/test/context-isolation.test.ts +687 -0
- package/test/core-functions.test.ts +183 -579
- package/test/decide.test.ts +154 -322
- package/test/define.test.ts +211 -8
- package/test/digital-objects-registry.test.ts +760 -0
- package/test/embedding-cache-middleware.test.ts +140 -0
- package/test/evals/deterministic.eval.test.ts +376 -0
- package/test/generate-core.test.ts +140 -229
- package/test/implicit-batch.test.ts +22 -65
- package/test/json-parse-error-handling.test.ts +463 -0
- package/test/retry-policy-integration.test.ts +117 -0
- package/test/retry.test.ts +1016 -0
- package/test/schema.test.ts +55 -19
- package/test/streaming.test.ts +316 -0
- package/test/template.test.ts +1164 -0
- package/test/tool-orchestration.test.ts +1040 -0
- package/test/wrap-for-v3.test.ts +612 -0
- package/vitest.config.js +6 -0
- package/vitest.config.ts +20 -0
- package/dist/rpc/auth.d.ts +0 -69
- package/dist/rpc/auth.d.ts.map +0 -1
- package/dist/rpc/auth.js +0 -136
- package/dist/rpc/auth.js.map +0 -1
- package/dist/rpc/client.d.ts +0 -62
- package/dist/rpc/client.d.ts.map +0 -1
- package/dist/rpc/client.js +0 -103
- package/dist/rpc/client.js.map +0 -1
- package/dist/rpc/deferred.d.ts +0 -60
- package/dist/rpc/deferred.d.ts.map +0 -1
- package/dist/rpc/deferred.js +0 -96
- package/dist/rpc/deferred.js.map +0 -1
- package/dist/rpc/index.d.ts +0 -22
- package/dist/rpc/index.d.ts.map +0 -1
- package/dist/rpc/index.js +0 -38
- package/dist/rpc/index.js.map +0 -1
- package/dist/rpc/local.d.ts +0 -42
- package/dist/rpc/local.d.ts.map +0 -1
- package/dist/rpc/local.js +0 -50
- package/dist/rpc/local.js.map +0 -1
- package/dist/rpc/server.d.ts +0 -165
- package/dist/rpc/server.d.ts.map +0 -1
- package/dist/rpc/server.js +0 -405
- package/dist/rpc/server.js.map +0 -1
- package/dist/rpc/session.d.ts +0 -32
- package/dist/rpc/session.d.ts.map +0 -1
- package/dist/rpc/session.js +0 -43
- package/dist/rpc/session.js.map +0 -1
- package/dist/rpc/transport.d.ts +0 -306
- package/dist/rpc/transport.d.ts.map +0 -1
- package/dist/rpc/transport.js +0 -731
- package/dist/rpc/transport.js.map +0 -1
- package/src/batch/anthropic.js +0 -256
- package/src/batch/bedrock.js +0 -584
- package/src/batch/cloudflare.js +0 -287
- package/src/batch/google.js +0 -359
- package/src/batch/index.js +0 -30
- package/src/batch/memory.js +0 -187
- package/src/batch/openai.js +0 -402
- package/src/eval/index.js +0 -7
- package/src/eval/models.js +0 -119
- package/src/eval/runner.js +0 -147
- package/test/schema.test.js +0 -96
package/src/errors.ts
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Error classes for AI primitives
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Error thrown when a function is not yet implemented.
|
|
7
|
+
*
|
|
8
|
+
* This is used to clearly indicate at runtime that a function exists
|
|
9
|
+
* in the API but does not have a working implementation yet.
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```ts
|
|
13
|
+
* throw new NotImplementedError('human', 'Human-in-the-loop functions require channel integrations')
|
|
14
|
+
* ```
|
|
15
|
+
*/
|
|
16
|
+
export class NotImplementedError extends Error {
|
|
17
|
+
/** The name of the function that is not implemented */
|
|
18
|
+
readonly functionName: string
|
|
19
|
+
|
|
20
|
+
/** Additional details about why it's not implemented or what's needed */
|
|
21
|
+
readonly details?: string
|
|
22
|
+
|
|
23
|
+
constructor(functionName: string, details?: string) {
|
|
24
|
+
const message = details
|
|
25
|
+
? `Function '${functionName}' is not implemented: ${details}`
|
|
26
|
+
: `Function '${functionName}' is not implemented`
|
|
27
|
+
super(message)
|
|
28
|
+
this.name = 'NotImplementedError'
|
|
29
|
+
this.functionName = functionName
|
|
30
|
+
if (details !== undefined) this.details = details
|
|
31
|
+
|
|
32
|
+
// Maintain proper stack trace for where the error was thrown (V8 engines)
|
|
33
|
+
if (Error.captureStackTrace) {
|
|
34
|
+
Error.captureStackTrace(this, NotImplementedError)
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
package/src/eval/runner.ts
CHANGED
|
@@ -8,6 +8,17 @@
|
|
|
8
8
|
import { generateObject, generateText } from '../generate.js'
|
|
9
9
|
import { schema } from '../schema.js'
|
|
10
10
|
import { createModelVariants, getModelPricing, type EvalModel, type ModelTier } from './models.js'
|
|
11
|
+
import { getLogger } from '../logger.js'
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Output function type for eval progress reporting
|
|
15
|
+
*/
|
|
16
|
+
export type EvalOutputFn = (message: string) => void
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Default output function uses logger.info
|
|
20
|
+
*/
|
|
21
|
+
const defaultOutput: EvalOutputFn = (message: string) => getLogger().info(message)
|
|
11
22
|
|
|
12
23
|
export interface EvalCase<TInput = unknown, TExpected = unknown> {
|
|
13
24
|
name: string
|
|
@@ -25,7 +36,8 @@ export interface EvalScore {
|
|
|
25
36
|
export interface EvalResult<TOutput = unknown> {
|
|
26
37
|
model: EvalModel
|
|
27
38
|
case: EvalCase
|
|
28
|
-
output
|
|
39
|
+
/** The output from the task. Will be null if an error occurred. */
|
|
40
|
+
output: TOutput | null
|
|
29
41
|
scores: EvalScore[]
|
|
30
42
|
latencyMs: number
|
|
31
43
|
cost: number
|
|
@@ -48,12 +60,20 @@ export interface RunEvalOptions<TInput, TOutput, TExpected> {
|
|
|
48
60
|
scorers: Array<{
|
|
49
61
|
name: string
|
|
50
62
|
description?: string
|
|
51
|
-
scorer: (args: {
|
|
63
|
+
scorer: (args: {
|
|
64
|
+
input: TInput
|
|
65
|
+
output: TOutput
|
|
66
|
+
expected?: TExpected
|
|
67
|
+
}) => number | Promise<number>
|
|
52
68
|
}>
|
|
53
69
|
models?: EvalModel[]
|
|
54
70
|
tiers?: ModelTier[]
|
|
55
71
|
providers?: string[]
|
|
56
72
|
concurrency?: number
|
|
73
|
+
/** Custom output function for progress reporting (defaults to logger.info) */
|
|
74
|
+
output?: EvalOutputFn
|
|
75
|
+
/** Whether to suppress progress output (defaults to false) */
|
|
76
|
+
quiet?: boolean
|
|
57
77
|
}
|
|
58
78
|
|
|
59
79
|
/**
|
|
@@ -62,21 +82,22 @@ export interface RunEvalOptions<TInput, TOutput, TExpected> {
|
|
|
62
82
|
export async function runEval<TInput, TOutput, TExpected>(
|
|
63
83
|
options: RunEvalOptions<TInput, TOutput, TExpected>
|
|
64
84
|
): Promise<EvalSummary> {
|
|
65
|
-
const { name, cases, task, scorers, concurrency = 3 } = options
|
|
85
|
+
const { name, cases, task, scorers, concurrency = 3, quiet = false } = options
|
|
86
|
+
const log = quiet ? () => {} : options.output ?? defaultOutput
|
|
66
87
|
|
|
67
88
|
// Get models to test
|
|
68
|
-
const
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
89
|
+
const variantOptions: { tiers?: ModelTier[]; providers?: string[] } = {}
|
|
90
|
+
if (options.tiers !== undefined) variantOptions.tiers = options.tiers
|
|
91
|
+
if (options.providers !== undefined) variantOptions.providers = options.providers
|
|
92
|
+
const models = options.models ?? createModelVariants(variantOptions).map((v) => v.input)
|
|
72
93
|
|
|
73
94
|
const results: EvalResult<TOutput>[] = []
|
|
74
95
|
const startTime = Date.now()
|
|
75
96
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
97
|
+
log(`\nRunning eval: ${name}`)
|
|
98
|
+
log(` Models: ${models.map((m) => m.name).join(', ')}`)
|
|
99
|
+
log(` Cases: ${cases.length}`)
|
|
100
|
+
log('')
|
|
80
101
|
|
|
81
102
|
// Run all model/case combinations
|
|
82
103
|
const jobs: Array<{ model: EvalModel; case: EvalCase<TInput, TExpected> }> = []
|
|
@@ -96,7 +117,7 @@ export async function runEval<TInput, TOutput, TExpected>(
|
|
|
96
117
|
|
|
97
118
|
try {
|
|
98
119
|
// Run the task
|
|
99
|
-
const
|
|
120
|
+
const taskOutput = await task(job.case.input, job.model)
|
|
100
121
|
const latencyMs = Date.now() - caseStart
|
|
101
122
|
|
|
102
123
|
// Run scorers
|
|
@@ -105,19 +126,19 @@ export async function runEval<TInput, TOutput, TExpected>(
|
|
|
105
126
|
try {
|
|
106
127
|
const score = await s.scorer({
|
|
107
128
|
input: job.case.input,
|
|
108
|
-
output,
|
|
109
|
-
expected: job.case.expected,
|
|
129
|
+
output: taskOutput,
|
|
130
|
+
...(job.case.expected !== undefined && { expected: job.case.expected }),
|
|
110
131
|
})
|
|
111
132
|
scores.push({
|
|
112
133
|
name: s.name,
|
|
113
134
|
score: Math.max(0, Math.min(1, score)),
|
|
114
|
-
description: s.description,
|
|
135
|
+
...(s.description && { description: s.description }),
|
|
115
136
|
})
|
|
116
137
|
} catch (err) {
|
|
117
138
|
scores.push({
|
|
118
139
|
name: s.name,
|
|
119
140
|
score: 0,
|
|
120
|
-
description: s.description,
|
|
141
|
+
...(s.description && { description: s.description }),
|
|
121
142
|
metadata: { error: String(err) },
|
|
122
143
|
})
|
|
123
144
|
}
|
|
@@ -129,32 +150,37 @@ export async function runEval<TInput, TOutput, TExpected>(
|
|
|
129
150
|
const estimatedPromptTokens = 100
|
|
130
151
|
const estimatedCompletionTokens = 200
|
|
131
152
|
const cost = pricing
|
|
132
|
-
? (estimatedPromptTokens * pricing.prompt +
|
|
153
|
+
? (estimatedPromptTokens * pricing.prompt +
|
|
154
|
+
estimatedCompletionTokens * pricing.completion) /
|
|
155
|
+
1_000_000
|
|
133
156
|
: 0
|
|
134
157
|
|
|
135
|
-
const avgScore =
|
|
136
|
-
? scores.reduce((sum, s) => sum + s.score, 0) / scores.length
|
|
137
|
-
: 0
|
|
158
|
+
const avgScore =
|
|
159
|
+
scores.length > 0 ? scores.reduce((sum, s) => sum + s.score, 0) / scores.length : 0
|
|
138
160
|
|
|
139
|
-
const symbol = avgScore >= 0.8 ? '
|
|
140
|
-
|
|
161
|
+
const symbol = avgScore >= 0.8 ? 'PASS' : avgScore >= 0.5 ? 'WARN' : 'FAIL'
|
|
162
|
+
log(
|
|
163
|
+
` ${symbol} ${job.model.name} | ${job.case.name} | ${(avgScore * 100).toFixed(
|
|
164
|
+
0
|
|
165
|
+
)}% | ${latencyMs}ms`
|
|
166
|
+
)
|
|
141
167
|
|
|
142
168
|
return {
|
|
143
169
|
model: job.model,
|
|
144
170
|
case: job.case,
|
|
145
|
-
output,
|
|
171
|
+
output: taskOutput,
|
|
146
172
|
scores,
|
|
147
173
|
latencyMs,
|
|
148
174
|
cost,
|
|
149
175
|
}
|
|
150
176
|
} catch (err) {
|
|
151
|
-
|
|
177
|
+
log(` FAIL ${job.model.name} | ${job.case.name} | ERROR: ${err}`)
|
|
152
178
|
|
|
153
179
|
return {
|
|
154
180
|
model: job.model,
|
|
155
181
|
case: job.case,
|
|
156
|
-
output: null
|
|
157
|
-
scores: scorers.map(s => ({ name: s.name, score: 0 })),
|
|
182
|
+
output: null,
|
|
183
|
+
scores: scorers.map((s) => ({ name: s.name, score: 0 })),
|
|
158
184
|
latencyMs: Date.now() - caseStart,
|
|
159
185
|
cost: 0,
|
|
160
186
|
error: String(err),
|
|
@@ -169,10 +195,9 @@ export async function runEval<TInput, TOutput, TExpected>(
|
|
|
169
195
|
// Calculate summary
|
|
170
196
|
const totalTime = Date.now() - startTime
|
|
171
197
|
const totalCost = results.reduce((sum, r) => sum + r.cost, 0)
|
|
172
|
-
const allScores = results.flatMap(r => r.scores.map(s => s.score))
|
|
173
|
-
const avgScore =
|
|
174
|
-
? allScores.reduce((a, b) => a + b, 0) / allScores.length
|
|
175
|
-
: 0
|
|
198
|
+
const allScores = results.flatMap((r) => r.scores.map((s) => s.score))
|
|
199
|
+
const avgScore =
|
|
200
|
+
allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : 0
|
|
176
201
|
|
|
177
202
|
// Group by model
|
|
178
203
|
const byModel: Record<string, { avgScore: number; count: number }> = {}
|
|
@@ -192,15 +217,15 @@ export async function runEval<TInput, TOutput, TExpected>(
|
|
|
192
217
|
}
|
|
193
218
|
}
|
|
194
219
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
220
|
+
log('')
|
|
221
|
+
log(`Results:`)
|
|
222
|
+
log(` Overall: ${(avgScore * 100).toFixed(1)}%`)
|
|
223
|
+
log(` Time: ${(totalTime / 1000).toFixed(1)}s`)
|
|
224
|
+
log(` Cost: $${totalCost.toFixed(4)}`)
|
|
225
|
+
log('')
|
|
226
|
+
log(' By Model:')
|
|
202
227
|
for (const [modelId, stats] of Object.entries(byModel)) {
|
|
203
|
-
|
|
228
|
+
log(` - ${modelId}: ${(stats.avgScore * 100).toFixed(1)}%`)
|
|
204
229
|
}
|
|
205
230
|
|
|
206
231
|
return {
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* InMemoryEvalLogStore — Map-backed default implementation of
|
|
3
|
+
* {@link EvalLogStore}.
|
|
4
|
+
*
|
|
5
|
+
* Matches Evalite v1's default backend: process-local Map keyed on `$id`,
|
|
6
|
+
* insertion-ordered for "most recent first" listing without sorting. Suitable
|
|
7
|
+
* for single-process tests, evals, and the cascade walker's in-flight log;
|
|
8
|
+
* not suitable for cross-process or multi-worker setups (use a disk/SQLite
|
|
9
|
+
* backend for those — same contract).
|
|
10
|
+
*
|
|
11
|
+
* @packageDocumentation
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { randomUUID } from 'crypto'
|
|
15
|
+
import type { EvalLogEntry, EvalLogListOptions, EvalLogStore } from './types.js'
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* In-memory implementation of {@link EvalLogStore}.
|
|
19
|
+
*/
|
|
20
|
+
export class InMemoryEvalLogStore implements EvalLogStore {
|
|
21
|
+
/**
|
|
22
|
+
* Map keyed on `$id`. Insertion order on a JS Map is preserved, so we
|
|
23
|
+
* walk it in reverse for "most recent first" listing.
|
|
24
|
+
*/
|
|
25
|
+
private readonly entries: Map<string, EvalLogEntry> = new Map()
|
|
26
|
+
|
|
27
|
+
async record(
|
|
28
|
+
entry: Omit<EvalLogEntry, '$id' | 'createdAt'> &
|
|
29
|
+
Partial<Pick<EvalLogEntry, '$id' | 'createdAt'>>
|
|
30
|
+
): Promise<EvalLogEntry> {
|
|
31
|
+
const $id = entry.$id ?? randomUUID()
|
|
32
|
+
const createdAt = entry.createdAt ?? Date.now()
|
|
33
|
+
const stored: EvalLogEntry = {
|
|
34
|
+
$id,
|
|
35
|
+
createdAt,
|
|
36
|
+
model: entry.model,
|
|
37
|
+
prompt: entry.prompt,
|
|
38
|
+
response: entry.response,
|
|
39
|
+
usage: entry.usage,
|
|
40
|
+
costUsd: entry.costUsd,
|
|
41
|
+
durationMs: entry.durationMs,
|
|
42
|
+
...(entry.traceId !== undefined ? { traceId: entry.traceId } : {}),
|
|
43
|
+
...(entry.tags !== undefined ? { tags: entry.tags } : {}),
|
|
44
|
+
}
|
|
45
|
+
this.entries.set($id, stored)
|
|
46
|
+
return stored
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
async get(id: string): Promise<EvalLogEntry | undefined> {
|
|
50
|
+
return this.entries.get(id)
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
async list(options: EvalLogListOptions = {}): Promise<EvalLogEntry[]> {
|
|
54
|
+
const { traceId, model, tags, limit } = options
|
|
55
|
+
const out: EvalLogEntry[] = []
|
|
56
|
+
// Iterate in reverse insertion order — Map preserves order; we walk
|
|
57
|
+
// values into an array, then reverse for most-recent-first.
|
|
58
|
+
const all = Array.from(this.entries.values()).reverse()
|
|
59
|
+
for (const entry of all) {
|
|
60
|
+
if (traceId !== undefined && entry.traceId !== traceId) continue
|
|
61
|
+
if (model !== undefined && entry.model !== model) continue
|
|
62
|
+
if (tags !== undefined) {
|
|
63
|
+
let matchesAll = true
|
|
64
|
+
for (const k of Object.keys(tags)) {
|
|
65
|
+
if (entry.tags?.[k] !== tags[k]) {
|
|
66
|
+
matchesAll = false
|
|
67
|
+
break
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
if (!matchesAll) continue
|
|
71
|
+
}
|
|
72
|
+
out.push(entry)
|
|
73
|
+
if (limit !== undefined && out.length >= limit) break
|
|
74
|
+
}
|
|
75
|
+
return out
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
async delete(id: string): Promise<boolean> {
|
|
79
|
+
return this.entries.delete(id)
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Convenience for tests: drop every entry. Not on the public
|
|
84
|
+
* {@link EvalLogStore} interface because the disk/SQLite backends may not
|
|
85
|
+
* want to expose a one-shot wipe.
|
|
86
|
+
*/
|
|
87
|
+
clear(): void {
|
|
88
|
+
this.entries.clear()
|
|
89
|
+
}
|
|
90
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvalLogStore — pluggable persistence primitive for trace/eval entries.
|
|
3
|
+
*
|
|
4
|
+
* Exports the {@link EvalLogStore} contract, the
|
|
5
|
+
* {@link InMemoryEvalLogStore} default implementation, and a global
|
|
6
|
+
* accessor pair (`getEvalLogStore` / `configureEvalLogStore`) mirroring the
|
|
7
|
+
* marketplace persistence pattern from round 9.
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { InMemoryEvalLogStore } from './in-memory.js'
|
|
13
|
+
import type { EvalLogStore } from './types.js'
|
|
14
|
+
|
|
15
|
+
export type { EvalLogEntry, EvalLogListOptions, EvalLogStore } from './types.js'
|
|
16
|
+
export { InMemoryEvalLogStore } from './in-memory.js'
|
|
17
|
+
|
|
18
|
+
// ============================================================================
|
|
19
|
+
// Global accessor (lazy default + override)
|
|
20
|
+
// ============================================================================
|
|
21
|
+
|
|
22
|
+
let _store: EvalLogStore | null = null
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Get the global {@link EvalLogStore}. Lazily constructs an
|
|
26
|
+
* {@link InMemoryEvalLogStore} on first call when no store has been
|
|
27
|
+
* configured.
|
|
28
|
+
*
|
|
29
|
+
* Match the round-9 marketplace persistence accessor: callers that don't
|
|
30
|
+
* care about isolation read the global; callers that do (tests, multi-tenant
|
|
31
|
+
* apps) install their own via {@link configureEvalLogStore}.
|
|
32
|
+
*/
|
|
33
|
+
export function getEvalLogStore(): EvalLogStore {
|
|
34
|
+
if (_store === null) {
|
|
35
|
+
_store = new InMemoryEvalLogStore()
|
|
36
|
+
}
|
|
37
|
+
return _store
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Install a global {@link EvalLogStore}. Pass `null` to reset to the lazy
|
|
42
|
+
* in-memory default (useful in test teardown).
|
|
43
|
+
*/
|
|
44
|
+
export function configureEvalLogStore(store: EvalLogStore | null): void {
|
|
45
|
+
_store = store
|
|
46
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvalLogStore — pluggable persistence primitive for trace/eval entries.
|
|
3
|
+
*
|
|
4
|
+
* Forward-looking primitive matching Evalite v1's EvalLogStore pattern:
|
|
5
|
+
* the in-memory default ships today; the disk/SQLite/durable backends can
|
|
6
|
+
* land later without breaking the trace middleware contract.
|
|
7
|
+
*
|
|
8
|
+
* Used downstream by `traceMiddleware` (in `../middleware/trace.ts`) as the
|
|
9
|
+
* sink for per-call prompt+response+usage records. The cascade-walker in
|
|
10
|
+
* services-as-software will consume `list()` / `get()` to populate the
|
|
11
|
+
* InvocationEvent stream once round 16+ adds the `'persona-trace'` variant.
|
|
12
|
+
*
|
|
13
|
+
* @packageDocumentation
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
// ============================================================================
|
|
17
|
+
// Types
|
|
18
|
+
// ============================================================================
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* A single entry in the eval log — one LLM call with its full payload.
|
|
22
|
+
*
|
|
23
|
+
* Shape mirrors what `traceMiddleware` emits, with optional `tags` for
|
|
24
|
+
* caller-supplied dimensions (persona name, evaluator role, cascade depth).
|
|
25
|
+
*/
|
|
26
|
+
export interface EvalLogEntry {
|
|
27
|
+
/** MDXLD identity — typically a UUID generated at insert time. */
|
|
28
|
+
$id: string
|
|
29
|
+
/**
|
|
30
|
+
* Optional caller-supplied trace correlation ID. When the cascade walker
|
|
31
|
+
* spans multiple LLM calls under one user request, all entries share the
|
|
32
|
+
* same `traceId` so `list({ traceId })` rolls them up.
|
|
33
|
+
*/
|
|
34
|
+
traceId?: string
|
|
35
|
+
/** Model identifier (e.g. `'anthropic/claude-sonnet-4.5'` or `'sonnet'`). */
|
|
36
|
+
model: string
|
|
37
|
+
/**
|
|
38
|
+
* Stringified prompt as submitted to the model. We don't store the
|
|
39
|
+
* structured `LanguageModelV3Prompt` shape because (a) it's bulky and (b)
|
|
40
|
+
* downstream consumers (replay, fixture diff) only need the text payload.
|
|
41
|
+
*/
|
|
42
|
+
prompt: string
|
|
43
|
+
/** The model's text response. Tool calls/files are not stored here. */
|
|
44
|
+
response: string
|
|
45
|
+
/** Token usage as reported by the AI SDK. */
|
|
46
|
+
usage: {
|
|
47
|
+
inputTokens: number
|
|
48
|
+
outputTokens: number
|
|
49
|
+
}
|
|
50
|
+
/** Computed USD cost (caller-supplied via the `pricing` overlay). */
|
|
51
|
+
costUsd: number
|
|
52
|
+
/** Wall-clock duration of the underlying `doGenerate` / `doStream` call. */
|
|
53
|
+
durationMs: number
|
|
54
|
+
/** Caller-supplied dimensions (persona, evaluator role, cascade step). */
|
|
55
|
+
tags?: Record<string, string>
|
|
56
|
+
/** Insert timestamp (epoch ms). */
|
|
57
|
+
createdAt: number
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Options accepted by `EvalLogStore.list`. All fields are AND-combined.
|
|
62
|
+
*/
|
|
63
|
+
export interface EvalLogListOptions {
|
|
64
|
+
/** Filter to entries with this trace correlation ID. */
|
|
65
|
+
traceId?: string
|
|
66
|
+
/** Filter to entries for a specific model. */
|
|
67
|
+
model?: string
|
|
68
|
+
/**
|
|
69
|
+
* Filter to entries whose `tags` are a *superset* of the supplied object.
|
|
70
|
+
* (E.g. `{ persona: 'cfo' }` matches entries tagged
|
|
71
|
+
* `{ persona: 'cfo', step: '3' }` but not entries tagged
|
|
72
|
+
* `{ persona: 'cto' }`.)
|
|
73
|
+
*/
|
|
74
|
+
tags?: Record<string, string>
|
|
75
|
+
/** Maximum number of entries to return (most recent first). */
|
|
76
|
+
limit?: number
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Pluggable persistence interface for eval log entries.
|
|
81
|
+
*
|
|
82
|
+
* Modeled after the Evalite v1 EvalLogStore contract: in-memory default,
|
|
83
|
+
* disk JSON / SQLite / durable backends supplied via
|
|
84
|
+
* `configureEvalLogStore`.
|
|
85
|
+
*
|
|
86
|
+
* All methods are async to keep the contract uniform across backends — the
|
|
87
|
+
* in-memory implementation resolves synchronously under the hood.
|
|
88
|
+
*/
|
|
89
|
+
export interface EvalLogStore {
|
|
90
|
+
/**
|
|
91
|
+
* Persist a new entry. Returns the stored entry (with `$id` and
|
|
92
|
+
* `createdAt` filled in if the caller omitted them).
|
|
93
|
+
*/
|
|
94
|
+
record(
|
|
95
|
+
entry: Omit<EvalLogEntry, '$id' | 'createdAt'> &
|
|
96
|
+
Partial<Pick<EvalLogEntry, '$id' | 'createdAt'>>
|
|
97
|
+
): Promise<EvalLogEntry>
|
|
98
|
+
/**
|
|
99
|
+
* Read an entry by `$id`. Returns `undefined` when not found.
|
|
100
|
+
*/
|
|
101
|
+
get(id: string): Promise<EvalLogEntry | undefined>
|
|
102
|
+
/**
|
|
103
|
+
* List entries matching the supplied filter. Returns most recent first.
|
|
104
|
+
*/
|
|
105
|
+
list(options?: EvalLogListOptions): Promise<EvalLogEntry[]>
|
|
106
|
+
/**
|
|
107
|
+
* Delete an entry. Returns `true` if an entry was actually removed.
|
|
108
|
+
*/
|
|
109
|
+
delete(id: string): Promise<boolean>
|
|
110
|
+
}
|