ai-functions 2.1.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/.turbo/turbo-build.log +1 -4
  2. package/CHANGELOG.md +68 -1
  3. package/README.md +397 -157
  4. package/dist/ai-promise.d.ts +50 -3
  5. package/dist/ai-promise.d.ts.map +1 -1
  6. package/dist/ai-promise.js +410 -51
  7. package/dist/ai-promise.js.map +1 -1
  8. package/dist/ai-schemas.d.ts +56 -0
  9. package/dist/ai-schemas.d.ts.map +1 -0
  10. package/dist/ai-schemas.js +53 -0
  11. package/dist/ai-schemas.js.map +1 -0
  12. package/dist/ai.d.ts +16 -242
  13. package/dist/ai.d.ts.map +1 -1
  14. package/dist/ai.js +54 -837
  15. package/dist/ai.js.map +1 -1
  16. package/dist/batch/anthropic.d.ts +6 -4
  17. package/dist/batch/anthropic.d.ts.map +1 -1
  18. package/dist/batch/anthropic.js +83 -145
  19. package/dist/batch/anthropic.js.map +1 -1
  20. package/dist/batch/bedrock.d.ts +8 -30
  21. package/dist/batch/bedrock.d.ts.map +1 -1
  22. package/dist/batch/bedrock.js +155 -338
  23. package/dist/batch/bedrock.js.map +1 -1
  24. package/dist/batch/cloudflare.d.ts +8 -20
  25. package/dist/batch/cloudflare.d.ts.map +1 -1
  26. package/dist/batch/cloudflare.js +68 -189
  27. package/dist/batch/cloudflare.js.map +1 -1
  28. package/dist/batch/google.d.ts +6 -20
  29. package/dist/batch/google.d.ts.map +1 -1
  30. package/dist/batch/google.js +70 -238
  31. package/dist/batch/google.js.map +1 -1
  32. package/dist/batch/index.d.ts +4 -1
  33. package/dist/batch/index.d.ts.map +1 -1
  34. package/dist/batch/index.js +4 -1
  35. package/dist/batch/index.js.map +1 -1
  36. package/dist/batch/memory.d.ts +1 -1
  37. package/dist/batch/memory.d.ts.map +1 -1
  38. package/dist/batch/memory.js +14 -10
  39. package/dist/batch/memory.js.map +1 -1
  40. package/dist/batch/openai.d.ts +11 -14
  41. package/dist/batch/openai.d.ts.map +1 -1
  42. package/dist/batch/openai.js +52 -156
  43. package/dist/batch/openai.js.map +1 -1
  44. package/dist/batch/provider.d.ts +111 -0
  45. package/dist/batch/provider.d.ts.map +1 -0
  46. package/dist/batch/provider.js +233 -0
  47. package/dist/batch/provider.js.map +1 -0
  48. package/dist/batch-map.d.ts.map +1 -1
  49. package/dist/batch-map.js +23 -17
  50. package/dist/batch-map.js.map +1 -1
  51. package/dist/batch-queue.d.ts +65 -0
  52. package/dist/batch-queue.d.ts.map +1 -1
  53. package/dist/batch-queue.js +169 -14
  54. package/dist/batch-queue.js.map +1 -1
  55. package/dist/budget.d.ts +272 -0
  56. package/dist/budget.d.ts.map +1 -0
  57. package/dist/budget.js +513 -0
  58. package/dist/budget.js.map +1 -0
  59. package/dist/cache.d.ts +295 -0
  60. package/dist/cache.d.ts.map +1 -0
  61. package/dist/cache.js +433 -0
  62. package/dist/cache.js.map +1 -0
  63. package/dist/context.d.ts +42 -8
  64. package/dist/context.d.ts.map +1 -1
  65. package/dist/context.js +64 -62
  66. package/dist/context.js.map +1 -1
  67. package/dist/digital-objects-registry.d.ts +229 -0
  68. package/dist/digital-objects-registry.d.ts.map +1 -0
  69. package/dist/digital-objects-registry.js +617 -0
  70. package/dist/digital-objects-registry.js.map +1 -0
  71. package/dist/embeddings.d.ts +2 -2
  72. package/dist/embeddings.d.ts.map +1 -1
  73. package/dist/errors.d.ts +22 -0
  74. package/dist/errors.d.ts.map +1 -0
  75. package/dist/errors.js +35 -0
  76. package/dist/errors.js.map +1 -0
  77. package/dist/eval/runner.d.ts +10 -1
  78. package/dist/eval/runner.d.ts.map +1 -1
  79. package/dist/eval/runner.js +41 -35
  80. package/dist/eval/runner.js.map +1 -1
  81. package/dist/eval-log/in-memory.d.ts +34 -0
  82. package/dist/eval-log/in-memory.d.ts.map +1 -0
  83. package/dist/eval-log/in-memory.js +84 -0
  84. package/dist/eval-log/in-memory.js.map +1 -0
  85. package/dist/eval-log/index.d.ts +29 -0
  86. package/dist/eval-log/index.d.ts.map +1 -0
  87. package/dist/eval-log/index.js +39 -0
  88. package/dist/eval-log/index.js.map +1 -0
  89. package/dist/eval-log/types.d.ts +101 -0
  90. package/dist/eval-log/types.d.ts.map +1 -0
  91. package/dist/eval-log/types.js +16 -0
  92. package/dist/eval-log/types.js.map +1 -0
  93. package/dist/function-registry.d.ts +116 -0
  94. package/dist/function-registry.d.ts.map +1 -0
  95. package/dist/function-registry.js +546 -0
  96. package/dist/function-registry.js.map +1 -0
  97. package/dist/generate.d.ts +9 -3
  98. package/dist/generate.d.ts.map +1 -1
  99. package/dist/generate.js +18 -22
  100. package/dist/generate.js.map +1 -1
  101. package/dist/index.d.ts +35 -20
  102. package/dist/index.d.ts.map +1 -1
  103. package/dist/index.js +89 -42
  104. package/dist/index.js.map +1 -1
  105. package/dist/logger.d.ts +118 -0
  106. package/dist/logger.d.ts.map +1 -0
  107. package/dist/logger.js +187 -0
  108. package/dist/logger.js.map +1 -0
  109. package/dist/middleware/budget.d.ts +84 -0
  110. package/dist/middleware/budget.d.ts.map +1 -0
  111. package/dist/middleware/budget.js +110 -0
  112. package/dist/middleware/budget.js.map +1 -0
  113. package/dist/middleware/cache.d.ts +103 -0
  114. package/dist/middleware/cache.d.ts.map +1 -0
  115. package/dist/middleware/cache.js +228 -0
  116. package/dist/middleware/cache.js.map +1 -0
  117. package/dist/middleware/embed-cache.d.ts +99 -0
  118. package/dist/middleware/embed-cache.d.ts.map +1 -0
  119. package/dist/middleware/embed-cache.js +128 -0
  120. package/dist/middleware/embed-cache.js.map +1 -0
  121. package/dist/middleware/index.d.ts +11 -0
  122. package/dist/middleware/index.d.ts.map +1 -0
  123. package/dist/middleware/index.js +11 -0
  124. package/dist/middleware/index.js.map +1 -0
  125. package/dist/middleware/trace.d.ts +103 -0
  126. package/dist/middleware/trace.d.ts.map +1 -0
  127. package/dist/middleware/trace.js +176 -0
  128. package/dist/middleware/trace.js.map +1 -0
  129. package/dist/primitives.d.ts +120 -1
  130. package/dist/primitives.d.ts.map +1 -1
  131. package/dist/primitives.js +398 -26
  132. package/dist/primitives.js.map +1 -1
  133. package/dist/retry.d.ts +368 -0
  134. package/dist/retry.d.ts.map +1 -0
  135. package/dist/retry.js +646 -0
  136. package/dist/retry.js.map +1 -0
  137. package/dist/schema.d.ts.map +1 -1
  138. package/dist/schema.js +2 -10
  139. package/dist/schema.js.map +1 -1
  140. package/dist/telemetry.d.ts +128 -0
  141. package/dist/telemetry.d.ts.map +1 -0
  142. package/dist/telemetry.js +285 -0
  143. package/dist/telemetry.js.map +1 -0
  144. package/dist/template.d.ts.map +1 -1
  145. package/dist/template.js +6 -1
  146. package/dist/template.js.map +1 -1
  147. package/dist/tool-orchestration.d.ts +453 -0
  148. package/dist/tool-orchestration.d.ts.map +1 -0
  149. package/dist/tool-orchestration.js +763 -0
  150. package/dist/tool-orchestration.js.map +1 -0
  151. package/dist/type-guards.d.ts +28 -0
  152. package/dist/type-guards.d.ts.map +1 -0
  153. package/dist/type-guards.js +29 -0
  154. package/dist/type-guards.js.map +1 -0
  155. package/dist/types.d.ts +135 -17
  156. package/dist/types.d.ts.map +1 -1
  157. package/dist/types.js +36 -1
  158. package/dist/types.js.map +1 -1
  159. package/dist/wrap-for-v3.d.ts +80 -0
  160. package/dist/wrap-for-v3.d.ts.map +1 -0
  161. package/dist/wrap-for-v3.js +89 -0
  162. package/dist/wrap-for-v3.js.map +1 -0
  163. package/examples/00-quickstart.ts +232 -0
  164. package/examples/01-rag-chatbot.ts +212 -0
  165. package/examples/02-multi-agent-research.ts +290 -0
  166. package/examples/03-email-classification.ts +379 -0
  167. package/examples/04-content-moderation.ts +400 -0
  168. package/examples/05-document-extraction.ts +455 -0
  169. package/examples/06-streaming-chat-nextjs.ts +437 -0
  170. package/examples/07-cloudflare-worker.ts +483 -0
  171. package/examples/08-batch-processing.ts +491 -0
  172. package/examples/09-budget-constrained.ts +527 -0
  173. package/examples/10-tool-orchestration.ts +565 -0
  174. package/examples/11-retry-resilience.ts +403 -0
  175. package/examples/12-caching-strategies.ts +422 -0
  176. package/examples/README.md +145 -0
  177. package/package.json +10 -6
  178. package/src/ai-promise.ts +528 -99
  179. package/src/ai-schemas.ts +122 -0
  180. package/src/ai.ts +69 -1153
  181. package/src/batch/anthropic.ts +96 -161
  182. package/src/batch/bedrock.ts +203 -454
  183. package/src/batch/cloudflare.ts +99 -282
  184. package/src/batch/google.ts +91 -297
  185. package/src/batch/index.ts +4 -1
  186. package/src/batch/memory.ts +15 -10
  187. package/src/batch/openai.ts +65 -193
  188. package/src/batch/provider.ts +336 -0
  189. package/src/batch-map.ts +29 -24
  190. package/src/batch-queue.ts +200 -11
  191. package/src/budget.ts +740 -0
  192. package/src/cache.ts +681 -0
  193. package/src/context.ts +122 -76
  194. package/src/digital-objects-registry.ts +750 -0
  195. package/src/errors.ts +37 -0
  196. package/src/eval/runner.ts +63 -38
  197. package/src/eval-log/in-memory.ts +90 -0
  198. package/src/eval-log/index.ts +46 -0
  199. package/src/eval-log/types.ts +110 -0
  200. package/src/function-registry.ts +671 -0
  201. package/src/generate.ts +33 -33
  202. package/src/index.ts +325 -49
  203. package/src/logger.ts +232 -0
  204. package/src/middleware/budget.ts +171 -0
  205. package/src/middleware/cache.ts +299 -0
  206. package/src/middleware/embed-cache.ts +195 -0
  207. package/src/middleware/index.ts +23 -0
  208. package/src/middleware/trace.ts +248 -0
  209. package/src/primitives.ts +589 -62
  210. package/src/retry.ts +902 -0
  211. package/src/schema.ts +8 -17
  212. package/src/telemetry.ts +403 -0
  213. package/src/template.ts +8 -4
  214. package/src/tool-orchestration.ts +1173 -0
  215. package/src/type-guards.ts +31 -0
  216. package/src/types.ts +164 -25
  217. package/src/wrap-for-v3.ts +105 -0
  218. package/test/ai-promise.test.ts +1080 -0
  219. package/test/ai-proxy.test.ts +1 -1
  220. package/test/backward-compat.test.ts +147 -0
  221. package/test/batch-autosubmit-errors.test.ts +610 -0
  222. package/test/batch-blog-posts.test.ts +87 -129
  223. package/test/budget-tracking.test.ts +800 -0
  224. package/test/cache.test.ts +712 -0
  225. package/test/context-isolation.test.ts +687 -0
  226. package/test/core-functions.test.ts +183 -579
  227. package/test/decide.test.ts +154 -322
  228. package/test/define.test.ts +211 -8
  229. package/test/digital-objects-registry.test.ts +760 -0
  230. package/test/embedding-cache-middleware.test.ts +140 -0
  231. package/test/evals/deterministic.eval.test.ts +376 -0
  232. package/test/generate-core.test.ts +140 -229
  233. package/test/implicit-batch.test.ts +22 -65
  234. package/test/json-parse-error-handling.test.ts +463 -0
  235. package/test/retry-policy-integration.test.ts +117 -0
  236. package/test/retry.test.ts +1016 -0
  237. package/test/schema.test.ts +55 -19
  238. package/test/streaming.test.ts +316 -0
  239. package/test/template.test.ts +1164 -0
  240. package/test/tool-orchestration.test.ts +1040 -0
  241. package/test/wrap-for-v3.test.ts +612 -0
  242. package/vitest.config.js +6 -0
  243. package/vitest.config.ts +20 -0
  244. package/dist/rpc/auth.d.ts +0 -69
  245. package/dist/rpc/auth.d.ts.map +0 -1
  246. package/dist/rpc/auth.js +0 -136
  247. package/dist/rpc/auth.js.map +0 -1
  248. package/dist/rpc/client.d.ts +0 -62
  249. package/dist/rpc/client.d.ts.map +0 -1
  250. package/dist/rpc/client.js +0 -103
  251. package/dist/rpc/client.js.map +0 -1
  252. package/dist/rpc/deferred.d.ts +0 -60
  253. package/dist/rpc/deferred.d.ts.map +0 -1
  254. package/dist/rpc/deferred.js +0 -96
  255. package/dist/rpc/deferred.js.map +0 -1
  256. package/dist/rpc/index.d.ts +0 -22
  257. package/dist/rpc/index.d.ts.map +0 -1
  258. package/dist/rpc/index.js +0 -38
  259. package/dist/rpc/index.js.map +0 -1
  260. package/dist/rpc/local.d.ts +0 -42
  261. package/dist/rpc/local.d.ts.map +0 -1
  262. package/dist/rpc/local.js +0 -50
  263. package/dist/rpc/local.js.map +0 -1
  264. package/dist/rpc/server.d.ts +0 -165
  265. package/dist/rpc/server.d.ts.map +0 -1
  266. package/dist/rpc/server.js +0 -405
  267. package/dist/rpc/server.js.map +0 -1
  268. package/dist/rpc/session.d.ts +0 -32
  269. package/dist/rpc/session.d.ts.map +0 -1
  270. package/dist/rpc/session.js +0 -43
  271. package/dist/rpc/session.js.map +0 -1
  272. package/dist/rpc/transport.d.ts +0 -306
  273. package/dist/rpc/transport.d.ts.map +0 -1
  274. package/dist/rpc/transport.js +0 -731
  275. package/dist/rpc/transport.js.map +0 -1
  276. package/src/batch/anthropic.js +0 -256
  277. package/src/batch/bedrock.js +0 -584
  278. package/src/batch/cloudflare.js +0 -287
  279. package/src/batch/google.js +0 -359
  280. package/src/batch/index.js +0 -30
  281. package/src/batch/memory.js +0 -187
  282. package/src/batch/openai.js +0 -402
  283. package/src/eval/index.js +0 -7
  284. package/src/eval/models.js +0 -119
  285. package/src/eval/runner.js +0 -147
  286. package/test/schema.test.js +0 -96
package/src/errors.ts ADDED
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Error classes for AI primitives
3
+ */
4
+
5
+ /**
6
+ * Error thrown when a function is not yet implemented.
7
+ *
8
+ * This is used to clearly indicate at runtime that a function exists
9
+ * in the API but does not have a working implementation yet.
10
+ *
11
+ * @example
12
+ * ```ts
13
+ * throw new NotImplementedError('human', 'Human-in-the-loop functions require channel integrations')
14
+ * ```
15
+ */
16
+ export class NotImplementedError extends Error {
17
+ /** The name of the function that is not implemented */
18
+ readonly functionName: string
19
+
20
+ /** Additional details about why it's not implemented or what's needed */
21
+ readonly details?: string
22
+
23
+ constructor(functionName: string, details?: string) {
24
+ const message = details
25
+ ? `Function '${functionName}' is not implemented: ${details}`
26
+ : `Function '${functionName}' is not implemented`
27
+ super(message)
28
+ this.name = 'NotImplementedError'
29
+ this.functionName = functionName
30
+ if (details !== undefined) this.details = details
31
+
32
+ // Maintain proper stack trace for where the error was thrown (V8 engines)
33
+ if (Error.captureStackTrace) {
34
+ Error.captureStackTrace(this, NotImplementedError)
35
+ }
36
+ }
37
+ }
@@ -8,6 +8,17 @@
8
8
  import { generateObject, generateText } from '../generate.js'
9
9
  import { schema } from '../schema.js'
10
10
  import { createModelVariants, getModelPricing, type EvalModel, type ModelTier } from './models.js'
11
+ import { getLogger } from '../logger.js'
12
+
13
+ /**
14
+ * Output function type for eval progress reporting
15
+ */
16
+ export type EvalOutputFn = (message: string) => void
17
+
18
+ /**
19
+ * Default output function uses logger.info
20
+ */
21
+ const defaultOutput: EvalOutputFn = (message: string) => getLogger().info(message)
11
22
 
12
23
  export interface EvalCase<TInput = unknown, TExpected = unknown> {
13
24
  name: string
@@ -25,7 +36,8 @@ export interface EvalScore {
25
36
  export interface EvalResult<TOutput = unknown> {
26
37
  model: EvalModel
27
38
  case: EvalCase
28
- output: TOutput
39
+ /** The output from the task. Will be null if an error occurred. */
40
+ output: TOutput | null
29
41
  scores: EvalScore[]
30
42
  latencyMs: number
31
43
  cost: number
@@ -48,12 +60,20 @@ export interface RunEvalOptions<TInput, TOutput, TExpected> {
48
60
  scorers: Array<{
49
61
  name: string
50
62
  description?: string
51
- scorer: (args: { input: TInput; output: TOutput; expected?: TExpected }) => number | Promise<number>
63
+ scorer: (args: {
64
+ input: TInput
65
+ output: TOutput
66
+ expected?: TExpected
67
+ }) => number | Promise<number>
52
68
  }>
53
69
  models?: EvalModel[]
54
70
  tiers?: ModelTier[]
55
71
  providers?: string[]
56
72
  concurrency?: number
73
+ /** Custom output function for progress reporting (defaults to logger.info) */
74
+ output?: EvalOutputFn
75
+ /** Whether to suppress progress output (defaults to false) */
76
+ quiet?: boolean
57
77
  }
58
78
 
59
79
  /**
@@ -62,21 +82,22 @@ export interface RunEvalOptions<TInput, TOutput, TExpected> {
62
82
  export async function runEval<TInput, TOutput, TExpected>(
63
83
  options: RunEvalOptions<TInput, TOutput, TExpected>
64
84
  ): Promise<EvalSummary> {
65
- const { name, cases, task, scorers, concurrency = 3 } = options
85
+ const { name, cases, task, scorers, concurrency = 3, quiet = false } = options
86
+ const log = quiet ? () => {} : options.output ?? defaultOutput
66
87
 
67
88
  // Get models to test
68
- const models = options.models ?? createModelVariants({
69
- tiers: options.tiers,
70
- providers: options.providers,
71
- }).map(v => v.input)
89
+ const variantOptions: { tiers?: ModelTier[]; providers?: string[] } = {}
90
+ if (options.tiers !== undefined) variantOptions.tiers = options.tiers
91
+ if (options.providers !== undefined) variantOptions.providers = options.providers
92
+ const models = options.models ?? createModelVariants(variantOptions).map((v) => v.input)
72
93
 
73
94
  const results: EvalResult<TOutput>[] = []
74
95
  const startTime = Date.now()
75
96
 
76
- console.log(`\n🧪 Running eval: ${name}`)
77
- console.log(` Models: ${models.map(m => m.name).join(', ')}`)
78
- console.log(` Cases: ${cases.length}`)
79
- console.log('')
97
+ log(`\nRunning eval: ${name}`)
98
+ log(` Models: ${models.map((m) => m.name).join(', ')}`)
99
+ log(` Cases: ${cases.length}`)
100
+ log('')
80
101
 
81
102
  // Run all model/case combinations
82
103
  const jobs: Array<{ model: EvalModel; case: EvalCase<TInput, TExpected> }> = []
@@ -96,7 +117,7 @@ export async function runEval<TInput, TOutput, TExpected>(
96
117
 
97
118
  try {
98
119
  // Run the task
99
- const output = await task(job.case.input, job.model)
120
+ const taskOutput = await task(job.case.input, job.model)
100
121
  const latencyMs = Date.now() - caseStart
101
122
 
102
123
  // Run scorers
@@ -105,19 +126,19 @@ export async function runEval<TInput, TOutput, TExpected>(
105
126
  try {
106
127
  const score = await s.scorer({
107
128
  input: job.case.input,
108
- output,
109
- expected: job.case.expected,
129
+ output: taskOutput,
130
+ ...(job.case.expected !== undefined && { expected: job.case.expected }),
110
131
  })
111
132
  scores.push({
112
133
  name: s.name,
113
134
  score: Math.max(0, Math.min(1, score)),
114
- description: s.description,
135
+ ...(s.description && { description: s.description }),
115
136
  })
116
137
  } catch (err) {
117
138
  scores.push({
118
139
  name: s.name,
119
140
  score: 0,
120
- description: s.description,
141
+ ...(s.description && { description: s.description }),
121
142
  metadata: { error: String(err) },
122
143
  })
123
144
  }
@@ -129,32 +150,37 @@ export async function runEval<TInput, TOutput, TExpected>(
129
150
  const estimatedPromptTokens = 100
130
151
  const estimatedCompletionTokens = 200
131
152
  const cost = pricing
132
- ? (estimatedPromptTokens * pricing.prompt + estimatedCompletionTokens * pricing.completion) / 1_000_000
153
+ ? (estimatedPromptTokens * pricing.prompt +
154
+ estimatedCompletionTokens * pricing.completion) /
155
+ 1_000_000
133
156
  : 0
134
157
 
135
- const avgScore = scores.length > 0
136
- ? scores.reduce((sum, s) => sum + s.score, 0) / scores.length
137
- : 0
158
+ const avgScore =
159
+ scores.length > 0 ? scores.reduce((sum, s) => sum + s.score, 0) / scores.length : 0
138
160
 
139
- const symbol = avgScore >= 0.8 ? '' : avgScore >= 0.5 ? '~' : ''
140
- console.log(` ${symbol} ${job.model.name} | ${job.case.name} | ${(avgScore * 100).toFixed(0)}% | ${latencyMs}ms`)
161
+ const symbol = avgScore >= 0.8 ? 'PASS' : avgScore >= 0.5 ? 'WARN' : 'FAIL'
162
+ log(
163
+ ` ${symbol} ${job.model.name} | ${job.case.name} | ${(avgScore * 100).toFixed(
164
+ 0
165
+ )}% | ${latencyMs}ms`
166
+ )
141
167
 
142
168
  return {
143
169
  model: job.model,
144
170
  case: job.case,
145
- output,
171
+ output: taskOutput,
146
172
  scores,
147
173
  latencyMs,
148
174
  cost,
149
175
  }
150
176
  } catch (err) {
151
- console.log(` ${job.model.name} | ${job.case.name} | ERROR: ${err}`)
177
+ log(` FAIL ${job.model.name} | ${job.case.name} | ERROR: ${err}`)
152
178
 
153
179
  return {
154
180
  model: job.model,
155
181
  case: job.case,
156
- output: null as unknown as TOutput,
157
- scores: scorers.map(s => ({ name: s.name, score: 0 })),
182
+ output: null,
183
+ scores: scorers.map((s) => ({ name: s.name, score: 0 })),
158
184
  latencyMs: Date.now() - caseStart,
159
185
  cost: 0,
160
186
  error: String(err),
@@ -169,10 +195,9 @@ export async function runEval<TInput, TOutput, TExpected>(
169
195
  // Calculate summary
170
196
  const totalTime = Date.now() - startTime
171
197
  const totalCost = results.reduce((sum, r) => sum + r.cost, 0)
172
- const allScores = results.flatMap(r => r.scores.map(s => s.score))
173
- const avgScore = allScores.length > 0
174
- ? allScores.reduce((a, b) => a + b, 0) / allScores.length
175
- : 0
198
+ const allScores = results.flatMap((r) => r.scores.map((s) => s.score))
199
+ const avgScore =
200
+ allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : 0
176
201
 
177
202
  // Group by model
178
203
  const byModel: Record<string, { avgScore: number; count: number }> = {}
@@ -192,15 +217,15 @@ export async function runEval<TInput, TOutput, TExpected>(
192
217
  }
193
218
  }
194
219
 
195
- console.log('')
196
- console.log(`📊 Results:`)
197
- console.log(` Overall: ${(avgScore * 100).toFixed(1)}%`)
198
- console.log(` Time: ${(totalTime / 1000).toFixed(1)}s`)
199
- console.log(` Cost: $${totalCost.toFixed(4)}`)
200
- console.log('')
201
- console.log(' By Model:')
220
+ log('')
221
+ log(`Results:`)
222
+ log(` Overall: ${(avgScore * 100).toFixed(1)}%`)
223
+ log(` Time: ${(totalTime / 1000).toFixed(1)}s`)
224
+ log(` Cost: $${totalCost.toFixed(4)}`)
225
+ log('')
226
+ log(' By Model:')
202
227
  for (const [modelId, stats] of Object.entries(byModel)) {
203
- console.log(` - ${modelId}: ${(stats.avgScore * 100).toFixed(1)}%`)
228
+ log(` - ${modelId}: ${(stats.avgScore * 100).toFixed(1)}%`)
204
229
  }
205
230
 
206
231
  return {
@@ -0,0 +1,90 @@
1
+ /**
2
+ * InMemoryEvalLogStore — Map-backed default implementation of
3
+ * {@link EvalLogStore}.
4
+ *
5
+ * Matches Evalite v1's default backend: process-local Map keyed on `$id`,
6
+ * insertion-ordered for "most recent first" listing without sorting. Suitable
7
+ * for single-process tests, evals, and the cascade walker's in-flight log;
8
+ * not suitable for cross-process or multi-worker setups (use a disk/SQLite
9
+ * backend for those — same contract).
10
+ *
11
+ * @packageDocumentation
12
+ */
13
+
14
+ import { randomUUID } from 'crypto'
15
+ import type { EvalLogEntry, EvalLogListOptions, EvalLogStore } from './types.js'
16
+
17
+ /**
18
+ * In-memory implementation of {@link EvalLogStore}.
19
+ */
20
+ export class InMemoryEvalLogStore implements EvalLogStore {
21
+ /**
22
+ * Map keyed on `$id`. Insertion order on a JS Map is preserved, so we
23
+ * walk it in reverse for "most recent first" listing.
24
+ */
25
+ private readonly entries: Map<string, EvalLogEntry> = new Map()
26
+
27
+ async record(
28
+ entry: Omit<EvalLogEntry, '$id' | 'createdAt'> &
29
+ Partial<Pick<EvalLogEntry, '$id' | 'createdAt'>>
30
+ ): Promise<EvalLogEntry> {
31
+ const $id = entry.$id ?? randomUUID()
32
+ const createdAt = entry.createdAt ?? Date.now()
33
+ const stored: EvalLogEntry = {
34
+ $id,
35
+ createdAt,
36
+ model: entry.model,
37
+ prompt: entry.prompt,
38
+ response: entry.response,
39
+ usage: entry.usage,
40
+ costUsd: entry.costUsd,
41
+ durationMs: entry.durationMs,
42
+ ...(entry.traceId !== undefined ? { traceId: entry.traceId } : {}),
43
+ ...(entry.tags !== undefined ? { tags: entry.tags } : {}),
44
+ }
45
+ this.entries.set($id, stored)
46
+ return stored
47
+ }
48
+
49
+ async get(id: string): Promise<EvalLogEntry | undefined> {
50
+ return this.entries.get(id)
51
+ }
52
+
53
+ async list(options: EvalLogListOptions = {}): Promise<EvalLogEntry[]> {
54
+ const { traceId, model, tags, limit } = options
55
+ const out: EvalLogEntry[] = []
56
+ // Iterate in reverse insertion order — Map preserves order; we walk
57
+ // values into an array, then reverse for most-recent-first.
58
+ const all = Array.from(this.entries.values()).reverse()
59
+ for (const entry of all) {
60
+ if (traceId !== undefined && entry.traceId !== traceId) continue
61
+ if (model !== undefined && entry.model !== model) continue
62
+ if (tags !== undefined) {
63
+ let matchesAll = true
64
+ for (const k of Object.keys(tags)) {
65
+ if (entry.tags?.[k] !== tags[k]) {
66
+ matchesAll = false
67
+ break
68
+ }
69
+ }
70
+ if (!matchesAll) continue
71
+ }
72
+ out.push(entry)
73
+ if (limit !== undefined && out.length >= limit) break
74
+ }
75
+ return out
76
+ }
77
+
78
+ async delete(id: string): Promise<boolean> {
79
+ return this.entries.delete(id)
80
+ }
81
+
82
+ /**
83
+ * Convenience for tests: drop every entry. Not on the public
84
+ * {@link EvalLogStore} interface because the disk/SQLite backends may not
85
+ * want to expose a one-shot wipe.
86
+ */
87
+ clear(): void {
88
+ this.entries.clear()
89
+ }
90
+ }
@@ -0,0 +1,46 @@
1
+ /**
2
+ * EvalLogStore — pluggable persistence primitive for trace/eval entries.
3
+ *
4
+ * Exports the {@link EvalLogStore} contract, the
5
+ * {@link InMemoryEvalLogStore} default implementation, and a global
6
+ * accessor pair (`getEvalLogStore` / `configureEvalLogStore`) mirroring the
7
+ * marketplace persistence pattern from round 9.
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ import { InMemoryEvalLogStore } from './in-memory.js'
13
+ import type { EvalLogStore } from './types.js'
14
+
15
+ export type { EvalLogEntry, EvalLogListOptions, EvalLogStore } from './types.js'
16
+ export { InMemoryEvalLogStore } from './in-memory.js'
17
+
18
+ // ============================================================================
19
+ // Global accessor (lazy default + override)
20
+ // ============================================================================
21
+
22
+ let _store: EvalLogStore | null = null
23
+
24
+ /**
25
+ * Get the global {@link EvalLogStore}. Lazily constructs an
26
+ * {@link InMemoryEvalLogStore} on first call when no store has been
27
+ * configured.
28
+ *
29
+ * Match the round-9 marketplace persistence accessor: callers that don't
30
+ * care about isolation read the global; callers that do (tests, multi-tenant
31
+ * apps) install their own via {@link configureEvalLogStore}.
32
+ */
33
+ export function getEvalLogStore(): EvalLogStore {
34
+ if (_store === null) {
35
+ _store = new InMemoryEvalLogStore()
36
+ }
37
+ return _store
38
+ }
39
+
40
+ /**
41
+ * Install a global {@link EvalLogStore}. Pass `null` to reset to the lazy
42
+ * in-memory default (useful in test teardown).
43
+ */
44
+ export function configureEvalLogStore(store: EvalLogStore | null): void {
45
+ _store = store
46
+ }
@@ -0,0 +1,110 @@
1
+ /**
2
+ * EvalLogStore — pluggable persistence primitive for trace/eval entries.
3
+ *
4
+ * Forward-looking primitive matching Evalite v1's EvalLogStore pattern:
5
+ * the in-memory default ships today; the disk/SQLite/durable backends can
6
+ * land later without breaking the trace middleware contract.
7
+ *
8
+ * Used downstream by `traceMiddleware` (in `../middleware/trace.ts`) as the
9
+ * sink for per-call prompt+response+usage records. The cascade-walker in
10
+ * services-as-software will consume `list()` / `get()` to populate the
11
+ * InvocationEvent stream once round 16+ adds the `'persona-trace'` variant.
12
+ *
13
+ * @packageDocumentation
14
+ */
15
+
16
+ // ============================================================================
17
+ // Types
18
+ // ============================================================================
19
+
20
+ /**
21
+ * A single entry in the eval log — one LLM call with its full payload.
22
+ *
23
+ * Shape mirrors what `traceMiddleware` emits, with optional `tags` for
24
+ * caller-supplied dimensions (persona name, evaluator role, cascade depth).
25
+ */
26
+ export interface EvalLogEntry {
27
+ /** MDXLD identity — typically a UUID generated at insert time. */
28
+ $id: string
29
+ /**
30
+ * Optional caller-supplied trace correlation ID. When the cascade walker
31
+ * spans multiple LLM calls under one user request, all entries share the
32
+ * same `traceId` so `list({ traceId })` rolls them up.
33
+ */
34
+ traceId?: string
35
+ /** Model identifier (e.g. `'anthropic/claude-sonnet-4.5'` or `'sonnet'`). */
36
+ model: string
37
+ /**
38
+ * Stringified prompt as submitted to the model. We don't store the
39
+ * structured `LanguageModelV3Prompt` shape because (a) it's bulky and (b)
40
+ * downstream consumers (replay, fixture diff) only need the text payload.
41
+ */
42
+ prompt: string
43
+ /** The model's text response. Tool calls/files are not stored here. */
44
+ response: string
45
+ /** Token usage as reported by the AI SDK. */
46
+ usage: {
47
+ inputTokens: number
48
+ outputTokens: number
49
+ }
50
+ /** Computed USD cost (caller-supplied via the `pricing` overlay). */
51
+ costUsd: number
52
+ /** Wall-clock duration of the underlying `doGenerate` / `doStream` call. */
53
+ durationMs: number
54
+ /** Caller-supplied dimensions (persona, evaluator role, cascade step). */
55
+ tags?: Record<string, string>
56
+ /** Insert timestamp (epoch ms). */
57
+ createdAt: number
58
+ }
59
+
60
+ /**
61
+ * Options accepted by `EvalLogStore.list`. All fields are AND-combined.
62
+ */
63
+ export interface EvalLogListOptions {
64
+ /** Filter to entries with this trace correlation ID. */
65
+ traceId?: string
66
+ /** Filter to entries for a specific model. */
67
+ model?: string
68
+ /**
69
+ * Filter to entries whose `tags` are a *superset* of the supplied object.
70
+ * (E.g. `{ persona: 'cfo' }` matches entries tagged
71
+ * `{ persona: 'cfo', step: '3' }` but not entries tagged
72
+ * `{ persona: 'cto' }`.)
73
+ */
74
+ tags?: Record<string, string>
75
+ /** Maximum number of entries to return (most recent first). */
76
+ limit?: number
77
+ }
78
+
79
+ /**
80
+ * Pluggable persistence interface for eval log entries.
81
+ *
82
+ * Modeled after the Evalite v1 EvalLogStore contract: in-memory default,
83
+ * disk JSON / SQLite / durable backends supplied via
84
+ * `configureEvalLogStore`.
85
+ *
86
+ * All methods are async to keep the contract uniform across backends — the
87
+ * in-memory implementation resolves synchronously under the hood.
88
+ */
89
+ export interface EvalLogStore {
90
+ /**
91
+ * Persist a new entry. Returns the stored entry (with `$id` and
92
+ * `createdAt` filled in if the caller omitted them).
93
+ */
94
+ record(
95
+ entry: Omit<EvalLogEntry, '$id' | 'createdAt'> &
96
+ Partial<Pick<EvalLogEntry, '$id' | 'createdAt'>>
97
+ ): Promise<EvalLogEntry>
98
+ /**
99
+ * Read an entry by `$id`. Returns `undefined` when not found.
100
+ */
101
+ get(id: string): Promise<EvalLogEntry | undefined>
102
+ /**
103
+ * List entries matching the supplied filter. Returns most recent first.
104
+ */
105
+ list(options?: EvalLogListOptions): Promise<EvalLogEntry[]>
106
+ /**
107
+ * Delete an entry. Returns `true` if an entry was actually removed.
108
+ */
109
+ delete(id: string): Promise<boolean>
110
+ }