ai-experiments 0.1.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,358 @@
1
+ /**
2
+ * Experiment execution and management
3
+ */
4
+
5
+ import { randomUUID } from 'crypto'
6
+ import type {
7
+ ExperimentConfig,
8
+ ExperimentContext,
9
+ ExperimentResult,
10
+ ExperimentSummary,
11
+ RunExperimentOptions,
12
+ ExperimentVariant,
13
+ } from './types.js'
14
+ import { track } from './tracking.js'
15
+
16
+ /**
17
+ * Create and run an A/B experiment with multiple variants
18
+ *
19
+ * @example
20
+ * ```ts
21
+ * import { Experiment } from 'ai-experiments'
22
+ *
23
+ * const results = await Experiment({
24
+ * id: 'prompt-comparison',
25
+ * name: 'Prompt Engineering Test',
26
+ * variants: [
27
+ * {
28
+ * id: 'baseline',
29
+ * name: 'Baseline Prompt',
30
+ * config: { prompt: 'Summarize this text.' },
31
+ * },
32
+ * {
33
+ * id: 'detailed',
34
+ * name: 'Detailed Prompt',
35
+ * config: { prompt: 'Provide a comprehensive summary...' },
36
+ * },
37
+ * ],
38
+ * execute: async (config) => {
39
+ * return await ai.generate({ prompt: config.prompt })
40
+ * },
41
+ * metric: (result) => result.quality_score,
42
+ * })
43
+ *
44
+ * console.log('Best variant:', results.bestVariant)
45
+ * ```
46
+ */
47
+ export async function Experiment<TConfig = unknown, TResult = unknown>(
48
+ config: ExperimentConfig<TConfig, TResult>,
49
+ options: RunExperimentOptions = {}
50
+ ): Promise<ExperimentSummary<TResult>> {
51
+ const {
52
+ parallel = true,
53
+ maxConcurrency,
54
+ stopOnError = false,
55
+ context: contextData,
56
+ onVariantStart,
57
+ onVariantComplete,
58
+ onVariantError,
59
+ } = options
60
+
61
+ const experimentStartTime = new Date()
62
+
63
+ // Track experiment start
64
+ track({
65
+ type: 'experiment.start',
66
+ timestamp: experimentStartTime,
67
+ data: {
68
+ experimentId: config.id,
69
+ experimentName: config.name,
70
+ variantCount: config.variants.length,
71
+ parallel,
72
+ ...config.metadata,
73
+ },
74
+ })
75
+
76
+ const results: ExperimentResult<TResult>[] = []
77
+
78
+ // Execute variants
79
+ if (parallel) {
80
+ // Parallel execution with optional concurrency limit
81
+ const executeVariant = async (
82
+ variant: ExperimentVariant<TConfig>
83
+ ): Promise<ExperimentResult<TResult>> => {
84
+ return runVariant(config, variant, contextData, {
85
+ onVariantStart,
86
+ onVariantComplete,
87
+ onVariantError,
88
+ })
89
+ }
90
+
91
+ if (maxConcurrency && maxConcurrency > 0) {
92
+ // Execute with concurrency limit
93
+ const chunks = chunkArray(config.variants, maxConcurrency)
94
+ for (const chunk of chunks) {
95
+ const chunkResults = await Promise.all(chunk.map(executeVariant))
96
+ results.push(...chunkResults)
97
+
98
+ // Stop on first error if configured
99
+ if (stopOnError && chunkResults.some((r) => !r.success)) {
100
+ break
101
+ }
102
+ }
103
+ } else {
104
+ // Execute all in parallel
105
+ const allResults = await Promise.all(config.variants.map(executeVariant))
106
+ results.push(...allResults)
107
+ }
108
+ } else {
109
+ // Sequential execution
110
+ for (const variant of config.variants) {
111
+ const result = await runVariant(config, variant, contextData, {
112
+ onVariantStart,
113
+ onVariantComplete,
114
+ onVariantError,
115
+ })
116
+ results.push(result)
117
+
118
+ // Stop on first error if configured
119
+ if (stopOnError && !result.success) {
120
+ break
121
+ }
122
+ }
123
+ }
124
+
125
+ const experimentEndTime = new Date()
126
+ const totalDuration = experimentEndTime.getTime() - experimentStartTime.getTime()
127
+
128
+ // Find best variant by metric
129
+ let bestVariant: ExperimentSummary<TResult>['bestVariant']
130
+ const successfulResults = results.filter((r) => r.success && r.metricValue !== undefined)
131
+ if (successfulResults.length > 0) {
132
+ const best = successfulResults.reduce((prev, current) =>
133
+ (current.metricValue ?? -Infinity) > (prev.metricValue ?? -Infinity) ? current : prev
134
+ )
135
+ bestVariant = {
136
+ variantId: best.variantId,
137
+ variantName: best.variantName,
138
+ metricValue: best.metricValue!,
139
+ }
140
+ }
141
+
142
+ const summary: ExperimentSummary<TResult> = {
143
+ experimentId: config.id,
144
+ experimentName: config.name,
145
+ results,
146
+ bestVariant,
147
+ totalDuration,
148
+ successCount: results.filter((r) => r.success).length,
149
+ failureCount: results.filter((r) => !r.success).length,
150
+ startedAt: experimentStartTime,
151
+ completedAt: experimentEndTime,
152
+ }
153
+
154
+ // Track experiment completion
155
+ track({
156
+ type: 'experiment.complete',
157
+ timestamp: experimentEndTime,
158
+ data: {
159
+ experimentId: config.id,
160
+ experimentName: config.name,
161
+ successCount: summary.successCount,
162
+ failureCount: summary.failureCount,
163
+ totalDuration,
164
+ bestVariant: bestVariant?.variantId,
165
+ ...config.metadata,
166
+ },
167
+ })
168
+
169
+ return summary
170
+ }
171
+
172
+ /**
173
+ * Run a single variant
174
+ */
175
+ async function runVariant<TConfig, TResult>(
176
+ config: ExperimentConfig<TConfig, TResult>,
177
+ variant: ExperimentVariant<TConfig>,
178
+ contextData: Record<string, unknown> | undefined,
179
+ callbacks: {
180
+ onVariantStart?: (variantId: string, variantName: string) => void
181
+ onVariantComplete?: (result: ExperimentResult<TResult>) => void
182
+ onVariantError?: (variantId: string, error: Error) => void
183
+ }
184
+ ): Promise<ExperimentResult<TResult>> {
185
+ const runId = randomUUID()
186
+ const startTime = new Date()
187
+
188
+ const context: ExperimentContext = {
189
+ experimentId: config.id,
190
+ variantId: variant.id,
191
+ runId,
192
+ startedAt: startTime,
193
+ data: contextData,
194
+ }
195
+
196
+ // Track variant start
197
+ track({
198
+ type: 'variant.start',
199
+ timestamp: startTime,
200
+ data: {
201
+ experimentId: config.id,
202
+ variantId: variant.id,
203
+ variantName: variant.name,
204
+ runId,
205
+ },
206
+ })
207
+
208
+ callbacks.onVariantStart?.(variant.id, variant.name)
209
+
210
+ try {
211
+ // Execute the variant
212
+ const result = await config.execute(variant.config, context)
213
+ const endTime = new Date()
214
+ const duration = endTime.getTime() - startTime.getTime()
215
+
216
+ // Compute metric if provided
217
+ let metricValue: number | undefined
218
+ if (config.metric) {
219
+ metricValue = await config.metric(result)
220
+
221
+ track({
222
+ type: 'metric.computed',
223
+ timestamp: new Date(),
224
+ data: {
225
+ experimentId: config.id,
226
+ variantId: variant.id,
227
+ runId,
228
+ metricValue,
229
+ },
230
+ })
231
+ }
232
+
233
+ const experimentResult: ExperimentResult<TResult> = {
234
+ experimentId: config.id,
235
+ variantId: variant.id,
236
+ variantName: variant.name,
237
+ runId,
238
+ result,
239
+ metricValue,
240
+ duration,
241
+ startedAt: startTime,
242
+ completedAt: endTime,
243
+ success: true,
244
+ }
245
+
246
+ // Track variant completion
247
+ track({
248
+ type: 'variant.complete',
249
+ timestamp: endTime,
250
+ data: {
251
+ experimentId: config.id,
252
+ variantId: variant.id,
253
+ variantName: variant.name,
254
+ runId,
255
+ duration,
256
+ metricValue,
257
+ success: true,
258
+ },
259
+ })
260
+
261
+ callbacks.onVariantComplete?.(experimentResult)
262
+
263
+ return experimentResult
264
+ } catch (error) {
265
+ const endTime = new Date()
266
+ const duration = endTime.getTime() - startTime.getTime()
267
+ const err = error instanceof Error ? error : new Error(String(error))
268
+
269
+ // Track variant error
270
+ track({
271
+ type: 'variant.error',
272
+ timestamp: endTime,
273
+ data: {
274
+ experimentId: config.id,
275
+ variantId: variant.id,
276
+ variantName: variant.name,
277
+ runId,
278
+ duration,
279
+ error: err.message,
280
+ stack: err.stack,
281
+ },
282
+ })
283
+
284
+ callbacks.onVariantError?.(variant.id, err)
285
+
286
+ return {
287
+ experimentId: config.id,
288
+ variantId: variant.id,
289
+ variantName: variant.name,
290
+ runId,
291
+ result: undefined as unknown as TResult,
292
+ duration,
293
+ startedAt: startTime,
294
+ completedAt: endTime,
295
+ error: err,
296
+ success: false,
297
+ }
298
+ }
299
+ }
300
+
301
+ /**
302
+ * Split array into chunks
303
+ */
304
+ function chunkArray<T>(array: T[], size: number): T[][] {
305
+ const chunks: T[][] = []
306
+ for (let i = 0; i < array.length; i += size) {
307
+ chunks.push(array.slice(i, i + size))
308
+ }
309
+ return chunks
310
+ }
311
+
312
+ /**
313
+ * Helper to create experiment variants from a parameter grid
314
+ *
315
+ * @example
316
+ * ```ts
317
+ * const variants = createVariantsFromGrid({
318
+ * temperature: [0.3, 0.7, 1.0],
319
+ * model: ['sonnet', 'opus'],
320
+ * maxTokens: [100, 500],
321
+ * })
322
+ * // Returns 12 variants (3 * 2 * 2 combinations)
323
+ * ```
324
+ */
325
+ export function createVariantsFromGrid<T extends Record<string, unknown[]>>(
326
+ grid: T
327
+ ): ExperimentVariant<{ [K in keyof T]: T[K][number] }>[] {
328
+ const keys = Object.keys(grid) as (keyof T)[]
329
+ const values = keys.map((k) => grid[k])
330
+
331
+ // Generate all combinations
332
+ const combinations = cartesianProduct(values)
333
+
334
+ return combinations.map((combo, index) => {
335
+ const config = Object.fromEntries(
336
+ keys.map((key, i) => [key, combo[i]])
337
+ ) as { [K in keyof T]: T[K][number] }
338
+
339
+ return {
340
+ id: `variant-${index}`,
341
+ name: keys.map((k, i) => `${String(k)}=${combo[i]}`).join(', '),
342
+ config,
343
+ }
344
+ })
345
+ }
346
+
347
+ /**
348
+ * Cartesian product helper
349
+ */
350
+ function cartesianProduct<T>(arrays: T[][]): T[][] {
351
+ if (arrays.length === 0) return [[]]
352
+ if (arrays.length === 1) return arrays[0]!.map((x) => [x])
353
+
354
+ const [first, ...rest] = arrays
355
+ const restProduct = cartesianProduct(rest)
356
+
357
+ return first!.flatMap((x) => restProduct.map((arr) => [x, ...arr]))
358
+ }
package/src/index.ts ADDED
@@ -0,0 +1,44 @@
1
+ /**
2
+ * ai-experiments - AI-powered experimentation primitives for testing and evaluating models
3
+ *
4
+ * This package provides tools for A/B testing, parameter exploration, decision making,
5
+ * and tracking in AI applications.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+
10
+ // Export core types
11
+ export * from './types.js'
12
+
13
+ // Export experiment functionality
14
+ export { Experiment, createVariantsFromGrid } from './experiment.js'
15
+
16
+ // Export cartesian product utilities
17
+ export {
18
+ cartesian,
19
+ cartesianFilter,
20
+ cartesianSample,
21
+ cartesianCount,
22
+ cartesianWithLabels,
23
+ } from './cartesian.js'
24
+
25
+ // Export decision making utilities
26
+ export {
27
+ decide,
28
+ decideWeighted,
29
+ decideEpsilonGreedy,
30
+ decideThompsonSampling,
31
+ decideUCB,
32
+ } from './decide.js'
33
+
34
+ // Export tracking utilities
35
+ export {
36
+ track,
37
+ flush,
38
+ configureTracking,
39
+ getTrackingConfig,
40
+ createConsoleBackend,
41
+ createMemoryBackend,
42
+ createBatchBackend,
43
+ createFileBackend,
44
+ } from './tracking.js'