ai-experiments 0.1.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +5 -0
- package/CHANGELOG.md +8 -0
- package/README.md +306 -91
- package/dist/cartesian.d.ts +140 -0
- package/dist/cartesian.d.ts.map +1 -0
- package/dist/cartesian.js +216 -0
- package/dist/cartesian.js.map +1 -0
- package/dist/decide.d.ts +152 -0
- package/dist/decide.d.ts.map +1 -0
- package/dist/decide.js +329 -0
- package/dist/decide.js.map +1 -0
- package/dist/experiment.d.ts +53 -0
- package/dist/experiment.d.ts.map +1 -0
- package/dist/experiment.js +292 -0
- package/dist/experiment.js.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +19 -0
- package/dist/index.js.map +1 -0
- package/dist/tracking.d.ts +159 -0
- package/dist/tracking.d.ts.map +1 -0
- package/dist/tracking.js +310 -0
- package/dist/tracking.js.map +1 -0
- package/dist/types.d.ts +198 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +5 -0
- package/dist/types.js.map +1 -0
- package/examples.ts +261 -0
- package/package.json +21 -39
- package/src/cartesian.ts +259 -0
- package/src/decide.ts +398 -0
- package/src/experiment.ts +358 -0
- package/src/index.ts +44 -0
- package/src/tracking.ts +339 -0
- package/src/types.ts +215 -0
- package/tsconfig.json +9 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Experiment execution and management
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { randomUUID } from 'crypto'
|
|
6
|
+
import type {
|
|
7
|
+
ExperimentConfig,
|
|
8
|
+
ExperimentContext,
|
|
9
|
+
ExperimentResult,
|
|
10
|
+
ExperimentSummary,
|
|
11
|
+
RunExperimentOptions,
|
|
12
|
+
ExperimentVariant,
|
|
13
|
+
} from './types.js'
|
|
14
|
+
import { track } from './tracking.js'
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Create and run an A/B experiment with multiple variants
|
|
18
|
+
*
|
|
19
|
+
* @example
|
|
20
|
+
* ```ts
|
|
21
|
+
* import { Experiment } from 'ai-experiments'
|
|
22
|
+
*
|
|
23
|
+
* const results = await Experiment({
|
|
24
|
+
* id: 'prompt-comparison',
|
|
25
|
+
* name: 'Prompt Engineering Test',
|
|
26
|
+
* variants: [
|
|
27
|
+
* {
|
|
28
|
+
* id: 'baseline',
|
|
29
|
+
* name: 'Baseline Prompt',
|
|
30
|
+
* config: { prompt: 'Summarize this text.' },
|
|
31
|
+
* },
|
|
32
|
+
* {
|
|
33
|
+
* id: 'detailed',
|
|
34
|
+
* name: 'Detailed Prompt',
|
|
35
|
+
* config: { prompt: 'Provide a comprehensive summary...' },
|
|
36
|
+
* },
|
|
37
|
+
* ],
|
|
38
|
+
* execute: async (config) => {
|
|
39
|
+
* return await ai.generate({ prompt: config.prompt })
|
|
40
|
+
* },
|
|
41
|
+
* metric: (result) => result.quality_score,
|
|
42
|
+
* })
|
|
43
|
+
*
|
|
44
|
+
* console.log('Best variant:', results.bestVariant)
|
|
45
|
+
* ```
|
|
46
|
+
*/
|
|
47
|
+
export async function Experiment<TConfig = unknown, TResult = unknown>(
|
|
48
|
+
config: ExperimentConfig<TConfig, TResult>,
|
|
49
|
+
options: RunExperimentOptions = {}
|
|
50
|
+
): Promise<ExperimentSummary<TResult>> {
|
|
51
|
+
const {
|
|
52
|
+
parallel = true,
|
|
53
|
+
maxConcurrency,
|
|
54
|
+
stopOnError = false,
|
|
55
|
+
context: contextData,
|
|
56
|
+
onVariantStart,
|
|
57
|
+
onVariantComplete,
|
|
58
|
+
onVariantError,
|
|
59
|
+
} = options
|
|
60
|
+
|
|
61
|
+
const experimentStartTime = new Date()
|
|
62
|
+
|
|
63
|
+
// Track experiment start
|
|
64
|
+
track({
|
|
65
|
+
type: 'experiment.start',
|
|
66
|
+
timestamp: experimentStartTime,
|
|
67
|
+
data: {
|
|
68
|
+
experimentId: config.id,
|
|
69
|
+
experimentName: config.name,
|
|
70
|
+
variantCount: config.variants.length,
|
|
71
|
+
parallel,
|
|
72
|
+
...config.metadata,
|
|
73
|
+
},
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
const results: ExperimentResult<TResult>[] = []
|
|
77
|
+
|
|
78
|
+
// Execute variants
|
|
79
|
+
if (parallel) {
|
|
80
|
+
// Parallel execution with optional concurrency limit
|
|
81
|
+
const executeVariant = async (
|
|
82
|
+
variant: ExperimentVariant<TConfig>
|
|
83
|
+
): Promise<ExperimentResult<TResult>> => {
|
|
84
|
+
return runVariant(config, variant, contextData, {
|
|
85
|
+
onVariantStart,
|
|
86
|
+
onVariantComplete,
|
|
87
|
+
onVariantError,
|
|
88
|
+
})
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if (maxConcurrency && maxConcurrency > 0) {
|
|
92
|
+
// Execute with concurrency limit
|
|
93
|
+
const chunks = chunkArray(config.variants, maxConcurrency)
|
|
94
|
+
for (const chunk of chunks) {
|
|
95
|
+
const chunkResults = await Promise.all(chunk.map(executeVariant))
|
|
96
|
+
results.push(...chunkResults)
|
|
97
|
+
|
|
98
|
+
// Stop on first error if configured
|
|
99
|
+
if (stopOnError && chunkResults.some((r) => !r.success)) {
|
|
100
|
+
break
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
} else {
|
|
104
|
+
// Execute all in parallel
|
|
105
|
+
const allResults = await Promise.all(config.variants.map(executeVariant))
|
|
106
|
+
results.push(...allResults)
|
|
107
|
+
}
|
|
108
|
+
} else {
|
|
109
|
+
// Sequential execution
|
|
110
|
+
for (const variant of config.variants) {
|
|
111
|
+
const result = await runVariant(config, variant, contextData, {
|
|
112
|
+
onVariantStart,
|
|
113
|
+
onVariantComplete,
|
|
114
|
+
onVariantError,
|
|
115
|
+
})
|
|
116
|
+
results.push(result)
|
|
117
|
+
|
|
118
|
+
// Stop on first error if configured
|
|
119
|
+
if (stopOnError && !result.success) {
|
|
120
|
+
break
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const experimentEndTime = new Date()
|
|
126
|
+
const totalDuration = experimentEndTime.getTime() - experimentStartTime.getTime()
|
|
127
|
+
|
|
128
|
+
// Find best variant by metric
|
|
129
|
+
let bestVariant: ExperimentSummary<TResult>['bestVariant']
|
|
130
|
+
const successfulResults = results.filter((r) => r.success && r.metricValue !== undefined)
|
|
131
|
+
if (successfulResults.length > 0) {
|
|
132
|
+
const best = successfulResults.reduce((prev, current) =>
|
|
133
|
+
(current.metricValue ?? -Infinity) > (prev.metricValue ?? -Infinity) ? current : prev
|
|
134
|
+
)
|
|
135
|
+
bestVariant = {
|
|
136
|
+
variantId: best.variantId,
|
|
137
|
+
variantName: best.variantName,
|
|
138
|
+
metricValue: best.metricValue!,
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const summary: ExperimentSummary<TResult> = {
|
|
143
|
+
experimentId: config.id,
|
|
144
|
+
experimentName: config.name,
|
|
145
|
+
results,
|
|
146
|
+
bestVariant,
|
|
147
|
+
totalDuration,
|
|
148
|
+
successCount: results.filter((r) => r.success).length,
|
|
149
|
+
failureCount: results.filter((r) => !r.success).length,
|
|
150
|
+
startedAt: experimentStartTime,
|
|
151
|
+
completedAt: experimentEndTime,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Track experiment completion
|
|
155
|
+
track({
|
|
156
|
+
type: 'experiment.complete',
|
|
157
|
+
timestamp: experimentEndTime,
|
|
158
|
+
data: {
|
|
159
|
+
experimentId: config.id,
|
|
160
|
+
experimentName: config.name,
|
|
161
|
+
successCount: summary.successCount,
|
|
162
|
+
failureCount: summary.failureCount,
|
|
163
|
+
totalDuration,
|
|
164
|
+
bestVariant: bestVariant?.variantId,
|
|
165
|
+
...config.metadata,
|
|
166
|
+
},
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
return summary
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Run a single variant
|
|
174
|
+
*/
|
|
175
|
+
async function runVariant<TConfig, TResult>(
|
|
176
|
+
config: ExperimentConfig<TConfig, TResult>,
|
|
177
|
+
variant: ExperimentVariant<TConfig>,
|
|
178
|
+
contextData: Record<string, unknown> | undefined,
|
|
179
|
+
callbacks: {
|
|
180
|
+
onVariantStart?: (variantId: string, variantName: string) => void
|
|
181
|
+
onVariantComplete?: (result: ExperimentResult<TResult>) => void
|
|
182
|
+
onVariantError?: (variantId: string, error: Error) => void
|
|
183
|
+
}
|
|
184
|
+
): Promise<ExperimentResult<TResult>> {
|
|
185
|
+
const runId = randomUUID()
|
|
186
|
+
const startTime = new Date()
|
|
187
|
+
|
|
188
|
+
const context: ExperimentContext = {
|
|
189
|
+
experimentId: config.id,
|
|
190
|
+
variantId: variant.id,
|
|
191
|
+
runId,
|
|
192
|
+
startedAt: startTime,
|
|
193
|
+
data: contextData,
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Track variant start
|
|
197
|
+
track({
|
|
198
|
+
type: 'variant.start',
|
|
199
|
+
timestamp: startTime,
|
|
200
|
+
data: {
|
|
201
|
+
experimentId: config.id,
|
|
202
|
+
variantId: variant.id,
|
|
203
|
+
variantName: variant.name,
|
|
204
|
+
runId,
|
|
205
|
+
},
|
|
206
|
+
})
|
|
207
|
+
|
|
208
|
+
callbacks.onVariantStart?.(variant.id, variant.name)
|
|
209
|
+
|
|
210
|
+
try {
|
|
211
|
+
// Execute the variant
|
|
212
|
+
const result = await config.execute(variant.config, context)
|
|
213
|
+
const endTime = new Date()
|
|
214
|
+
const duration = endTime.getTime() - startTime.getTime()
|
|
215
|
+
|
|
216
|
+
// Compute metric if provided
|
|
217
|
+
let metricValue: number | undefined
|
|
218
|
+
if (config.metric) {
|
|
219
|
+
metricValue = await config.metric(result)
|
|
220
|
+
|
|
221
|
+
track({
|
|
222
|
+
type: 'metric.computed',
|
|
223
|
+
timestamp: new Date(),
|
|
224
|
+
data: {
|
|
225
|
+
experimentId: config.id,
|
|
226
|
+
variantId: variant.id,
|
|
227
|
+
runId,
|
|
228
|
+
metricValue,
|
|
229
|
+
},
|
|
230
|
+
})
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const experimentResult: ExperimentResult<TResult> = {
|
|
234
|
+
experimentId: config.id,
|
|
235
|
+
variantId: variant.id,
|
|
236
|
+
variantName: variant.name,
|
|
237
|
+
runId,
|
|
238
|
+
result,
|
|
239
|
+
metricValue,
|
|
240
|
+
duration,
|
|
241
|
+
startedAt: startTime,
|
|
242
|
+
completedAt: endTime,
|
|
243
|
+
success: true,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// Track variant completion
|
|
247
|
+
track({
|
|
248
|
+
type: 'variant.complete',
|
|
249
|
+
timestamp: endTime,
|
|
250
|
+
data: {
|
|
251
|
+
experimentId: config.id,
|
|
252
|
+
variantId: variant.id,
|
|
253
|
+
variantName: variant.name,
|
|
254
|
+
runId,
|
|
255
|
+
duration,
|
|
256
|
+
metricValue,
|
|
257
|
+
success: true,
|
|
258
|
+
},
|
|
259
|
+
})
|
|
260
|
+
|
|
261
|
+
callbacks.onVariantComplete?.(experimentResult)
|
|
262
|
+
|
|
263
|
+
return experimentResult
|
|
264
|
+
} catch (error) {
|
|
265
|
+
const endTime = new Date()
|
|
266
|
+
const duration = endTime.getTime() - startTime.getTime()
|
|
267
|
+
const err = error instanceof Error ? error : new Error(String(error))
|
|
268
|
+
|
|
269
|
+
// Track variant error
|
|
270
|
+
track({
|
|
271
|
+
type: 'variant.error',
|
|
272
|
+
timestamp: endTime,
|
|
273
|
+
data: {
|
|
274
|
+
experimentId: config.id,
|
|
275
|
+
variantId: variant.id,
|
|
276
|
+
variantName: variant.name,
|
|
277
|
+
runId,
|
|
278
|
+
duration,
|
|
279
|
+
error: err.message,
|
|
280
|
+
stack: err.stack,
|
|
281
|
+
},
|
|
282
|
+
})
|
|
283
|
+
|
|
284
|
+
callbacks.onVariantError?.(variant.id, err)
|
|
285
|
+
|
|
286
|
+
return {
|
|
287
|
+
experimentId: config.id,
|
|
288
|
+
variantId: variant.id,
|
|
289
|
+
variantName: variant.name,
|
|
290
|
+
runId,
|
|
291
|
+
result: undefined as unknown as TResult,
|
|
292
|
+
duration,
|
|
293
|
+
startedAt: startTime,
|
|
294
|
+
completedAt: endTime,
|
|
295
|
+
error: err,
|
|
296
|
+
success: false,
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Split array into chunks
|
|
303
|
+
*/
|
|
304
|
+
function chunkArray<T>(array: T[], size: number): T[][] {
|
|
305
|
+
const chunks: T[][] = []
|
|
306
|
+
for (let i = 0; i < array.length; i += size) {
|
|
307
|
+
chunks.push(array.slice(i, i + size))
|
|
308
|
+
}
|
|
309
|
+
return chunks
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Helper to create experiment variants from a parameter grid
|
|
314
|
+
*
|
|
315
|
+
* @example
|
|
316
|
+
* ```ts
|
|
317
|
+
* const variants = createVariantsFromGrid({
|
|
318
|
+
* temperature: [0.3, 0.7, 1.0],
|
|
319
|
+
* model: ['sonnet', 'opus'],
|
|
320
|
+
* maxTokens: [100, 500],
|
|
321
|
+
* })
|
|
322
|
+
* // Returns 12 variants (3 * 2 * 2 combinations)
|
|
323
|
+
* ```
|
|
324
|
+
*/
|
|
325
|
+
export function createVariantsFromGrid<T extends Record<string, unknown[]>>(
|
|
326
|
+
grid: T
|
|
327
|
+
): ExperimentVariant<{ [K in keyof T]: T[K][number] }>[] {
|
|
328
|
+
const keys = Object.keys(grid) as (keyof T)[]
|
|
329
|
+
const values = keys.map((k) => grid[k])
|
|
330
|
+
|
|
331
|
+
// Generate all combinations
|
|
332
|
+
const combinations = cartesianProduct(values)
|
|
333
|
+
|
|
334
|
+
return combinations.map((combo, index) => {
|
|
335
|
+
const config = Object.fromEntries(
|
|
336
|
+
keys.map((key, i) => [key, combo[i]])
|
|
337
|
+
) as { [K in keyof T]: T[K][number] }
|
|
338
|
+
|
|
339
|
+
return {
|
|
340
|
+
id: `variant-${index}`,
|
|
341
|
+
name: keys.map((k, i) => `${String(k)}=${combo[i]}`).join(', '),
|
|
342
|
+
config,
|
|
343
|
+
}
|
|
344
|
+
})
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Cartesian product helper
|
|
349
|
+
*/
|
|
350
|
+
function cartesianProduct<T>(arrays: T[][]): T[][] {
|
|
351
|
+
if (arrays.length === 0) return [[]]
|
|
352
|
+
if (arrays.length === 1) return arrays[0]!.map((x) => [x])
|
|
353
|
+
|
|
354
|
+
const [first, ...rest] = arrays
|
|
355
|
+
const restProduct = cartesianProduct(rest)
|
|
356
|
+
|
|
357
|
+
return first!.flatMap((x) => restProduct.map((arr) => [x, ...arr]))
|
|
358
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ai-experiments - AI-powered experimentation primitives for testing and evaluating models
|
|
3
|
+
*
|
|
4
|
+
* This package provides tools for A/B testing, parameter exploration, decision making,
|
|
5
|
+
* and tracking in AI applications.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
// Export core types
|
|
11
|
+
export * from './types.js'
|
|
12
|
+
|
|
13
|
+
// Export experiment functionality
|
|
14
|
+
export { Experiment, createVariantsFromGrid } from './experiment.js'
|
|
15
|
+
|
|
16
|
+
// Export cartesian product utilities
|
|
17
|
+
export {
|
|
18
|
+
cartesian,
|
|
19
|
+
cartesianFilter,
|
|
20
|
+
cartesianSample,
|
|
21
|
+
cartesianCount,
|
|
22
|
+
cartesianWithLabels,
|
|
23
|
+
} from './cartesian.js'
|
|
24
|
+
|
|
25
|
+
// Export decision making utilities
|
|
26
|
+
export {
|
|
27
|
+
decide,
|
|
28
|
+
decideWeighted,
|
|
29
|
+
decideEpsilonGreedy,
|
|
30
|
+
decideThompsonSampling,
|
|
31
|
+
decideUCB,
|
|
32
|
+
} from './decide.js'
|
|
33
|
+
|
|
34
|
+
// Export tracking utilities
|
|
35
|
+
export {
|
|
36
|
+
track,
|
|
37
|
+
flush,
|
|
38
|
+
configureTracking,
|
|
39
|
+
getTrackingConfig,
|
|
40
|
+
createConsoleBackend,
|
|
41
|
+
createMemoryBackend,
|
|
42
|
+
createBatchBackend,
|
|
43
|
+
createFileBackend,
|
|
44
|
+
} from './tracking.js'
|