@langfuse/client 4.0.0 → 4.1.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,17 +1,404 @@
1
1
  import * as _langfuse_core from '@langfuse/core';
2
- import { LangfuseAPIClient, Dataset, DatasetItem, DatasetRunItem, ParsedMediaReference, CreatePromptRequest, ChatMessage, ChatMessageWithPlaceholders, PlaceholderMessage, BasePrompt, Prompt, ScoreBody } from '@langfuse/core';
2
+ import { DatasetItem, ScoreBody, Dataset, DatasetRunItem, LangfuseAPIClient, ParsedMediaReference, CreatePromptRequest, ChatMessage, ChatMessageWithPlaceholders, PlaceholderMessage, BasePrompt, Prompt } from '@langfuse/core';
3
3
  import { Span } from '@opentelemetry/api';
4
4
 
5
+ type ExperimentItem<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = {
6
+ /**
7
+ * The input data to pass to the task function.
8
+ *
9
+ * Can be any type - string, object, array, etc. This data will be passed
10
+ * to your task function as the `input` parameter. Structure it according
11
+ * to your task's requirements.
12
+ */
13
+ input?: Input;
14
+ /**
15
+ * The expected output for evaluation purposes.
16
+ *
17
+ * Optional ground truth or reference output for this input.
18
+ * Used by evaluators to assess task performance. If not provided,
19
+ * only evaluators that don't require expected output can be used.
20
+ */
21
+ expectedOutput?: ExpectedOutput;
22
+ /**
23
+ * Optional metadata to attach to the experiment item.
24
+ *
25
+ * Store additional context, tags, or custom data related to this specific item.
26
+ * This metadata will be available in traces and can be used for filtering,
27
+ * analysis, or custom evaluator logic.
28
+ */
29
+ metadata?: Metadata;
30
+ } | DatasetItem;
31
+ /**
32
+ * Parameters passed to an experiment task function.
33
+ *
34
+ * Can be either an ExperimentItem (for custom datasets) or a DatasetItem
35
+ * (for Langfuse datasets). The task function should handle both types.
36
+ *
37
+ * @public
38
+ * @since 4.1.0
39
+ */
40
+ type ExperimentTaskParams<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = ExperimentItem<Input, ExpectedOutput, Metadata>;
41
+ /**
42
+ * Function type for experiment tasks that process input data and return output.
43
+ *
44
+ * The task function is the core component being tested in an experiment.
45
+ * It receives either an ExperimentItem or DatasetItem and produces output
46
+ * that will be evaluated.
47
+ *
48
+ * @param params - Either an ExperimentItem or DatasetItem containing input and metadata
49
+ * @returns Promise resolving to the task's output (any type)
50
+ *
51
+ * @example Task handling both item types
52
+ * ```typescript
53
+ * const universalTask: ExperimentTask = async (item) => {
54
+ * // Works with both ExperimentItem and DatasetItem
55
+ * const input = item.input;
56
+ * const metadata = item.metadata;
57
+ *
58
+ * const response = await openai.chat.completions.create({
59
+ * model: "gpt-4",
60
+ * messages: [{ role: "user", content: input }]
61
+ * });
62
+ *
63
+ * return response.choices[0].message.content;
64
+ * };
65
+ * ```
66
+ *
67
+ * @public
68
+ * @since 4.1.0
69
+ */
70
+ type ExperimentTask<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = (params: ExperimentTaskParams<Input, ExpectedOutput, Metadata>) => Promise<any>;
71
+ type Evaluation = Pick<ScoreBody, "name" | "value" | "comment" | "metadata" | "dataType">;
72
+ type EvaluatorParams<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = {
73
+ /**
74
+ * The original input data passed to the task.
75
+ *
76
+ * This is the same input that was provided to the task function.
77
+ * Use this for context-aware evaluations or input-output relationship analysis.
78
+ */
79
+ input: Input;
80
+ /**
81
+ * The output produced by the task.
82
+ *
83
+ * This is the actual result returned by your task function.
84
+ * This is the primary value to evaluate against expectations.
85
+ */
86
+ output: any;
87
+ /**
88
+ * The expected output for comparison (optional).
89
+ *
90
+ * This is the ground truth or expected result for the given input.
91
+ * May not be available for all evaluation scenarios.
92
+ */
93
+ expectedOutput?: ExpectedOutput;
94
+ /**
95
+ * Optional metadata about the evaluation context.
96
+ *
97
+ * Contains additional information from the experiment item or dataset item
98
+ * that may be useful for evaluation logic, such as tags, categories,
99
+ * or other contextual data.
100
+ */
101
+ metadata?: Metadata;
102
+ };
103
+ type Evaluator<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = (params: EvaluatorParams<Input, ExpectedOutput, Metadata>) => Promise<Evaluation[] | Evaluation>;
104
+ type RunEvaluatorParams<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = {
105
+ /**
106
+ * Results from all processed experiment items.
107
+ *
108
+ * Each item contains the input, output, evaluations, and metadata from
109
+ * processing a single data item. Use this for aggregate analysis,
110
+ * statistical calculations, and cross-item comparisons.
111
+ */
112
+ itemResults: ExperimentItemResult<Input, ExpectedOutput, Metadata>[];
113
+ };
114
+ type RunEvaluator<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = (params: RunEvaluatorParams<Input, ExpectedOutput, Metadata>) => Promise<Evaluation[] | Evaluation>;
115
+ type ExperimentParams<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = {
116
+ /**
117
+ * Human-readable name for the experiment.
118
+ *
119
+ * This name will appear in Langfuse UI and experiment results.
120
+ * Choose a descriptive name that identifies the experiment's purpose.
121
+ */
122
+ name: string;
123
+ /**
124
+ * Optional description explaining the experiment's purpose.
125
+ *
126
+ * Provide context about what you're testing, methodology, or goals.
127
+ * This helps with experiment tracking and result interpretation.
128
+ */
129
+ description?: string;
130
+ /**
131
+ * Optional metadata to attach to the experiment run.
132
+ *
133
+ * Store additional context like model versions, hyperparameters,
134
+ * or any other relevant information for analysis and comparison.
135
+ */
136
+ metadata?: Record<string, any>;
137
+ /**
138
+ * Array of data items to process.
139
+ *
140
+ * Can be either custom ExperimentItem[] or DatasetItem[] from Langfuse.
141
+ * Each item should contain input data and optionally expected output.
142
+ */
143
+ data: ExperimentItem<Input, ExpectedOutput, Metadata>[];
144
+ /**
145
+ * The task function to execute on each data item.
146
+ *
147
+ * This function receives input data and produces output that will be evaluated.
148
+ * It should encapsulate the model or system being tested.
149
+ */
150
+ task: ExperimentTask<Input, ExpectedOutput, Metadata>;
151
+ /**
152
+ * Optional array of evaluator functions to assess each item's output.
153
+ *
154
+ * Each evaluator receives input, output, and expected output (if available)
155
+ * and returns evaluation results. Multiple evaluators enable comprehensive assessment.
156
+ */
157
+ evaluators?: Evaluator<Input, ExpectedOutput, Metadata>[];
158
+ /**
159
+ * Optional array of run-level evaluators to assess the entire experiment.
160
+ *
161
+ * These evaluators receive all item results and can perform aggregate analysis
162
+ * like calculating averages, detecting patterns, or statistical analysis.
163
+ */
164
+ runEvaluators?: RunEvaluator<Input, ExpectedOutput, Metadata>[];
165
+ /**
166
+ * Maximum number of concurrent task executions (default: Infinity).
167
+ *
168
+ * Controls parallelism to manage resource usage and API rate limits.
169
+ * Set lower values for expensive operations or rate-limited services.
170
+ */
171
+ maxConcurrency?: number;
172
+ };
173
+ type ExperimentItemResult<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = Pick<ExperimentItem<Input, ExpectedOutput, Metadata>, "input" | "expectedOutput"> & {
174
+ /**
175
+ * The original experiment or dataset item that was processed.
176
+ *
177
+ * Contains the complete original item data including input, expected output,
178
+ * metadata, and any additional fields. Useful for accessing item-specific
179
+ * context or metadata in result analysis.
180
+ */
181
+ item: ExperimentItem<Input, ExpectedOutput, Metadata>;
182
+ /**
183
+ * The actual output produced by the task.
184
+ *
185
+ * This is the result returned by your task function for this specific input.
186
+ * It will be passed to evaluators for assessment against expected outputs.
187
+ */
188
+ output: any;
189
+ /**
190
+ * Results from all evaluators that ran on this item.
191
+ *
192
+ * Contains evaluation scores, comments, and metadata from each evaluator
193
+ * that successfully processed this item. Failed evaluators are excluded.
194
+ */
195
+ evaluations: Evaluation[];
196
+ /**
197
+ * Langfuse trace ID for this item's execution (for debugging and analysis).
198
+ *
199
+ * Use this ID to view detailed execution traces in the Langfuse UI,
200
+ * including timing, inputs, outputs, and any nested observations.
201
+ */
202
+ traceId?: string;
203
+ /**
204
+ * Dataset run ID if this item was part of a Langfuse dataset.
205
+ *
206
+ * Present only when running experiments on Langfuse datasets.
207
+ * Links this item result to a specific dataset run for tracking and comparison.
208
+ */
209
+ datasetRunId?: string;
210
+ };
211
+ /**
212
+ * Complete result of an experiment execution.
213
+ *
214
+ * Contains all results from processing the experiment data,
215
+ * including individual item results, run-level evaluations,
216
+ * and utilities for result visualization.
217
+ *
218
+ * @example Using experiment results
219
+ * ```typescript
220
+ * const result = await langfuse.experiment.run(config);
221
+ *
222
+ * // Access individual results
223
+ * console.log(`Processed ${result.itemResults.length} items`);
224
+ *
225
+ * // Check run-level evaluations
226
+ * const avgScore = result.runEvaluations.find(e => e.name === 'average_score');
227
+ * console.log(`Average score: ${avgScore?.value}`);
228
+ *
229
+ * // Print formatted results
230
+ * console.log(await result.prettyPrint());
231
+ *
232
+ * // Print summary with individual item results
233
+ * console.log(await result.prettyPrint({ includeItemResults: true }));
234
+ *
235
+ * // Link to dataset run (if available)
236
+ * if (result.datasetRunUrl) {
237
+ * console.log(`View in Langfuse: dataset run ${result.datasetRunUrl}`);
238
+ * }
239
+ * ```
240
+ *
241
+ * @public
242
+ */
243
+ type ExperimentResult<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = {
244
+ /**
245
+ * ID of the dataset run in Langfuse (only for experiments on Langfuse datasets).
246
+ *
247
+ * Present only when running experiments on Langfuse datasets.
248
+ * Use this ID to access the dataset run via the Langfuse API or UI
249
+ * for detailed analysis and comparison with other runs.
250
+ */
251
+ datasetRunId?: string;
252
+ /**
253
+ * URL to the dataset run in the Langfuse UI (only for experiments on Langfuse datasets).
254
+ *
255
+ * Direct link to view the complete dataset run in the Langfuse web interface,
256
+ * including all experiment results, traces, and analytics. Provides easy access
257
+ * to detailed analysis and visualization of the experiment.
258
+ */
259
+ datasetRunUrl?: string;
260
+ /**
261
+ * Results from processing each individual data item.
262
+ *
263
+ * Contains the complete results for every item in your experiment data,
264
+ * including inputs, outputs, evaluations, and trace information.
265
+ * Use this for detailed analysis of individual item performance.
266
+ */
267
+ itemResults: ExperimentItemResult<Input, ExpectedOutput, Metadata>[];
268
+ /**
269
+ * Results from run-level evaluators that assessed the entire experiment.
270
+ *
271
+ * Contains aggregate evaluations that analyze the complete experiment,
272
+ * such as average scores, statistical measures, or overall quality assessments.
273
+ */
274
+ runEvaluations: Evaluation[];
275
+ /**
276
+ * Function to format and display experiment results in a human-readable format.
277
+ *
278
+ * Generates a comprehensive, nicely formatted summary including individual results,
279
+ * aggregate statistics, evaluation scores, and links to traces and dataset runs.
280
+ *
281
+ * @param options - Formatting options
282
+ * @param options.includeItemResults - Whether to include individual item details (default: false)
283
+ * @returns Promise resolving to formatted string representation
284
+ */
285
+ prettyPrint: (options?: {
286
+ includeItemResults?: boolean;
287
+ }) => Promise<string>;
288
+ };
289
+
290
+ /**
291
+ * Function type for running experiments on Langfuse datasets.
292
+ *
293
+ * This function type is attached to fetched datasets to enable convenient
294
+ * experiment execution directly on dataset objects.
295
+ *
296
+ * @param params - Experiment parameters excluding data (since data comes from the dataset)
297
+ * @returns Promise resolving to experiment results
298
+ *
299
+ * @example
300
+ * ```typescript
301
+ * const dataset = await langfuse.dataset.get("my-dataset");
302
+ * const result = await dataset.runExperiment({
303
+ * name: "Model Evaluation",
304
+ * task: myTask,
305
+ * evaluators: [myEvaluator]
306
+ * });
307
+ * ```
308
+ *
309
+ * @public
310
+ * @since 4.0.0
311
+ */
312
+ type RunExperimentOnDataset = (params: Omit<ExperimentParams<any, any, Record<string, any>>, "data">) => Promise<ExperimentResult<any, any, Record<string, any>>>;
313
+ /**
314
+ * Enhanced dataset object with additional methods for linking and experiments.
315
+ *
316
+ * This type extends the base Dataset with functionality for:
317
+ * - Linking dataset items to traces/observations
318
+ * - Running experiments directly on the dataset
319
+ *
320
+ * @example Working with a fetched dataset
321
+ * ```typescript
322
+ * const dataset = await langfuse.dataset.get("my-evaluation-dataset");
323
+ *
324
+ * // Access dataset metadata
325
+ * console.log(dataset.name, dataset.description);
326
+ *
327
+ * // Work with individual items
328
+ * for (const item of dataset.items) {
329
+ * console.log(item.input, item.expectedOutput);
330
+ *
331
+ * // Link item to a trace
332
+ * await item.link(myObservation, "experiment-run-1");
333
+ * }
334
+ *
335
+ * // Run experiments on the entire dataset
336
+ * const result = await dataset.runExperiment({
337
+ * name: "Model Comparison",
338
+ * task: myTask,
339
+ * evaluators: [accuracyEvaluator]
340
+ * });
341
+ * ```
342
+ *
343
+ * @public
344
+ * @since 4.0.0
345
+ */
346
+ type FetchedDataset = Dataset & {
347
+ /** Dataset items with additional linking functionality */
348
+ items: (DatasetItem & {
349
+ link: LinkDatasetItemFunction;
350
+ })[];
351
+ /** Function to run experiments directly on this dataset */
352
+ runExperiment: RunExperimentOnDataset;
353
+ };
5
354
  /**
6
355
  * Function type for linking dataset items to OpenTelemetry spans.
7
- * This allows dataset items to be associated with specific traces for experiment tracking.
8
356
  *
9
- * @param obj - Object containing the OpenTelemetry span
10
- * @param runName - Name of the dataset run
11
- * @param runArgs - Optional arguments for the dataset run
357
+ * This function creates a connection between a dataset item and a trace/observation,
358
+ * enabling tracking of which dataset items were used in which experiments or runs.
359
+ * This is essential for creating dataset runs and tracking experiment lineage.
360
+ *
361
+ * @param obj - Object containing the OpenTelemetry span to link to
362
+ * @param obj.otelSpan - The OpenTelemetry span from a Langfuse observation
363
+ * @param runName - Name of the experiment run for grouping related items
364
+ * @param runArgs - Optional configuration for the dataset run
365
+ * @param runArgs.description - Description of the experiment run
366
+ * @param runArgs.metadata - Additional metadata to attach to the run
12
367
  * @returns Promise that resolves to the created dataset run item
13
368
  *
369
+ * @example Basic linking
370
+ * ```typescript
371
+ * const dataset = await langfuse.dataset.get("my-dataset");
372
+ * const span = startObservation("my-task", { input: "test" });
373
+ * span.update({ output: "result" });
374
+ * span.end();
375
+ *
376
+ * // Link the dataset item to this execution
377
+ * await dataset.items[0].link(
378
+ * { otelSpan: span.otelSpan },
379
+ * "experiment-run-1"
380
+ * );
381
+ * ```
382
+ *
383
+ * @example Linking with metadata
384
+ * ```typescript
385
+ * await dataset.items[0].link(
386
+ * { otelSpan: span.otelSpan },
387
+ * "model-comparison-v2",
388
+ * {
389
+ * description: "Comparing GPT-4 vs Claude performance",
390
+ * metadata: {
391
+ * modelVersion: "gpt-4-1106-preview",
392
+ * temperature: 0.7,
393
+ * timestamp: new Date().toISOString()
394
+ * }
395
+ * }
396
+ * );
397
+ * ```
398
+ *
399
+ * @see {@link https://langfuse.com/docs/datasets} Langfuse datasets documentation
14
400
  * @public
401
+ * @since 4.0.0
15
402
  */
16
403
  type LinkDatasetItemFunction = (obj: {
17
404
  otelSpan: Span;
@@ -30,7 +417,7 @@ type LinkDatasetItemFunction = (obj: {
30
417
  * @public
31
418
  */
32
419
  declare class DatasetManager {
33
- private apiClient;
420
+ private langfuseClient;
34
421
  /**
35
422
  * Creates a new DatasetManager instance.
36
423
  *
@@ -38,44 +425,87 @@ declare class DatasetManager {
38
425
  * @internal
39
426
  */
40
427
  constructor(params: {
41
- apiClient: LangfuseAPIClient;
428
+ langfuseClient: LangfuseClient;
42
429
  });
43
430
  /**
44
- * Retrieves a dataset by name along with all its items.
431
+ * Retrieves a dataset by name with all its items and experiment functionality.
45
432
  *
46
- * This method automatically handles pagination to fetch all dataset items
47
- * and enhances each item with a `link` function for easy experiment tracking.
433
+ * This method fetches a dataset and all its associated items, with support
434
+ * for automatic pagination to handle large datasets efficiently. The returned
435
+ * dataset object includes enhanced functionality for linking items to traces
436
+ * and running experiments directly on the dataset.
48
437
  *
49
438
  * @param name - The name of the dataset to retrieve
50
- * @param options - Optional configuration for fetching
439
+ * @param options - Optional configuration for data fetching
51
440
  * @param options.fetchItemsPageSize - Number of items to fetch per page (default: 50)
441
+ * @returns Promise resolving to enhanced dataset with items, linking, and experiment capabilities
52
442
  *
53
- * @returns Promise that resolves to the dataset with enhanced items
443
+ * @example Basic dataset retrieval
444
+ * ```typescript
445
+ * const dataset = await langfuse.dataset.get("my-evaluation-dataset");
446
+ * console.log(`Dataset ${dataset.name} has ${dataset.items.length} items`);
54
447
  *
55
- * @example
448
+ * // Access dataset properties
449
+ * console.log(dataset.description);
450
+ * console.log(dataset.metadata);
451
+ * ```
452
+ *
453
+ * @example Working with dataset items
56
454
  * ```typescript
57
- * const dataset = await langfuse.dataset.get("my-dataset");
455
+ * const dataset = await langfuse.dataset.get("qa-dataset");
58
456
  *
59
457
  * for (const item of dataset.items) {
60
- * // Use the item data for your experiment
61
- * const result = await processItem(item.input);
62
- *
63
- * // Link the result to the dataset item
64
- * await item.link(
65
- * { otelSpan: currentSpan },
66
- * "experiment-run-1",
67
- * { description: "Testing new model" }
68
- * );
458
+ * console.log("Question:", item.input);
459
+ * console.log("Expected Answer:", item.expectedOutput);
460
+ *
461
+ * // Each item has a link function for connecting to traces
462
+ * // await item.link(span, "experiment-name");
69
463
  * }
70
464
  * ```
465
+ *
466
+ * @example Running experiments on datasets
467
+ * ```typescript
468
+ * const dataset = await langfuse.dataset.get("benchmark-dataset");
469
+ *
470
+ * const result = await dataset.runExperiment({
471
+ * name: "GPT-4 Benchmark",
472
+ * description: "Evaluating GPT-4 on our benchmark tasks",
473
+ * task: async ({ input }) => {
474
+ * const response = await openai.chat.completions.create({
475
+ * model: "gpt-4",
476
+ * messages: [{ role: "user", content: input }]
477
+ * });
478
+ * return response.choices[0].message.content;
479
+ * },
480
+ * evaluators: [
481
+ * async ({ output, expectedOutput }) => ({
482
+ * name: "exact_match",
483
+ * value: output === expectedOutput ? 1 : 0
484
+ * })
485
+ * ]
486
+ * });
487
+ *
488
+ * console.log(await result.prettyPrint());
489
+ * ```
490
+ *
491
+ * @example Handling large datasets
492
+ * ```typescript
493
+ * // For very large datasets, use smaller page sizes
494
+ * const largeDataset = await langfuse.dataset.get(
495
+ * "large-dataset",
496
+ * { fetchItemsPageSize: 100 }
497
+ * );
498
+ * ```
499
+ *
500
+ * @throws {Error} If the dataset does not exist or cannot be accessed
501
+ * @see {@link FetchedDataset} for the complete return type specification
502
+ * @see {@link RunExperimentOnDataset} for experiment execution details
503
+ * @public
504
+ * @since 4.0.0
71
505
  */
72
506
  get(name: string, options?: {
73
507
  fetchItemsPageSize: number;
74
- }): Promise<Dataset & {
75
- items: (DatasetItem & {
76
- link: LinkDatasetItemFunction;
77
- })[];
78
- }>;
508
+ }): Promise<FetchedDataset>;
79
509
  /**
80
510
  * Creates a link function for a specific dataset item.
81
511
  *
@@ -86,6 +516,250 @@ declare class DatasetManager {
86
516
  private createDatasetItemLinkFunction;
87
517
  }
88
518
 
519
+ /**
520
+ * Manages the execution and evaluation of experiments on datasets.
521
+ *
522
+ * The ExperimentManager provides a comprehensive framework for running experiments
523
+ * that test models or tasks against datasets, with support for automatic evaluation,
524
+ * scoring.
525
+ *
526
+ * @example Basic experiment usage
527
+ * ```typescript
528
+ * const langfuse = new LangfuseClient();
529
+ *
530
+ * const result = await langfuse.experiment.run({
531
+ * name: "Capital Cities Test",
532
+ * description: "Testing model knowledge of world capitals",
533
+ * data: [
534
+ * { input: "France", expectedOutput: "Paris" },
535
+ * { input: "Germany", expectedOutput: "Berlin" }
536
+ * ],
537
+ * task: async ({ input }) => {
538
+ * const response = await openai.chat.completions.create({
539
+ * model: "gpt-4",
540
+ * messages: [{ role: "user", content: `What is the capital of ${input}?` }]
541
+ * });
542
+ * return response.choices[0].message.content;
543
+ * },
544
+ * evaluators: [
545
+ * async ({ input, output, expectedOutput }) => ({
546
+ * name: "exact_match",
547
+ * value: output === expectedOutput ? 1 : 0
548
+ * })
549
+ * ]
550
+ * });
551
+ *
552
+ * console.log(await result.prettyPrint());
553
+ * ```
554
+ *
555
+ * @example Using with Langfuse datasets
556
+ * ```typescript
557
+ * const dataset = await langfuse.dataset.get("my-dataset");
558
+ *
559
+ * const result = await dataset.runExperiment({
560
+ * name: "Model Comparison",
561
+ * task: myTask,
562
+ * evaluators: [myEvaluator],
563
+ * runEvaluators: [averageScoreEvaluator]
564
+ * });
565
+ * ```
566
+ *
567
+ * @public
568
+ */
569
+ declare class ExperimentManager {
570
+ private langfuseClient;
571
+ /**
572
+ * Creates a new ExperimentManager instance.
573
+ *
574
+ * @param params - Configuration object
575
+ * @param params.langfuseClient - The Langfuse client instance for API communication
576
+ * @internal
577
+ */
578
+ constructor(params: {
579
+ langfuseClient: LangfuseClient;
580
+ });
581
+ /**
582
+ * Gets the global logger instance for experiment-related logging.
583
+ *
584
+ * @returns The global logger instance
585
+ * @internal
586
+ */
587
+ get logger(): _langfuse_core.Logger;
588
+ /**
589
+ * Executes an experiment by running a task on each data item and evaluating the results.
590
+ *
591
+ * This method orchestrates the complete experiment lifecycle:
592
+ * 1. Executes the task function on each data item with proper tracing
593
+ * 2. Runs item-level evaluators on each task output
594
+ * 3. Executes run-level evaluators on the complete result set
595
+ * 4. Links results to dataset runs (for Langfuse datasets)
596
+ * 5. Stores all scores and traces in Langfuse
597
+ *
598
+ * @param config - The experiment configuration
599
+ * @param config.name - Human-readable name for the experiment
600
+ * @param config.description - Optional description of the experiment's purpose
601
+ * @param config.metadata - Optional metadata to attach to the experiment run
602
+ * @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[])
603
+ * @param config.task - Function that processes each data item and returns output
604
+ * @param config.evaluators - Optional array of functions to evaluate each item's output
605
+ * @param config.runEvaluators - Optional array of functions to evaluate the entire run
606
+ * @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity)
607
+ *
608
+ * @returns Promise that resolves to experiment results including:
609
+ * - itemResults: Results for each processed data item
610
+ * - runEvaluations: Results from run-level evaluators
611
+ * - datasetRunId: ID of the dataset run (if using Langfuse datasets)
612
+ * - prettyPrint: Function to format and display results
613
+ *
614
+ * @throws {Error} When task execution fails and cannot be handled gracefully
615
+ * @throws {Error} When required evaluators fail critically
616
+ *
617
+ * @example Simple experiment
618
+ * ```typescript
619
+ * const result = await langfuse.experiment.run({
620
+ * name: "Translation Quality Test",
621
+ * data: [
622
+ * { input: "Hello world", expectedOutput: "Hola mundo" },
623
+ * { input: "Good morning", expectedOutput: "Buenos días" }
624
+ * ],
625
+ * task: async ({ input }) => translateText(input, 'es'),
626
+ * evaluators: [
627
+ * async ({ output, expectedOutput }) => ({
628
+ * name: "bleu_score",
629
+ * value: calculateBleuScore(output, expectedOutput)
630
+ * })
631
+ * ]
632
+ * });
633
+ * ```
634
+ *
635
+ * @example Experiment with concurrency control
636
+ * ```typescript
637
+ * const result = await langfuse.experiment.run({
638
+ * name: "Large Scale Evaluation",
639
+ * data: largeBatchOfItems,
640
+ * task: expensiveModelCall,
641
+ * maxConcurrency: 5, // Process max 5 items simultaneously
642
+ * evaluators: [myEvaluator],
643
+ * runEvaluators: [
644
+ * async ({ itemResults }) => ({
645
+ * name: "average_score",
646
+ * value: itemResults.reduce((acc, r) => acc + r.evaluations[0].value, 0) / itemResults.length
647
+ * })
648
+ * ]
649
+ * });
650
+ * ```
651
+ *
652
+ * @see {@link ExperimentParams} for detailed parameter documentation
653
+ * @see {@link ExperimentResult} for detailed return value documentation
654
+ * @see {@link Evaluator} for evaluator function specifications
655
+ * @see {@link RunEvaluator} for run evaluator function specifications
656
+ *
657
+ * @public
658
+ */
659
+ run<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>>(config: ExperimentParams<Input, ExpectedOutput, Metadata>): Promise<ExperimentResult<Input, ExpectedOutput, Metadata>>;
660
+ /**
661
+ * Executes the task and evaluators for a single data item.
662
+ *
663
+ * This method handles the complete processing pipeline for one data item:
664
+ * 1. Executes the task within a traced observation span
665
+ * 2. Links the result to a dataset run (if applicable)
666
+ * 3. Runs all item-level evaluators on the output
667
+ * 4. Stores evaluation scores in Langfuse
668
+ * 5. Handles errors gracefully by continuing with remaining evaluators
669
+ *
670
+ * @param params - Parameters for item execution
671
+ * @param params.experimentName - Name of the parent experiment
672
+ * @param params.experimentDescription - Description of the parent experiment
673
+ * @param params.experimentMetadata - Metadata for the parent experiment
674
+ * @param params.item - The data item to process
675
+ * @param params.task - The task function to execute
676
+ * @param params.evaluators - Optional evaluators to run on the output
677
+ *
678
+ * @returns Promise resolving to the item result with output, evaluations, and trace info
679
+ *
680
+ * @throws {Error} When task execution fails (propagated from task function)
681
+ *
682
+ * @internal
683
+ */
684
+ private runItem;
685
+ /**
686
+ * Formats experiment results into a human-readable string representation.
687
+ *
688
+ * Creates a comprehensive, nicely formatted summary of the experiment including:
689
+ * - Individual item results with inputs, outputs, expected values, and scores
690
+ * - Dataset item and trace links (when available)
691
+ * - Experiment overview with aggregate statistics
692
+ * - Average scores across all evaluations
693
+ * - Run-level evaluation results
694
+ * - Links to dataset runs in the Langfuse UI
695
+ *
696
+ * @param params - Formatting parameters
697
+ * @param params.datasetRunUrl - Optional URL to the dataset run in Langfuse UI
698
+ * @param params.itemResults - Results from processing each data item
699
+ * @param params.originalData - The original input data items
700
+ * @param params.runEvaluations - Results from run-level evaluators
701
+ * @param params.name - Name of the experiment
702
+ * @param params.description - Optional description of the experiment
703
+ * @param params.includeItemResults - Whether to include individual item details (default: false)
704
+ *
705
+ * @returns Promise resolving to formatted string representation
706
+ *
707
+ * @example Output format
708
+ * ```
709
+ * 1. Item 1:
710
+ * Input: What is the capital of France?
711
+ * Expected: Paris
712
+ * Actual: Paris
713
+ * Scores:
714
+ * • exact_match: 1.000
715
+ * • similarity: 0.95
716
+ * 💭 Very close match with expected output
717
+ *
718
+ * Dataset Item:
719
+ * https://cloud.langfuse.com/project/123/datasets/456/items/789
720
+ *
721
+ * Trace:
722
+ * https://cloud.langfuse.com/project/123/traces/abc123
723
+ *
724
+ * ──────────────────────────────────────────────────
725
+ * 📊 Translation Quality Test - Testing model accuracy
726
+ * 2 items
727
+ * Evaluations:
728
+ * • exact_match
729
+ * • similarity
730
+ *
731
+ * Average Scores:
732
+ * • exact_match: 0.850
733
+ * • similarity: 0.923
734
+ *
735
+ * Run Evaluations:
736
+ * • overall_quality: 0.887
737
+ * 💭 Good performance with room for improvement
738
+ *
739
+ * 🔗 Dataset Run:
740
+ * https://cloud.langfuse.com/project/123/datasets/456/runs/def456
741
+ * ```
742
+ *
743
+ * @internal
744
+ */
745
+ private prettyPrintResults;
746
+ /**
747
+ * Formats a value for display in pretty-printed output.
748
+ *
749
+ * Handles different value types appropriately:
750
+ * - Strings: Truncates long strings to 50 characters with "..."
751
+ * - Objects/Arrays: Converts to JSON string representation
752
+ * - Primitives: Uses toString() representation
753
+ *
754
+ * @param value - The value to format
755
+ * @returns Formatted string representation suitable for display
756
+ *
757
+ * @internal
758
+ */
759
+ private formatValue;
760
+ private isOtelRegistered;
761
+ }
762
+
89
763
  /**
90
764
  * Parameters for resolving media references in objects.
91
765
  *
@@ -793,6 +1467,61 @@ declare class LangfuseClient {
793
1467
  * Manager for media upload and reference resolution.
794
1468
  */
795
1469
  media: MediaManager;
1470
+ /**
1471
+ * Manager for running experiments on datasets and data items.
1472
+ *
1473
+ * The experiment manager provides comprehensive functionality for:
1474
+ * - Running tasks on datasets or custom data arrays
1475
+ * - Evaluating outputs with custom or pre-built evaluators
1476
+ * - Tracking experiment runs with automatic tracing
1477
+ * - Generating formatted result summaries
1478
+ * - Integrating with AutoEvals library evaluators
1479
+ *
1480
+ * @example Basic experiment execution
1481
+ * ```typescript
1482
+ * const langfuse = new LangfuseClient();
1483
+ *
1484
+ * const result = await langfuse.experiment.run({
1485
+ * name: "Model Evaluation",
1486
+ * description: "Testing model performance on Q&A tasks",
1487
+ * data: [
1488
+ * { input: "What is 2+2?", expectedOutput: "4" },
1489
+ * { input: "What is the capital of France?", expectedOutput: "Paris" }
1490
+ * ],
1491
+ * task: async ({ input }) => {
1492
+ * // Your model/task implementation
1493
+ * const response = await myModel.generate(input);
1494
+ * return response;
1495
+ * },
1496
+ * evaluators: [
1497
+ * async ({ output, expectedOutput }) => ({
1498
+ * name: "exact_match",
1499
+ * value: output.trim().toLowerCase() === expectedOutput.toLowerCase() ? 1 : 0
1500
+ * })
1501
+ * ]
1502
+ * });
1503
+ *
1504
+ * console.log(await result.prettyPrint());
1505
+ * ```
1506
+ *
1507
+ * @example Using with datasets
1508
+ * ```typescript
1509
+ * const dataset = await langfuse.dataset.get("my-test-dataset");
1510
+ * const result = await dataset.runExperiment({
1511
+ * name: "Production Readiness Test",
1512
+ * task: myTask,
1513
+ * evaluators: [accuracyEvaluator, latencyEvaluator],
1514
+ * runEvaluators: [overallQualityEvaluator]
1515
+ * });
1516
+ * ```
1517
+ *
1518
+ * @see {@link ExperimentManager} for detailed API documentation
1519
+ * @see {@link ExperimentParams} for configuration options
1520
+ * @see {@link ExperimentResult} for result structure
1521
+ * @public
1522
+ * @since 4.0.0
1523
+ */
1524
+ experiment: ExperimentManager;
796
1525
  private baseUrl;
797
1526
  private projectId;
798
1527
  /**
@@ -926,4 +1655,72 @@ declare class LangfuseClient {
926
1655
  getTraceUrl(traceId: string): Promise<string>;
927
1656
  }
928
1657
 
929
- export { type ChatMessageOrPlaceholder, ChatMessageType, ChatPromptClient, type CreateChatPromptBodyWithPlaceholders, DatasetManager, type LangchainMessagesPlaceholder, LangfuseClient, type LangfuseClientParams, type LangfuseMediaResolveMediaReferencesParams, type LinkDatasetItemFunction, MediaManager, PromptManager, ScoreManager, TextPromptClient };
1658
+ /**
1659
+ * Converts an AutoEvals evaluator to a Langfuse-compatible evaluator function.
1660
+ *
1661
+ * This adapter function bridges the gap between AutoEvals library evaluators
1662
+ * and Langfuse experiment evaluators, handling parameter mapping and result
1663
+ * formatting automatically.
1664
+ *
1665
+ * AutoEvals evaluators expect `input`, `output`, and `expected` parameters,
1666
+ * while Langfuse evaluators use `input`, `output`, and `expectedOutput`.
1667
+ * This function handles the parameter name mapping.
1668
+ *
1669
+ * @template E - Type of the AutoEvals evaluator function
1670
+ * @param autoevalEvaluator - The AutoEvals evaluator function to convert
1671
+ * @param params - Optional additional parameters to pass to the AutoEvals evaluator
1672
+ * @returns A Langfuse-compatible evaluator function
1673
+ *
1674
+ * @example Basic usage with AutoEvals
1675
+ * ```typescript
1676
+ * import { Factuality, Levenshtein } from 'autoevals';
1677
+ * import { autoevalsToLangfuseEvaluator } from '@langfuse/client';
1678
+ *
1679
+ * const factualityEvaluator = autoevalsToLangfuseEvaluator(Factuality);
1680
+ * const levenshteinEvaluator = autoevalsToLangfuseEvaluator(Levenshtein);
1681
+ *
1682
+ * await langfuse.experiment.run({
1683
+ * name: "AutoEvals Integration Test",
1684
+ * data: myDataset,
1685
+ * task: myTask,
1686
+ * evaluators: [factualityEvaluator, levenshteinEvaluator]
1687
+ * });
1688
+ * ```
1689
+ *
1690
+ * @example Using with additional parameters
1691
+ * ```typescript
1692
+ * import { Factuality } from 'autoevals';
1693
+ *
1694
+ * const factualityEvaluator = autoevalsToLangfuseEvaluator(
1695
+ * Factuality,
1696
+ * { model: 'gpt-4o' } // Additional params for AutoEvals
1697
+ * );
1698
+ *
1699
+ * await langfuse.experiment.run({
1700
+ * name: "Factuality Test",
1701
+ * data: myDataset,
1702
+ * task: myTask,
1703
+ * evaluators: [factualityEvaluator]
1704
+ * });
1705
+ * ```
1706
+ *
1707
+ * @see {@link https://github.com/braintrustdata/autoevals} AutoEvals library documentation
1708
+ * @see {@link Evaluator} for Langfuse evaluator specifications
1709
+ *
1710
+ * @public
1711
+ * @since 4.0.0
1712
+ */
1713
+ declare function autoevalsToLangfuseEvaluator<E extends CallableFunction>(autoevalEvaluator: E, params?: Params<E>): Evaluator;
1714
+ /**
1715
+ * Utility type to extract parameter types from AutoEvals evaluator functions.
1716
+ *
1717
+ * This type helper extracts the parameter type from an AutoEvals evaluator
1718
+ * and omits the standard parameters (input, output, expected) that are
1719
+ * handled by the adapter, leaving only the additional configuration parameters.
1720
+ *
1721
+ * @template E - The AutoEvals evaluator function type
1722
+ * @internal
1723
+ */
1724
+ type Params<E> = Parameters<E extends (...args: any[]) => any ? E : never>[0] extends infer P ? Omit<P, "input" | "output" | "expected"> : never;
1725
+
1726
+ export { type ChatMessageOrPlaceholder, ChatMessageType, ChatPromptClient, type CreateChatPromptBodyWithPlaceholders, DatasetManager, type Evaluation, type Evaluator, type EvaluatorParams, type ExperimentItem, type ExperimentItemResult, ExperimentManager, type ExperimentParams, type ExperimentResult, type ExperimentTask, type ExperimentTaskParams, type FetchedDataset, type LangchainMessagesPlaceholder, LangfuseClient, type LangfuseClientParams, type LangfuseMediaResolveMediaReferencesParams, type LinkDatasetItemFunction, MediaManager, PromptManager, type RunEvaluator, type RunEvaluatorParams, type RunExperimentOnDataset, ScoreManager, TextPromptClient, autoevalsToLangfuseEvaluator };